#!/usr/bin/env bash
#/ Usage: ghe-backup-repositories
#/ Take an online, incremental snapshot of all Git repository data.
#/
#/ Note: This command typically isn't called directly. It's invoked by
#/ ghe-backup.
set -e

# This command is designed to allow for transferring active Git repository data
# from a GitHub instance to a backup site in a way that ensures data is
# captured in a consistent state even when being written to.
#
# - All Git GC operations are disabled on the GitHub instance for the duration of
#   the backup. This removes the possibly of objects or packs being removed
#   while the backup is in progress.
#
# - In progress Git GC operations are given a cooldown window to complete. The
#   script will sleep for up to 60 seconds waiting for GC operations to finish.
#
# - Git repository data is transferred in a specific order: auxiliary files,
#   packed refs, loose refs, reflogs, and finally objects and pack files in that
#   order. This ensures that all referenced objects are captured.
#
# - Git GC operations are re-enabled on the GitHub instance.
#
# The script uses multiple runs of rsync to transfer repository files. Each run
# includes a list of filter rules that ensure only specific types of files are
# transferred.
#
# See the "FILTER RULES" and "INCLUDE/EXCLUDE PATTERN RULES" sections of the
# rsync(1) manual for more information:
#      <http://rsync.samba.org/ftp/rsync/rsync.html>

# Bring in the backup configuration
# shellcheck source=share/github-backup-utils/ghe-backup-config
. "$( dirname "${BASH_SOURCE[0]}" )/ghe-backup-config"

bm_start "$(basename $0)"

# Set up remote host and root backup snapshot directory based on config
host="$GHE_HOSTNAME"
backup_dir="$GHE_SNAPSHOT_DIR/repositories"

# Location of last good backup for rsync --link-dest
backup_current="$GHE_DATA_DIR/current/repositories"

# Verify rsync is available.
if ! rsync --version 1>/dev/null 2>&1; then
  log_error "rsync not found." 1>&2
  exit 1
fi

# Perform a host-check and establish GHE_REMOTE_XXX variables.
ghe_remote_version_required "$host"

# Split host:port into parts
port=$(ssh_port_part "$GHE_HOSTNAME")
host=$(ssh_host_part "$GHE_HOSTNAME")

# Add user / -l option
user="${host%@*}"
[ "$user" = "$host" ] && user="admin"

hostnames=$host
ssh_config_file_opt=
tempdir=$(mktemp -d -t backup-utils-backup-XXXXXX)
remote_tempdir=$(ghe-ssh "$GHE_HOSTNAME" -- mktemp -d -t backup-utils-backup-XXXXXX)
routes_list=$tempdir/routes_list
remote_routes_list=$remote_tempdir/remote_routes_list
opts="$GHE_EXTRA_SSH_OPTS"

# git server hostnames under cluster
if [ "$GHE_BACKUP_STRATEGY" = "cluster" ]; then
  ssh_config_file="$tempdir/ssh_config"
  ssh_config_file_opt="-F $ssh_config_file"
  opts="$opts -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o PasswordAuthentication=no"
  hostnames=$(ghe-cluster-find-nodes "$GHE_HOSTNAME" "git-server")
  ghe-ssh-config "$GHE_HOSTNAME" "$hostnames" > "$ssh_config_file"
fi

# Replica hostnames for HA
if ghe-ssh "$GHE_HOSTNAME" -- "[ -f '$GHE_REMOTE_ROOT_DIR/etc/github/repl-state' ]"; then
  ha_replica_hosts=$(ghe-ssh "$GHE_HOSTNAME" ghe-cluster-nodes --replica)
fi

# Make sure root backup dir exists if this is the first run
mkdir -p "$backup_dir"

# Removes the remote sync-in-progress file on exit, re-enabling GC operations
# on the remote instance.
cleanup() {
  for pid in $(jobs -p); do
    kill -KILL $pid > /dev/null 2>&1 || true
  done

  # Enable remote GC operations
  for hostname in $hostnames; do
    ghe-gc-enable $ssh_config_file_opt $hostname:$port || {
      echo "Re-enable gc on $hostname failed, please manually delete $SYNC_IN_PROGRESS_FILE" 1>&2
    }
  done

  # Enable remote GC operations for HA replica
  for replica_host in $ha_replica_hosts; do
    echo "set -o pipefail; ssh $replica_host -- 'sudo rm -f $SYNC_IN_PROGRESS_FILE'" | ghe-ssh "$host" /bin/bash || {
      echo "Re-enable gc on $replica_host failed, please manually delete $SYNC_IN_PROGRESS_FILE" 1>&2
    }
  done

  ghe-ssh "$GHE_HOSTNAME" -- rm -rf $remote_tempdir
  rm -rf $tempdir
}
trap 'cleanup' EXIT INT

# Disable remote GC operations
for hostname in $hostnames; do
  ghe-gc-disable $ssh_config_file_opt $hostname:$port
done

# Disable remote GC operations for HA replica
# gc_disable is a function defined in ghe-backup-config
# gc_disable is called on the replica node via the primary node, because replica node is not expected to be reachable from backup host. But replica node is expected to be reachable from primary node.
for replica_host in $ha_replica_hosts; do
  echo "set -o pipefail; ssh $replica_host -- '$(declare -f gc_disable); gc_disable \"$SYNC_IN_PROGRESS_FILE\" \"$GHE_GIT_COOLDOWN_PERIOD\"'" | ghe-ssh "$host" /bin/bash || {
    echo "Disable gc on $replica_host failed" 1>&2
  }
done

# If we have a previous increment, avoid transferring existing files via rsync's
# --link-dest support. This also decreases physical space usage considerably.
if [ -d "$backup_current" ]; then
  link_dest="--link-dest=../../current/repositories"
fi

# Calculate sync routes. This will store the healthy repo paths for each node
#
# This gets a repo path and stores the path in the $node.sync file
# a/nw/a8/3f/02/100000855 dgit-node1 >> dgit-node1.sync
# a/nw/a8/bc/8d/100000880 dgit-node3 >> dgit-node3.sync
# a/nw/a5/06/81/100000659 dgit-node2 >> dgit-node2.sync
# ...
# One route per line.
#
# NOTE: The route generation is performed on the appliance as it is considerably
# more performant than performing over an SSH pipe.
#
bm_start "$(basename $0) - Generating routes"
echo "github-env ./bin/dgit-cluster-backup-routes > $remote_routes_list" | ghe-ssh "$GHE_HOSTNAME" -- /bin/bash
ghe-ssh "$GHE_HOSTNAME" -- cat $remote_routes_list | ghe_debug
bm_end "$(basename $0) - Generating routes"

bm_start "$(basename $0) - Fetching routes"
ghe-ssh "$GHE_HOSTNAME" -- gzip -c $remote_routes_list | gzip -d > $routes_list
< $routes_list ghe_debug
bm_end "$(basename $0) - Fetching routes"

bm_start "$(basename $0) - Processing routes"
if [ "$GHE_BACKUP_STRATEGY" != "cluster" ]; then
  server=$host
fi
< $routes_list awk -v tempdir="$tempdir" -v server="$server" '{ for(i=2;i<=NF;i++){ server != "" ? host=server : host=$i; print $1 > (tempdir"/"host".rsync") }}'
ghe_debug "\n$(find "$tempdir" -maxdepth 1 -name '*.rsync')"
bm_end "$(basename $0) - Processing routes"

if [ -z "$(find "$tempdir" -maxdepth 1 -name '*.rsync')" ]; then
  log_warn "no routes found, skipping repositories backup ..."
  exit 0
else
  increment-progress-total-count 3
fi

# Transfer repository data from a GitHub instance to the current snapshot
# directory, using a previous snapshot to avoid transferring files that have
# already been transferred. A set of rsync filter rules are provided on stdin
# for each invocation.
rsync_repository_data () {
  port=$(ssh_port_part "$1")
  host=$(ssh_host_part "$1")

  #check if we are syncing from a given file list
  if [[ "$2" == *".rsync" ]]; then
    files_list="$2"
    shift
    shift
    log_rsync "BEGIN: repositories rsync" 1>&3
    ghe-rsync -avr \
      -e "ssh -q $opts -p $port $ssh_config_file_opt -l $user" \
      $link_dest "$@" \
      --rsync-path='sudo -u git rsync' \
      --include-from=- --exclude=\* \
      --files-from="$files_list" \
      --ignore-missing-args \
      "$host:$GHE_REMOTE_DATA_USER_DIR/repositories/" \
      "$backup_dir" 1>&3 2>&3
    log_rsync "END: repositories rsync" 1>&3
  else
    shift
    log_rsync "BEGIN: repositories rsync" 1>&3
    ghe-rsync -avr \
      -e "ssh -q $opts -p $port $ssh_config_file_opt -l $user" \
      $link_dest "$@" \
      --rsync-path='sudo -u git rsync' \
      --include-from=- --exclude=\* \
      --ignore-missing-args \
      "$host:$GHE_REMOTE_DATA_USER_DIR/repositories/" \
      "$backup_dir" 1>&3 2>&3
    log_rsync "END: repositories rsync" 1>&3
  fi
}

sync_data (){
  # Sync all auxiliary repository data. This includes files and directories like
  # HEAD, audit_log, config, description, info/, etc. No refs or object data
  # should be transferred here.
  echo 1>&3

  log_info "* Transferring auxiliary files ..." 1>&3
  rsync_repository_data $1:122 $2 <<RULES
- /__*__/
- /info/

+ /*/
+ /*/*.git
- /*/*.git/objects
- /*/*.git/refs
- /*/*.git/packed-refs
- /*/*.git/logs
+ /*/*.git/**

+ /*/??/
+ /*/??/??/
+ /*/??/??/??/
+ /*/??/??/??/gist/
+ /*/??/??/??/gist/*.git
- /*/??/??/??/gist/*.git/objects
- /*/??/??/??/gist/*.git/refs
- /*/??/??/??/gist/*.git/packed-refs
- /*/??/??/??/gist/*.git/logs
+ /*/??/??/??/gist/*.git/**

+ /*/nw/??/??/??/
+ /*/nw/??/??/??/*/
+ /*/nw/??/??/??/*/*.git
- /*/nw/??/??/??/*/*.git/objects
- /*/nw/??/??/??/*/*.git/refs
- /*/nw/??/??/??/*/*.git/packed-refs
- /*/nw/??/??/??/*/*.git/logs
+ /*/nw/??/??/??/*/*.git/**
RULES

    # Sync packed refs files. This is performed before sync'ing loose refs since
    # loose refs trump packed-refs information.
  echo 1>&3
  log_info "* Transferring packed-refs files ..." 1>&3
  rsync_repository_data $1:122 $2 <<RULES
- /__*__/
- /info/

+ /*/
+ /*/*.git
+ /*/*.git/packed-refs

+ /*/??/
+ /*/??/??/
+ /*/??/??/??/
+ /*/??/??/??/gist/
+ /*/??/??/??/gist/*.git
+ /*/??/??/??/gist/*.git/packed-refs

+ /*/nw/??/
+ /*/nw/??/??/
+ /*/nw/??/??/??/
+ /*/nw/??/??/??/*/
+ /*/nw/??/??/??/*/*.git
+ /*/nw/??/??/??/*/*.git/packed-refs
RULES
    # Sync loose refs and reflogs. This must be performed before object data is
    # transferred to ensure that all referenced objects are included.
  echo 1>&3
  log_info "* Transferring refs and reflogs ..."  1>&3
  rsync_repository_data $1:122 $2 <<RULES
- /__*__/
- /info/

+ /*/
+ /*/*.git
+ /*/*.git/refs
+ /*/*.git/refs/**
+ /*/*.git/logs
+ /*/*.git/logs/**

+ /*/??/
+ /*/??/??/
+ /*/??/??/??/
+ /*/??/??/??/gist/
+ /*/??/??/??/gist/*.git
+ /*/??/??/??/gist/*.git/refs
+ /*/??/??/??/gist/*.git/refs/**
+ /*/??/??/??/gist/*.git/logs
+ /*/??/??/??/gist/*.git/logs/**

+ /*/nw/??/
+ /*/nw/??/??/
+ /*/nw/??/??/??/
+ /*/nw/??/??/??/*/
+ /*/nw/??/??/??/*/*.git
+ /*/nw/??/??/??/*/*.git/refs
+ /*/nw/??/??/??/*/*.git/refs/**
+ /*/nw/??/??/??/*/*.git/logs
+ /*/nw/??/??/??/*/*.git/logs/**
RULES

    # Sync git objects and pack files. Compression is disabled during this phase
    # since these files are already well compressed.
  echo 1>&3
  log_info "* Transferring objects and packs ..." 1>&3
  rsync_repository_data $1:122 $2 -H <<RULES
- /__*__/
- /info/

+ /*/
+ /*/*.git
+ /*/*.git/objects
- /*/*.git/objects/**/tmp_*
+ /*/*.git/objects/**

+ /*/??/
+ /*/??/??/
+ /*/??/??/??/
+ /*/??/??/??/gist/
+ /*/??/??/??/gist/*.git
+ /*/??/??/??/gist/*.git/objects
- /*/??/??/??/gist/*.git/objects/**/tmp_*
+ /*/??/??/??/gist/*.git/objects/**

+ /*/nw/??/
+ /*/nw/??/??/
+ /*/nw/??/??/??/
+ /*/nw/??/??/??/*/
+ /*/nw/??/??/??/*/*.git
+ /*/nw/??/??/??/*/*.git/objects
- /*/nw/??/??/??/*/*.git/objects/**/tmp_*
+ /*/nw/??/??/??/*/*.git/objects/**
RULES

 echo 1>&3

}

# rsync all the repositories
bm_start "$(basename $0) - Repo sync"
for file_list in $tempdir/*.rsync; do
  hostname=$(basename $file_list .rsync)

  repo_num=$(< $file_list wc -l)
  ghe_verbose "* Transferring $repo_num repositories from $hostname"

  sync_data $hostname $file_list &
done

for pid in $(jobs -p); do
  wait $pid
done
bm_end "$(basename $0) - Repo sync"

# Since there are no routes for special data directories, we need to do this
# serially for all hostnames. Good candidate for future optimizations.

bm_start "$(basename $0) - Special Data Directories Sync"
for h in $hostnames; do
  # Sync __special__ data directories, including the __alambic_assets__,
  # __hookshot__, and __purgatory__ directories. The __nodeload_archives__,
  # __gitmon__, and __render__ directories are excludes since they act only as
  # caches.
  #
  # Under v2.x and greater, only the special __purgatory__ directory remains under
  # /data/repositories. All other special user data directories have been moved under
  # the /data/user directory.
  echo 1>&3
  log_info "* Transferring special data directories from $h..." 1>&3
  rsync_repository_data $h:122 <<RULES
- /__nodeload_archives__/
- /__gitmon__/
- /__render__/
+ /__*__/
+ /__*__/**
+ /info/
- /info/lost+found/
+ /info/*
RULES
  echo 1>&3
done
bm_end "$(basename $0) - Special Data Directories Sync"

if [[ "$GHE_ROUTE_VERIFICATION" == "true" ]]; then
  bm_start "$(basename $0) - Verifying Routes"
  for file_lst in $tempdir/*.rsync; do
  < $file_lst sort | uniq
  done |sort|uniq > $tempdir/source_routes
  (cd $backup_dir/ && find * -mindepth 5 -maxdepth 6 -type d -name \*.git | fix_paths_for_ghe_version | uniq | sort | uniq) > $tempdir/destination_routes

  git --no-pager diff --unified=0 --no-prefix -- $tempdir/source_routes $tempdir/destination_routes || echo "Warning: One or more repository networks and/or gists were not found on the source appliance."
  increment-progress-total-count 1
  bm_end "$(basename $0) - Verifying Routes"
fi

bm_end "$(basename $0)"
