#!/usr/bin/env bash
# Parallel extraction of the remaining GenSegDataset tars on h800's slow network share.
# Big archives are split by member-list into N chunks, each extracted by a separate
# `tar -x -T <chunk>` process, to saturate the share's parallel small-file throughput.
set -u
BASE=/mnt/tidal-alsh-share2/dataset/qinshengqian/research/c3/NPJ-ACM/Data
TARS=$BASE/_tars
WORK=/tmp/pextract
mkdir -p "$WORK"

# dataset -> parallel chunk count (kits19 = most files)
launch_ds() {
  local ds=$1 n=$2 tar="$TARS/$1.tar"
  [ -f "$tar" ] || { echo "MISSING $tar"; return; }
  if [ "$n" -le 1 ]; then
    tar -xf "$tar" -C "$BASE" &
  else
    tar -tf "$tar" | grep -v '/$' > "$WORK/$ds.list"
    split -n "l/$n" -d "$WORK/$ds.list" "$WORK/$ds.chunk."
    for c in "$WORK/$ds.chunk."*; do
      tar -xf "$tar" -C "$BASE" -T "$c" &
    done
  fi
}

echo "[start] $(date +%T) launching parallel extraction"
launch_ds medsegdb_kits19 8
launch_ds pannuke_semantic 4
launch_ds refuge2 1
echo "launched $(jobs -p | wc -l) parallel tar streams"
wait
echo "PEXTRACT_DONE $(date +%T)"

# cleanup tars + work
rm -f "$TARS"/*.tar
rmdir "$TARS" 2>/dev/null || true
rm -rf "$WORK"
echo "CLEANUP_DONE $(date +%T)"