#!/usr/bin/env bash # Parallel extraction of the remaining GenSegDataset tars on h800's slow network share. # Big archives are split by member-list into N chunks, each extracted by a separate # `tar -x -T ` process, to saturate the share's parallel small-file throughput. set -u BASE=/mnt/tidal-alsh-share2/dataset/qinshengqian/research/c3/NPJ-ACM/Data TARS=$BASE/_tars WORK=/tmp/pextract mkdir -p "$WORK" # dataset -> parallel chunk count (kits19 = most files) launch_ds() { local ds=$1 n=$2 tar="$TARS/$1.tar" [ -f "$tar" ] || { echo "MISSING $tar"; return; } if [ "$n" -le 1 ]; then tar -xf "$tar" -C "$BASE" & else tar -tf "$tar" | grep -v '/$' > "$WORK/$ds.list" split -n "l/$n" -d "$WORK/$ds.list" "$WORK/$ds.chunk." for c in "$WORK/$ds.chunk."*; do tar -xf "$tar" -C "$BASE" -T "$c" & done fi } echo "[start] $(date +%T) launching parallel extraction" launch_ds medsegdb_kits19 8 launch_ds pannuke_semantic 4 launch_ds refuge2 1 echo "launched $(jobs -p | wc -l) parallel tar streams" wait echo "PEXTRACT_DONE $(date +%T)" # cleanup tars + work rm -f "$TARS"/*.tar rmdir "$TARS" 2>/dev/null || true rm -rf "$WORK" echo "CLEANUP_DONE $(date +%T)"