File size: 3,554 Bytes
3dac39e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/bin/bash
# =============================================================================
# Sync local Arcspan files to GPU server for R8 training
# Usage: bash scripts/sync_to_gpu.sh [--dry-run]
# =============================================================================
set -euo pipefail

GPU="root@199.126.203.145"
PORT=18732
SSH="ssh -p $PORT"
RSYNC_SSH="ssh -p $PORT"
REMOTE_DIR="~/alkyline"
LOCAL_DIR="/home/ubuntu/alkyline"

DRY_RUN=""
if [[ "${1:-}" == "--dry-run" ]]; then
  DRY_RUN="--dry-run"
  echo "πŸ” DRY RUN β€” no files will be transferred"
  echo ""
fi

RSYNC_BASE="rsync -avz --progress $DRY_RUN -e '$RSYNC_SSH'"

echo "================================================================"
echo "  Arcspan R8 Sync β†’ GPU ($GPU:$PORT)"
echo "================================================================"
echo ""

# ── 1. CRITICAL: Vendor code (train runner + args with --o-downsample) ──
echo "── [1/4] Vendor code (modified _train/ files) ──"
eval $RSYNC_BASE \
  "$LOCAL_DIR/vendor/privacy-filter/opf/_train/runner.py" \
  "$LOCAL_DIR/vendor/privacy-filter/opf/_train/args.py" \
  "$GPU:$REMOTE_DIR/vendor/privacy-filter/opf/_train/"
echo ""

# ── 2. Data files ──
echo "── [2/4] Data files (R8 train/valid + APTNER + SB2 test) ──"
eval $RSYNC_BASE \
  "$LOCAL_DIR/data/processed/r8_5class_train.jsonl" \
  "$LOCAL_DIR/data/processed/r8_5class_valid.jsonl" \
  "$LOCAL_DIR/data/processed/aptner_5class_test_clean.jsonl" \
  "$LOCAL_DIR/data/processed/securebert2_5class_test.jsonl" \
  "$GPU:$REMOTE_DIR/data/processed/"
echo ""

# ── 3. Scripts ──
echo "── [3/4] Scripts (R8 training + utilities) ──"
eval $RSYNC_BASE \
  "$LOCAL_DIR/scripts/run_train_v8.sh" \
  "$LOCAL_DIR/scripts/build_dataset.py" \
  "$LOCAL_DIR/scripts/viterbi_grid_search.py" \
  "$LOCAL_DIR/scripts/early_stop_monitor.sh" \
  "$LOCAL_DIR/scripts/checkpoint_avg.py" \
  "$GPU:$REMOTE_DIR/scripts/"
echo ""

# ── 4. Label spaces (should already match, but ensure) ──
echo "── [4/4] Label spaces ──"
eval $RSYNC_BASE \
  "$LOCAL_DIR/data/label_spaces/" \
  "$GPU:$REMOTE_DIR/data/label_spaces/"
echo ""

echo "================================================================"
if [[ -n "$DRY_RUN" ]]; then
  echo "  DRY RUN COMPLETE β€” re-run without --dry-run to transfer"
else
  echo "  SYNC COMPLETE"
fi
echo "================================================================"

# ── Post-sync verification ──
if [[ -z "$DRY_RUN" ]]; then
  echo ""
  echo "── Post-sync verification ──"
  $SSH $GPU bash -c "'
    echo \"Checking critical files exist...\"
    for f in \
      ~/alkyline/data/processed/r8_5class_train.jsonl \
      ~/alkyline/data/processed/r8_5class_valid.jsonl \
      ~/alkyline/data/processed/aptner_5class_test_clean.jsonl \
      ~/alkyline/data/processed/securebert2_5class_test.jsonl \
      ~/alkyline/scripts/run_train_v8.sh \
      ~/alkyline/data/label_spaces/cyner_5class.json; do
      if [ -f \"\$f\" ]; then echo \"  βœ… \$f\"; else echo \"  ❌ MISSING: \$f\"; fi
    done
    echo \"\"
    echo \"Checking --o-downsample in vendor code...\"
    if grep -q o.downsample ~/alkyline/vendor/privacy-filter/opf/_train/args.py; then
      echo \"  βœ… --o-downsample flag present in args.py\"
    else
      echo \"  ❌ --o-downsample NOT found β€” vendor sync failed!\"
    fi
    echo \"\"
    echo \"Reinstalling opf (editable)...\"
    cd ~/alkyline/vendor/privacy-filter && pip install -e . -q
    echo \"  βœ… opf reinstalled\"
  '"
fi