| #!/usr/bin/env bash |
| set -euo pipefail |
|
|
| ROOT="${ROOT:-/home/ubuntu/Documents/MWave}" |
| INTERVAL_SECONDS="${INTERVAL_SECONDS:-1800}" |
| MONITOR_LOG="$ROOT/outputs/logs/training_monitor.log" |
| STATE_FILE="$ROOT/outputs/logs/training_monitor.state" |
|
|
| mkdir -p "$ROOT/outputs/logs" |
| touch "$STATE_FILE" |
|
|
| latest_train_log() { |
| ls -t "$ROOT"/outputs/logs/train_resume_*.log 2>/dev/null | head -n 1 || true |
| } |
|
|
| active_pipeline() { |
| pgrep -af 'scripts/train_qlora.py|scripts/evaluate.py|train_resume_2048|train_resume_1024' | grep -v monitor_training || true |
| } |
|
|
| checkpoint_summary() { |
| find "$ROOT/outputs/qwen35_9b_lora" -maxdepth 1 -type d -name 'checkpoint-*' 2>/dev/null | sort -V | tail -n 3 | xargs -r -n 1 basename |
| } |
|
|
| prediction_counts() { |
| for f in \ |
| "$ROOT/outputs/predictions/finetuned_struct_predictions.jsonl" \ |
| "$ROOT/outputs/predictions/finetuned_qa_predictions.jsonl"; do |
| if [[ -f "$f" ]]; then |
| wc -l "$f" |
| else |
| echo "0 $f" |
| fi |
| done |
| } |
|
|
| restart_1024_if_needed() { |
| local log="$1" |
| if grep -q 'fallback_1024_started=1' "$STATE_FILE"; then |
| return 0 |
| fi |
| if [[ -n "$(active_pipeline)" ]]; then |
| return 0 |
| fi |
| if [[ -f "$ROOT/outputs/metrics/finetuned_struct_metrics.json" ]]; then |
| return 0 |
| fi |
| if [[ -n "$log" ]] && grep -q 'torch.OutOfMemoryError' "$log"; then |
| echo "fallback_1024_started=1" >> "$STATE_FILE" |
| local fallback_log="$ROOT/outputs/logs/train_resume_1024_$(date +%Y%m%d_%H%M%S).log" |
| echo "[$(date '+%F %T %Z')] OOM detected. Restarting with max_seq_length=1024. log=$fallback_log" >> "$MONITOR_LOG" |
| rm -rf "$ROOT/outputs/qwen35_9b_lora" |
| ( |
| cd "$ROOT" |
| export ALL_PROXY= |
| export all_proxy= |
| export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True |
| python3 scripts/train_qlora.py \ |
| --model-name Qwen/Qwen3.5-9B \ |
| --train-file data/processed/train_mixed.jsonl \ |
| --val-file data/processed/val_mixed.jsonl \ |
| --output-dir outputs/qwen35_9b_lora \ |
| --max-seq-length 1024 \ |
| && python3 scripts/evaluate.py \ |
| --model-name Qwen/Qwen3.5-9B \ |
| --adapter-dir outputs/qwen35_9b_lora \ |
| --input-file data/processed/val_struct.jsonl \ |
| --task-type struct \ |
| --output-dir outputs \ |
| --run-name finetuned \ |
| && python3 scripts/evaluate.py \ |
| --model-name Qwen/Qwen3.5-9B \ |
| --adapter-dir outputs/qwen35_9b_lora \ |
| --input-file data/processed/val_qa.jsonl \ |
| --task-type qa \ |
| --output-dir outputs \ |
| --run-name finetuned \ |
| --max-new-tokens 512 \ |
| && python3 scripts/visualize_results.py \ |
| --metrics outputs/metrics/base_struct_metrics.json outputs/metrics/base_qa_metrics.json outputs/metrics/finetuned_struct_metrics.json outputs/metrics/finetuned_qa_metrics.json \ |
| --predictions outputs/predictions/base_struct_predictions.jsonl outputs/predictions/finetuned_struct_predictions.jsonl \ |
| --out-dir outputs/figures \ |
| && python3 scripts/build_report.py |
| ) > "$fallback_log" 2>&1 & |
| fi |
| } |
|
|
| check_once() { |
| local log |
| log="$(latest_train_log)" |
| { |
| echo "===== $(date '+%F %T %Z') =====" |
| echo "latest_train_log=${log:-none}" |
| echo "-- active processes --" |
| active_pipeline || true |
| echo "-- gpu --" |
| nvidia-smi --query-gpu=memory.used,memory.free,utilization.gpu --format=csv,noheader || true |
| nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv,noheader || true |
| echo "-- checkpoints --" |
| checkpoint_summary || true |
| echo "-- finetuned prediction counts --" |
| prediction_counts || true |
| if [[ -n "$log" ]]; then |
| echo "-- recent progress --" |
| grep -aoE '[0-9]+/6283|generated [0-9]+/4030|torch.OutOfMemoryError|Traceback|train_runtime|eval_loss' "$log" | tail -n 20 || true |
| fi |
| } >> "$MONITOR_LOG" |
| restart_1024_if_needed "$log" |
| } |
|
|
| while true; do |
| check_once |
| sleep "$INTERVAL_SECONDS" |
| done |
|
|