#!/usr/bin/env bash set -euo pipefail ROOT="${ROOT:-/home/ubuntu/Documents/MWave}" INTERVAL_SECONDS="${INTERVAL_SECONDS:-1800}" MONITOR_LOG="$ROOT/outputs/logs/training_monitor.log" STATE_FILE="$ROOT/outputs/logs/training_monitor.state" mkdir -p "$ROOT/outputs/logs" touch "$STATE_FILE" latest_train_log() { ls -t "$ROOT"/outputs/logs/train_resume_*.log 2>/dev/null | head -n 1 || true } active_pipeline() { pgrep -af 'scripts/train_qlora.py|scripts/evaluate.py|train_resume_2048|train_resume_1024' | grep -v monitor_training || true } checkpoint_summary() { find "$ROOT/outputs/qwen35_9b_lora" -maxdepth 1 -type d -name 'checkpoint-*' 2>/dev/null | sort -V | tail -n 3 | xargs -r -n 1 basename } prediction_counts() { for f in \ "$ROOT/outputs/predictions/finetuned_struct_predictions.jsonl" \ "$ROOT/outputs/predictions/finetuned_qa_predictions.jsonl"; do if [[ -f "$f" ]]; then wc -l "$f" else echo "0 $f" fi done } restart_1024_if_needed() { local log="$1" if grep -q 'fallback_1024_started=1' "$STATE_FILE"; then return 0 fi if [[ -n "$(active_pipeline)" ]]; then return 0 fi if [[ -f "$ROOT/outputs/metrics/finetuned_struct_metrics.json" ]]; then return 0 fi if [[ -n "$log" ]] && grep -q 'torch.OutOfMemoryError' "$log"; then echo "fallback_1024_started=1" >> "$STATE_FILE" local fallback_log="$ROOT/outputs/logs/train_resume_1024_$(date +%Y%m%d_%H%M%S).log" echo "[$(date '+%F %T %Z')] OOM detected. Restarting with max_seq_length=1024. log=$fallback_log" >> "$MONITOR_LOG" rm -rf "$ROOT/outputs/qwen35_9b_lora" ( cd "$ROOT" export ALL_PROXY= export all_proxy= export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python3 scripts/train_qlora.py \ --model-name Qwen/Qwen3.5-9B \ --train-file data/processed/train_mixed.jsonl \ --val-file data/processed/val_mixed.jsonl \ --output-dir outputs/qwen35_9b_lora \ --max-seq-length 1024 \ && python3 scripts/evaluate.py \ --model-name Qwen/Qwen3.5-9B \ --adapter-dir outputs/qwen35_9b_lora \ --input-file data/processed/val_struct.jsonl \ --task-type struct \ --output-dir outputs \ --run-name finetuned \ && python3 scripts/evaluate.py \ --model-name Qwen/Qwen3.5-9B \ --adapter-dir outputs/qwen35_9b_lora \ --input-file data/processed/val_qa.jsonl \ --task-type qa \ --output-dir outputs \ --run-name finetuned \ --max-new-tokens 512 \ && python3 scripts/visualize_results.py \ --metrics outputs/metrics/base_struct_metrics.json outputs/metrics/base_qa_metrics.json outputs/metrics/finetuned_struct_metrics.json outputs/metrics/finetuned_qa_metrics.json \ --predictions outputs/predictions/base_struct_predictions.jsonl outputs/predictions/finetuned_struct_predictions.jsonl \ --out-dir outputs/figures \ && python3 scripts/build_report.py ) > "$fallback_log" 2>&1 & fi } check_once() { local log log="$(latest_train_log)" { echo "===== $(date '+%F %T %Z') =====" echo "latest_train_log=${log:-none}" echo "-- active processes --" active_pipeline || true echo "-- gpu --" nvidia-smi --query-gpu=memory.used,memory.free,utilization.gpu --format=csv,noheader || true nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv,noheader || true echo "-- checkpoints --" checkpoint_summary || true echo "-- finetuned prediction counts --" prediction_counts || true if [[ -n "$log" ]]; then echo "-- recent progress --" grep -aoE '[0-9]+/6283|generated [0-9]+/4030|torch.OutOfMemoryError|Traceback|train_runtime|eval_loss' "$log" | tail -n 20 || true fi } >> "$MONITOR_LOG" restart_1024_if_needed "$log" } while true; do check_once sleep "$INTERVAL_SECONDS" done