| #!/bin/bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -euo pipefail |
|
|
| PROJECT_ROOT="/PROJECT/0325120031_A/ghong/taketimes/llm-bang" |
| TRAIN_LOG="${PROJECT_ROOT}/checkpoints/korean_3b_orpo_v1/train.log" |
| TRAIN_PID=$(pgrep -f "train/orpo.py.*korean_3b_orpo_v1" | head -1) |
|
|
| echo "==============================================" |
| echo " ORPO Eval Watchdog Started" |
| echo "==============================================" |
| echo " Time : $(date '+%Y-%m-%d %H:%M:%S')" |
| echo " Train PID : ${TRAIN_PID:-NOT FOUND}" |
| echo " Train Log : ${TRAIN_LOG}" |
| echo "==============================================" |
|
|
| if [ -z "${TRAIN_PID}" ]; then |
| echo "[WARN] Training process not found. Checking if already completed..." |
| |
| if grep -q "Training completed" "${TRAIN_LOG}" 2>/dev/null || \ |
| grep -q "Saving model checkpoint" "${TRAIN_LOG}" 2>/dev/null; then |
| echo "[INFO] Training appears to have already completed." |
| else |
| echo "[ERROR] No training process and no completion marker found. Exiting." |
| exit 1 |
| fi |
| else |
| echo "[INFO] Watching training PID ${TRAIN_PID}..." |
| echo "" |
|
|
| |
| while kill -0 "${TRAIN_PID}" 2>/dev/null; do |
| |
| CURRENT_STEP=$(grep -oP '\d+/9840' "${TRAIN_LOG}" 2>/dev/null | tail -1 || echo "?/?") |
| LATEST_LOSS=$(grep "'loss':" "${TRAIN_LOG}" 2>/dev/null | tail -1 | grep -oP "'loss': '([^']+)'" | sed "s/'loss': '//;s/'//" || echo "?") |
| echo "[$(date '+%H:%M:%S')] Step ${CURRENT_STEP} | Loss: ${LATEST_LOSS} | PID ${TRAIN_PID} running" |
| sleep 60 |
| done |
|
|
| echo "" |
| echo "==============================================" |
| echo "[INFO] Training process ${TRAIN_PID} has ended." |
| echo "[INFO] Time: $(date '+%Y-%m-%d %H:%M:%S')" |
| echo "==============================================" |
| fi |
|
|
| |
| sleep 10 |
|
|
| |
| echo "" |
| echo "[INFO] Final training stats:" |
| grep "eval_loss" "${TRAIN_LOG}" | tail -1 | tr ',' '\n' | head -10 |
| echo "" |
|
|
| |
| LATEST_CKPT=$(ls -d ${PROJECT_ROOT}/checkpoints/korean_3b_orpo_v1/checkpoint-* 2>/dev/null | sort -t- -k2 -n | tail -1) |
| echo "[INFO] Latest checkpoint: ${LATEST_CKPT}" |
|
|
| if [ -z "${LATEST_CKPT}" ]; then |
| echo "[ERROR] No checkpoint found. Cannot proceed with evaluation." |
| exit 1 |
| fi |
|
|
| |
| python3 -c " |
| import os, urllib.request, urllib.parse, json |
| token = os.environ.get('TELEGRAM_BOT_TOKEN', '') |
| chat_id = os.environ.get('TELEGRAM_CHAT_ID', '') |
| if token and chat_id: |
| msg = 'π ORPO νμ΅ μλ£! μλ νκ° μμν©λλ€.\nCheckpoint: ${LATEST_CKPT##*/}' |
| url = f'https://api.telegram.org/bot{token}/sendMessage' |
| data = urllib.parse.urlencode({'chat_id': chat_id, 'text': msg}).encode() |
| urllib.request.urlopen(url, data, timeout=10) |
| print('[INFO] Telegram notification sent.') |
| else: |
| print('[INFO] Telegram not configured, skipping notification.') |
| " 2>/dev/null || true |
|
|
| |
| |
| |
| echo "" |
| echo "==============================================" |
| echo " Starting ORPO Evaluation Pipeline" |
| echo " Time: $(date '+%Y-%m-%d %H:%M:%S')" |
| echo "==============================================" |
|
|
| cd "${PROJECT_ROOT}" |
|
|
| python3 eval/orpo_eval_pipeline.py \ |
| --checkpoint "${LATEST_CKPT}" \ |
| 2>&1 | tee -a checkpoints/korean_3b_orpo_v1/eval.log |
|
|
| EVAL_EXIT=$? |
|
|
| echo "" |
| echo "==============================================" |
| echo " Evaluation Complete" |
| echo " Exit code: ${EVAL_EXIT}" |
| echo " Time: $(date '+%Y-%m-%d %H:%M:%S')" |
| echo "==============================================" |
|
|
| |
| python3 -c " |
| import os, urllib.request, urllib.parse |
| token = os.environ.get('TELEGRAM_BOT_TOKEN', '') |
| chat_id = os.environ.get('TELEGRAM_CHAT_ID', '') |
| if token and chat_id: |
| exit_code = ${EVAL_EXIT} |
| status = 'β
μ±κ³΅' if exit_code == 0 else 'β μ€ν¨' |
| msg = f'ORPO νκ° μλ£: {status}\nExit code: {exit_code}\nλ³΄κ³ μ: reports/ νμΈ' |
| url = f'https://api.telegram.org/bot{token}/sendMessage' |
| data = urllib.parse.urlencode({'chat_id': chat_id, 'text': msg}).encode() |
| urllib.request.urlopen(url, data, timeout=10) |
| " 2>/dev/null || true |
|
|
| exit ${EVAL_EXIT} |
|
|