File size: 3,122 Bytes

af9853e

import os
import time
import json
import glob
import pandas as pd
from datetime import datetime

def get_latest_checkpoint(checkpoint_dir):
    # 查找所有 checkpoint-XXX 文件夹
    checkpoints = glob.glob(os.path.join(checkpoint_dir, "checkpoint-*"))
    if not checkpoints:
        return None
    # 按修改时间排序，最新的在最后
    checkpoints.sort(key=os.path.getmtime)
    return checkpoints[-1]

def read_metrics(checkpoint_path):
    state_file = os.path.join(checkpoint_path, "trainer_state.json")
    if not os.path.exists(state_file):
        return None
    
    try:
        with open(state_file, 'r') as f:
            data = json.load(f)
        return data.get("log_history", [])
    except:
        return None

def monitor(checkpoint_dir="checkpoints"):
    print(f"👀 开始监视训练目录: {checkpoint_dir}")
    print("按 Ctrl+C 退出监视")
    print("-" * 50)
    
    last_step = -1
    
    while True:
        latest_ckpt = get_latest_checkpoint(checkpoint_dir)
        if latest_ckpt:
            folder_name = os.path.basename(latest_ckpt)
            logs = read_metrics(latest_ckpt)
            
            if logs:
                # 找到最新的 eval 记录
                latest_log = logs[-1]
                current_step = latest_log.get('step', 0)
                
                # 如果有更新
                if current_step != last_step:
                    timestamp = datetime.now().strftime("%H:%M:%S")
                    
                    # 尝试寻找验证集指标 (eval_accuracy 等)
                    # log_history 混杂了 training loss 和 eval metrics
                    # 我们倒序找最近的一个包含 eval_accuracy 的记录
                    eval_record = None
                    train_record = None
                    
                    for log in reversed(logs):
                        if 'eval_accuracy' in log and eval_record is None:
                            eval_record = log
                        if 'loss' in log and train_record is None:
                            train_record = log
                        if eval_record and train_record:
                            break
                    
                    print(f"[{timestamp}] 最新检查点: {folder_name}")
                    if train_record:
                        print(f"   📉 Training Loss: {train_record.get('loss', 'N/A'):.4f} (Epoch {train_record.get('epoch', 'N/A'):.2f})")
                    if eval_record:
                        print(f"   ✅ Eval Accuracy: {eval_record.get('eval_accuracy', 'N/A'):.4f}")
                        print(f"   ✅ Eval F1 Score: {eval_record.get('eval_f1', 'N/A'):.4f}")
                    print("-" * 50)
                    
                    last_step = current_step
        
        time.sleep(10) # 每10秒检查一次

if __name__ == "__main__":
    # 尝试从 config 读取路径，如果失败则使用默认
    try:
        from config import Config
        ckpt_dir = Config.CHECKPOINT_DIR
    except:
        ckpt_dir = "checkpoints"
        
    monitor(ckpt_dir)