YUNTA88
/

rl4phyx-backup

Safetensors

Model card Files Files and versions

xet

Community

YUNTA88 commited on Mar 26

Commit

d491800

verified ·

1 Parent(s): 3eee49d

Upload scripts/variance_select.py with huggingface_hub

Browse files

Files changed (1) hide show

scripts/variance_select.py +312 -0

scripts/variance_select.py ADDED Viewed

	@@ -0,0 +1,312 @@

+"""
+Variance-based One-Shot Question Selection
+===========================================
+对 SFT 模型在 1533 道开放题上各跑 10 次推理，
+用精确字符串匹配判断对错，选出方差最大的题目。
+用法:
+  docker exec rl4phyx_env python3 /workspace/rl4phyx/RL4Phyx/SFT/variance_select.py
+输出:
+  - variance_results.json: 每道题的正确率和方差
+  - best_question_for_rlvr.json: 方差最大的题目信息
+  - rlvr_train.parquet: 转好的训练数据 (1题 × 128行)
+"""
+import json
+import re
+import os
+import torch
+import numpy as np
+from pathlib import Path
+# ============ 配置 ============
+MODEL_PATH = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_qwen25vl_3b/merged"
+TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_test.jsonl"
+IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/images"
+OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT"
+NUM_RUNS = 10          # 每题推理次数
+MAX_NEW_TOKENS = 1024  # 最大生成长度 (缩短加速，只需提取boxed答案)
+TEMPERATURE = 0.7      # 采样温度
+BATCH_SIZE = 4         # 推理 batch size (根据显存调)
+RLVR_COPIES = 128      # one-shot 复制次数
+def extract_boxed(text):
+    """从模型输出中提取 \\boxed{} 内的答案"""
+    # 找最后一个 \boxed{}
+    matches = re.findall(r'\\boxed\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}', text)
+    if matches:
+        return matches[-1].strip()
+    return None
+def normalize_answer(ans):
+    """轻度归一化：去空格、去末尾句号"""
+    if ans is None:
+        return None
+    ans = ans.strip()
+    ans = ans.rstrip('.')
+    # 去掉 \text{} 包裹
+    ans = re.sub(r'\\text\{([^}]*)\}', r'\1', ans)
+    # 去多余空格
+    ans = re.sub(r'\s+', ' ', ans)
+    return ans
+def exact_match(pred_answer, gt_answer):
+    """精确字符串匹配"""
+    if pred_answer is None:
+        return False
+    pred_norm = normalize_answer(pred_answer)
+    gt_norm = normalize_answer(gt_answer)
+    if pred_norm is None or gt_norm is None:
+        return False
+    return pred_norm == gt_norm
+def load_test_data(test_file):
+    """加载测试数据"""
+    data = []
+    with open(test_file, 'r', encoding='utf-8') as f:
+        for line in f:
+            r = json.loads(line.strip())
+            data.append(r)
+    print(f"Loaded {len(data)} test samples")
+    return data
+def run_inference(model, processor, test_data, run_id):
+    """对所有题目跑一次推理"""
+    print(f"\n{'='*60}")
+    print(f"  Run {run_id + 1}/{NUM_RUNS}")
+    print(f"{'='*60}")
+    results = []
+    for i, item in enumerate(test_data):
+        if i % 10 == 0:
+            print(f"  Processing {i}/{len(test_data)}...", flush=True)
+        prompt_text = item['prompt']
+        image_path = os.path.join(IMAGE_DIR, f"{item['index']}.png")
+        # 构建消息
+        messages = [{"role": "user", "content": []}]
+        # 添加图片（如果存在）
+        if os.path.exists(image_path):
+            messages[0]["content"].append({
+                "type": "image",
+                "image": f"file://{image_path}"
+            })
+        messages[0]["content"].append({
+            "type": "text",
+            "text": prompt_text
+        })
+        # 处理输入
+        text = processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        from qwen_vl_utils import process_vision_info
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt"
+        ).to(model.device)
+        # 生成
+        with torch.no_grad():
+            output_ids = model.generate(
+                **inputs,
+                max_new_tokens=MAX_NEW_TOKENS,
+                temperature=TEMPERATURE,
+                do_sample=True,
+                top_p=0.9,
+            )
+        # 解码 (只取生成部分)
+        input_len = inputs['input_ids'].shape[1]
+        generated = output_ids[0][input_len:]
+        prediction = processor.decode(generated, skip_special_tokens=True)
+        # 提取答案
+        pred_answer = extract_boxed(prediction)
+        gt_answer = item['ground_truth']
+        is_correct = exact_match(pred_answer, gt_answer)
+        results.append({
+            'index': item['index'],
+            'pred_answer': pred_answer,
+            'gt_answer': gt_answer,
+            'correct': is_correct,
+        })
+    correct_count = sum(1 for r in results if r['correct'])
+    print(f"  Run {run_id + 1} accuracy: {correct_count}/{len(results)} "
+          f"({100*correct_count/len(results):.1f}%)")
+    return results
+def compute_variance(all_runs, test_data):
+    """计算每道题的正确率方差"""
+    n_questions = len(test_data)
+    stats = []
+    for qi in range(n_questions):
+        # 收集这道题在 10 次 run 中的对错情况
+        correct_flags = [all_runs[run_id][qi]['correct'] for run_id in range(NUM_RUNS)]
+        n_correct = sum(correct_flags)
+        p = n_correct / NUM_RUNS  # 正确率
+        variance = p * (1 - p)     # 伯努利方差
+        stats.append({
+            'index': test_data[qi]['index'],
+            'category': test_data[qi].get('category', ''),
+            'ground_truth': test_data[qi]['ground_truth'],
+            'n_correct': n_correct,
+            'accuracy': p,
+            'variance': variance,
+            'correct_flags': correct_flags,
+            'pred_answers': [all_runs[run_id][qi]['pred_answer'] for run_id in range(NUM_RUNS)],
+        })
+    # 按方差降序排列
+    stats.sort(key=lambda x: x['variance'], reverse=True)
+    return stats
+def convert_to_training_format(question_item, copies=RLVR_COPIES):
+    """将选中的题目转成 RLVR 训练 parquet 格式"""
+    import pandas as pd
+    prompt_text = question_item['prompt']
+    image_path = f"{question_item['index']}.png"
+    # 构建 veRL 格式的 prompt
+    prompt_messages = [{"role": "user", "content": prompt_text}]
+    records = []
+    for _ in range(copies):
+        records.append({
+            'prompt': prompt_messages,
+            'answer': question_item['ground_truth'],
+            'image_path': image_path,
+            'data_source': 'deepscaler',
+            'category': question_item.get('category', 'Physics'),
+            'index': question_item['index'],
+        })
+    df = pd.DataFrame(records)
+    return df
+def main():
+    print("=" * 60)
+    print("  Variance-based One-Shot Question Selection")
+    print("=" * 60)
+    # 1. 加载模型
+    print("\n[1/4] Loading SFT model...")
+    from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+    import os
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        MODEL_PATH,
+        torch_dtype=torch.bfloat16,
+    ).to("cuda")
+    processor = AutoProcessor.from_pretrained(MODEL_PATH)
+    model.eval()
+    print(f"  Model loaded: {sum(p.numel() for p in model.parameters())/1e6:.0f}M params")
+    # 2. 加载测试数据
+    print("\n[2/4] Loading test data...")
+    test_data = load_test_data(TEST_FILE)
+    # 3. 跑 10 次推理
+    print(f"\n[3/4] Running {NUM_RUNS} inference passes on {len(test_data)} questions...")
+    all_runs = []
+    for run_id in range(NUM_RUNS):
+        run_results = run_inference(model, processor, test_data, run_id)
+        all_runs.append(run_results)
+        # 每次 run 后保存中间结果
+        interim_path = os.path.join(OUTPUT_DIR, f"variance_run_{run_id}.json")
+        with open(interim_path, 'w', encoding='utf-8') as f:
+            json.dump(run_results, f, ensure_ascii=False, indent=2)
+        print(f"  Saved interim results to {interim_path}")
+    # 4. 计算方差并选题
+    print(f"\n[4/4] Computing variance and selecting best question...")
+    stats = compute_variance(all_runs, test_data)
+    # 保存完整统计
+    stats_path = os.path.join(OUTPUT_DIR, "variance_results.json")
+    with open(stats_path, 'w', encoding='utf-8') as f:
+        json.dump(stats, f, ensure_ascii=False, indent=2)
+    print(f"  Saved all variance stats to {stats_path}")
+    # 打印 Top 20 最高方差题目
+    print(f"\n{'='*60}")
+    print(f"  TOP 20 HIGHEST VARIANCE QUESTIONS")
+    print(f"{'='*60}")
+    for i, s in enumerate(stats[:20]):
+        print(f"  #{i+1}: idx={s['index']} | gt={s['ground_truth'][:30]:30s} | "
+              f"correct={s['n_correct']}/{NUM_RUNS} | var={s['variance']:.4f} | "
+              f"cat={s['category']}")
+        print(f"       preds: {s['pred_answers'][:5]}")
+    # 选方差最大的题
+    best = stats[0]
+    print(f"\n{'='*60}")
+    print(f"  SELECTED QUESTION FOR ONE-SHOT RLVR")
+    print(f"{'='*60}")
+    print(f"  Index:       {best['index']}")
+    print(f"  Category:    {best['category']}")
+    print(f"  Ground Truth: {best['ground_truth']}")
+    print(f"  Accuracy:    {best['n_correct']}/{NUM_RUNS} ({best['accuracy']*100:.0f}%)")
+    print(f"  Variance:    {best['variance']:.4f}")
+    print(f"  Pred Answers: {best['pred_answers']}")
+    # 保存选中题目
+    best_idx = int(best['index'])
+    best_item = None
+    for item in test_data:
+        if int(item['index']) == best_idx:
+            best_item = item
+            break
+    best_path = os.path.join(OUTPUT_DIR, "best_question_for_rlvr.json")
+    with open(best_path, 'w', encoding='utf-8') as f:
+        json.dump({
+            'selected_question': best_item,
+            'stats': best,
+        }, f, ensure_ascii=False, indent=2)
+    print(f"  Saved best question to {best_path}")
+    # 转成训练 parquet
+    if best_item:
+        df = convert_to_training_format(best_item)
+        parquet_path = os.path.join(OUTPUT_DIR, "rlvr_train.parquet")
+        df.to_parquet(parquet_path, index=False)
+        print(f"  Saved training parquet ({len(df)} rows) to {parquet_path}")
+    print(f"\n{'='*60}")
+    print(f"  DONE!")
+    print(f"{'='*60}")
+if __name__ == "__main__":
+    main()