| """
|
| Variance-based One-Shot Question Selection
|
| ===========================================
|
| 对 SFT 模型在 1533 道开放题上各跑 10 次推理,
|
| 用精确字符串匹配判断对错,选出方差最大的题目。
|
|
|
| 用法:
|
| docker exec rl4phyx_env python3 /workspace/rl4phyx/RL4Phyx/SFT/variance_select.py
|
|
|
| 输出:
|
| - variance_results.json: 每道题的正确率和方差
|
| - best_question_for_rlvr.json: 方差最大的题目信息
|
| - rlvr_train.parquet: 转好的训练数据 (1题 × 128行)
|
| """
|
|
|
| import json
|
| import re
|
| import os
|
| import torch
|
| import numpy as np
|
| from pathlib import Path
|
|
|
|
|
| MODEL_PATH = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_qwen25vl_3b/merged"
|
| TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_test.jsonl"
|
| IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/images"
|
| OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT"
|
|
|
| NUM_RUNS = 10
|
| MAX_NEW_TOKENS = 1024
|
| TEMPERATURE = 0.7
|
| BATCH_SIZE = 4
|
| RLVR_COPIES = 128
|
|
|
|
|
| def extract_boxed(text):
|
| """从模型输出中提取 \\boxed{} 内的答案"""
|
|
|
| matches = re.findall(r'\\boxed\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}', text)
|
| if matches:
|
| return matches[-1].strip()
|
| return None
|
|
|
|
|
| def normalize_answer(ans):
|
| """轻度归一化:去空格、去末尾句号"""
|
| if ans is None:
|
| return None
|
| ans = ans.strip()
|
| ans = ans.rstrip('.')
|
|
|
| ans = re.sub(r'\\text\{([^}]*)\}', r'\1', ans)
|
|
|
| ans = re.sub(r'\s+', ' ', ans)
|
| return ans
|
|
|
|
|
| def exact_match(pred_answer, gt_answer):
|
| """精确字符串匹配"""
|
| if pred_answer is None:
|
| return False
|
| pred_norm = normalize_answer(pred_answer)
|
| gt_norm = normalize_answer(gt_answer)
|
| if pred_norm is None or gt_norm is None:
|
| return False
|
| return pred_norm == gt_norm
|
|
|
|
|
| def load_test_data(test_file):
|
| """加载测试数据"""
|
| data = []
|
| with open(test_file, 'r', encoding='utf-8') as f:
|
| for line in f:
|
| r = json.loads(line.strip())
|
| data.append(r)
|
| print(f"Loaded {len(data)} test samples")
|
| return data
|
|
|
|
|
| def run_inference(model, processor, test_data, run_id):
|
| """对所有题目跑一次推理"""
|
| print(f"\n{'='*60}")
|
| print(f" Run {run_id + 1}/{NUM_RUNS}")
|
| print(f"{'='*60}")
|
|
|
| results = []
|
|
|
| for i, item in enumerate(test_data):
|
| if i % 10 == 0:
|
| print(f" Processing {i}/{len(test_data)}...", flush=True)
|
|
|
| prompt_text = item['prompt']
|
| image_path = os.path.join(IMAGE_DIR, f"{item['index']}.png")
|
|
|
|
|
| messages = [{"role": "user", "content": []}]
|
|
|
|
|
| if os.path.exists(image_path):
|
| messages[0]["content"].append({
|
| "type": "image",
|
| "image": f"file://{image_path}"
|
| })
|
|
|
| messages[0]["content"].append({
|
| "type": "text",
|
| "text": prompt_text
|
| })
|
|
|
|
|
| text = processor.apply_chat_template(
|
| messages, tokenize=False, add_generation_prompt=True
|
| )
|
|
|
| from qwen_vl_utils import process_vision_info
|
| image_inputs, video_inputs = process_vision_info(messages)
|
|
|
| inputs = processor(
|
| text=[text],
|
| images=image_inputs,
|
| videos=video_inputs,
|
| padding=True,
|
| return_tensors="pt"
|
| ).to(model.device)
|
|
|
|
|
| with torch.no_grad():
|
| output_ids = model.generate(
|
| **inputs,
|
| max_new_tokens=MAX_NEW_TOKENS,
|
| temperature=TEMPERATURE,
|
| do_sample=True,
|
| top_p=0.9,
|
| )
|
|
|
|
|
| input_len = inputs['input_ids'].shape[1]
|
| generated = output_ids[0][input_len:]
|
| prediction = processor.decode(generated, skip_special_tokens=True)
|
|
|
|
|
| pred_answer = extract_boxed(prediction)
|
| gt_answer = item['ground_truth']
|
| is_correct = exact_match(pred_answer, gt_answer)
|
|
|
| results.append({
|
| 'index': item['index'],
|
| 'pred_answer': pred_answer,
|
| 'gt_answer': gt_answer,
|
| 'correct': is_correct,
|
| })
|
|
|
| correct_count = sum(1 for r in results if r['correct'])
|
| print(f" Run {run_id + 1} accuracy: {correct_count}/{len(results)} "
|
| f"({100*correct_count/len(results):.1f}%)")
|
|
|
| return results
|
|
|
|
|
| def compute_variance(all_runs, test_data):
|
| """计算每道题的正确率方差"""
|
| n_questions = len(test_data)
|
| stats = []
|
|
|
| for qi in range(n_questions):
|
|
|
| correct_flags = [all_runs[run_id][qi]['correct'] for run_id in range(NUM_RUNS)]
|
| n_correct = sum(correct_flags)
|
| p = n_correct / NUM_RUNS
|
| variance = p * (1 - p)
|
|
|
| stats.append({
|
| 'index': test_data[qi]['index'],
|
| 'category': test_data[qi].get('category', ''),
|
| 'ground_truth': test_data[qi]['ground_truth'],
|
| 'n_correct': n_correct,
|
| 'accuracy': p,
|
| 'variance': variance,
|
| 'correct_flags': correct_flags,
|
| 'pred_answers': [all_runs[run_id][qi]['pred_answer'] for run_id in range(NUM_RUNS)],
|
| })
|
|
|
|
|
| stats.sort(key=lambda x: x['variance'], reverse=True)
|
|
|
| return stats
|
|
|
|
|
| def convert_to_training_format(question_item, copies=RLVR_COPIES):
|
| """将选中的题目转成 RLVR 训练 parquet 格式"""
|
| import pandas as pd
|
|
|
| prompt_text = question_item['prompt']
|
| image_path = f"{question_item['index']}.png"
|
|
|
|
|
| prompt_messages = [{"role": "user", "content": prompt_text}]
|
|
|
| records = []
|
| for _ in range(copies):
|
| records.append({
|
| 'prompt': prompt_messages,
|
| 'answer': question_item['ground_truth'],
|
| 'image_path': image_path,
|
| 'data_source': 'deepscaler',
|
| 'category': question_item.get('category', 'Physics'),
|
| 'index': question_item['index'],
|
| })
|
|
|
| df = pd.DataFrame(records)
|
| return df
|
|
|
|
|
| def main():
|
| print("=" * 60)
|
| print(" Variance-based One-Shot Question Selection")
|
| print("=" * 60)
|
|
|
|
|
| print("\n[1/4] Loading SFT model...")
|
| from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
|
|
|
| import os
|
| os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
|
|
| model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| MODEL_PATH,
|
| torch_dtype=torch.bfloat16,
|
| ).to("cuda")
|
| processor = AutoProcessor.from_pretrained(MODEL_PATH)
|
| model.eval()
|
| print(f" Model loaded: {sum(p.numel() for p in model.parameters())/1e6:.0f}M params")
|
|
|
|
|
| print("\n[2/4] Loading test data...")
|
| test_data = load_test_data(TEST_FILE)
|
|
|
|
|
| print(f"\n[3/4] Running {NUM_RUNS} inference passes on {len(test_data)} questions...")
|
| all_runs = []
|
| for run_id in range(NUM_RUNS):
|
| run_results = run_inference(model, processor, test_data, run_id)
|
| all_runs.append(run_results)
|
|
|
|
|
| interim_path = os.path.join(OUTPUT_DIR, f"variance_run_{run_id}.json")
|
| with open(interim_path, 'w', encoding='utf-8') as f:
|
| json.dump(run_results, f, ensure_ascii=False, indent=2)
|
| print(f" Saved interim results to {interim_path}")
|
|
|
|
|
| print(f"\n[4/4] Computing variance and selecting best question...")
|
| stats = compute_variance(all_runs, test_data)
|
|
|
|
|
| stats_path = os.path.join(OUTPUT_DIR, "variance_results.json")
|
| with open(stats_path, 'w', encoding='utf-8') as f:
|
| json.dump(stats, f, ensure_ascii=False, indent=2)
|
| print(f" Saved all variance stats to {stats_path}")
|
|
|
|
|
| print(f"\n{'='*60}")
|
| print(f" TOP 20 HIGHEST VARIANCE QUESTIONS")
|
| print(f"{'='*60}")
|
| for i, s in enumerate(stats[:20]):
|
| print(f" #{i+1}: idx={s['index']} | gt={s['ground_truth'][:30]:30s} | "
|
| f"correct={s['n_correct']}/{NUM_RUNS} | var={s['variance']:.4f} | "
|
| f"cat={s['category']}")
|
| print(f" preds: {s['pred_answers'][:5]}")
|
|
|
|
|
| best = stats[0]
|
| print(f"\n{'='*60}")
|
| print(f" SELECTED QUESTION FOR ONE-SHOT RLVR")
|
| print(f"{'='*60}")
|
| print(f" Index: {best['index']}")
|
| print(f" Category: {best['category']}")
|
| print(f" Ground Truth: {best['ground_truth']}")
|
| print(f" Accuracy: {best['n_correct']}/{NUM_RUNS} ({best['accuracy']*100:.0f}%)")
|
| print(f" Variance: {best['variance']:.4f}")
|
| print(f" Pred Answers: {best['pred_answers']}")
|
|
|
|
|
| best_idx = int(best['index'])
|
| best_item = None
|
| for item in test_data:
|
| if int(item['index']) == best_idx:
|
| best_item = item
|
| break
|
|
|
| best_path = os.path.join(OUTPUT_DIR, "best_question_for_rlvr.json")
|
| with open(best_path, 'w', encoding='utf-8') as f:
|
| json.dump({
|
| 'selected_question': best_item,
|
| 'stats': best,
|
| }, f, ensure_ascii=False, indent=2)
|
| print(f" Saved best question to {best_path}")
|
|
|
|
|
| if best_item:
|
| df = convert_to_training_format(best_item)
|
| parquet_path = os.path.join(OUTPUT_DIR, "rlvr_train.parquet")
|
| df.to_parquet(parquet_path, index=False)
|
| print(f" Saved training parquet ({len(df)} rows) to {parquet_path}")
|
|
|
| print(f"\n{'='*60}")
|
| print(f" DONE!")
|
| print(f"{'='*60}")
|
|
|
|
|
| if __name__ == "__main__":
|
| main()
|
|
|