File size: 10,561 Bytes
d491800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
"""

Variance-based One-Shot Question Selection

===========================================

对 SFT 模型在 1533 道开放题上各跑 10 次推理,

用精确字符串匹配判断对错,选出方差最大的题目。



用法:

  docker exec rl4phyx_env python3 /workspace/rl4phyx/RL4Phyx/SFT/variance_select.py



输出:

  - variance_results.json: 每道题的正确率和方差

  - best_question_for_rlvr.json: 方差最大的题目信息

  - rlvr_train.parquet: 转好的训练数据 (1题 × 128行)

"""

import json
import re
import os
import torch
import numpy as np
from pathlib import Path

# ============ 配置 ============
MODEL_PATH = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_qwen25vl_3b/merged"
TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_test.jsonl"
IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/images"
OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT"

NUM_RUNS = 10          # 每题推理次数
MAX_NEW_TOKENS = 1024  # 最大生成长度 (缩短加速,只需提取boxed答案)
TEMPERATURE = 0.7      # 采样温度
BATCH_SIZE = 4         # 推理 batch size (根据显存调)
RLVR_COPIES = 128      # one-shot 复制次数


def extract_boxed(text):
    """从模型输出中提取 \\boxed{} 内的答案"""
    # 找最后一个 \boxed{}
    matches = re.findall(r'\\boxed\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}', text)
    if matches:
        return matches[-1].strip()
    return None


def normalize_answer(ans):
    """轻度归一化:去空格、去末尾句号"""
    if ans is None:
        return None
    ans = ans.strip()
    ans = ans.rstrip('.')
    # 去掉 \text{} 包裹
    ans = re.sub(r'\\text\{([^}]*)\}', r'\1', ans)
    # 去多余空格
    ans = re.sub(r'\s+', ' ', ans)
    return ans


def exact_match(pred_answer, gt_answer):
    """精确字符串匹配"""
    if pred_answer is None:
        return False
    pred_norm = normalize_answer(pred_answer)
    gt_norm = normalize_answer(gt_answer)
    if pred_norm is None or gt_norm is None:
        return False
    return pred_norm == gt_norm


def load_test_data(test_file):
    """加载测试数据"""
    data = []
    with open(test_file, 'r', encoding='utf-8') as f:
        for line in f:
            r = json.loads(line.strip())
            data.append(r)
    print(f"Loaded {len(data)} test samples")
    return data


def run_inference(model, processor, test_data, run_id):
    """对所有题目跑一次推理"""
    print(f"\n{'='*60}")
    print(f"  Run {run_id + 1}/{NUM_RUNS}")
    print(f"{'='*60}")
    
    results = []
    
    for i, item in enumerate(test_data):
        if i % 10 == 0:
            print(f"  Processing {i}/{len(test_data)}...", flush=True)
        
        prompt_text = item['prompt']
        image_path = os.path.join(IMAGE_DIR, f"{item['index']}.png")
        
        # 构建消息
        messages = [{"role": "user", "content": []}]
        
        # 添加图片(如果存在)
        if os.path.exists(image_path):
            messages[0]["content"].append({
                "type": "image",
                "image": f"file://{image_path}"
            })
        
        messages[0]["content"].append({
            "type": "text",
            "text": prompt_text
        })
        
        # 处理输入
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        
        from qwen_vl_utils import process_vision_info
        image_inputs, video_inputs = process_vision_info(messages)
        
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt"
        ).to(model.device)
        
        # 生成
        with torch.no_grad():
            output_ids = model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                temperature=TEMPERATURE,
                do_sample=True,
                top_p=0.9,
            )
        
        # 解码 (只取生成部分)
        input_len = inputs['input_ids'].shape[1]
        generated = output_ids[0][input_len:]
        prediction = processor.decode(generated, skip_special_tokens=True)
        
        # 提取答案
        pred_answer = extract_boxed(prediction)
        gt_answer = item['ground_truth']
        is_correct = exact_match(pred_answer, gt_answer)
        
        results.append({
            'index': item['index'],
            'pred_answer': pred_answer,
            'gt_answer': gt_answer,
            'correct': is_correct,
        })
    
    correct_count = sum(1 for r in results if r['correct'])
    print(f"  Run {run_id + 1} accuracy: {correct_count}/{len(results)} "
          f"({100*correct_count/len(results):.1f}%)")
    
    return results


def compute_variance(all_runs, test_data):
    """计算每道题的正确率方差"""
    n_questions = len(test_data)
    stats = []
    
    for qi in range(n_questions):
        # 收集这道题在 10 次 run 中的对错情况
        correct_flags = [all_runs[run_id][qi]['correct'] for run_id in range(NUM_RUNS)]
        n_correct = sum(correct_flags)
        p = n_correct / NUM_RUNS  # 正确率
        variance = p * (1 - p)     # 伯努利方差
        
        stats.append({
            'index': test_data[qi]['index'],
            'category': test_data[qi].get('category', ''),
            'ground_truth': test_data[qi]['ground_truth'],
            'n_correct': n_correct,
            'accuracy': p,
            'variance': variance,
            'correct_flags': correct_flags,
            'pred_answers': [all_runs[run_id][qi]['pred_answer'] for run_id in range(NUM_RUNS)],
        })
    
    # 按方差降序排列
    stats.sort(key=lambda x: x['variance'], reverse=True)
    
    return stats


def convert_to_training_format(question_item, copies=RLVR_COPIES):
    """将选中的题目转成 RLVR 训练 parquet 格式"""
    import pandas as pd
    
    prompt_text = question_item['prompt']
    image_path = f"{question_item['index']}.png"
    
    # 构建 veRL 格式的 prompt
    prompt_messages = [{"role": "user", "content": prompt_text}]
    
    records = []
    for _ in range(copies):
        records.append({
            'prompt': prompt_messages,
            'answer': question_item['ground_truth'],
            'image_path': image_path,
            'data_source': 'deepscaler',
            'category': question_item.get('category', 'Physics'),
            'index': question_item['index'],
        })
    
    df = pd.DataFrame(records)
    return df


def main():
    print("=" * 60)
    print("  Variance-based One-Shot Question Selection")
    print("=" * 60)
    
    # 1. 加载模型
    print("\n[1/4] Loading SFT model...")
    from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
    
    import os
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        MODEL_PATH,
        torch_dtype=torch.bfloat16,
    ).to("cuda")
    processor = AutoProcessor.from_pretrained(MODEL_PATH)
    model.eval()
    print(f"  Model loaded: {sum(p.numel() for p in model.parameters())/1e6:.0f}M params")
    
    # 2. 加载测试数据
    print("\n[2/4] Loading test data...")
    test_data = load_test_data(TEST_FILE)
    
    # 3. 跑 10 次推理
    print(f"\n[3/4] Running {NUM_RUNS} inference passes on {len(test_data)} questions...")
    all_runs = []
    for run_id in range(NUM_RUNS):
        run_results = run_inference(model, processor, test_data, run_id)
        all_runs.append(run_results)
        
        # 每次 run 后保存中间结果
        interim_path = os.path.join(OUTPUT_DIR, f"variance_run_{run_id}.json")
        with open(interim_path, 'w', encoding='utf-8') as f:
            json.dump(run_results, f, ensure_ascii=False, indent=2)
        print(f"  Saved interim results to {interim_path}")
    
    # 4. 计算方差并选题
    print(f"\n[4/4] Computing variance and selecting best question...")
    stats = compute_variance(all_runs, test_data)
    
    # 保存完整统计
    stats_path = os.path.join(OUTPUT_DIR, "variance_results.json")
    with open(stats_path, 'w', encoding='utf-8') as f:
        json.dump(stats, f, ensure_ascii=False, indent=2)
    print(f"  Saved all variance stats to {stats_path}")
    
    # 打印 Top 20 最高方差题目
    print(f"\n{'='*60}")
    print(f"  TOP 20 HIGHEST VARIANCE QUESTIONS")
    print(f"{'='*60}")
    for i, s in enumerate(stats[:20]):
        print(f"  #{i+1}: idx={s['index']} | gt={s['ground_truth'][:30]:30s} | "
              f"correct={s['n_correct']}/{NUM_RUNS} | var={s['variance']:.4f} | "
              f"cat={s['category']}")
        print(f"       preds: {s['pred_answers'][:5]}")
    
    # 选方差最大的题
    best = stats[0]
    print(f"\n{'='*60}")
    print(f"  SELECTED QUESTION FOR ONE-SHOT RLVR")
    print(f"{'='*60}")
    print(f"  Index:       {best['index']}")
    print(f"  Category:    {best['category']}")
    print(f"  Ground Truth: {best['ground_truth']}")
    print(f"  Accuracy:    {best['n_correct']}/{NUM_RUNS} ({best['accuracy']*100:.0f}%)")
    print(f"  Variance:    {best['variance']:.4f}")
    print(f"  Pred Answers: {best['pred_answers']}")
    
    # 保存选中题目
    best_idx = int(best['index'])
    best_item = None
    for item in test_data:
        if int(item['index']) == best_idx:
            best_item = item
            break
    
    best_path = os.path.join(OUTPUT_DIR, "best_question_for_rlvr.json")
    with open(best_path, 'w', encoding='utf-8') as f:
        json.dump({
            'selected_question': best_item,
            'stats': best,
        }, f, ensure_ascii=False, indent=2)
    print(f"  Saved best question to {best_path}")
    
    # 转成训练 parquet
    if best_item:
        df = convert_to_training_format(best_item)
        parquet_path = os.path.join(OUTPUT_DIR, "rlvr_train.parquet")
        df.to_parquet(parquet_path, index=False)
        print(f"  Saved training parquet ({len(df)} rows) to {parquet_path}")
    
    print(f"\n{'='*60}")
    print(f"  DONE!")
    print(f"{'='*60}")


if __name__ == "__main__":
    main()