YUNTA88 commited on
Commit
c6cbaf4
·
verified ·
1 Parent(s): 24f08aa

Upload folder using huggingface_hub

Browse files
Files changed (44) hide show
  1. eval_footprint/__pycache__/eval_openended_inference.cpython-311.pyc +0 -0
  2. eval_footprint/comparison_report.json +97 -0
  3. eval_footprint/convert_openended.py +61 -0
  4. eval_footprint/create_openended_test.py +211 -0
  5. eval_footprint/eval_deepseek_judge.py +382 -0
  6. eval_footprint/eval_fullft_math_nf.py +258 -0
  7. eval_footprint/eval_fullft_math_nf_old.py +259 -0
  8. eval_footprint/eval_fullft_math_nf_old_final.py +258 -0
  9. eval_footprint/eval_fullft_phyx_math_nf.py +258 -0
  10. eval_footprint/eval_fullft_phyx_nf.py +258 -0
  11. eval_footprint/eval_inference_lora_math_f.py +262 -0
  12. eval_footprint/eval_judge_fullft_math_nf.py +293 -0
  13. eval_footprint/eval_judge_fullft_phyx_nf.py +296 -0
  14. eval_footprint/eval_judge_lora_math_f.py +377 -0
  15. eval_footprint/eval_judge_lora_phyx_f.py +340 -0
  16. eval_footprint/eval_lora_phyx_f.py +259 -0
  17. eval_footprint/eval_lora_phyx_f_final.py +258 -0
  18. eval_footprint/eval_openended_inference.py +259 -0
  19. eval_footprint/eval_openended_judge.py +467 -0
  20. eval_footprint/eval_sft_only.py +114 -0
  21. eval_footprint/eval_sft_vs_base_multigpu.py +359 -0
  22. eval_footprint/eval_single_model_template.py +242 -0
  23. eval_footprint/inference_fullft_math_nf.jsonl +0 -0
  24. eval_footprint/inference_fullft_math_nf_old.jsonl +0 -0
  25. eval_footprint/inference_fullft_phyx_math_nf.jsonl +0 -0
  26. eval_footprint/inference_fullft_phyx_nf.jsonl +0 -0
  27. eval_footprint/inference_lora_phyx_f.jsonl +0 -0
  28. eval_footprint/inference_results_base.jsonl +0 -0
  29. eval_footprint/inference_results_lora_math_f.jsonl +0 -0
  30. eval_footprint/inference_results_sft.jsonl +0 -0
  31. eval_footprint/inference_results_sft.jsonl_gpu4.jsonl +0 -0
  32. eval_footprint/inference_results_sft.jsonl_gpu5.jsonl +0 -0
  33. eval_footprint/inference_results_sft.jsonl_gpu6.jsonl +0 -0
  34. eval_footprint/inference_results_sft.jsonl_gpu7.jsonl +0 -0
  35. eval_footprint/report_lora_math_f.json +39 -0
  36. eval_footprint/run_inference_single.py +120 -0
  37. eval_footprint/scored_results_base.jsonl +0 -0
  38. eval_footprint/scored_results_lora_math_f.jsonl +0 -0
  39. eval_footprint/scored_results_sft.jsonl +0 -0
  40. eval_footprint/scored_results_sft_phyx.jsonl +0 -0
  41. eval_footprint/sft_report_phyx.json +32 -0
  42. eval_footprint/simple_eval.py +68 -0
  43. eval_footprint/single_model_eval.py +103 -0
  44. eval_footprint/test_1533_openended.jsonl +0 -0
eval_footprint/__pycache__/eval_openended_inference.cpython-311.pyc ADDED
Binary file (13.9 kB). View file
 
eval_footprint/comparison_report.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "2026-02-26 04:38:24",
3
+ "scoring_method": "rule-based + Gemini 2.5 Flash judge (parallel)",
4
+ "base": {
5
+ "model": "Base",
6
+ "total": 1533,
7
+ "rule_correct": 48,
8
+ "gemini_rescued": 108,
9
+ "final_correct": 156,
10
+ "final_acc": 10.18,
11
+ "category_stats": {
12
+ "Mechanics": {
13
+ "total": 276,
14
+ "rule_correct": 8,
15
+ "gemini_correct": 33,
16
+ "final_correct": 41
17
+ },
18
+ "Waves/Acoustics": {
19
+ "total": 253,
20
+ "rule_correct": 7,
21
+ "gemini_correct": 20,
22
+ "final_correct": 27
23
+ },
24
+ "Electromagnetism": {
25
+ "total": 275,
26
+ "rule_correct": 7,
27
+ "gemini_correct": 16,
28
+ "final_correct": 23
29
+ },
30
+ "Modern Physics": {
31
+ "total": 222,
32
+ "rule_correct": 6,
33
+ "gemini_correct": 14,
34
+ "final_correct": 20
35
+ },
36
+ "Optics": {
37
+ "total": 252,
38
+ "rule_correct": 16,
39
+ "gemini_correct": 15,
40
+ "final_correct": 31
41
+ },
42
+ "Thermodynamics": {
43
+ "total": 255,
44
+ "rule_correct": 4,
45
+ "gemini_correct": 10,
46
+ "final_correct": 14
47
+ }
48
+ }
49
+ },
50
+ "sft": {
51
+ "model": "SFT",
52
+ "total": 1533,
53
+ "rule_correct": 41,
54
+ "gemini_rescued": 69,
55
+ "final_correct": 110,
56
+ "final_acc": 7.18,
57
+ "category_stats": {
58
+ "Mechanics": {
59
+ "total": 276,
60
+ "rule_correct": 2,
61
+ "gemini_correct": 27,
62
+ "final_correct": 29
63
+ },
64
+ "Waves/Acoustics": {
65
+ "total": 253,
66
+ "rule_correct": 8,
67
+ "gemini_correct": 12,
68
+ "final_correct": 20
69
+ },
70
+ "Electromagnetism": {
71
+ "total": 275,
72
+ "rule_correct": 7,
73
+ "gemini_correct": 8,
74
+ "final_correct": 15
75
+ },
76
+ "Modern Physics": {
77
+ "total": 222,
78
+ "rule_correct": 7,
79
+ "gemini_correct": 10,
80
+ "final_correct": 17
81
+ },
82
+ "Optics": {
83
+ "total": 252,
84
+ "rule_correct": 13,
85
+ "gemini_correct": 6,
86
+ "final_correct": 19
87
+ },
88
+ "Thermodynamics": {
89
+ "total": 255,
90
+ "rule_correct": 4,
91
+ "gemini_correct": 6,
92
+ "final_correct": 10
93
+ }
94
+ }
95
+ },
96
+ "improvement": "-3.00%"
97
+ }
eval_footprint/convert_openended.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Convert MCQ test set to open-ended format. Uses pyarrow (not pandas)."""
3
+ import json, re, os
4
+
5
+ PHYX_TEST = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/PhyX_test.jsonl"
6
+ TEST_PARQUET = "/workspace/rl4phyx/RL4Phyx/SFT/eval_data/test_1533.parquet"
7
+ OUTPUT = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
8
+
9
+ os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)
10
+
11
+ # Get test indices from parquet via pyarrow
12
+ import pyarrow.parquet as pq
13
+ table = pq.read_table(TEST_PARQUET)
14
+ ei_col = table.column("extra_info")
15
+ test_indices = set()
16
+ for i in range(len(table)):
17
+ ei = ei_col[i].as_py()
18
+ test_indices.add(ei["index"])
19
+ print(f"Test indices: {len(test_indices)}")
20
+
21
+ # Read all PhyX samples
22
+ all_samples = {}
23
+ with open(PHYX_TEST) as f:
24
+ for line in f:
25
+ if line.strip():
26
+ d = json.loads(line)
27
+ all_samples[d["index"]] = d
28
+ print(f"Total PhyX: {len(all_samples)}")
29
+
30
+ # Convert
31
+ results = []
32
+ for idx in sorted(test_indices):
33
+ s = all_samples[idx]
34
+ opts = dict(re.findall(r'([ABCD]):\s*"([^"]*)"', s.get("options", "")))
35
+ letter = s["answer"].strip().upper()
36
+ actual = opts.get(letter, letter)
37
+ results.append({
38
+ "index": idx,
39
+ "category": s.get("category", ""),
40
+ "subfield": s.get("subfield", ""),
41
+ "description": s.get("description", ""),
42
+ "question": s.get("question", ""),
43
+ "image": s.get("image", ""),
44
+ "ground_truth_letter": letter,
45
+ "ground_truth_value": actual,
46
+ "options_original": s.get("options", ""),
47
+ "reasoning_type": s.get("reasoning_type", []),
48
+ "image_caption": s.get("image_caption", ""),
49
+ })
50
+
51
+ with open(OUTPUT, "w") as f:
52
+ for r in results:
53
+ f.write(json.dumps(r, ensure_ascii=False) + "\n")
54
+
55
+ print(f"Saved {len(results)} samples to {OUTPUT}")
56
+ from collections import Counter
57
+ cats = Counter(r["category"] for r in results)
58
+ for c, n in sorted(cats.items(), key=lambda x: -x[1]):
59
+ print(f" {c}: {n}")
60
+ ex = results[0]
61
+ print(f"Ex: q={ex['question'][:80]} gt={ex['ground_truth_value']}")
eval_footprint/create_openended_test.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Convert MCQ test set to open-ended format.
4
+ NO pandas dependency — uses only json, re, os (avoids Docker import issues).
5
+
6
+ Input: PhyX_test.jsonl (3000 questions with MCQ options)
7
+ SFT train indices file (to exclude training samples)
8
+ Output: test_1533_openended.jsonl (open-ended format)
9
+ """
10
+ import json
11
+ import re
12
+ import os
13
+
14
+ # ============ CONFIG ============
15
+ PHYX_TEST = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/PhyX_test.jsonl"
16
+ # The SFT training set indices — we need to exclude these
17
+ SFT_TRAIN_PARQUET_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/data"
18
+ OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
19
+ OUTPUT_FILE = os.path.join(OUTPUT_DIR, "test_1533_openended.jsonl")
20
+ TOTAL_PHYX = 3000
21
+ SFT_TRAIN_COUNT = 1467 # 3000 - 1533
22
+ # ================================
23
+
24
+
25
+ def parse_options(options_str):
26
+ """Parse 'A:"7.55N",B:"5.55N",C:"7.65N",D:"6.65N"'
27
+ Returns dict: {'A': '7.55N', 'B': '5.55N', ...}
28
+ """
29
+ result = {}
30
+ matches = re.findall(r'([ABCD]):\s*"([^"]*)"', options_str)
31
+ for letter, value in matches:
32
+ result[letter] = value
33
+ return result
34
+
35
+
36
+ def get_train_indices():
37
+ """Get the indices of samples used in SFT training.
38
+ We know SFT used specific indices from PhyX_test.jsonl.
39
+ Read the SFT training jsonl to find which indices were used.
40
+ """
41
+ train_indices = set()
42
+
43
+ # Try reading the SFT training data to extract indices
44
+ sft_dirs = [
45
+ "/workspace/rl4phyx/RL4Phyx/SFT/data",
46
+ "/workspace/rl4phyx/RL4Phyx/SFT/sft_data",
47
+ ]
48
+
49
+ for sft_dir in sft_dirs:
50
+ if not os.path.exists(sft_dir):
51
+ continue
52
+ for fname in os.listdir(sft_dir):
53
+ fpath = os.path.join(sft_dir, fname)
54
+ if fname.endswith('.jsonl'):
55
+ with open(fpath, 'r', encoding='utf-8') as f:
56
+ for line in f:
57
+ if line.strip():
58
+ try:
59
+ data = json.loads(line)
60
+ if 'extra_info' in data and 'index' in data['extra_info']:
61
+ train_indices.add(data['extra_info']['index'])
62
+ elif 'index' in data:
63
+ train_indices.add(data['index'])
64
+ except:
65
+ pass
66
+ elif fname.endswith('.json'):
67
+ with open(fpath, 'r', encoding='utf-8') as f:
68
+ try:
69
+ data = json.load(f)
70
+ if isinstance(data, list):
71
+ for item in data:
72
+ if 'index' in item:
73
+ train_indices.add(item['index'])
74
+ except:
75
+ pass
76
+
77
+ return train_indices
78
+
79
+
80
+ def main():
81
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
82
+
83
+ # Step 1: Read all PhyX_test.jsonl samples
84
+ all_samples = []
85
+ with open(PHYX_TEST, 'r', encoding='utf-8') as f:
86
+ for line in f:
87
+ if line.strip():
88
+ all_samples.append(json.loads(line))
89
+ print(f"Total PhyX_test samples: {len(all_samples)}")
90
+
91
+ # Step 2: Get training indices to exclude
92
+ train_indices = get_train_indices()
93
+ print(f"Found {len(train_indices)} SFT training indices to exclude")
94
+
95
+ # If we couldn't find training indices, try alternate approach
96
+ # The test set was created by selecting samples with split='test'
97
+ if len(train_indices) == 0:
98
+ print("No training indices found from files, checking for 'split' field...")
99
+ # Check if samples have a split field
100
+ has_split = any('split' in s for s in all_samples)
101
+ if has_split:
102
+ test_samples = [s for s in all_samples if s.get('split') == 'test']
103
+ train_samples = [s for s in all_samples if s.get('split') != 'test']
104
+ train_indices = {s['index'] for s in train_samples}
105
+ print(f"Using split field: {len(test_samples)} test, {len(train_samples)} train")
106
+ else:
107
+ # Check the SFT config for variance selection
108
+ # The test set matches the complement of the training set
109
+ # Try reading parquet via pyarrow directly (might work even if pandas broken)
110
+ print("Trying pyarrow directly...")
111
+ try:
112
+ import pyarrow.parquet as pq
113
+ for root, dirs, files in os.walk("/workspace/rl4phyx/RL4Phyx/SFT"):
114
+ for f_name in files:
115
+ if 'train' in f_name.lower() and f_name.endswith('.parquet'):
116
+ fpath = os.path.join(root, f_name)
117
+ table = pq.read_table(fpath)
118
+ # Try to extract indices
119
+ if 'extra_info' in table.column_names:
120
+ for row in table.to_pydict()['extra_info']:
121
+ if isinstance(row, dict) and 'index' in row:
122
+ train_indices.add(row['index'])
123
+ print(f" Read {fpath}: got {len(train_indices)} indices so far")
124
+ except Exception as e:
125
+ print(f"pyarrow failed: {e}")
126
+
127
+ # Step 3: Build test set
128
+ if len(train_indices) > 0:
129
+ test_samples = [s for s in all_samples if s['index'] not in train_indices]
130
+ else:
131
+ # Fallback: just use all 3000 samples
132
+ print("WARNING: Could not identify train/test split. Using all 3000 samples.")
133
+ test_samples = all_samples
134
+
135
+ print(f"Test samples to convert: {len(test_samples)}")
136
+
137
+ # Step 4: Convert to open-ended format
138
+ openended = []
139
+ for sample in test_samples:
140
+ options = parse_options(sample.get('options', ''))
141
+ letter_answer = sample['answer'].strip().upper()
142
+ actual_answer = options.get(letter_answer, letter_answer)
143
+
144
+ entry = {
145
+ 'index': sample['index'],
146
+ 'category': sample.get('category', ''),
147
+ 'subfield': sample.get('subfield', ''),
148
+ 'description': sample.get('description', ''),
149
+ 'question': sample.get('question', ''),
150
+ 'image': sample.get('image', ''),
151
+ 'ground_truth_letter': letter_answer,
152
+ 'ground_truth_value': actual_answer,
153
+ 'options_original': sample.get('options', ''),
154
+ 'reasoning_type': sample.get('reasoning_type', []),
155
+ 'image_caption': sample.get('image_caption', ''),
156
+ }
157
+ openended.append(entry)
158
+
159
+ # Step 5: Stats
160
+ from collections import Counter
161
+ cats = Counter(e['category'] for e in openended)
162
+ print(f"\nConverted {len(openended)} samples to open-ended format")
163
+ print("Per-category distribution:")
164
+ for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
165
+ print(f" {cat}: {cnt}")
166
+
167
+ # Answer type analysis
168
+ numeric_count = 0
169
+ text_count = 0
170
+ for e in openended:
171
+ val = e['ground_truth_value']
172
+ # Check if answer is numeric-ish
173
+ clean = re.sub(r'[^\d.\-eE]', '', val)
174
+ try:
175
+ float(clean)
176
+ numeric_count += 1
177
+ except:
178
+ text_count += 1
179
+ print(f"\nAnswer types: {numeric_count} numeric-ish, {text_count} text/symbolic")
180
+
181
+ # Step 6: Save
182
+ with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
183
+ for entry in openended:
184
+ f.write(json.dumps(entry, ensure_ascii=False) + '\n')
185
+ print(f"\nSaved to: {OUTPUT_FILE}")
186
+
187
+ # Also copy the conversion script itself
188
+ import shutil
189
+ script_dest = os.path.join(OUTPUT_DIR, "create_openended_test.py")
190
+ shutil.copy2(__file__, script_dest)
191
+ print(f"Script copied to: {script_dest}")
192
+
193
+ # Show examples
194
+ print("\n=== EXAMPLE 1 ===")
195
+ ex = openended[0]
196
+ print(f" index: {ex['index']}")
197
+ print(f" category: {ex['category']}")
198
+ print(f" question: {ex['question']}")
199
+ print(f" ground_truth_value: {ex['ground_truth_value']}")
200
+
201
+ if len(openended) > 100:
202
+ print("\n=== EXAMPLE 100 ===")
203
+ ex = openended[100]
204
+ print(f" index: {ex['index']}")
205
+ print(f" category: {ex['category']}")
206
+ print(f" question: {ex['question']}")
207
+ print(f" ground_truth_value: {ex['ground_truth_value']}")
208
+
209
+
210
+ if __name__ == '__main__':
211
+ main()
eval_footprint/eval_deepseek_judge.py ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Score inference results using DeepSeek-V3 as LLM Judge.
4
+ *** EXACTLY aligned with PhyX official evaluation pipeline ***
5
+ (from killthefullmoon/PhyX -> vlmeval/dataset/utils/phyx.py)
6
+
7
+ Pipeline:
8
+ 1. Extract answer from \boxed{} or "final answer:" pattern
9
+ 2. String-level matching
10
+ 3. LLM judge with 5-shot ICE prompt, retry 5 times with increasing temperature
11
+
12
+ Usage:
13
+ python3 eval_deepseek_judge.py
14
+ """
15
+ import json, os, re, time, sys, ast
16
+ from collections import defaultdict
17
+ import urllib.request
18
+ import urllib.error
19
+
20
+ # ===================== CONFIG =====================
21
+ DEEPSEEK_API_KEY = "sk-6364e2b3116241c59577191c32b09021"
22
+ DEEPSEEK_MODEL = "deepseek-chat" # Official DeepSeek-V3
23
+ DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
24
+
25
+ RESULTS_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
26
+ BASE_RESULTS = os.path.join(RESULTS_DIR, "inference_results_base.jsonl")
27
+ SFT_RESULTS = os.path.join(RESULTS_DIR, "inference_results_sft.jsonl")
28
+ OUTPUT_DIR = RESULTS_DIR
29
+
30
+ FAIL_MSG = 'Failed to obtain answer via API.'
31
+ RETRY = 5
32
+ # ==================================================
33
+
34
+
35
+ # ============= PhyX ICE (In-Context Examples) =============
36
+ # Exactly from PhyX source code: get_ICE()
37
+
38
+ def get_ICE():
39
+ example_1 = """
40
+ Ground truth answer: 502 \n
41
+ Predicted answer: The mass of block (B) is:
42
+ [
43
+ \\boxed{ 50 \\sqrt{101} }
44
+ ] \n
45
+ Judegement: 1
46
+ """
47
+
48
+ example_2 = """
49
+ Ground truth answer: 46.3 kN \n
50
+ Predicted answer: The tension ( T_B ) in the cable is approximately:
51
+ [
52
+ \\boxed{46300 }
53
+ ] \n
54
+ Judegement: 1
55
+ """
56
+
57
+ example_3 = """
58
+ Ground truth answer: 12 m/s \n
59
+ Predicted answer: The speed of the box after 2.00 seconds is:
60
+ [
61
+ \\boxed{11.3, \\text{m/s}}
62
+ ] \n
63
+ Judegement: 0
64
+ """
65
+
66
+ example_4 = """
67
+ Ground truth answer: 36.00 kg \n
68
+ Predicted answer: The mass of the hanging block ( m_2 ) must be approximately:
69
+ [
70
+ \\boxed{36.1, \\text\\{kg\\}}
71
+ ] \n
72
+ Judegement: 1
73
+ """
74
+
75
+ example_5 = """
76
+ Ground truth answer: 3.2 m \n
77
+ Predicted answer: The stuntman and villain slide approximately \\frac{10}{3.1415} meters**.
78
+ Judegement: 1
79
+ """
80
+
81
+ return [example_1, example_2, example_3, example_4, example_5]
82
+
83
+
84
+ # ============= PhyX Prompt Builder =============
85
+ # Exactly from PhyX source code: build_phyx_gpt4_prompt()
86
+
87
+ def build_phyx_gpt4_prompt(gt_answer, pred):
88
+ task_description = """
89
+ Please read the following example. Given predicted answer and ground truth answer,
90
+ compare the these two answers, then ONLY output judegement 1/0 for matched/unmatched at the end of the prompt.
91
+ If the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.
92
+ If the given predicted mentions "approximately", then allow the Approximation Error, \
93
+ such as 0.49 and approximately 0.5, 0.81 and approximately 0.8. \n
94
+
95
+ """
96
+ prompt = task_description
97
+ examples = get_ICE()
98
+ for example in examples:
99
+ prompt += example + '\n'
100
+ prompt += 'Ground truth answer: {} \n'.format(gt_answer)
101
+ prompt += 'Predicted answer: {} \n'.format(pred)
102
+ prompt += 'Judegement:'
103
+ return prompt
104
+
105
+
106
+ # ============= PhyX Answer Extraction =============
107
+ # Exactly from PhyX source code
108
+
109
+ def mapping_str(input_str):
110
+ d = {"\\dfrac": "\\frac", "\\pi": "3.14"}
111
+ output = input_str
112
+ for k, v in d.items():
113
+ try:
114
+ output = output.replace(k, v)
115
+ except:
116
+ pass
117
+ return output
118
+
119
+
120
+ def extract_boxed_content(s):
121
+ """Extract content from \\boxed{...} handling nested braces. From PhyX source."""
122
+ start = s.find(r'\boxed{')
123
+ if start == -1:
124
+ return None
125
+ content_start = start + len(r'\boxed{')
126
+ rest = s[content_start:]
127
+ depth = 0
128
+ for i, ch in enumerate(rest):
129
+ if ch == '{':
130
+ depth += 1
131
+ elif ch == '}':
132
+ if depth == 0:
133
+ return rest[:i]
134
+ else:
135
+ depth -= 1
136
+ return None
137
+
138
+
139
+ def PhyX_process_line(prediction_str, gt_answer):
140
+ """
141
+ PhyX rule-based answer extraction and string matching.
142
+ Returns: dict with 'extracted', 'match' (0 or 1)
143
+ """
144
+ ret = {}
145
+ ret['gt'] = str(gt_answer)
146
+ ret['pred'] = prediction_str.strip()
147
+
148
+ if ret['pred'] == FAIL_MSG:
149
+ ret['match'] = 0
150
+ ret["extracted"] = "Fail to Call API"
151
+ return ret
152
+
153
+ # Try extracting from \boxed{}
154
+ boxed_answer = extract_boxed_content(ret['pred'])
155
+ if boxed_answer is not None:
156
+ boxed_answer = mapping_str(boxed_answer)
157
+ ret["extracted"] = boxed_answer
158
+ else:
159
+ # Try "final answer:" or "correct answer:" pattern
160
+ pattern = r'\b(?:final\s+answer|correct\s+answer)\b[^::]*[::]\s*(.*?)(?=\n\n\n|\Z)'
161
+ flags = re.IGNORECASE | re.DOTALL
162
+ match = re.search(pattern, ret['pred'], flags=flags)
163
+ if match:
164
+ extracted_answer = match.group(1)
165
+ extracted_answer = mapping_str(extracted_answer)
166
+ ret["extracted"] = extracted_answer
167
+ else:
168
+ ret["extracted"] = "SAME as predict"
169
+
170
+ # String-level matching (PhyX logic)
171
+ gt_lower = ret['gt'].strip().lower()
172
+ extracted_lower = ret["extracted"].strip().lower()
173
+ pred_lower = ret["pred"].strip().lower()
174
+
175
+ if gt_lower == extracted_lower or gt_lower == pred_lower or ret['gt'] in ret['pred']:
176
+ ret['match'] = 1
177
+ return ret
178
+
179
+ ret['match'] = 0
180
+ return ret
181
+
182
+
183
+ # ============= DeepSeek API =============
184
+
185
+ def call_deepseek(prompt, temperature=0.0):
186
+ """Call DeepSeek-V3 API (OpenAI-compatible)."""
187
+ headers = {
188
+ "Content-Type": "application/json",
189
+ "Authorization": f"Bearer {DEEPSEEK_API_KEY}"
190
+ }
191
+ data = json.dumps({
192
+ "model": DEEPSEEK_MODEL,
193
+ "messages": [{"role": "user", "content": prompt}],
194
+ "temperature": temperature,
195
+ "max_tokens": 200,
196
+ }).encode('utf-8')
197
+
198
+ try:
199
+ req = urllib.request.Request(DEEPSEEK_API_URL, data=data, headers=headers)
200
+ with urllib.request.urlopen(req, timeout=30) as resp:
201
+ result = json.loads(resp.read().decode())
202
+ return result['choices'][0]['message']['content']
203
+ except Exception as e:
204
+ return FAIL_MSG
205
+
206
+
207
+ # ============= PhyX Evaluation Logic =============
208
+ # Exactly from PhyX source code: PhyX_auxeval()
209
+
210
+ def PhyX_auxeval(gt_answer, prediction):
211
+ """
212
+ Evaluate a single prediction against ground truth.
213
+ Follows PhyX pipeline exactly:
214
+ 1. Extract answer (boxed/regex)
215
+ 2. String-level match
216
+ 3. LLM judge with 5 retries, increasing temperature
217
+ Returns: dict(log, res, extracted)
218
+ """
219
+ log = ''
220
+
221
+ # Step 1: Rule-based extraction
222
+ tmp = PhyX_process_line(prediction, gt_answer)
223
+
224
+ if tmp["extracted"] == "Fail to Call API":
225
+ log += "Fail to Call API"
226
+ return dict(log=log, res=0, extracted="Fail to Call API")
227
+
228
+ if tmp["extracted"] != "SAME as predict":
229
+ prediction_extracted = tmp["extracted"]
230
+ else:
231
+ prediction_extracted = prediction
232
+
233
+ # Step 2: String-level match
234
+ if str(gt_answer).strip().lower() == prediction_extracted.strip().lower():
235
+ return dict(log="Matched at string level", res=1, extracted=prediction_extracted)
236
+
237
+ # Step 3: LLM judge with retries (PhyX uses 5 retries with temp = i * 0.5)
238
+ prompt = build_phyx_gpt4_prompt(gt_answer, prediction_extracted)
239
+ for i in range(RETRY):
240
+ res = call_deepseek(prompt, temperature=i * 0.5)
241
+ if FAIL_MSG in res:
242
+ log += f'Try {i}: answer and prediction are {gt_answer} and {prediction_extracted}, failed to compare.\n'
243
+ else:
244
+ log += 'Compared at semantic level. '
245
+ if "1" in res:
246
+ log += "Semantic equal via LLM."
247
+ return dict(log=log, res=1, extracted=prediction_extracted)
248
+ elif "0" in res:
249
+ log += f"LLM judgement {res}"
250
+ return dict(log=log, res=0, extracted=prediction_extracted)
251
+ log += 'All 5 retries failed.\n'
252
+ return dict(log=log, res=0, extracted=prediction_extracted)
253
+
254
+
255
+ # ============= Main Scoring =============
256
+
257
+ def score_results(results_file, model_name):
258
+ """Score all results from a JSONL file."""
259
+ results = []
260
+ with open(results_file, 'r', encoding='utf-8') as f:
261
+ for line in f:
262
+ if line.strip():
263
+ results.append(json.loads(line))
264
+
265
+ print(f"\n{'='*60}")
266
+ print(f" Scoring: {model_name} ({len(results)} samples)")
267
+ print(f" Using PhyX-aligned pipeline with DeepSeek-V3 judge")
268
+ print(f"{'='*60}")
269
+
270
+ total = len(results)
271
+ hit = 0
272
+ cat_stats = defaultdict(lambda: {'total': 0, 'correct': 0})
273
+ scored = []
274
+ string_match = 0
275
+ llm_match = 0
276
+ llm_called = 0
277
+
278
+ for i, r in enumerate(results):
279
+ gt = r['ground_truth_value']
280
+ prediction = r['model_output']
281
+ cat = r.get('category', 'unknown')
282
+ cat_stats[cat]['total'] += 1
283
+
284
+ eval_result = PhyX_auxeval(gt, prediction)
285
+
286
+ r['extracted_answer'] = eval_result['extracted']
287
+ r['eval_log'] = eval_result['log']
288
+ r['res'] = eval_result['res']
289
+
290
+ if eval_result['res'] == 1:
291
+ hit += 1
292
+ cat_stats[cat]['correct'] += 1
293
+
294
+ if "string level" in eval_result['log']:
295
+ string_match += 1
296
+ elif "semantic level" in eval_result['log'] or "LLM judgement" in eval_result['log']:
297
+ llm_called += 1
298
+ if eval_result['res'] == 1:
299
+ llm_match += 1
300
+
301
+ scored.append(r)
302
+
303
+ if (i + 1) % 50 == 0:
304
+ print(f" [{i+1}/{total}] acc={hit/(i+1)*100:.1f}% "
305
+ f"(str_match={string_match}, llm_called={llm_called}, llm_match={llm_match})",
306
+ flush=True)
307
+
308
+ acc = hit / total * 100
309
+
310
+ print(f"\n RESULTS for {model_name}:")
311
+ print(f" Total: {total}")
312
+ print(f" String matches: {string_match}")
313
+ print(f" LLM calls: {llm_called}")
314
+ print(f" LLM matches: {llm_match}")
315
+ print(f" Final correct: {hit} ({acc:.1f}%)")
316
+ print(f"\n Per category:")
317
+ for cat, s in sorted(cat_stats.items(), key=lambda x: -x[1]['total']):
318
+ cat_acc = s['correct'] / s['total'] * 100 if s['total'] > 0 else 0
319
+ print(f" {cat:25s}: {s['correct']:3d}/{s['total']:3d} ({cat_acc:5.1f}%)")
320
+
321
+ return scored, {
322
+ 'model': model_name,
323
+ 'total': total,
324
+ 'string_matches': string_match,
325
+ 'llm_calls': llm_called,
326
+ 'llm_matches': llm_match,
327
+ 'final_correct': hit,
328
+ 'final_acc': round(acc, 2),
329
+ 'category_stats': {k: dict(v) for k, v in cat_stats.items()}
330
+ }
331
+
332
+
333
+ def main():
334
+ print("="*60)
335
+ print(" PhyX-ALIGNED EVAL: DeepSeek-V3 as Judge")
336
+ print(f" Pipeline: extract → string match → LLM judge (5 retries)")
337
+ print(f" Results dir: {RESULTS_DIR}")
338
+ print("="*60)
339
+
340
+ # Test API
341
+ print("\nTesting DeepSeek API...")
342
+ test = call_deepseek("Say 'OK' if you can read this.")
343
+ if test == FAIL_MSG:
344
+ print(f" API FAILED: {test}")
345
+ sys.exit(1)
346
+ print(f" API OK: {test[:50]}")
347
+
348
+ # Score both models
349
+ base_scored, base_stats = score_results(BASE_RESULTS, "Base (Qwen2.5-VL-3B)")
350
+ sft_scored, sft_stats = score_results(SFT_RESULTS, "SFT (Cold-Start Full FT)")
351
+
352
+ # Save scored results
353
+ for scored, name in [(base_scored, "base"), (sft_scored, "sft")]:
354
+ out_file = os.path.join(OUTPUT_DIR, f"scored_results_{name}_phyx.jsonl")
355
+ with open(out_file, 'w', encoding='utf-8') as f:
356
+ for r in scored:
357
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
358
+
359
+ # Save comparison report
360
+ report = {
361
+ 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
362
+ 'scoring_method': 'PhyX-aligned (DeepSeek-V3 judge, 5-shot ICE, 5 retries)',
363
+ 'base': base_stats,
364
+ 'sft': sft_stats,
365
+ }
366
+ report_file = os.path.join(OUTPUT_DIR, "comparison_report_phyx.json")
367
+ with open(report_file, 'w', encoding='utf-8') as f:
368
+ json.dump(report, f, indent=2, ensure_ascii=False)
369
+
370
+ # Final comparison
371
+ print(f"\n{'='*60}")
372
+ print(f" FINAL COMPARISON (PhyX-aligned)")
373
+ print(f"{'='*60}")
374
+ print(f" Base accuracy: {base_stats['final_acc']}%")
375
+ print(f" SFT accuracy: {sft_stats['final_acc']}%")
376
+ print(f" Improvement: {sft_stats['final_acc'] - base_stats['final_acc']:+.1f}%")
377
+ print(f"\n Report saved: {report_file}")
378
+ print(f"{'='*60}")
379
+
380
+
381
+ if __name__ == '__main__':
382
+ main()
eval_footprint/eval_fullft_math_nf.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).
4
+
5
+ Runs both Base and SFT models on the 1533 open-ended physics test set.
6
+ Saves raw model outputs for later judging.
7
+
8
+ Usage (inside Docker container):
9
+ cd /tmp && python3 /path/to/eval_openended_inference.py
10
+
11
+ Output:
12
+ sft_eval_footprint/inference_results_base.jsonl
13
+ sft_eval_footprint/inference_results_coldstart.jsonl
14
+ """
15
+ import os
16
+ import sys
17
+ import json
18
+ import re
19
+ import time
20
+ import torch
21
+ import multiprocessing as mp
22
+ from collections import Counter
23
+
24
+ # ============ CONFIG ============
25
+ os.environ["HF_HUB_OFFLINE"] = "1"
26
+ os.environ["TRANSFORMERS_OFFLINE"] = "1"
27
+
28
+ BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
29
+ SFT_MODEL = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_qwen25vl_3b_fullft_coldstart/final"
30
+ TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
31
+ OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
32
+ IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
33
+
34
+ # Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
35
+ BASE_GPUS = [0, 1, 2, 3]
36
+ SFT_GPUS = [0, 1, 2, 3, 4, 5, 6, 7]
37
+ MAX_NEW_TOKENS = 2048
38
+ # ================================
39
+
40
+
41
+ def load_test_data():
42
+ """Load test samples from JSONL."""
43
+ samples = []
44
+ with open(TEST_FILE, 'r', encoding='utf-8') as f:
45
+ for line in f:
46
+ if line.strip():
47
+ samples.append(json.loads(line))
48
+ return samples
49
+
50
+
51
+ def build_open_ended_prompt(sample):
52
+ """Build an open-ended prompt (no MCQ options)."""
53
+ desc = sample.get('description', '')
54
+ question = sample.get('question', '')
55
+
56
+ prompt = f"""Look at the image and answer the physics question.
57
+
58
+ {desc}
59
+
60
+ {question}
61
+
62
+ Please reason step by step, and put your final answer within \\boxed{{}}.
63
+ """
64
+ return prompt.strip()
65
+
66
+
67
+ def worker_inference(gpu_id, model_path, samples, output_file, model_name):
68
+ """Worker: load model on specific GPU and run inference on assigned samples."""
69
+ import torch
70
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
71
+ from qwen_vl_utils import process_vision_info
72
+ from PIL import Image
73
+
74
+ device = f"cuda:{gpu_id}"
75
+ print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)
76
+
77
+ processor = AutoProcessor.from_pretrained(
78
+ model_path,
79
+ min_pixels=3136,
80
+ max_pixels=200704,
81
+ local_files_only=True,
82
+ trust_remote_code=True,
83
+ )
84
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
85
+ model_path,
86
+ torch_dtype=torch.bfloat16,
87
+ device_map=device,
88
+ local_files_only=True,
89
+ trust_remote_code=True,
90
+ )
91
+ model.eval()
92
+ print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)
93
+
94
+ results = []
95
+ for i, sample in enumerate(samples):
96
+ idx = sample['index']
97
+ prompt_text = build_open_ended_prompt(sample)
98
+ image_path = os.path.join(IMAGE_DIR, sample['image'])
99
+
100
+ # Build messages
101
+ messages = [{
102
+ "role": "user",
103
+ "content": [
104
+ {"type": "image", "image": f"file://{image_path}"},
105
+ {"type": "text", "text": prompt_text},
106
+ ],
107
+ }]
108
+
109
+ try:
110
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
111
+ image_inputs, video_inputs = process_vision_info(messages)
112
+ inputs = processor(
113
+ text=[text],
114
+ images=image_inputs,
115
+ videos=video_inputs,
116
+ padding=True,
117
+ return_tensors="pt",
118
+ ).to(device)
119
+
120
+ with torch.no_grad():
121
+ output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
122
+
123
+ generated = output_ids[0][inputs.input_ids.shape[1]:]
124
+ response = processor.decode(generated, skip_special_tokens=True)
125
+ except Exception as e:
126
+ response = f"ERROR: {str(e)}"
127
+
128
+ result = {
129
+ "index": idx,
130
+ "category": sample['category'],
131
+ "subfield": sample.get('subfield', ''),
132
+ "question": sample['question'],
133
+ "ground_truth_value": sample['ground_truth_value'],
134
+ "ground_truth_letter": sample.get('ground_truth_letter', ''),
135
+ "model_output": response,
136
+ "model_name": model_name,
137
+ "gpu_id": gpu_id,
138
+ }
139
+ results.append(result)
140
+
141
+ if (i + 1) % 20 == 0 or (i + 1) == len(samples):
142
+ print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
143
+
144
+ # Write results
145
+ with open(output_file, 'w', encoding='utf-8') as f:
146
+ for r in results:
147
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
148
+
149
+ print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
150
+ return len(results)
151
+
152
+
153
+ def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
154
+ """Split samples across GPUs and run in parallel."""
155
+ n = len(samples)
156
+ k = len(gpu_ids)
157
+ chunk_size = (n + k - 1) // k
158
+
159
+ processes = []
160
+ output_files = []
161
+ for i, gpu_id in enumerate(gpu_ids):
162
+ chunk = samples[i * chunk_size: (i + 1) * chunk_size]
163
+ if not chunk:
164
+ continue
165
+ out_file = f"{output_base}_gpu{gpu_id}.jsonl"
166
+ output_files.append(out_file)
167
+ p = mp.Process(
168
+ target=worker_inference,
169
+ args=(gpu_id, model_path, chunk, out_file, model_name)
170
+ )
171
+ processes.append(p)
172
+
173
+ for p in processes:
174
+ p.start()
175
+ for p in processes:
176
+ p.join()
177
+
178
+ return output_files
179
+
180
+
181
+ def merge_results(output_files, final_output):
182
+ """Merge per-GPU result files into one."""
183
+ all_results = []
184
+ for f in output_files:
185
+ if os.path.exists(f):
186
+ with open(f, 'r', encoding='utf-8') as fh:
187
+ for line in fh:
188
+ if line.strip():
189
+ all_results.append(json.loads(line))
190
+
191
+ # Sort by index for consistency
192
+ all_results.sort(key=lambda x: x['index'])
193
+
194
+ with open(final_output, 'w', encoding='utf-8') as f:
195
+ for r in all_results:
196
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
197
+
198
+ # Cleanup per-GPU files
199
+ for f in output_files:
200
+ if os.path.exists(f):
201
+ os.remove(f)
202
+
203
+ return all_results
204
+
205
+
206
+ def main():
207
+ mp.set_start_method('spawn', force=True)
208
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
209
+
210
+ print("=" * 60)
211
+ print(" OPEN-ENDED EVAL: Base vs SFT (Multi-GPU)")
212
+ print(f" Base model: {BASE_MODEL}")
213
+ print(f" SFT model: {SFT_MODEL}")
214
+ print(f" Base GPUs: {BASE_GPUS}")
215
+ print(f" SFT GPUs: {SFT_GPUS}")
216
+ print("=" * 60)
217
+
218
+ # Load test data
219
+ samples = load_test_data()
220
+ print(f"\nLoaded {len(samples)} test samples")
221
+
222
+ cats = Counter(s['category'] for s in samples)
223
+ for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
224
+ print(f" {cat}: {cnt}")
225
+
226
+ # Run both models (each uses 4 GPUs internally for parallel inference)
227
+ t0 = time.time()
228
+
229
+ base_output = os.path.join(OUTPUT_DIR, "inference_results_base")
230
+ sft_output = os.path.join(OUTPUT_DIR, "inference_results_phyx")
231
+
232
+ # Run base model on GPUs 0-3 (4 workers in parallel)
233
+ pass # SKIP BASE
234
+
235
+ # Run SFT model on GPUs 4-7 (4 workers in parallel)
236
+ print("\n>>> Starting SFT model inference...", flush=True)
237
+ run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)
238
+
239
+ # Merge results
240
+ base_files = [f"{base_output}_gpu{g}.jsonl" for g in BASE_GPUS]
241
+ sft_files = [f"{sft_output}_gpu{g}.jsonl" for g in SFT_GPUS]
242
+
243
+ base_final = os.path.join(OUTPUT_DIR, "inference_results_base.jsonl")
244
+ sft_final = os.path.join(OUTPUT_DIR, "inference_results_coldstart.jsonl")
245
+
246
+ base_results = []
247
+ sft_results = merge_results(sft_files, sft_final)
248
+
249
+ elapsed = time.time() - t0
250
+ print(f"\n{'=' * 60}")
251
+ print(f" INFERENCE COMPLETE in {elapsed/60:.1f} min")
252
+ print(f" Base results: {len(base_results)} → {base_final}")
253
+ print(f" SFT results: {len(sft_results)} → {sft_final}")
254
+ print(f"{'=' * 60}")
255
+
256
+
257
+ if __name__ == '__main__':
258
+ main()
eval_footprint/eval_fullft_math_nf_old.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).
4
+
5
+ Runs both Base and SFT models on the 1533 open-ended physics test set.
6
+ Saves raw model outputs for later judging.
7
+
8
+ Usage (inside Docker container):
9
+ cd /tmp && python3 /path/to/eval_openended_inference.py
10
+
11
+ Output:
12
+ sft_eval_footprint/inference_results_base.jsonl
13
+ sft_eval_footprint/inference_results_sft.jsonl
14
+ """
15
+ import os
16
+ import sys
17
+ import json
18
+ import re
19
+ import time
20
+ import torch
21
+ import multiprocessing as mp
22
+ from collections import Counter
23
+
24
+ # ============ CONFIG ============
25
+ os.environ["HF_HUB_OFFLINE"] = "1"
26
+ os.environ["TRANSFORMERS_OFFLINE"] = "1"
27
+
28
+ BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
29
+ SFT_MODEL = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_qwen25vl_3b_fullft/final"
30
+ TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
31
+ OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
32
+ IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
33
+
34
+ # Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
35
+ BASE_GPUS = [0, 1, 2, 3]
36
+ SFT_GPUS = [4, 5, 6, 7]
37
+ MAX_NEW_TOKENS = 2048
38
+ # ================================
39
+
40
+
41
+ def load_test_data():
42
+ """Load test samples from JSONL."""
43
+ samples = []
44
+ with open(TEST_FILE, 'r', encoding='utf-8') as f:
45
+ for line in f:
46
+ if line.strip():
47
+ samples.append(json.loads(line))
48
+ return samples
49
+
50
+
51
+ def build_open_ended_prompt(sample):
52
+ """Build an open-ended prompt (no MCQ options)."""
53
+ desc = sample.get('description', '')
54
+ question = sample.get('question', '')
55
+
56
+ prompt = f"""Look at the image and answer the physics question.
57
+
58
+ {desc}
59
+
60
+ {question}
61
+
62
+ Please reason step by step, and put your final answer within \\boxed{{}}.
63
+ """
64
+ return prompt.strip()
65
+
66
+
67
+ def worker_inference(gpu_id, model_path, samples, output_file, model_name):
68
+ """Worker: load model on specific GPU and run inference on assigned samples."""
69
+ import torch
70
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
71
+ from qwen_vl_utils import process_vision_info
72
+ from PIL import Image
73
+
74
+ device = f"cuda:{gpu_id}"
75
+ print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)
76
+
77
+ processor = AutoProcessor.from_pretrained(
78
+ model_path,
79
+ min_pixels=3136,
80
+ max_pixels=200704,
81
+ local_files_only=True,
82
+ trust_remote_code=True,
83
+ )
84
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
85
+ model_path,
86
+ torch_dtype=torch.bfloat16,
87
+ device_map=device,
88
+ local_files_only=True,
89
+ trust_remote_code=True,
90
+ )
91
+ model.eval()
92
+ print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)
93
+
94
+ results = []
95
+ for i, sample in enumerate(samples):
96
+ idx = sample['index']
97
+ prompt_text = build_open_ended_prompt(sample)
98
+ image_path = os.path.join(IMAGE_DIR, sample['image'])
99
+
100
+ # Build messages
101
+ messages = [{
102
+ "role": "user",
103
+ "content": [
104
+ {"type": "image", "image": f"file://{image_path}"},
105
+ {"type": "text", "text": prompt_text},
106
+ ],
107
+ }]
108
+
109
+ try:
110
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
111
+ image_inputs, video_inputs = process_vision_info(messages)
112
+ inputs = processor(
113
+ text=[text],
114
+ images=image_inputs,
115
+ videos=video_inputs,
116
+ padding=True,
117
+ return_tensors="pt",
118
+ ).to(device)
119
+
120
+ with torch.no_grad():
121
+ output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
122
+
123
+ generated = output_ids[0][inputs.input_ids.shape[1]:]
124
+ response = processor.decode(generated, skip_special_tokens=True)
125
+ except Exception as e:
126
+ response = f"ERROR: {str(e)}"
127
+
128
+ result = {
129
+ "index": idx,
130
+ "category": sample['category'],
131
+ "subfield": sample.get('subfield', ''),
132
+ "question": sample['question'],
133
+ "ground_truth_value": sample['ground_truth_value'],
134
+ "ground_truth_letter": sample.get('ground_truth_letter', ''),
135
+ "model_output": response,
136
+ "model_name": model_name,
137
+ "gpu_id": gpu_id,
138
+ }
139
+ results.append(result)
140
+
141
+ if (i + 1) % 20 == 0 or (i + 1) == len(samples):
142
+ print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
143
+
144
+ # Write results
145
+ with open(output_file, 'w', encoding='utf-8') as f:
146
+ for r in results:
147
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
148
+
149
+ print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
150
+ return len(results)
151
+
152
+
153
+ def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
154
+ """Split samples across GPUs and run in parallel."""
155
+ n = len(samples)
156
+ k = len(gpu_ids)
157
+ chunk_size = (n + k - 1) // k
158
+
159
+ processes = []
160
+ output_files = []
161
+ for i, gpu_id in enumerate(gpu_ids):
162
+ chunk = samples[i * chunk_size: (i + 1) * chunk_size]
163
+ if not chunk:
164
+ continue
165
+ out_file = f"{output_base}_gpu{gpu_id}.jsonl"
166
+ output_files.append(out_file)
167
+ p = mp.Process(
168
+ target=worker_inference,
169
+ args=(gpu_id, model_path, chunk, out_file, model_name)
170
+ )
171
+ processes.append(p)
172
+
173
+ for p in processes:
174
+ p.start()
175
+ for p in processes:
176
+ p.join()
177
+
178
+ return output_files
179
+
180
+
181
+ def merge_results(output_files, final_output):
182
+ """Merge per-GPU result files into one."""
183
+ all_results = []
184
+ for f in output_files:
185
+ if os.path.exists(f):
186
+ with open(f, 'r', encoding='utf-8') as fh:
187
+ for line in fh:
188
+ if line.strip():
189
+ all_results.append(json.loads(line))
190
+
191
+ # Sort by index for consistency
192
+ all_results.sort(key=lambda x: x['index'])
193
+
194
+ with open(final_output, 'w', encoding='utf-8') as f:
195
+ for r in all_results:
196
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
197
+
198
+ # Cleanup per-GPU files
199
+ for f in output_files:
200
+ if os.path.exists(f):
201
+ os.remove(f)
202
+
203
+ return all_results
204
+
205
+
206
+ def main():
207
+ mp.set_start_method('spawn', force=True)
208
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
209
+
210
+ print("=" * 60)
211
+ print(" OPEN-ENDED EVAL: Base vs SFT (Multi-GPU)")
212
+ print(f" Base model: {BASE_MODEL}")
213
+ print(f" SFT model: {SFT_MODEL}")
214
+ print(f" Base GPUs: {BASE_GPUS}")
215
+ print(f" SFT GPUs: {SFT_GPUS}")
216
+ print("=" * 60)
217
+
218
+ # Load test data
219
+ samples = load_test_data()
220
+ print(f"\nLoaded {len(samples)} test samples")
221
+
222
+ cats = Counter(s['category'] for s in samples)
223
+ for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
224
+ print(f" {cat}: {cnt}")
225
+
226
+ # Run both models (each uses 4 GPUs internally for parallel inference)
227
+ t0 = time.time()
228
+
229
+ base_output = os.path.join(OUTPUT_DIR, "inference_results_base")
230
+ sft_output = os.path.join(OUTPUT_DIR, "inference_results_sft")
231
+
232
+ # Run base model on GPUs 0-3 (4 workers in parallel)
233
+ print("\n>>> Starting BASE model inference...", flush=True)
234
+ run_model_parallel(BASE_MODEL, "base", BASE_GPUS, samples, base_output)
235
+
236
+ # Run SFT model on GPUs 4-7 (4 workers in parallel)
237
+ print("\n>>> Starting SFT model inference...", flush=True)
238
+ run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)
239
+
240
+ # Merge results
241
+ base_files = [f"{base_output}_gpu{g}.jsonl" for g in BASE_GPUS]
242
+ sft_files = [f"{sft_output}_gpu{g}.jsonl" for g in SFT_GPUS]
243
+
244
+ base_final = os.path.join(OUTPUT_DIR, "inference_results_base.jsonl")
245
+ sft_final = os.path.join(OUTPUT_DIR, "inference_results_sft.jsonl")
246
+
247
+ base_results = merge_results(base_files, base_final)
248
+ sft_results = merge_results(sft_files, sft_final)
249
+
250
+ elapsed = time.time() - t0
251
+ print(f"\n{'=' * 60}")
252
+ print(f" INFERENCE COMPLETE in {elapsed/60:.1f} min")
253
+ print(f" Base results: {len(base_results)} → {base_final}")
254
+ print(f" SFT results: {len(sft_results)} → {sft_final}")
255
+ print(f"{'=' * 60}")
256
+
257
+
258
+ if __name__ == '__main__':
259
+ main()
eval_footprint/eval_fullft_math_nf_old_final.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).
4
+
5
+ Runs both Base and SFT models on the 1533 open-ended physics test set.
6
+ Saves raw model outputs for later judging.
7
+
8
+ Usage (inside Docker container):
9
+ cd /tmp && python3 /path/to/eval_openended_inference.py
10
+
11
+ Output:
12
+ sft_eval_footprint/inference_results_base.jsonl
13
+ sft_eval_footprint/inference_results_phyx_50000.jsonl
14
+ """
15
+ import os
16
+ import sys
17
+ import json
18
+ import re
19
+ import time
20
+ import torch
21
+ import multiprocessing as mp
22
+ from collections import Counter
23
+
24
+ # ============ CONFIG ============
25
+ os.environ["HF_HUB_OFFLINE"] = "1"
26
+ os.environ["TRANSFORMERS_OFFLINE"] = "1"
27
+
28
+ BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
29
+ SFT_MODEL = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_qwen25vl_3b_fullft_phyx_50000/final"
30
+ TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
31
+ OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
32
+ IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
33
+
34
+ # Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
35
+ BASE_GPUS = [0, 1, 2, 3]
36
+ SFT_GPUS = [4, 5, 6, 7]
37
+ MAX_NEW_TOKENS = 2048
38
+ # ================================
39
+
40
+
41
+ def load_test_data():
42
+ """Load test samples from JSONL."""
43
+ samples = []
44
+ with open(TEST_FILE, 'r', encoding='utf-8') as f:
45
+ for line in f:
46
+ if line.strip():
47
+ samples.append(json.loads(line))
48
+ return samples
49
+
50
+
51
+ def build_open_ended_prompt(sample):
52
+ """Build an open-ended prompt (no MCQ options)."""
53
+ desc = sample.get('description', '')
54
+ question = sample.get('question', '')
55
+
56
+ prompt = f"""Look at the image and answer the physics question.
57
+
58
+ {desc}
59
+
60
+ {question}
61
+
62
+ Please reason step by step, and put your final answer within \\boxed{{}}.
63
+ """
64
+ return prompt.strip()
65
+
66
+
67
+ def worker_inference(gpu_id, model_path, samples, output_file, model_name):
68
+ """Worker: load model on specific GPU and run inference on assigned samples."""
69
+ import torch
70
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
71
+ from qwen_vl_utils import process_vision_info
72
+ from PIL import Image
73
+
74
+ device = f"cuda:{gpu_id}"
75
+ print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)
76
+
77
+ processor = AutoProcessor.from_pretrained(
78
+ model_path,
79
+ min_pixels=3136,
80
+ max_pixels=200704,
81
+ local_files_only=True,
82
+ trust_remote_code=True,
83
+ )
84
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
85
+ model_path,
86
+ torch_dtype=torch.bfloat16,
87
+ device_map=device,
88
+ local_files_only=True,
89
+ trust_remote_code=True,
90
+ )
91
+ model.eval()
92
+ print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)
93
+
94
+ results = []
95
+ for i, sample in enumerate(samples):
96
+ idx = sample['index']
97
+ prompt_text = build_open_ended_prompt(sample)
98
+ image_path = os.path.join(IMAGE_DIR, sample['image'])
99
+
100
+ # Build messages
101
+ messages = [{
102
+ "role": "user",
103
+ "content": [
104
+ {"type": "image", "image": f"file://{image_path}"},
105
+ {"type": "text", "text": prompt_text},
106
+ ],
107
+ }]
108
+
109
+ try:
110
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
111
+ image_inputs, video_inputs = process_vision_info(messages)
112
+ inputs = processor(
113
+ text=[text],
114
+ images=image_inputs,
115
+ videos=video_inputs,
116
+ padding=True,
117
+ return_tensors="pt",
118
+ ).to(device)
119
+
120
+ with torch.no_grad():
121
+ output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
122
+
123
+ generated = output_ids[0][inputs.input_ids.shape[1]:]
124
+ response = processor.decode(generated, skip_special_tokens=True)
125
+ except Exception as e:
126
+ response = f"ERROR: {str(e)}"
127
+
128
+ result = {
129
+ "index": idx,
130
+ "category": sample['category'],
131
+ "subfield": sample.get('subfield', ''),
132
+ "question": sample['question'],
133
+ "ground_truth_value": sample['ground_truth_value'],
134
+ "ground_truth_letter": sample.get('ground_truth_letter', ''),
135
+ "model_output": response,
136
+ "model_name": model_name,
137
+ "gpu_id": gpu_id,
138
+ }
139
+ results.append(result)
140
+
141
+ if (i + 1) % 20 == 0 or (i + 1) == len(samples):
142
+ print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
143
+
144
+ # Write results
145
+ with open(output_file, 'w', encoding='utf-8') as f:
146
+ for r in results:
147
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
148
+
149
+ print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
150
+ return len(results)
151
+
152
+
153
+ def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
154
+ """Split samples across GPUs and run in parallel."""
155
+ n = len(samples)
156
+ k = len(gpu_ids)
157
+ chunk_size = (n + k - 1) // k
158
+
159
+ processes = []
160
+ output_files = []
161
+ for i, gpu_id in enumerate(gpu_ids):
162
+ chunk = samples[i * chunk_size: (i + 1) * chunk_size]
163
+ if not chunk:
164
+ continue
165
+ out_file = f"{output_base}_gpu{gpu_id}.jsonl"
166
+ output_files.append(out_file)
167
+ p = mp.Process(
168
+ target=worker_inference,
169
+ args=(gpu_id, model_path, chunk, out_file, model_name)
170
+ )
171
+ processes.append(p)
172
+
173
+ for p in processes:
174
+ p.start()
175
+ for p in processes:
176
+ p.join()
177
+
178
+ return output_files
179
+
180
+
181
+ def merge_results(output_files, final_output):
182
+ """Merge per-GPU result files into one."""
183
+ all_results = []
184
+ for f in output_files:
185
+ if os.path.exists(f):
186
+ with open(f, 'r', encoding='utf-8') as fh:
187
+ for line in fh:
188
+ if line.strip():
189
+ all_results.append(json.loads(line))
190
+
191
+ # Sort by index for consistency
192
+ all_results.sort(key=lambda x: x['index'])
193
+
194
+ with open(final_output, 'w', encoding='utf-8') as f:
195
+ for r in all_results:
196
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
197
+
198
+ # Cleanup per-GPU files
199
+ for f in output_files:
200
+ if os.path.exists(f):
201
+ os.remove(f)
202
+
203
+ return all_results
204
+
205
+
206
+ def main():
207
+ mp.set_start_method('spawn', force=True)
208
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
209
+
210
+ print("=" * 60)
211
+ print(" OPEN-ENDED EVAL: Base vs SFT (Multi-GPU)")
212
+ print(f" Base model: {BASE_MODEL}")
213
+ print(f" SFT model: {SFT_MODEL}")
214
+ print(f" Base GPUs: {BASE_GPUS}")
215
+ print(f" SFT GPUs: {SFT_GPUS}")
216
+ print("=" * 60)
217
+
218
+ # Load test data
219
+ samples = load_test_data()
220
+ print(f"\nLoaded {len(samples)} test samples")
221
+
222
+ cats = Counter(s['category'] for s in samples)
223
+ for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
224
+ print(f" {cat}: {cnt}")
225
+
226
+ # Run both models (each uses 4 GPUs internally for parallel inference)
227
+ t0 = time.time()
228
+
229
+ base_output = os.path.join(OUTPUT_DIR, "inference_results_base")
230
+ sft_output = os.path.join(OUTPUT_DIR, "inference_results_phyx_50000")
231
+
232
+ # Run base model on GPUs 0-3 (4 workers in parallel)
233
+ pass # SKIP BASE
234
+
235
+ # Run SFT model on GPUs 4-7 (4 workers in parallel)
236
+ print("\n>>> Starting SFT model inference...", flush=True)
237
+ run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)
238
+
239
+ # Merge results
240
+ base_files = [f"{base_output}_gpu{g}.jsonl" for g in BASE_GPUS]
241
+ sft_files = [f"{sft_output}_gpu{g}.jsonl" for g in SFT_GPUS]
242
+
243
+ base_final = os.path.join(OUTPUT_DIR, "inference_results_base.jsonl")
244
+ sft_final = os.path.join(OUTPUT_DIR, "inference_results_phyx_50000.jsonl")
245
+
246
+ base_results = []
247
+ sft_results = merge_results(sft_files, sft_final)
248
+
249
+ elapsed = time.time() - t0
250
+ print(f"\n{'=' * 60}")
251
+ print(f" INFERENCE COMPLETE in {elapsed/60:.1f} min")
252
+ print(f" Base results: {len(base_results)} → {base_final}")
253
+ print(f" SFT results: {len(sft_results)} → {sft_final}")
254
+ print(f"{'=' * 60}")
255
+
256
+
257
+ if __name__ == '__main__':
258
+ main()
eval_footprint/eval_fullft_phyx_math_nf.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).
4
+
5
+ Runs both Base and SFT models on the 1533 open-ended physics test set.
6
+ Saves raw model outputs for later judging.
7
+
8
+ Usage (inside Docker container):
9
+ cd /tmp && python3 /path/to/eval_openended_inference.py
10
+
11
+ Output:
12
+ sft_eval_footprint/inference_results_base.jsonl
13
+ sft_eval_footprint/inference_results_combined_v3.jsonl
14
+ """
15
+ import os
16
+ import sys
17
+ import json
18
+ import re
19
+ import time
20
+ import torch
21
+ import multiprocessing as mp
22
+ from collections import Counter
23
+
24
+ # ============ CONFIG ============
25
+ os.environ["HF_HUB_OFFLINE"] = "1"
26
+ os.environ["TRANSFORMERS_OFFLINE"] = "1"
27
+
28
+ BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
29
+ SFT_MODEL = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_combined_v3/final"
30
+ TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
31
+ OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
32
+ IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
33
+
34
+ # Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
35
+ BASE_GPUS = [0, 1, 2, 3]
36
+ SFT_GPUS = [0, 1, 2, 3, 4, 5, 6, 7]
37
+ MAX_NEW_TOKENS = 2048
38
+ # ================================
39
+
40
+
41
+ def load_test_data():
42
+ """Load test samples from JSONL."""
43
+ samples = []
44
+ with open(TEST_FILE, 'r', encoding='utf-8') as f:
45
+ for line in f:
46
+ if line.strip():
47
+ samples.append(json.loads(line))
48
+ return samples
49
+
50
+
51
+ def build_open_ended_prompt(sample):
52
+ """Build an open-ended prompt (no MCQ options)."""
53
+ desc = sample.get('description', '')
54
+ question = sample.get('question', '')
55
+
56
+ prompt = f"""Look at the image and answer the physics question.
57
+
58
+ {desc}
59
+
60
+ {question}
61
+
62
+ Please reason step by step, and put your final answer within \\boxed{{}}.
63
+ """
64
+ return prompt.strip()
65
+
66
+
67
+ def worker_inference(gpu_id, model_path, samples, output_file, model_name):
68
+ """Worker: load model on specific GPU and run inference on assigned samples."""
69
+ import torch
70
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
71
+ from qwen_vl_utils import process_vision_info
72
+ from PIL import Image
73
+
74
+ device = f"cuda:{gpu_id}"
75
+ print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)
76
+
77
+ processor = AutoProcessor.from_pretrained(
78
+ model_path,
79
+ min_pixels=3136,
80
+ max_pixels=200704,
81
+ local_files_only=True,
82
+ trust_remote_code=True,
83
+ )
84
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
85
+ model_path,
86
+ torch_dtype=torch.bfloat16,
87
+ device_map=device,
88
+ local_files_only=True,
89
+ trust_remote_code=True,
90
+ )
91
+ model.eval()
92
+ print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)
93
+
94
+ results = []
95
+ for i, sample in enumerate(samples):
96
+ idx = sample['index']
97
+ prompt_text = build_open_ended_prompt(sample)
98
+ image_path = os.path.join(IMAGE_DIR, sample['image'])
99
+
100
+ # Build messages
101
+ messages = [{
102
+ "role": "user",
103
+ "content": [
104
+ {"type": "image", "image": f"file://{image_path}"},
105
+ {"type": "text", "text": prompt_text},
106
+ ],
107
+ }]
108
+
109
+ try:
110
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
111
+ image_inputs, video_inputs = process_vision_info(messages)
112
+ inputs = processor(
113
+ text=[text],
114
+ images=image_inputs,
115
+ videos=video_inputs,
116
+ padding=True,
117
+ return_tensors="pt",
118
+ ).to(device)
119
+
120
+ with torch.no_grad():
121
+ output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
122
+
123
+ generated = output_ids[0][inputs.input_ids.shape[1]:]
124
+ response = processor.decode(generated, skip_special_tokens=True)
125
+ except Exception as e:
126
+ response = f"ERROR: {str(e)}"
127
+
128
+ result = {
129
+ "index": idx,
130
+ "category": sample['category'],
131
+ "subfield": sample.get('subfield', ''),
132
+ "question": sample['question'],
133
+ "ground_truth_value": sample['ground_truth_value'],
134
+ "ground_truth_letter": sample.get('ground_truth_letter', ''),
135
+ "model_output": response,
136
+ "model_name": model_name,
137
+ "gpu_id": gpu_id,
138
+ }
139
+ results.append(result)
140
+
141
+ if (i + 1) % 20 == 0 or (i + 1) == len(samples):
142
+ print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
143
+
144
+ # Write results
145
+ with open(output_file, 'w', encoding='utf-8') as f:
146
+ for r in results:
147
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
148
+
149
+ print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
150
+ return len(results)
151
+
152
+
153
+ def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
154
+ """Split samples across GPUs and run in parallel."""
155
+ n = len(samples)
156
+ k = len(gpu_ids)
157
+ chunk_size = (n + k - 1) // k
158
+
159
+ processes = []
160
+ output_files = []
161
+ for i, gpu_id in enumerate(gpu_ids):
162
+ chunk = samples[i * chunk_size: (i + 1) * chunk_size]
163
+ if not chunk:
164
+ continue
165
+ out_file = f"{output_base}_gpu{gpu_id}.jsonl"
166
+ output_files.append(out_file)
167
+ p = mp.Process(
168
+ target=worker_inference,
169
+ args=(gpu_id, model_path, chunk, out_file, model_name)
170
+ )
171
+ processes.append(p)
172
+
173
+ for p in processes:
174
+ p.start()
175
+ for p in processes:
176
+ p.join()
177
+
178
+ return output_files
179
+
180
+
181
+ def merge_results(output_files, final_output):
182
+ """Merge per-GPU result files into one."""
183
+ all_results = []
184
+ for f in output_files:
185
+ if os.path.exists(f):
186
+ with open(f, 'r', encoding='utf-8') as fh:
187
+ for line in fh:
188
+ if line.strip():
189
+ all_results.append(json.loads(line))
190
+
191
+ # Sort by index for consistency
192
+ all_results.sort(key=lambda x: x['index'])
193
+
194
+ with open(final_output, 'w', encoding='utf-8') as f:
195
+ for r in all_results:
196
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
197
+
198
+ # Cleanup per-GPU files
199
+ for f in output_files:
200
+ if os.path.exists(f):
201
+ os.remove(f)
202
+
203
+ return all_results
204
+
205
+
206
+ def main():
207
+ mp.set_start_method('spawn', force=True)
208
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
209
+
210
+ print("=" * 60)
211
+ print(" OPEN-ENDED EVAL: Base vs SFT (Multi-GPU)")
212
+ print(f" Base model: {BASE_MODEL}")
213
+ print(f" SFT model: {SFT_MODEL}")
214
+ print(f" Base GPUs: {BASE_GPUS}")
215
+ print(f" SFT GPUs: {SFT_GPUS}")
216
+ print("=" * 60)
217
+
218
+ # Load test data
219
+ samples = load_test_data()
220
+ print(f"\nLoaded {len(samples)} test samples")
221
+
222
+ cats = Counter(s['category'] for s in samples)
223
+ for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
224
+ print(f" {cat}: {cnt}")
225
+
226
+ # Run both models (each uses 4 GPUs internally for parallel inference)
227
+ t0 = time.time()
228
+
229
+ base_output = os.path.join(OUTPUT_DIR, "inference_results_base")
230
+ sft_output = os.path.join(OUTPUT_DIR, "inference_results_combined_v3")
231
+
232
+ # Run base model on GPUs 0-3 (4 workers in parallel)
233
+ pass # Skip base
234
+
235
+ # Run SFT model on GPUs 4-7 (4 workers in parallel)
236
+ print("\n>>> Starting SFT model inference...", flush=True)
237
+ run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)
238
+
239
+ # Merge results
240
+ base_files = [f"{base_output}_gpu{g}.jsonl" for g in BASE_GPUS]
241
+ sft_files = [f"{sft_output}_gpu{g}.jsonl" for g in SFT_GPUS]
242
+
243
+ base_final = os.path.join(OUTPUT_DIR, "inference_results_base.jsonl")
244
+ sft_final = os.path.join(OUTPUT_DIR, "inference_results_combined_v3.jsonl")
245
+
246
+ base_results = []
247
+ sft_results = merge_results(sft_files, sft_final)
248
+
249
+ elapsed = time.time() - t0
250
+ print(f"\n{'=' * 60}")
251
+ print(f" INFERENCE COMPLETE in {elapsed/60:.1f} min")
252
+ print(f" Base results: {len(base_results)} → {base_final}")
253
+ print(f" SFT results: {len(sft_results)} → {sft_final}")
254
+ print(f"{'=' * 60}")
255
+
256
+
257
+ if __name__ == '__main__':
258
+ main()
eval_footprint/eval_fullft_phyx_nf.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).
4
+
5
+ Runs both Base and SFT models on the 1533 open-ended physics test set.
6
+ Saves raw model outputs for later judging.
7
+
8
+ Usage (inside Docker container):
9
+ cd /tmp && python3 /path/to/eval_openended_inference.py
10
+
11
+ Output:
12
+ sft_eval_footprint/inference_results_base.jsonl
13
+ sft_eval_footprint/inference_results_phyx_fullft_v3.jsonl
14
+ """
15
+ import os
16
+ import sys
17
+ import json
18
+ import re
19
+ import time
20
+ import torch
21
+ import multiprocessing as mp
22
+ from collections import Counter
23
+
24
+ # ============ CONFIG ============
25
+ os.environ["HF_HUB_OFFLINE"] = "1"
26
+ os.environ["TRANSFORMERS_OFFLINE"] = "1"
27
+
28
+ BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
29
+ SFT_MODEL = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_phyx_fullft_v3/final"
30
+ TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
31
+ OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
32
+ IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
33
+
34
+ # Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
35
+ BASE_GPUS = [0, 1, 2, 3]
36
+ SFT_GPUS = [0, 1, 2, 3, 4, 5, 6, 7]
37
+ MAX_NEW_TOKENS = 2048
38
+ # ================================
39
+
40
+
41
+ def load_test_data():
42
+ """Load test samples from JSONL."""
43
+ samples = []
44
+ with open(TEST_FILE, 'r', encoding='utf-8') as f:
45
+ for line in f:
46
+ if line.strip():
47
+ samples.append(json.loads(line))
48
+ return samples
49
+
50
+
51
+ def build_open_ended_prompt(sample):
52
+ """Build an open-ended prompt (no MCQ options)."""
53
+ desc = sample.get('description', '')
54
+ question = sample.get('question', '')
55
+
56
+ prompt = f"""Look at the image and answer the physics question.
57
+
58
+ {desc}
59
+
60
+ {question}
61
+
62
+ Please reason step by step, and put your final answer within \\boxed{{}}.
63
+ """
64
+ return prompt.strip()
65
+
66
+
67
+ def worker_inference(gpu_id, model_path, samples, output_file, model_name):
68
+ """Worker: load model on specific GPU and run inference on assigned samples."""
69
+ import torch
70
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
71
+ from qwen_vl_utils import process_vision_info
72
+ from PIL import Image
73
+
74
+ device = f"cuda:{gpu_id}"
75
+ print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)
76
+
77
+ processor = AutoProcessor.from_pretrained(
78
+ model_path,
79
+ min_pixels=3136,
80
+ max_pixels=200704,
81
+ local_files_only=True,
82
+ trust_remote_code=True,
83
+ )
84
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
85
+ model_path,
86
+ torch_dtype=torch.bfloat16,
87
+ device_map=device,
88
+ local_files_only=True,
89
+ trust_remote_code=True,
90
+ )
91
+ model.eval()
92
+ print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)
93
+
94
+ results = []
95
+ for i, sample in enumerate(samples):
96
+ idx = sample['index']
97
+ prompt_text = build_open_ended_prompt(sample)
98
+ image_path = os.path.join(IMAGE_DIR, sample['image'])
99
+
100
+ # Build messages
101
+ messages = [{
102
+ "role": "user",
103
+ "content": [
104
+ {"type": "image", "image": f"file://{image_path}"},
105
+ {"type": "text", "text": prompt_text},
106
+ ],
107
+ }]
108
+
109
+ try:
110
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
111
+ image_inputs, video_inputs = process_vision_info(messages)
112
+ inputs = processor(
113
+ text=[text],
114
+ images=image_inputs,
115
+ videos=video_inputs,
116
+ padding=True,
117
+ return_tensors="pt",
118
+ ).to(device)
119
+
120
+ with torch.no_grad():
121
+ output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
122
+
123
+ generated = output_ids[0][inputs.input_ids.shape[1]:]
124
+ response = processor.decode(generated, skip_special_tokens=True)
125
+ except Exception as e:
126
+ response = f"ERROR: {str(e)}"
127
+
128
+ result = {
129
+ "index": idx,
130
+ "category": sample['category'],
131
+ "subfield": sample.get('subfield', ''),
132
+ "question": sample['question'],
133
+ "ground_truth_value": sample['ground_truth_value'],
134
+ "ground_truth_letter": sample.get('ground_truth_letter', ''),
135
+ "model_output": response,
136
+ "model_name": model_name,
137
+ "gpu_id": gpu_id,
138
+ }
139
+ results.append(result)
140
+
141
+ if (i + 1) % 20 == 0 or (i + 1) == len(samples):
142
+ print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
143
+
144
+ # Write results
145
+ with open(output_file, 'w', encoding='utf-8') as f:
146
+ for r in results:
147
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
148
+
149
+ print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
150
+ return len(results)
151
+
152
+
153
+ def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
154
+ """Split samples across GPUs and run in parallel."""
155
+ n = len(samples)
156
+ k = len(gpu_ids)
157
+ chunk_size = (n + k - 1) // k
158
+
159
+ processes = []
160
+ output_files = []
161
+ for i, gpu_id in enumerate(gpu_ids):
162
+ chunk = samples[i * chunk_size: (i + 1) * chunk_size]
163
+ if not chunk:
164
+ continue
165
+ out_file = f"{output_base}_gpu{gpu_id}.jsonl"
166
+ output_files.append(out_file)
167
+ p = mp.Process(
168
+ target=worker_inference,
169
+ args=(gpu_id, model_path, chunk, out_file, model_name)
170
+ )
171
+ processes.append(p)
172
+
173
+ for p in processes:
174
+ p.start()
175
+ for p in processes:
176
+ p.join()
177
+
178
+ return output_files
179
+
180
+
181
+ def merge_results(output_files, final_output):
182
+ """Merge per-GPU result files into one."""
183
+ all_results = []
184
+ for f in output_files:
185
+ if os.path.exists(f):
186
+ with open(f, 'r', encoding='utf-8') as fh:
187
+ for line in fh:
188
+ if line.strip():
189
+ all_results.append(json.loads(line))
190
+
191
+ # Sort by index for consistency
192
+ all_results.sort(key=lambda x: x['index'])
193
+
194
+ with open(final_output, 'w', encoding='utf-8') as f:
195
+ for r in all_results:
196
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
197
+
198
+ # Cleanup per-GPU files
199
+ for f in output_files:
200
+ if os.path.exists(f):
201
+ os.remove(f)
202
+
203
+ return all_results
204
+
205
+
206
+ def main():
207
+ mp.set_start_method('spawn', force=True)
208
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
209
+
210
+ print("=" * 60)
211
+ print(" OPEN-ENDED EVAL: Base vs SFT (Multi-GPU)")
212
+ print(f" Base model: {BASE_MODEL}")
213
+ print(f" SFT model: {SFT_MODEL}")
214
+ print(f" Base GPUs: {BASE_GPUS}")
215
+ print(f" SFT GPUs: {SFT_GPUS}")
216
+ print("=" * 60)
217
+
218
+ # Load test data
219
+ samples = load_test_data()
220
+ print(f"\nLoaded {len(samples)} test samples")
221
+
222
+ cats = Counter(s['category'] for s in samples)
223
+ for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
224
+ print(f" {cat}: {cnt}")
225
+
226
+ # Run both models (each uses 4 GPUs internally for parallel inference)
227
+ t0 = time.time()
228
+
229
+ base_output = os.path.join(OUTPUT_DIR, "inference_results_base")
230
+ sft_output = os.path.join(OUTPUT_DIR, "inference_results_phyx_fullft_v3")
231
+
232
+ # Run base model on GPUs 0-3 (4 workers in parallel)
233
+ pass # Skip base
234
+
235
+ # Run SFT model on GPUs 4-7 (4 workers in parallel)
236
+ print("\n>>> Starting SFT model inference...", flush=True)
237
+ run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)
238
+
239
+ # Merge results
240
+ base_files = [f"{base_output}_gpu{g}.jsonl" for g in BASE_GPUS]
241
+ sft_files = [f"{sft_output}_gpu{g}.jsonl" for g in SFT_GPUS]
242
+
243
+ base_final = os.path.join(OUTPUT_DIR, "inference_results_base.jsonl")
244
+ sft_final = os.path.join(OUTPUT_DIR, "inference_results_phyx_fullft_v3.jsonl")
245
+
246
+ base_results = []
247
+ sft_results = merge_results(sft_files, sft_final)
248
+
249
+ elapsed = time.time() - t0
250
+ print(f"\n{'=' * 60}")
251
+ print(f" INFERENCE COMPLETE in {elapsed/60:.1f} min")
252
+ print(f" Base results: {len(base_results)} → {base_final}")
253
+ print(f" SFT results: {len(sft_results)} → {sft_final}")
254
+ print(f"{'=' * 60}")
255
+
256
+
257
+ if __name__ == '__main__':
258
+ main()
eval_footprint/eval_inference_lora_math_f.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).
4
+
5
+ Runs both Base and SFT models on the 1533 open-ended physics test set.
6
+ Saves raw model outputs for later judging.
7
+
8
+ Usage (inside Docker container):
9
+ cd /tmp && python3 /path/to/eval_openended_inference.py
10
+
11
+ Output:
12
+ sft_eval_footprint/inference_results_base.jsonl
13
+ sft_eval_footprint/inference_results_lora_math_f.jsonl
14
+ """
15
+ import os
16
+ import sys
17
+ import json
18
+ import re
19
+ import time
20
+ import torch
21
+ import multiprocessing as mp
22
+ from collections import Counter
23
+
24
+ # ============ CONFIG ============
25
+ os.environ["HF_HUB_OFFLINE"] = "1"
26
+ os.environ["TRANSFORMERS_OFFLINE"] = "1"
27
+
28
+ BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
29
+ SFT_MODEL = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/lora_math_f/merged"
30
+ TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
31
+ OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
32
+ IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
33
+
34
+ # Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
35
+ BASE_GPUS = []
36
+ SFT_GPUS = [0, 1, 2, 3, 4, 5, 6, 7]
37
+ MAX_NEW_TOKENS = 2048
38
+ # ================================
39
+
40
+
41
+ def load_test_data():
42
+ """Load test samples from JSONL."""
43
+ samples = []
44
+ with open(TEST_FILE, 'r', encoding='utf-8') as f:
45
+ for line in f:
46
+ if line.strip():
47
+ samples.append(json.loads(line))
48
+ return samples
49
+
50
+
51
+ def build_open_ended_prompt(sample):
52
+ """Build an open-ended prompt (no MCQ options)."""
53
+ desc = sample.get('description', '')
54
+ question = sample.get('question', '')
55
+
56
+ prompt = f"""Look at the image and answer the physics question.
57
+
58
+ {desc}
59
+
60
+ {question}
61
+
62
+ Please reason step by step, and put your final answer within \\boxed{{}}.
63
+ """
64
+ return prompt.strip()
65
+
66
+
67
+ def worker_inference(gpu_id, model_path, samples, output_file, model_name):
68
+ """Worker: load model on specific GPU and run inference on assigned samples."""
69
+ import torch
70
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
71
+ from qwen_vl_utils import process_vision_info
72
+ from PIL import Image
73
+
74
+ device = f"cuda:{gpu_id}"
75
+ print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)
76
+
77
+ processor = AutoProcessor.from_pretrained(
78
+ model_path,
79
+ min_pixels=3136,
80
+ max_pixels=200704,
81
+ local_files_only=True,
82
+ trust_remote_code=True,
83
+ )
84
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
85
+ model_path,
86
+ torch_dtype=torch.bfloat16,
87
+ device_map=device,
88
+ local_files_only=True,
89
+ trust_remote_code=True,
90
+ )
91
+ model.eval()
92
+ print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)
93
+
94
+ results = []
95
+ for i, sample in enumerate(samples):
96
+ idx = sample['index']
97
+ prompt_text = build_open_ended_prompt(sample)
98
+ image_path = os.path.join(IMAGE_DIR, sample['image'])
99
+
100
+ # Build messages
101
+ messages = [{
102
+ "role": "user",
103
+ "content": [
104
+ {"type": "image", "image": f"file://{image_path}"},
105
+ {"type": "text", "text": prompt_text},
106
+ ],
107
+ }]
108
+
109
+ try:
110
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
111
+ image_inputs, video_inputs = process_vision_info(messages)
112
+ inputs = processor(
113
+ text=[text],
114
+ images=image_inputs,
115
+ videos=video_inputs,
116
+ padding=True,
117
+ return_tensors="pt",
118
+ ).to(device)
119
+
120
+ with torch.no_grad():
121
+ output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
122
+
123
+ generated = output_ids[0][inputs.input_ids.shape[1]:]
124
+ response = processor.decode(generated, skip_special_tokens=True)
125
+ except Exception as e:
126
+ response = f"ERROR: {str(e)}"
127
+
128
+ result = {
129
+ "index": idx,
130
+ "category": sample['category'],
131
+ "subfield": sample.get('subfield', ''),
132
+ "question": sample['question'],
133
+ "ground_truth_value": sample['ground_truth_value'],
134
+ "ground_truth_letter": sample.get('ground_truth_letter', ''),
135
+ "model_output": response,
136
+ "model_name": model_name,
137
+ "gpu_id": gpu_id,
138
+ }
139
+ results.append(result)
140
+
141
+ if (i + 1) % 20 == 0 or (i + 1) == len(samples):
142
+ print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
143
+
144
+ # Write results
145
+ with open(output_file, 'w', encoding='utf-8') as f:
146
+ for r in results:
147
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
148
+
149
+ print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
150
+ return len(results)
151
+
152
+
153
+ def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
154
+ """Split samples across GPUs and run in parallel."""
155
+ n = len(samples)
156
+ k = len(gpu_ids)
157
+ chunk_size = (n + k - 1) // k
158
+
159
+ processes = []
160
+ output_files = []
161
+ for i, gpu_id in enumerate(gpu_ids):
162
+ chunk = samples[i * chunk_size: (i + 1) * chunk_size]
163
+ if not chunk:
164
+ continue
165
+ out_file = f"{output_base}_gpu{gpu_id}.jsonl"
166
+ output_files.append(out_file)
167
+ p = mp.Process(
168
+ target=worker_inference,
169
+ args=(gpu_id, model_path, chunk, out_file, model_name)
170
+ )
171
+ processes.append(p)
172
+
173
+ for p in processes:
174
+ p.start()
175
+ for p in processes:
176
+ p.join()
177
+
178
+ return output_files
179
+
180
+
181
+ def merge_results(output_files, final_output):
182
+ """Merge per-GPU result files into one."""
183
+ all_results = []
184
+ for f in output_files:
185
+ if os.path.exists(f):
186
+ with open(f, 'r', encoding='utf-8') as fh:
187
+ for line in fh:
188
+ if line.strip():
189
+ all_results.append(json.loads(line))
190
+
191
+ # Sort by index for consistency
192
+ all_results.sort(key=lambda x: x['index'])
193
+
194
+ with open(final_output, 'w', encoding='utf-8') as f:
195
+ for r in all_results:
196
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
197
+
198
+ # Cleanup per-GPU files
199
+ for f in output_files:
200
+ if os.path.exists(f):
201
+ os.remove(f)
202
+
203
+ return all_results
204
+
205
+
206
+ def main():
207
+ mp.set_start_method('spawn', force=True)
208
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
209
+
210
+ print("=" * 60)
211
+ print(" OPEN-ENDED EVAL: Base vs SFT (Multi-GPU)")
212
+ print(f" Base model: {BASE_MODEL}")
213
+ print(f" SFT model: {SFT_MODEL}")
214
+ print(f" Base GPUs: {BASE_GPUS}")
215
+ print(f" SFT GPUs: {SFT_GPUS}")
216
+ print("=" * 60)
217
+
218
+ # Load test data
219
+ samples = load_test_data()
220
+ print(f"\nLoaded {len(samples)} test samples")
221
+
222
+ cats = Counter(s['category'] for s in samples)
223
+ for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
224
+ print(f" {cat}: {cnt}")
225
+
226
+ # Run both models (each uses 4 GPUs internally for parallel inference)
227
+ t0 = time.time()
228
+
229
+ base_output = os.path.join(OUTPUT_DIR, "inference_results_base")
230
+ sft_output = os.path.join(OUTPUT_DIR, "inference_results_lora_math_f")
231
+
232
+ # Run base model (SKIPPED if BASE_GPUS is empty)
233
+ if BASE_GPUS:
234
+ print("\n>>> Starting BASE model inference...", flush=True)
235
+ run_model_parallel(BASE_MODEL, "base", BASE_GPUS, samples, base_output)
236
+ else:
237
+ print("\n>>> SKIPPING BASE model (BASE_GPUS is empty)", flush=True)
238
+
239
+ # Run SFT model on GPUs 4-7 (4 workers in parallel)
240
+ print("\n>>> Starting SFT model inference...", flush=True)
241
+ run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)
242
+
243
+ # Merge results
244
+ base_files = [f"{base_output}_gpu{g}.jsonl" for g in BASE_GPUS]
245
+ sft_files = [f"{sft_output}_gpu{g}.jsonl" for g in SFT_GPUS]
246
+
247
+ base_final = os.path.join(OUTPUT_DIR, "inference_results_base.jsonl")
248
+ sft_final = os.path.join(OUTPUT_DIR, "inference_results_lora_math_f.jsonl")
249
+
250
+ base_results = merge_results(base_files, base_final) if BASE_GPUS else []
251
+ sft_results = merge_results(sft_files, sft_final)
252
+
253
+ elapsed = time.time() - t0
254
+ print(f"\n{'=' * 60}")
255
+ print(f" INFERENCE COMPLETE in {elapsed/60:.1f} min")
256
+ print(f" Base results: {len(base_results)} → {base_final}")
257
+ print(f" SFT results: {len(sft_results)} → {sft_final}")
258
+ print(f"{'=' * 60}")
259
+
260
+
261
+ if __name__ == '__main__':
262
+ main()
eval_footprint/eval_judge_fullft_math_nf.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Score base and fullft inference results using DeepSeek-V3 judge (20 threads).
4
+ PhyX-aligned pipeline: extract -> string match -> LLM judge (5 retries).
5
+ """
6
+ import json, os, re, time, sys
7
+ from collections import defaultdict
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ import urllib.request
10
+ import urllib.error
11
+
12
+ # ===================== CONFIG =====================
13
+ DEEPSEEK_API_KEY = "sk-6364e2b3116241c59577191c32b09021"
14
+ DEEPSEEK_MODEL = "deepseek-chat"
15
+ DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
16
+
17
+ RESULTS_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
18
+ OUTPUT_DIR = "/data1/dhelix_shared/hku/rl4phyx/RL4Phyx/SFT/result"
19
+
20
+ COLDSTART_RESULTS = os.path.join(RESULTS_DIR, "inference_results_coldstart.jsonl")
21
+ # removed
22
+
23
+ FAIL_MSG = 'Failed to obtain answer via API.'
24
+ RETRY = 5
25
+ # ==================================================
26
+
27
+ def get_ICE():
28
+ example_1 = """
29
+ Ground truth answer: 502 \n
30
+ Predicted answer: The mass of block (B) is:
31
+ [
32
+ \\boxed{ 50 \\sqrt{101} }
33
+ ] \n
34
+ Judegement: 1
35
+ """
36
+ example_2 = """
37
+ Ground truth answer: 46.3 kN \n
38
+ Predicted answer: The tension ( T_B ) in the cable is approximately:
39
+ [
40
+ \\boxed{46300 }
41
+ ] \n
42
+ Judegement: 1
43
+ """
44
+ example_3 = """
45
+ Ground truth answer: 12 m/s \n
46
+ Predicted answer: The speed of the box after 2.00 seconds is:
47
+ [
48
+ \\boxed{11.3, \\text{m/s}}
49
+ ] \n
50
+ Judegement: 0
51
+ """
52
+ example_4 = """
53
+ Ground truth answer: 36.00 kg \n
54
+ Predicted answer: The mass of the hanging block ( m_2 ) must be approximately:
55
+ [
56
+ \\boxed{36.1, \\text\\{kg\\}}
57
+ ] \n
58
+ Judegement: 1
59
+ """
60
+ example_5 = """
61
+ Ground truth answer: 3.2 m \n
62
+ Predicted answer: The stuntman and villain slide approximately \\frac{10}{3.1415} meters**.
63
+ Judegement: 1
64
+ """
65
+ return [example_1, example_2, example_3, example_4, example_5]
66
+
67
+
68
+ def build_phyx_gpt4_prompt(gt_answer, pred):
69
+ task_description = """
70
+ Please read the following example. Given predicted answer and ground truth answer,
71
+ compare the these two answers, then ONLY output judegement 1/0 for matched/unmatched at the end of the prompt.
72
+ If the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.
73
+ If the given predicted mentions "approximately", then allow the Approximation Error, \
74
+ such as 0.49 and approximately 0.5, 0.81 and approximately 0.8. \n
75
+
76
+ """
77
+ prompt = task_description
78
+ for example in get_ICE():
79
+ prompt += example + '\n'
80
+ prompt += 'Ground truth answer: {} \n'.format(gt_answer)
81
+ prompt += 'Predicted answer: {} \n'.format(pred)
82
+ prompt += 'Judegement:'
83
+ return prompt
84
+
85
+
86
+ def mapping_str(input_str):
87
+ d = {"\\dfrac": "\\frac", "\\pi": "3.14"}
88
+ output = input_str
89
+ for k, v in d.items():
90
+ try:
91
+ output = output.replace(k, v)
92
+ except:
93
+ pass
94
+ return output
95
+
96
+
97
+ def extract_boxed_content(s):
98
+ start = s.find(r'\boxed{')
99
+ if start == -1:
100
+ return None
101
+ content_start = start + len(r'\boxed{')
102
+ rest = s[content_start:]
103
+ depth = 0
104
+ for i, ch in enumerate(rest):
105
+ if ch == '{':
106
+ depth += 1
107
+ elif ch == '}':
108
+ if depth == 0:
109
+ return rest[:i]
110
+ else:
111
+ depth -= 1
112
+ return None
113
+
114
+
115
+ def PhyX_process_line(prediction_str, gt_answer):
116
+ ret = {}
117
+ ret['gt'] = str(gt_answer)
118
+ ret['pred'] = prediction_str.strip()
119
+ if ret['pred'] == FAIL_MSG:
120
+ ret['match'] = 0
121
+ ret["extracted"] = "Fail to Call API"
122
+ return ret
123
+ boxed_answer = extract_boxed_content(ret['pred'])
124
+ if boxed_answer is not None:
125
+ ret["extracted"] = mapping_str(boxed_answer)
126
+ else:
127
+ pattern = r'\b(?:final\s+answer|correct\s+answer)\b[^::]*[::]\s*(.*?)(?=\n\n\n|\Z)'
128
+ match = re.search(pattern, ret['pred'], re.IGNORECASE | re.DOTALL)
129
+ if match:
130
+ ret["extracted"] = mapping_str(match.group(1))
131
+ else:
132
+ ret["extracted"] = "SAME as predict"
133
+ gt_lower = ret['gt'].strip().lower()
134
+ extracted_lower = ret["extracted"].strip().lower()
135
+ pred_lower = ret["pred"].strip().lower()
136
+ if gt_lower == extracted_lower or gt_lower == pred_lower or ret['gt'] in ret['pred']:
137
+ ret['match'] = 1
138
+ return ret
139
+ ret['match'] = 0
140
+ return ret
141
+
142
+
143
+ def call_deepseek(prompt, temperature=0.0):
144
+ headers = {
145
+ "Content-Type": "application/json",
146
+ "Authorization": f"Bearer {DEEPSEEK_API_KEY}"
147
+ }
148
+ data = json.dumps({
149
+ "model": DEEPSEEK_MODEL,
150
+ "messages": [{"role": "user", "content": prompt}],
151
+ "temperature": temperature,
152
+ "max_tokens": 200,
153
+ }).encode('utf-8')
154
+ try:
155
+ req = urllib.request.Request(DEEPSEEK_API_URL, data=data, headers=headers)
156
+ with urllib.request.urlopen(req, timeout=30) as resp:
157
+ result = json.loads(resp.read().decode())
158
+ return result['choices'][0]['message']['content']
159
+ except:
160
+ return FAIL_MSG
161
+
162
+
163
+ def PhyX_auxeval(gt_answer, prediction):
164
+ log = ''
165
+ tmp = PhyX_process_line(prediction, gt_answer)
166
+ if tmp["extracted"] == "Fail to Call API":
167
+ return dict(log="Fail to Call API", res=0, extracted="Fail to Call API")
168
+ prediction_extracted = tmp["extracted"] if tmp["extracted"] != "SAME as predict" else prediction
169
+ if str(gt_answer).strip().lower() == prediction_extracted.strip().lower():
170
+ return dict(log="Matched at string level", res=1, extracted=prediction_extracted)
171
+ prompt = build_phyx_gpt4_prompt(gt_answer, prediction_extracted)
172
+ for i in range(RETRY):
173
+ res = call_deepseek(prompt, temperature=i * 0.5)
174
+ if FAIL_MSG in res:
175
+ log += f'Try {i}: failed.\n'
176
+ else:
177
+ log += 'Compared at semantic level. '
178
+ if "1" in res:
179
+ log += "Semantic equal via LLM."
180
+ return dict(log=log, res=1, extracted=prediction_extracted)
181
+ elif "0" in res:
182
+ log += f"LLM judgement {res}"
183
+ return dict(log=log, res=0, extracted=prediction_extracted)
184
+ log += 'All 5 retries failed.\n'
185
+ return dict(log=log, res=0, extracted=prediction_extracted)
186
+
187
+
188
+ def _eval_single(args):
189
+ idx, r = args
190
+ gt = r['ground_truth_value']
191
+ prediction = r['model_output']
192
+ eval_result = PhyX_auxeval(gt, prediction)
193
+ r['extracted_answer'] = eval_result['extracted']
194
+ r['eval_log'] = eval_result['log']
195
+ r['res'] = eval_result['res']
196
+ return idx, r
197
+
198
+
199
+ def score_results(results_file, model_name, output_file):
200
+ results = []
201
+ with open(results_file, 'r', encoding='utf-8') as f:
202
+ for line in f:
203
+ if line.strip():
204
+ results.append(json.loads(line))
205
+
206
+ print(f"\n{'='*60}")
207
+ print(f" Scoring: {model_name} ({len(results)} samples)")
208
+ print(f" Using PhyX-aligned pipeline with DeepSeek-V3 judge (20 threads)")
209
+ print(f"{'='*60}")
210
+
211
+ total = len(results)
212
+ with ThreadPoolExecutor(max_workers=20) as executor:
213
+ futures = {executor.submit(_eval_single, (i, r)): i for i, r in enumerate(results)}
214
+ done = 0
215
+ for future in as_completed(futures):
216
+ done += 1
217
+ if done % 100 == 0 or done == total:
218
+ hit = sum(1 for r in results if r.get('res') == 1)
219
+ print(f" [{done}/{total}] processed, correct={hit}", flush=True)
220
+
221
+ hit = 0
222
+ string_match = 0
223
+ llm_match = 0
224
+ llm_called = 0
225
+ cat_stats = defaultdict(lambda: {'total': 0, 'correct': 0})
226
+ for r in results:
227
+ cat = r.get('category', 'unknown')
228
+ cat_stats[cat]['total'] += 1
229
+ if r.get('res') == 1:
230
+ hit += 1
231
+ cat_stats[cat]['correct'] += 1
232
+ log = r.get('eval_log', '')
233
+ if "string level" in log:
234
+ string_match += 1
235
+ elif "semantic level" in log or "LLM judgement" in log:
236
+ llm_called += 1
237
+ if r.get('res') == 1:
238
+ llm_match += 1
239
+
240
+ acc = hit / total * 100
241
+ with open(output_file, 'w', encoding='utf-8') as f:
242
+ for r in results:
243
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
244
+
245
+ print(f"\n {model_name}: {hit}/{total} ({acc:.1f}%)")
246
+ print(f" String: {string_match}, LLM calls: {llm_called}, LLM match: {llm_match}")
247
+ print(f" Per category:")
248
+ for cat, s in sorted(cat_stats.items(), key=lambda x: -x[1]['total']):
249
+ cat_acc = s['correct'] / s['total'] * 100 if s['total'] > 0 else 0
250
+ print(f" {cat:25s}: {s['correct']:3d}/{s['total']:3d} ({cat_acc:5.1f}%)")
251
+
252
+ return {
253
+ 'model': model_name, 'total': total,
254
+ 'string_matches': string_match, 'llm_calls': llm_called, 'llm_matches': llm_match,
255
+ 'final_correct': hit, 'final_acc': round(acc, 2),
256
+ 'category_stats': {k: dict(v) for k, v in cat_stats.items()}
257
+ }
258
+
259
+
260
+ def main():
261
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
262
+ print("="*60)
263
+ print(" PhyX-ALIGNED EVAL: Coldstart")
264
+ print(f" Output: {OUTPUT_DIR}")
265
+ print("="*60)
266
+
267
+ print("\nTesting DeepSeek API...")
268
+ test = call_deepseek("Say 'OK' if you can read this.")
269
+ if test == FAIL_MSG:
270
+ print(f" API FAILED"); sys.exit(1)
271
+ print(f" API OK: {test[:50]}")
272
+
273
+ cs_stats = score_results(COLDSTART_RESULTS, "Coldstart (50K math)",
274
+ os.path.join(OUTPUT_DIR, "scored_results_coldstart.jsonl"))
275
+
276
+ report = {
277
+ 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
278
+ 'scoring_method': 'PhyX-aligned (DeepSeek-V3 judge, 5-shot ICE, 5 retries)',
279
+ 'coldstart': cs_stats,
280
+ }
281
+ report_file = os.path.join(OUTPUT_DIR, "report_coldstart.json")
282
+ with open(report_file, 'w', encoding='utf-8') as f:
283
+ json.dump(report, f, indent=2, ensure_ascii=False)
284
+
285
+ print(f"\n{'='*60}")
286
+ print(f" RESULTS")
287
+ print(f"{'='*60}")
288
+ print(f" Coldstart accuracy: {cs_stats['final_acc']}%")
289
+ print(f" Report: {report_file}")
290
+ print(f"{'='*60}")
291
+
292
+ if __name__ == '__main__':
293
+ main()
eval_footprint/eval_judge_fullft_phyx_nf.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Score base and fullft inference results using DeepSeek-V3 judge (20 threads).
4
+ PhyX-aligned pipeline: extract -> string match -> LLM judge (5 retries).
5
+ """
6
+ import json, os, re, time, sys
7
+ from collections import defaultdict
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ import urllib.request
10
+ import urllib.error
11
+
12
+ # ===================== CONFIG =====================
13
+ DEEPSEEK_API_KEY = "sk-6364e2b3116241c59577191c32b09021"
14
+ DEEPSEEK_MODEL = "deepseek-chat"
15
+ DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
16
+
17
+ RESULTS_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
18
+ OUTPUT_DIR = "/data1/dhelix_shared/hku/rl4phyx/RL4Phyx/SFT/result"
19
+
20
+ BASE_RESULTS = os.path.join(RESULTS_DIR, "inference_results_base.jsonl")
21
+ FULLFT_RESULTS = os.path.join(RESULTS_DIR, "inference_results_sft.jsonl")
22
+
23
+ FAIL_MSG = 'Failed to obtain answer via API.'
24
+ RETRY = 5
25
+ # ==================================================
26
+
27
+ def get_ICE():
28
+ example_1 = """
29
+ Ground truth answer: 502 \n
30
+ Predicted answer: The mass of block (B) is:
31
+ [
32
+ \\boxed{ 50 \\sqrt{101} }
33
+ ] \n
34
+ Judegement: 1
35
+ """
36
+ example_2 = """
37
+ Ground truth answer: 46.3 kN \n
38
+ Predicted answer: The tension ( T_B ) in the cable is approximately:
39
+ [
40
+ \\boxed{46300 }
41
+ ] \n
42
+ Judegement: 1
43
+ """
44
+ example_3 = """
45
+ Ground truth answer: 12 m/s \n
46
+ Predicted answer: The speed of the box after 2.00 seconds is:
47
+ [
48
+ \\boxed{11.3, \\text{m/s}}
49
+ ] \n
50
+ Judegement: 0
51
+ """
52
+ example_4 = """
53
+ Ground truth answer: 36.00 kg \n
54
+ Predicted answer: The mass of the hanging block ( m_2 ) must be approximately:
55
+ [
56
+ \\boxed{36.1, \\text\\{kg\\}}
57
+ ] \n
58
+ Judegement: 1
59
+ """
60
+ example_5 = """
61
+ Ground truth answer: 3.2 m \n
62
+ Predicted answer: The stuntman and villain slide approximately \\frac{10}{3.1415} meters**.
63
+ Judegement: 1
64
+ """
65
+ return [example_1, example_2, example_3, example_4, example_5]
66
+
67
+
68
+ def build_phyx_gpt4_prompt(gt_answer, pred):
69
+ task_description = """
70
+ Please read the following example. Given predicted answer and ground truth answer,
71
+ compare the these two answers, then ONLY output judegement 1/0 for matched/unmatched at the end of the prompt.
72
+ If the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.
73
+ If the given predicted mentions "approximately", then allow the Approximation Error, \
74
+ such as 0.49 and approximately 0.5, 0.81 and approximately 0.8. \n
75
+
76
+ """
77
+ prompt = task_description
78
+ for example in get_ICE():
79
+ prompt += example + '\n'
80
+ prompt += 'Ground truth answer: {} \n'.format(gt_answer)
81
+ prompt += 'Predicted answer: {} \n'.format(pred)
82
+ prompt += 'Judegement:'
83
+ return prompt
84
+
85
+
86
+ def mapping_str(input_str):
87
+ d = {"\\dfrac": "\\frac", "\\pi": "3.14"}
88
+ output = input_str
89
+ for k, v in d.items():
90
+ try:
91
+ output = output.replace(k, v)
92
+ except:
93
+ pass
94
+ return output
95
+
96
+
97
+ def extract_boxed_content(s):
98
+ start = s.find(r'\boxed{')
99
+ if start == -1:
100
+ return None
101
+ content_start = start + len(r'\boxed{')
102
+ rest = s[content_start:]
103
+ depth = 0
104
+ for i, ch in enumerate(rest):
105
+ if ch == '{':
106
+ depth += 1
107
+ elif ch == '}':
108
+ if depth == 0:
109
+ return rest[:i]
110
+ else:
111
+ depth -= 1
112
+ return None
113
+
114
+
115
+ def PhyX_process_line(prediction_str, gt_answer):
116
+ ret = {}
117
+ ret['gt'] = str(gt_answer)
118
+ ret['pred'] = prediction_str.strip()
119
+ if ret['pred'] == FAIL_MSG:
120
+ ret['match'] = 0
121
+ ret["extracted"] = "Fail to Call API"
122
+ return ret
123
+ boxed_answer = extract_boxed_content(ret['pred'])
124
+ if boxed_answer is not None:
125
+ ret["extracted"] = mapping_str(boxed_answer)
126
+ else:
127
+ pattern = r'\b(?:final\s+answer|correct\s+answer)\b[^::]*[::]\s*(.*?)(?=\n\n\n|\Z)'
128
+ match = re.search(pattern, ret['pred'], re.IGNORECASE | re.DOTALL)
129
+ if match:
130
+ ret["extracted"] = mapping_str(match.group(1))
131
+ else:
132
+ ret["extracted"] = "SAME as predict"
133
+ gt_lower = ret['gt'].strip().lower()
134
+ extracted_lower = ret["extracted"].strip().lower()
135
+ pred_lower = ret["pred"].strip().lower()
136
+ if gt_lower == extracted_lower or gt_lower == pred_lower or ret['gt'] in ret['pred']:
137
+ ret['match'] = 1
138
+ return ret
139
+ ret['match'] = 0
140
+ return ret
141
+
142
+
143
+ def call_deepseek(prompt, temperature=0.0):
144
+ headers = {
145
+ "Content-Type": "application/json",
146
+ "Authorization": f"Bearer {DEEPSEEK_API_KEY}"
147
+ }
148
+ data = json.dumps({
149
+ "model": DEEPSEEK_MODEL,
150
+ "messages": [{"role": "user", "content": prompt}],
151
+ "temperature": temperature,
152
+ "max_tokens": 200,
153
+ }).encode('utf-8')
154
+ try:
155
+ req = urllib.request.Request(DEEPSEEK_API_URL, data=data, headers=headers)
156
+ with urllib.request.urlopen(req, timeout=30) as resp:
157
+ result = json.loads(resp.read().decode())
158
+ return result['choices'][0]['message']['content']
159
+ except:
160
+ return FAIL_MSG
161
+
162
+
163
+ def PhyX_auxeval(gt_answer, prediction):
164
+ log = ''
165
+ tmp = PhyX_process_line(prediction, gt_answer)
166
+ if tmp["extracted"] == "Fail to Call API":
167
+ return dict(log="Fail to Call API", res=0, extracted="Fail to Call API")
168
+ prediction_extracted = tmp["extracted"] if tmp["extracted"] != "SAME as predict" else prediction
169
+ if str(gt_answer).strip().lower() == prediction_extracted.strip().lower():
170
+ return dict(log="Matched at string level", res=1, extracted=prediction_extracted)
171
+ prompt = build_phyx_gpt4_prompt(gt_answer, prediction_extracted)
172
+ for i in range(RETRY):
173
+ res = call_deepseek(prompt, temperature=i * 0.5)
174
+ if FAIL_MSG in res:
175
+ log += f'Try {i}: failed.\n'
176
+ else:
177
+ log += 'Compared at semantic level. '
178
+ if "1" in res:
179
+ log += "Semantic equal via LLM."
180
+ return dict(log=log, res=1, extracted=prediction_extracted)
181
+ elif "0" in res:
182
+ log += f"LLM judgement {res}"
183
+ return dict(log=log, res=0, extracted=prediction_extracted)
184
+ log += 'All 5 retries failed.\n'
185
+ return dict(log=log, res=0, extracted=prediction_extracted)
186
+
187
+
188
+ def _eval_single(args):
189
+ idx, r = args
190
+ gt = r['ground_truth_value']
191
+ prediction = r['model_output']
192
+ eval_result = PhyX_auxeval(gt, prediction)
193
+ r['extracted_answer'] = eval_result['extracted']
194
+ r['eval_log'] = eval_result['log']
195
+ r['res'] = eval_result['res']
196
+ return idx, r
197
+
198
+
199
+ def score_results(results_file, model_name, output_file):
200
+ results = []
201
+ with open(results_file, 'r', encoding='utf-8') as f:
202
+ for line in f:
203
+ if line.strip():
204
+ results.append(json.loads(line))
205
+
206
+ print(f"\n{'='*60}")
207
+ print(f" Scoring: {model_name} ({len(results)} samples)")
208
+ print(f" Using PhyX-aligned pipeline with DeepSeek-V3 judge (20 threads)")
209
+ print(f"{'='*60}")
210
+
211
+ total = len(results)
212
+ with ThreadPoolExecutor(max_workers=20) as executor:
213
+ futures = {executor.submit(_eval_single, (i, r)): i for i, r in enumerate(results)}
214
+ done = 0
215
+ for future in as_completed(futures):
216
+ done += 1
217
+ if done % 100 == 0 or done == total:
218
+ hit = sum(1 for r in results if r.get('res') == 1)
219
+ print(f" [{done}/{total}] processed, correct={hit}", flush=True)
220
+
221
+ hit = 0
222
+ string_match = 0
223
+ llm_match = 0
224
+ llm_called = 0
225
+ cat_stats = defaultdict(lambda: {'total': 0, 'correct': 0})
226
+ for r in results:
227
+ cat = r.get('category', 'unknown')
228
+ cat_stats[cat]['total'] += 1
229
+ if r.get('res') == 1:
230
+ hit += 1
231
+ cat_stats[cat]['correct'] += 1
232
+ log = r.get('eval_log', '')
233
+ if "string level" in log:
234
+ string_match += 1
235
+ elif "semantic level" in log or "LLM judgement" in log:
236
+ llm_called += 1
237
+ if r.get('res') == 1:
238
+ llm_match += 1
239
+
240
+ acc = hit / total * 100
241
+ with open(output_file, 'w', encoding='utf-8') as f:
242
+ for r in results:
243
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
244
+
245
+ print(f"\n {model_name}: {hit}/{total} ({acc:.1f}%)")
246
+ print(f" String: {string_match}, LLM calls: {llm_called}, LLM match: {llm_match}")
247
+ print(f" Per category:")
248
+ for cat, s in sorted(cat_stats.items(), key=lambda x: -x[1]['total']):
249
+ cat_acc = s['correct'] / s['total'] * 100 if s['total'] > 0 else 0
250
+ print(f" {cat:25s}: {s['correct']:3d}/{s['total']:3d} ({cat_acc:5.1f}%)")
251
+
252
+ return {
253
+ 'model': model_name, 'total': total,
254
+ 'string_matches': string_match, 'llm_calls': llm_called, 'llm_matches': llm_match,
255
+ 'final_correct': hit, 'final_acc': round(acc, 2),
256
+ 'category_stats': {k: dict(v) for k, v in cat_stats.items()}
257
+ }
258
+
259
+
260
+ def main():
261
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
262
+ print("="*60)
263
+ print(" PhyX-ALIGNED EVAL: Base + FullFT")
264
+ print(f" Output: {OUTPUT_DIR}")
265
+ print("="*60)
266
+
267
+ print("\nTesting DeepSeek API...")
268
+ test = call_deepseek("Say 'OK' if you can read this.")
269
+ if test == FAIL_MSG:
270
+ print(f" API FAILED"); sys.exit(1)
271
+ print(f" API OK: {test[:50]}")
272
+
273
+ base_stats = score_results(BASE_RESULTS, "Base (Qwen2.5-VL-3B-Instruct)",
274
+ os.path.join(OUTPUT_DIR, "scored_results_base.jsonl"))
275
+ fullft_stats = score_results(FULLFT_RESULTS, "SFT-fullft (Cold-Start)",
276
+ os.path.join(OUTPUT_DIR, "scored_results_fullft.jsonl"))
277
+
278
+ report = {
279
+ 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
280
+ 'scoring_method': 'PhyX-aligned (DeepSeek-V3 judge, 5-shot ICE, 5 retries)',
281
+ 'base': base_stats, 'fullft': fullft_stats,
282
+ }
283
+ report_file = os.path.join(OUTPUT_DIR, "comparison_report_base_fullft.json")
284
+ with open(report_file, 'w', encoding='utf-8') as f:
285
+ json.dump(report, f, indent=2, ensure_ascii=False)
286
+
287
+ print(f"\n{'='*60}")
288
+ print(f" RESULTS")
289
+ print(f"{'='*60}")
290
+ print(f" Base accuracy: {base_stats['final_acc']}%")
291
+ print(f" FullFT accuracy: {fullft_stats['final_acc']}%")
292
+ print(f" Report: {report_file}")
293
+ print(f"{'='*60}")
294
+
295
+ if __name__ == '__main__':
296
+ main()
eval_footprint/eval_judge_lora_math_f.py ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Score inference results using DeepSeek-V3 as LLM Judge.
4
+ *** EXACTLY aligned with PhyX official evaluation pipeline ***
5
+ (from killthefullmoon/PhyX -> vlmeval/dataset/utils/phyx.py)
6
+
7
+ Pipeline:
8
+ 1. Extract answer from \boxed{} or "final answer:" pattern
9
+ 2. String-level matching
10
+ 3. LLM judge with 5-shot ICE prompt, retry 5 times with increasing temperature
11
+
12
+ Usage:
13
+ python3 eval_deepseek_judge.py
14
+ """
15
+ import json, os, re, time, sys, ast
16
+ from collections import defaultdict
17
+ import urllib.request
18
+ import urllib.error
19
+
20
+ # ===================== CONFIG =====================
21
+ DEEPSEEK_API_KEY = "sk-6364e2b3116241c59577191c32b09021"
22
+ DEEPSEEK_MODEL = "deepseek-chat" # Official DeepSeek-V3
23
+ DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
24
+
25
+ RESULTS_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
26
+ BASE_RESULTS = os.path.join(RESULTS_DIR, "inference_results_base.jsonl")
27
+ SFT_RESULTS = os.path.join(RESULTS_DIR, "inference_results_lora_math_f.jsonl")
28
+ OUTPUT_DIR = RESULTS_DIR
29
+
30
+ FAIL_MSG = 'Failed to obtain answer via API.'
31
+ RETRY = 5
32
+ # ==================================================
33
+
34
+
35
+ # ============= PhyX ICE (In-Context Examples) =============
36
+ # Exactly from PhyX source code: get_ICE()
37
+
38
+ def get_ICE():
39
+ example_1 = """
40
+ Ground truth answer: 502 \n
41
+ Predicted answer: The mass of block (B) is:
42
+ [
43
+ \\boxed{ 50 \\sqrt{101} }
44
+ ] \n
45
+ Judegement: 1
46
+ """
47
+
48
+ example_2 = """
49
+ Ground truth answer: 46.3 kN \n
50
+ Predicted answer: The tension ( T_B ) in the cable is approximately:
51
+ [
52
+ \\boxed{46300 }
53
+ ] \n
54
+ Judegement: 1
55
+ """
56
+
57
+ example_3 = """
58
+ Ground truth answer: 12 m/s \n
59
+ Predicted answer: The speed of the box after 2.00 seconds is:
60
+ [
61
+ \\boxed{11.3, \\text{m/s}}
62
+ ] \n
63
+ Judegement: 0
64
+ """
65
+
66
+ example_4 = """
67
+ Ground truth answer: 36.00 kg \n
68
+ Predicted answer: The mass of the hanging block ( m_2 ) must be approximately:
69
+ [
70
+ \\boxed{36.1, \\text\\{kg\\}}
71
+ ] \n
72
+ Judegement: 1
73
+ """
74
+
75
+ example_5 = """
76
+ Ground truth answer: 3.2 m \n
77
+ Predicted answer: The stuntman and villain slide approximately \\frac{10}{3.1415} meters**.
78
+ Judegement: 1
79
+ """
80
+
81
+ return [example_1, example_2, example_3, example_4, example_5]
82
+
83
+
84
+ # ============= PhyX Prompt Builder =============
85
+ # Exactly from PhyX source code: build_phyx_gpt4_prompt()
86
+
87
+ def build_phyx_gpt4_prompt(gt_answer, pred):
88
+ task_description = """
89
+ Please read the following example. Given predicted answer and ground truth answer,
90
+ compare the these two answers, then ONLY output judegement 1/0 for matched/unmatched at the end of the prompt.
91
+ If the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.
92
+ If the given predicted mentions "approximately", then allow the Approximation Error, \
93
+ such as 0.49 and approximately 0.5, 0.81 and approximately 0.8. \n
94
+
95
+ """
96
+ prompt = task_description
97
+ examples = get_ICE()
98
+ for example in examples:
99
+ prompt += example + '\n'
100
+ prompt += 'Ground truth answer: {} \n'.format(gt_answer)
101
+ prompt += 'Predicted answer: {} \n'.format(pred)
102
+ prompt += 'Judegement:'
103
+ return prompt
104
+
105
+
106
+ # ============= PhyX Answer Extraction =============
107
+ # Exactly from PhyX source code
108
+
109
+ def mapping_str(input_str):
110
+ d = {"\\dfrac": "\\frac", "\\pi": "3.14"}
111
+ output = input_str
112
+ for k, v in d.items():
113
+ try:
114
+ output = output.replace(k, v)
115
+ except:
116
+ pass
117
+ return output
118
+
119
+
120
+ def extract_boxed_content(s):
121
+ """Extract content from \\boxed{...} handling nested braces. From PhyX source."""
122
+ start = s.find(r'\boxed{')
123
+ if start == -1:
124
+ return None
125
+ content_start = start + len(r'\boxed{')
126
+ rest = s[content_start:]
127
+ depth = 0
128
+ for i, ch in enumerate(rest):
129
+ if ch == '{':
130
+ depth += 1
131
+ elif ch == '}':
132
+ if depth == 0:
133
+ return rest[:i]
134
+ else:
135
+ depth -= 1
136
+ return None
137
+
138
+
139
+ def PhyX_process_line(prediction_str, gt_answer):
140
+ """
141
+ PhyX rule-based answer extraction and string matching.
142
+ Returns: dict with 'extracted', 'match' (0 or 1)
143
+ """
144
+ ret = {}
145
+ ret['gt'] = str(gt_answer)
146
+ ret['pred'] = prediction_str.strip()
147
+
148
+ if ret['pred'] == FAIL_MSG:
149
+ ret['match'] = 0
150
+ ret["extracted"] = "Fail to Call API"
151
+ return ret
152
+
153
+ # Try extracting from \boxed{}
154
+ boxed_answer = extract_boxed_content(ret['pred'])
155
+ if boxed_answer is not None:
156
+ boxed_answer = mapping_str(boxed_answer)
157
+ ret["extracted"] = boxed_answer
158
+ else:
159
+ # Try "final answer:" or "correct answer:" pattern
160
+ pattern = r'\b(?:final\s+answer|correct\s+answer)\b[^::]*[::]\s*(.*?)(?=\n\n\n|\Z)'
161
+ flags = re.IGNORECASE | re.DOTALL
162
+ match = re.search(pattern, ret['pred'], flags=flags)
163
+ if match:
164
+ extracted_answer = match.group(1)
165
+ extracted_answer = mapping_str(extracted_answer)
166
+ ret["extracted"] = extracted_answer
167
+ else:
168
+ ret["extracted"] = "SAME as predict"
169
+
170
+ # String-level matching (PhyX logic)
171
+ gt_lower = ret['gt'].strip().lower()
172
+ extracted_lower = ret["extracted"].strip().lower()
173
+ pred_lower = ret["pred"].strip().lower()
174
+
175
+ if gt_lower == extracted_lower or gt_lower == pred_lower or ret['gt'] in ret['pred']:
176
+ ret['match'] = 1
177
+ return ret
178
+
179
+ ret['match'] = 0
180
+ return ret
181
+
182
+
183
+ # ============= DeepSeek API =============
184
+
185
+ def call_deepseek(prompt, temperature=0.0):
186
+ """Call DeepSeek-V3 API (OpenAI-compatible)."""
187
+ headers = {
188
+ "Content-Type": "application/json",
189
+ "Authorization": f"Bearer {DEEPSEEK_API_KEY}"
190
+ }
191
+ data = json.dumps({
192
+ "model": DEEPSEEK_MODEL,
193
+ "messages": [{"role": "user", "content": prompt}],
194
+ "temperature": temperature,
195
+ "max_tokens": 200,
196
+ }).encode('utf-8')
197
+
198
+ try:
199
+ req = urllib.request.Request(DEEPSEEK_API_URL, data=data, headers=headers)
200
+ with urllib.request.urlopen(req, timeout=30) as resp:
201
+ result = json.loads(resp.read().decode())
202
+ return result['choices'][0]['message']['content']
203
+ except Exception as e:
204
+ return FAIL_MSG
205
+
206
+
207
+ # ============= PhyX Evaluation Logic =============
208
+ # Exactly from PhyX source code: PhyX_auxeval()
209
+
210
+ def PhyX_auxeval(gt_answer, prediction):
211
+ """
212
+ Evaluate a single prediction against ground truth.
213
+ Follows PhyX pipeline exactly:
214
+ 1. Extract answer (boxed/regex)
215
+ 2. String-level match
216
+ 3. LLM judge with 5 retries, increasing temperature
217
+ Returns: dict(log, res, extracted)
218
+ """
219
+ log = ''
220
+
221
+ # Step 1: Rule-based extraction
222
+ tmp = PhyX_process_line(prediction, gt_answer)
223
+
224
+ if tmp["extracted"] == "Fail to Call API":
225
+ log += "Fail to Call API"
226
+ return dict(log=log, res=0, extracted="Fail to Call API")
227
+
228
+ if tmp["extracted"] != "SAME as predict":
229
+ prediction_extracted = tmp["extracted"]
230
+ else:
231
+ prediction_extracted = prediction
232
+
233
+ # Step 2: String-level match
234
+ if str(gt_answer).strip().lower() == prediction_extracted.strip().lower():
235
+ return dict(log="Matched at string level", res=1, extracted=prediction_extracted)
236
+
237
+ # Step 3: LLM judge with retries (PhyX uses 5 retries with temp = i * 0.5)
238
+ prompt = build_phyx_gpt4_prompt(gt_answer, prediction_extracted)
239
+ for i in range(RETRY):
240
+ res = call_deepseek(prompt, temperature=i * 0.5)
241
+ if FAIL_MSG in res:
242
+ log += f'Try {i}: answer and prediction are {gt_answer} and {prediction_extracted}, failed to compare.\n'
243
+ else:
244
+ log += 'Compared at semantic level. '
245
+ if "1" in res:
246
+ log += "Semantic equal via LLM."
247
+ return dict(log=log, res=1, extracted=prediction_extracted)
248
+ elif "0" in res:
249
+ log += f"LLM judgement {res}"
250
+ return dict(log=log, res=0, extracted=prediction_extracted)
251
+ log += 'All 5 retries failed.\n'
252
+ return dict(log=log, res=0, extracted=prediction_extracted)
253
+
254
+
255
+ # ============= Main Scoring =============
256
+
257
+ def score_results(results_file, model_name):
258
+ """Score all results from a JSONL file."""
259
+ results = []
260
+ with open(results_file, 'r', encoding='utf-8') as f:
261
+ for line in f:
262
+ if line.strip():
263
+ results.append(json.loads(line))
264
+
265
+ print(f"\n{'='*60}")
266
+ print(f" Scoring: {model_name} ({len(results)} samples)")
267
+ print(f" Using PhyX-aligned pipeline with DeepSeek-V3 judge")
268
+ print(f"{'='*60}")
269
+
270
+ total = len(results)
271
+ hit = 0
272
+ cat_stats = defaultdict(lambda: {'total': 0, 'correct': 0})
273
+ scored = []
274
+ string_match = 0
275
+ llm_match = 0
276
+ llm_called = 0
277
+
278
+ for i, r in enumerate(results):
279
+ gt = r['ground_truth_value']
280
+ prediction = r['model_output']
281
+ cat = r.get('category', 'unknown')
282
+ cat_stats[cat]['total'] += 1
283
+
284
+ eval_result = PhyX_auxeval(gt, prediction)
285
+
286
+ r['extracted_answer'] = eval_result['extracted']
287
+ r['eval_log'] = eval_result['log']
288
+ r['res'] = eval_result['res']
289
+
290
+ if eval_result['res'] == 1:
291
+ hit += 1
292
+ cat_stats[cat]['correct'] += 1
293
+
294
+ if "string level" in eval_result['log']:
295
+ string_match += 1
296
+ elif "semantic level" in eval_result['log'] or "LLM judgement" in eval_result['log']:
297
+ llm_called += 1
298
+ if eval_result['res'] == 1:
299
+ llm_match += 1
300
+
301
+ scored.append(r)
302
+
303
+ if (i + 1) % 50 == 0:
304
+ print(f" [{i+1}/{total}] acc={hit/(i+1)*100:.1f}% "
305
+ f"(str_match={string_match}, llm_called={llm_called}, llm_match={llm_match})",
306
+ flush=True)
307
+
308
+ acc = hit / total * 100
309
+
310
+ print(f"\n RESULTS for {model_name}:")
311
+ print(f" Total: {total}")
312
+ print(f" String matches: {string_match}")
313
+ print(f" LLM calls: {llm_called}")
314
+ print(f" LLM matches: {llm_match}")
315
+ print(f" Final correct: {hit} ({acc:.1f}%)")
316
+ print(f"\n Per category:")
317
+ for cat, s in sorted(cat_stats.items(), key=lambda x: -x[1]['total']):
318
+ cat_acc = s['correct'] / s['total'] * 100 if s['total'] > 0 else 0
319
+ print(f" {cat:25s}: {s['correct']:3d}/{s['total']:3d} ({cat_acc:5.1f}%)")
320
+
321
+ return scored, {
322
+ 'model': model_name,
323
+ 'total': total,
324
+ 'string_matches': string_match,
325
+ 'llm_calls': llm_called,
326
+ 'llm_matches': llm_match,
327
+ 'final_correct': hit,
328
+ 'final_acc': round(acc, 2),
329
+ 'category_stats': {k: dict(v) for k, v in cat_stats.items()}
330
+ }
331
+
332
+
333
+ def main():
334
+ print("="*60)
335
+ print(" PhyX-ALIGNED EVAL: DeepSeek-V3 as Judge")
336
+ print(f" Scoring: lora_math_f ONLY (base already scored)")
337
+ print(f" Results dir: {RESULTS_DIR}")
338
+ print("="*60)
339
+
340
+ # Test API
341
+ print("\nTesting DeepSeek API...")
342
+ test = call_deepseek("Say 'OK' if you can read this.")
343
+ if test == FAIL_MSG:
344
+ print(f" API FAILED: {test}")
345
+ sys.exit(1)
346
+ print(f" API OK: {test[:50]}")
347
+
348
+ # Score lora_math_f only
349
+ sft_scored, sft_stats = score_results(SFT_RESULTS, "lora_math_f (LoRA+freeze+math)")
350
+
351
+ # Save scored results
352
+ out_file = os.path.join(OUTPUT_DIR, "scored_results_lora_math_f.jsonl")
353
+ with open(out_file, 'w', encoding='utf-8') as f:
354
+ for r in sft_scored:
355
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
356
+
357
+ # Save report
358
+ report = {
359
+ 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
360
+ 'scoring_method': 'PhyX-aligned (DeepSeek-V3 judge, 5-shot ICE, 5 retries)',
361
+ 'model': sft_stats,
362
+ }
363
+ report_file = os.path.join(OUTPUT_DIR, "report_lora_math_f.json")
364
+ with open(report_file, 'w', encoding='utf-8') as f:
365
+ json.dump(report, f, indent=2, ensure_ascii=False)
366
+
367
+ print(f"\n{'='*60}")
368
+ print(f" SCORING COMPLETE: lora_math_f")
369
+ print(f"{'='*60}")
370
+ print(f" Accuracy: {sft_stats['final_acc']}%")
371
+ print(f" Scored: {out_file}")
372
+ print(f" Report: {report_file}")
373
+ print(f"{'='*60}")
374
+
375
+
376
+ if __name__ == '__main__':
377
+ main()
eval_footprint/eval_judge_lora_phyx_f.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Score phyx and phyx_50000 inference results using DeepSeek-V3 judge.
4
+ PhyX-aligned pipeline: extract → string match → LLM judge (5 retries).
5
+ """
6
+ import json, os, re, time, sys, ast
7
+ from collections import defaultdict
8
+ import urllib.request
9
+ import urllib.error
10
+
11
+ # ===================== CONFIG =====================
12
+ DEEPSEEK_API_KEY = "sk-6364e2b3116241c59577191c32b09021"
13
+ DEEPSEEK_MODEL = "deepseek-chat"
14
+ DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
15
+
16
+ RESULTS_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
17
+ OUTPUT_DIR = "/data1/dhelix_shared/hku/rl4phyx/RL4Phyx/SFT/result"
18
+
19
+ PHYX_RESULTS = os.path.join(RESULTS_DIR, "inference_results_phyx.jsonl")
20
+ PHYX50K_RESULTS = os.path.join(RESULTS_DIR, "inference_results_phyx_50000.jsonl")
21
+
22
+ FAIL_MSG = 'Failed to obtain answer via API.'
23
+ RETRY = 5
24
+ # ==================================================
25
+
26
+
27
+ # ============= PhyX ICE =============
28
+ def get_ICE():
29
+ example_1 = """
30
+ Ground truth answer: 502 \n
31
+ Predicted answer: The mass of block (B) is:
32
+ [
33
+ \\boxed{ 50 \\sqrt{101} }
34
+ ] \n
35
+ Judegement: 1
36
+ """
37
+ example_2 = """
38
+ Ground truth answer: 46.3 kN \n
39
+ Predicted answer: The tension ( T_B ) in the cable is approximately:
40
+ [
41
+ \\boxed{46300 }
42
+ ] \n
43
+ Judegement: 1
44
+ """
45
+ example_3 = """
46
+ Ground truth answer: 12 m/s \n
47
+ Predicted answer: The speed of the box after 2.00 seconds is:
48
+ [
49
+ \\boxed{11.3, \\text{m/s}}
50
+ ] \n
51
+ Judegement: 0
52
+ """
53
+ example_4 = """
54
+ Ground truth answer: 36.00 kg \n
55
+ Predicted answer: The mass of the hanging block ( m_2 ) must be approximately:
56
+ [
57
+ \\boxed{36.1, \\text\\{kg\\}}
58
+ ] \n
59
+ Judegement: 1
60
+ """
61
+ example_5 = """
62
+ Ground truth answer: 3.2 m \n
63
+ Predicted answer: The stuntman and villain slide approximately \\frac{10}{3.1415} meters**.
64
+ Judegement: 1
65
+ """
66
+ return [example_1, example_2, example_3, example_4, example_5]
67
+
68
+
69
+ def build_phyx_gpt4_prompt(gt_answer, pred):
70
+ task_description = """
71
+ Please read the following example. Given predicted answer and ground truth answer,
72
+ compare the these two answers, then ONLY output judegement 1/0 for matched/unmatched at the end of the prompt.
73
+ If the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.
74
+ If the given predicted mentions "approximately", then allow the Approximation Error, \
75
+ such as 0.49 and approximately 0.5, 0.81 and approximately 0.8. \n
76
+
77
+ """
78
+ prompt = task_description
79
+ for example in get_ICE():
80
+ prompt += example + '\n'
81
+ prompt += 'Ground truth answer: {} \n'.format(gt_answer)
82
+ prompt += 'Predicted answer: {} \n'.format(pred)
83
+ prompt += 'Judegement:'
84
+ return prompt
85
+
86
+
87
+ # ============= PhyX Answer Extraction =============
88
+ def mapping_str(input_str):
89
+ d = {"\\dfrac": "\\frac", "\\pi": "3.14"}
90
+ output = input_str
91
+ for k, v in d.items():
92
+ try:
93
+ output = output.replace(k, v)
94
+ except:
95
+ pass
96
+ return output
97
+
98
+
99
+ def extract_boxed_content(s):
100
+ start = s.find(r'\boxed{')
101
+ if start == -1:
102
+ return None
103
+ content_start = start + len(r'\boxed{')
104
+ rest = s[content_start:]
105
+ depth = 0
106
+ for i, ch in enumerate(rest):
107
+ if ch == '{':
108
+ depth += 1
109
+ elif ch == '}':
110
+ if depth == 0:
111
+ return rest[:i]
112
+ else:
113
+ depth -= 1
114
+ return None
115
+
116
+
117
+ def PhyX_process_line(prediction_str, gt_answer):
118
+ ret = {}
119
+ ret['gt'] = str(gt_answer)
120
+ ret['pred'] = prediction_str.strip()
121
+
122
+ if ret['pred'] == FAIL_MSG:
123
+ ret['match'] = 0
124
+ ret["extracted"] = "Fail to Call API"
125
+ return ret
126
+
127
+ boxed_answer = extract_boxed_content(ret['pred'])
128
+ if boxed_answer is not None:
129
+ boxed_answer = mapping_str(boxed_answer)
130
+ ret["extracted"] = boxed_answer
131
+ else:
132
+ pattern = r'\b(?:final\s+answer|correct\s+answer)\b[^::]*[::]\s*(.*?)(?=\n\n\n|\Z)'
133
+ flags = re.IGNORECASE | re.DOTALL
134
+ match = re.search(pattern, ret['pred'], flags=flags)
135
+ if match:
136
+ ret["extracted"] = mapping_str(match.group(1))
137
+ else:
138
+ ret["extracted"] = "SAME as predict"
139
+
140
+ gt_lower = ret['gt'].strip().lower()
141
+ extracted_lower = ret["extracted"].strip().lower()
142
+ pred_lower = ret["pred"].strip().lower()
143
+
144
+ if gt_lower == extracted_lower or gt_lower == pred_lower or ret['gt'] in ret['pred']:
145
+ ret['match'] = 1
146
+ return ret
147
+
148
+ ret['match'] = 0
149
+ return ret
150
+
151
+
152
+ # ============= DeepSeek API =============
153
+ def call_deepseek(prompt, temperature=0.0):
154
+ headers = {
155
+ "Content-Type": "application/json",
156
+ "Authorization": f"Bearer {DEEPSEEK_API_KEY}"
157
+ }
158
+ data = json.dumps({
159
+ "model": DEEPSEEK_MODEL,
160
+ "messages": [{"role": "user", "content": prompt}],
161
+ "temperature": temperature,
162
+ "max_tokens": 200,
163
+ }).encode('utf-8')
164
+
165
+ try:
166
+ req = urllib.request.Request(DEEPSEEK_API_URL, data=data, headers=headers)
167
+ with urllib.request.urlopen(req, timeout=30) as resp:
168
+ result = json.loads(resp.read().decode())
169
+ return result['choices'][0]['message']['content']
170
+ except Exception as e:
171
+ return FAIL_MSG
172
+
173
+
174
+ # ============= PhyX Evaluation =============
175
+ def PhyX_auxeval(gt_answer, prediction):
176
+ log = ''
177
+ tmp = PhyX_process_line(prediction, gt_answer)
178
+
179
+ if tmp["extracted"] == "Fail to Call API":
180
+ return dict(log="Fail to Call API", res=0, extracted="Fail to Call API")
181
+
182
+ prediction_extracted = tmp["extracted"] if tmp["extracted"] != "SAME as predict" else prediction
183
+
184
+ if str(gt_answer).strip().lower() == prediction_extracted.strip().lower():
185
+ return dict(log="Matched at string level", res=1, extracted=prediction_extracted)
186
+
187
+ prompt = build_phyx_gpt4_prompt(gt_answer, prediction_extracted)
188
+ for i in range(RETRY):
189
+ res = call_deepseek(prompt, temperature=i * 0.5)
190
+ if FAIL_MSG in res:
191
+ log += f'Try {i}: failed to compare.\n'
192
+ else:
193
+ log += 'Compared at semantic level. '
194
+ if "1" in res:
195
+ log += "Semantic equal via LLM."
196
+ return dict(log=log, res=1, extracted=prediction_extracted)
197
+ elif "0" in res:
198
+ log += f"LLM judgement {res}"
199
+ return dict(log=log, res=0, extracted=prediction_extracted)
200
+ log += 'All 5 retries failed.\n'
201
+ return dict(log=log, res=0, extracted=prediction_extracted)
202
+
203
+
204
+ # ============= Scoring =============
205
+ def _eval_single(args):
206
+ """Evaluate a single sample (for thread pool)."""
207
+ idx, r = args
208
+ gt = r['ground_truth_value']
209
+ prediction = r['model_output']
210
+ eval_result = PhyX_auxeval(gt, prediction)
211
+ r['extracted_answer'] = eval_result['extracted']
212
+ r['eval_log'] = eval_result['log']
213
+ r['res'] = eval_result['res']
214
+ return idx, r
215
+
216
+
217
+ def score_results(results_file, model_name, output_file):
218
+ from concurrent.futures import ThreadPoolExecutor, as_completed
219
+
220
+ results = []
221
+ with open(results_file, 'r', encoding='utf-8') as f:
222
+ for line in f:
223
+ if line.strip():
224
+ results.append(json.loads(line))
225
+
226
+ print(f"\n{'='*60}")
227
+ print(f" Scoring: {model_name} ({len(results)} samples)")
228
+ print(f" Using PhyX-aligned pipeline with DeepSeek-V3 judge (20 threads)")
229
+ print(f"{'='*60}")
230
+
231
+ total = len(results)
232
+
233
+ # Parallel evaluation with 20 threads
234
+ with ThreadPoolExecutor(max_workers=20) as executor:
235
+ futures = {executor.submit(_eval_single, (i, r)): i for i, r in enumerate(results)}
236
+ done = 0
237
+ for future in as_completed(futures):
238
+ done += 1
239
+ if done % 100 == 0 or done == total:
240
+ # Count current stats
241
+ hit = sum(1 for r in results if r.get('res') == 1)
242
+ print(f" [{done}/{total}] processed, current correct={hit}", flush=True)
243
+
244
+ # Compute final stats
245
+ hit = 0
246
+ string_match = 0
247
+ llm_match = 0
248
+ llm_called = 0
249
+ cat_stats = defaultdict(lambda: {'total': 0, 'correct': 0})
250
+
251
+ for r in results:
252
+ cat = r.get('category', 'unknown')
253
+ cat_stats[cat]['total'] += 1
254
+ if r.get('res') == 1:
255
+ hit += 1
256
+ cat_stats[cat]['correct'] += 1
257
+ log = r.get('eval_log', '')
258
+ if "string level" in log:
259
+ string_match += 1
260
+ elif "semantic level" in log or "LLM judgement" in log:
261
+ llm_called += 1
262
+ if r.get('res') == 1:
263
+ llm_match += 1
264
+
265
+ acc = hit / total * 100
266
+
267
+ with open(output_file, 'w', encoding='utf-8') as f:
268
+ for r in results:
269
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
270
+
271
+ print(f"\n {model_name}: {hit}/{total} ({acc:.1f}%)")
272
+ print(f" String matches: {string_match}, LLM calls: {llm_called}, LLM matches: {llm_match}")
273
+ print(f" Per category:")
274
+ for cat, s in sorted(cat_stats.items(), key=lambda x: -x[1]['total']):
275
+ cat_acc = s['correct'] / s['total'] * 100 if s['total'] > 0 else 0
276
+ print(f" {cat:25s}: {s['correct']:3d}/{s['total']:3d} ({cat_acc:5.1f}%)")
277
+
278
+ return {
279
+ 'model': model_name,
280
+ 'total': total,
281
+ 'string_matches': string_match,
282
+ 'llm_calls': llm_called,
283
+ 'llm_matches': llm_match,
284
+ 'final_correct': hit,
285
+ 'final_acc': round(acc, 2),
286
+ 'category_stats': {k: dict(v) for k, v in cat_stats.items()}
287
+ }
288
+
289
+
290
+ def main():
291
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
292
+
293
+ print("="*60)
294
+ print(" PhyX-ALIGNED EVAL: DeepSeek-V3 as Judge")
295
+ print(f" Pipeline: extract -> string match -> LLM judge (5 retries)")
296
+ print(f" Output dir: {OUTPUT_DIR}")
297
+ print("="*60)
298
+
299
+ # Test API
300
+ print("\nTesting DeepSeek API...")
301
+ test = call_deepseek("Say 'OK' if you can read this.")
302
+ if test == FAIL_MSG:
303
+ print(f" API FAILED: {test}")
304
+ sys.exit(1)
305
+ print(f" API OK: {test[:50]}")
306
+
307
+ # Score phyx
308
+ phyx_stats = score_results(
309
+ PHYX_RESULTS, "SFT-phyx (1467 physics)",
310
+ os.path.join(OUTPUT_DIR, "scored_results_phyx.jsonl")
311
+ )
312
+
313
+ # Score phyx_50000
314
+ phyx50k_stats = score_results(
315
+ PHYX50K_RESULTS, "SFT-phyx_50000 (53001 combined)",
316
+ os.path.join(OUTPUT_DIR, "scored_results_phyx_50000.jsonl")
317
+ )
318
+
319
+ # Save report
320
+ report = {
321
+ 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
322
+ 'scoring_method': 'PhyX-aligned (DeepSeek-V3 judge, 5-shot ICE, 5 retries)',
323
+ 'phyx': phyx_stats,
324
+ 'phyx_50000': phyx50k_stats,
325
+ }
326
+ report_file = os.path.join(OUTPUT_DIR, "comparison_report.json")
327
+ with open(report_file, 'w', encoding='utf-8') as f:
328
+ json.dump(report, f, indent=2, ensure_ascii=False)
329
+
330
+ print(f"\n{'='*60}")
331
+ print(f" RESULTS")
332
+ print(f"{'='*60}")
333
+ print(f" phyx accuracy: {phyx_stats['final_acc']}%")
334
+ print(f" phyx_50000 accuracy: {phyx50k_stats['final_acc']}%")
335
+ print(f"\n Report: {report_file}")
336
+ print(f"{'='*60}")
337
+
338
+
339
+ if __name__ == '__main__':
340
+ main()
eval_footprint/eval_lora_phyx_f.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).
4
+
5
+ Runs both Base and SFT models on the 1533 open-ended physics test set.
6
+ Saves raw model outputs for later judging.
7
+
8
+ Usage (inside Docker container):
9
+ cd /tmp && python3 /path/to/eval_openended_inference.py
10
+
11
+ Output:
12
+ sft_eval_footprint/inference_results_base.jsonl
13
+ sft_eval_footprint/inference_results_sft.jsonl
14
+ """
15
+ import os
16
+ import sys
17
+ import json
18
+ import re
19
+ import time
20
+ import torch
21
+ import multiprocessing as mp
22
+ from collections import Counter
23
+
24
+ # ============ CONFIG ============
25
+ os.environ["HF_HUB_OFFLINE"] = "1"
26
+ os.environ["TRANSFORMERS_OFFLINE"] = "1"
27
+
28
+ BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
29
+ SFT_MODEL = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_qwen25vl_3b_fullft/final"
30
+ TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
31
+ OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
32
+ IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
33
+
34
+ # Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
35
+ BASE_GPUS = [0, 1, 2, 3]
36
+ SFT_GPUS = [4, 5, 6, 7]
37
+ MAX_NEW_TOKENS = 2048
38
+ # ================================
39
+
40
+
41
+ def load_test_data():
42
+ """Load test samples from JSONL."""
43
+ samples = []
44
+ with open(TEST_FILE, 'r', encoding='utf-8') as f:
45
+ for line in f:
46
+ if line.strip():
47
+ samples.append(json.loads(line))
48
+ return samples
49
+
50
+
51
+ def build_open_ended_prompt(sample):
52
+ """Build an open-ended prompt (no MCQ options)."""
53
+ desc = sample.get('description', '')
54
+ question = sample.get('question', '')
55
+
56
+ prompt = f"""Look at the image and answer the physics question.
57
+
58
+ {desc}
59
+
60
+ {question}
61
+
62
+ Please reason step by step, and put your final answer within \\boxed{{}}.
63
+ """
64
+ return prompt.strip()
65
+
66
+
67
+ def worker_inference(gpu_id, model_path, samples, output_file, model_name):
68
+ """Worker: load model on specific GPU and run inference on assigned samples."""
69
+ import torch
70
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
71
+ from qwen_vl_utils import process_vision_info
72
+ from PIL import Image
73
+
74
+ device = f"cuda:{gpu_id}"
75
+ print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)
76
+
77
+ processor = AutoProcessor.from_pretrained(
78
+ model_path,
79
+ min_pixels=3136,
80
+ max_pixels=200704,
81
+ local_files_only=True,
82
+ trust_remote_code=True,
83
+ )
84
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
85
+ model_path,
86
+ torch_dtype=torch.bfloat16,
87
+ device_map=device,
88
+ local_files_only=True,
89
+ trust_remote_code=True,
90
+ )
91
+ model.eval()
92
+ print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)
93
+
94
+ results = []
95
+ for i, sample in enumerate(samples):
96
+ idx = sample['index']
97
+ prompt_text = build_open_ended_prompt(sample)
98
+ image_path = os.path.join(IMAGE_DIR, sample['image'])
99
+
100
+ # Build messages
101
+ messages = [{
102
+ "role": "user",
103
+ "content": [
104
+ {"type": "image", "image": f"file://{image_path}"},
105
+ {"type": "text", "text": prompt_text},
106
+ ],
107
+ }]
108
+
109
+ try:
110
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
111
+ image_inputs, video_inputs = process_vision_info(messages)
112
+ inputs = processor(
113
+ text=[text],
114
+ images=image_inputs,
115
+ videos=video_inputs,
116
+ padding=True,
117
+ return_tensors="pt",
118
+ ).to(device)
119
+
120
+ with torch.no_grad():
121
+ output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
122
+
123
+ generated = output_ids[0][inputs.input_ids.shape[1]:]
124
+ response = processor.decode(generated, skip_special_tokens=True)
125
+ except Exception as e:
126
+ response = f"ERROR: {str(e)}"
127
+
128
+ result = {
129
+ "index": idx,
130
+ "category": sample['category'],
131
+ "subfield": sample.get('subfield', ''),
132
+ "question": sample['question'],
133
+ "ground_truth_value": sample['ground_truth_value'],
134
+ "ground_truth_letter": sample.get('ground_truth_letter', ''),
135
+ "model_output": response,
136
+ "model_name": model_name,
137
+ "gpu_id": gpu_id,
138
+ }
139
+ results.append(result)
140
+
141
+ if (i + 1) % 20 == 0 or (i + 1) == len(samples):
142
+ print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
143
+
144
+ # Write results
145
+ with open(output_file, 'w', encoding='utf-8') as f:
146
+ for r in results:
147
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
148
+
149
+ print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
150
+ return len(results)
151
+
152
+
153
+ def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
154
+ """Split samples across GPUs and run in parallel."""
155
+ n = len(samples)
156
+ k = len(gpu_ids)
157
+ chunk_size = (n + k - 1) // k
158
+
159
+ processes = []
160
+ output_files = []
161
+ for i, gpu_id in enumerate(gpu_ids):
162
+ chunk = samples[i * chunk_size: (i + 1) * chunk_size]
163
+ if not chunk:
164
+ continue
165
+ out_file = f"{output_base}_gpu{gpu_id}.jsonl"
166
+ output_files.append(out_file)
167
+ p = mp.Process(
168
+ target=worker_inference,
169
+ args=(gpu_id, model_path, chunk, out_file, model_name)
170
+ )
171
+ processes.append(p)
172
+
173
+ for p in processes:
174
+ p.start()
175
+ for p in processes:
176
+ p.join()
177
+
178
+ return output_files
179
+
180
+
181
+ def merge_results(output_files, final_output):
182
+ """Merge per-GPU result files into one."""
183
+ all_results = []
184
+ for f in output_files:
185
+ if os.path.exists(f):
186
+ with open(f, 'r', encoding='utf-8') as fh:
187
+ for line in fh:
188
+ if line.strip():
189
+ all_results.append(json.loads(line))
190
+
191
+ # Sort by index for consistency
192
+ all_results.sort(key=lambda x: x['index'])
193
+
194
+ with open(final_output, 'w', encoding='utf-8') as f:
195
+ for r in all_results:
196
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
197
+
198
+ # Cleanup per-GPU files
199
+ for f in output_files:
200
+ if os.path.exists(f):
201
+ os.remove(f)
202
+
203
+ return all_results
204
+
205
+
206
+ def main():
207
+ mp.set_start_method('spawn', force=True)
208
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
209
+
210
+ print("=" * 60)
211
+ print(" OPEN-ENDED EVAL: Base vs SFT (Multi-GPU)")
212
+ print(f" Base model: {BASE_MODEL}")
213
+ print(f" SFT model: {SFT_MODEL}")
214
+ print(f" Base GPUs: {BASE_GPUS}")
215
+ print(f" SFT GPUs: {SFT_GPUS}")
216
+ print("=" * 60)
217
+
218
+ # Load test data
219
+ samples = load_test_data()
220
+ print(f"\nLoaded {len(samples)} test samples")
221
+
222
+ cats = Counter(s['category'] for s in samples)
223
+ for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
224
+ print(f" {cat}: {cnt}")
225
+
226
+ # Run both models (each uses 4 GPUs internally for parallel inference)
227
+ t0 = time.time()
228
+
229
+ base_output = os.path.join(OUTPUT_DIR, "inference_results_base")
230
+ sft_output = os.path.join(OUTPUT_DIR, "inference_results_sft")
231
+
232
+ # Run base model on GPUs 0-3 (4 workers in parallel)
233
+ print("\n>>> Starting BASE model inference...", flush=True)
234
+ run_model_parallel(BASE_MODEL, "base", BASE_GPUS, samples, base_output)
235
+
236
+ # Run SFT model on GPUs 4-7 (4 workers in parallel)
237
+ print("\n>>> Starting SFT model inference...", flush=True)
238
+ run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)
239
+
240
+ # Merge results
241
+ base_files = [f"{base_output}_gpu{g}.jsonl" for g in BASE_GPUS]
242
+ sft_files = [f"{sft_output}_gpu{g}.jsonl" for g in SFT_GPUS]
243
+
244
+ base_final = os.path.join(OUTPUT_DIR, "inference_results_base.jsonl")
245
+ sft_final = os.path.join(OUTPUT_DIR, "inference_results_sft.jsonl")
246
+
247
+ base_results = merge_results(base_files, base_final)
248
+ sft_results = merge_results(sft_files, sft_final)
249
+
250
+ elapsed = time.time() - t0
251
+ print(f"\n{'=' * 60}")
252
+ print(f" INFERENCE COMPLETE in {elapsed/60:.1f} min")
253
+ print(f" Base results: {len(base_results)} → {base_final}")
254
+ print(f" SFT results: {len(sft_results)} → {sft_final}")
255
+ print(f"{'=' * 60}")
256
+
257
+
258
+ if __name__ == '__main__':
259
+ main()
eval_footprint/eval_lora_phyx_f_final.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).
4
+
5
+ Runs both Base and SFT models on the 1533 open-ended physics test set.
6
+ Saves raw model outputs for later judging.
7
+
8
+ Usage (inside Docker container):
9
+ cd /tmp && python3 /path/to/eval_openended_inference.py
10
+
11
+ Output:
12
+ sft_eval_footprint/inference_results_base.jsonl
13
+ sft_eval_footprint/inference_results_phyx.jsonl
14
+ """
15
+ import os
16
+ import sys
17
+ import json
18
+ import re
19
+ import time
20
+ import torch
21
+ import multiprocessing as mp
22
+ from collections import Counter
23
+
24
+ # ============ CONFIG ============
25
+ os.environ["HF_HUB_OFFLINE"] = "1"
26
+ os.environ["TRANSFORMERS_OFFLINE"] = "1"
27
+
28
+ BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
29
+ SFT_MODEL = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_qwen25vl_3b_fullft_phyx/final"
30
+ TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
31
+ OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
32
+ IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
33
+
34
+ # Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
35
+ BASE_GPUS = [0, 1, 2, 3]
36
+ SFT_GPUS = [0, 1, 2, 3]
37
+ MAX_NEW_TOKENS = 2048
38
+ # ================================
39
+
40
+
41
+ def load_test_data():
42
+ """Load test samples from JSONL."""
43
+ samples = []
44
+ with open(TEST_FILE, 'r', encoding='utf-8') as f:
45
+ for line in f:
46
+ if line.strip():
47
+ samples.append(json.loads(line))
48
+ return samples
49
+
50
+
51
+ def build_open_ended_prompt(sample):
52
+ """Build an open-ended prompt (no MCQ options)."""
53
+ desc = sample.get('description', '')
54
+ question = sample.get('question', '')
55
+
56
+ prompt = f"""Look at the image and answer the physics question.
57
+
58
+ {desc}
59
+
60
+ {question}
61
+
62
+ Please reason step by step, and put your final answer within \\boxed{{}}.
63
+ """
64
+ return prompt.strip()
65
+
66
+
67
+ def worker_inference(gpu_id, model_path, samples, output_file, model_name):
68
+ """Worker: load model on specific GPU and run inference on assigned samples."""
69
+ import torch
70
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
71
+ from qwen_vl_utils import process_vision_info
72
+ from PIL import Image
73
+
74
+ device = f"cuda:{gpu_id}"
75
+ print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)
76
+
77
+ processor = AutoProcessor.from_pretrained(
78
+ model_path,
79
+ min_pixels=3136,
80
+ max_pixels=200704,
81
+ local_files_only=True,
82
+ trust_remote_code=True,
83
+ )
84
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
85
+ model_path,
86
+ torch_dtype=torch.bfloat16,
87
+ device_map=device,
88
+ local_files_only=True,
89
+ trust_remote_code=True,
90
+ )
91
+ model.eval()
92
+ print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)
93
+
94
+ results = []
95
+ for i, sample in enumerate(samples):
96
+ idx = sample['index']
97
+ prompt_text = build_open_ended_prompt(sample)
98
+ image_path = os.path.join(IMAGE_DIR, sample['image'])
99
+
100
+ # Build messages
101
+ messages = [{
102
+ "role": "user",
103
+ "content": [
104
+ {"type": "image", "image": f"file://{image_path}"},
105
+ {"type": "text", "text": prompt_text},
106
+ ],
107
+ }]
108
+
109
+ try:
110
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
111
+ image_inputs, video_inputs = process_vision_info(messages)
112
+ inputs = processor(
113
+ text=[text],
114
+ images=image_inputs,
115
+ videos=video_inputs,
116
+ padding=True,
117
+ return_tensors="pt",
118
+ ).to(device)
119
+
120
+ with torch.no_grad():
121
+ output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
122
+
123
+ generated = output_ids[0][inputs.input_ids.shape[1]:]
124
+ response = processor.decode(generated, skip_special_tokens=True)
125
+ except Exception as e:
126
+ response = f"ERROR: {str(e)}"
127
+
128
+ result = {
129
+ "index": idx,
130
+ "category": sample['category'],
131
+ "subfield": sample.get('subfield', ''),
132
+ "question": sample['question'],
133
+ "ground_truth_value": sample['ground_truth_value'],
134
+ "ground_truth_letter": sample.get('ground_truth_letter', ''),
135
+ "model_output": response,
136
+ "model_name": model_name,
137
+ "gpu_id": gpu_id,
138
+ }
139
+ results.append(result)
140
+
141
+ if (i + 1) % 20 == 0 or (i + 1) == len(samples):
142
+ print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
143
+
144
+ # Write results
145
+ with open(output_file, 'w', encoding='utf-8') as f:
146
+ for r in results:
147
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
148
+
149
+ print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
150
+ return len(results)
151
+
152
+
153
+ def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
154
+ """Split samples across GPUs and run in parallel."""
155
+ n = len(samples)
156
+ k = len(gpu_ids)
157
+ chunk_size = (n + k - 1) // k
158
+
159
+ processes = []
160
+ output_files = []
161
+ for i, gpu_id in enumerate(gpu_ids):
162
+ chunk = samples[i * chunk_size: (i + 1) * chunk_size]
163
+ if not chunk:
164
+ continue
165
+ out_file = f"{output_base}_gpu{gpu_id}.jsonl"
166
+ output_files.append(out_file)
167
+ p = mp.Process(
168
+ target=worker_inference,
169
+ args=(gpu_id, model_path, chunk, out_file, model_name)
170
+ )
171
+ processes.append(p)
172
+
173
+ for p in processes:
174
+ p.start()
175
+ for p in processes:
176
+ p.join()
177
+
178
+ return output_files
179
+
180
+
181
+ def merge_results(output_files, final_output):
182
+ """Merge per-GPU result files into one."""
183
+ all_results = []
184
+ for f in output_files:
185
+ if os.path.exists(f):
186
+ with open(f, 'r', encoding='utf-8') as fh:
187
+ for line in fh:
188
+ if line.strip():
189
+ all_results.append(json.loads(line))
190
+
191
+ # Sort by index for consistency
192
+ all_results.sort(key=lambda x: x['index'])
193
+
194
+ with open(final_output, 'w', encoding='utf-8') as f:
195
+ for r in all_results:
196
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
197
+
198
+ # Cleanup per-GPU files
199
+ for f in output_files:
200
+ if os.path.exists(f):
201
+ os.remove(f)
202
+
203
+ return all_results
204
+
205
+
206
+ def main():
207
+ mp.set_start_method('spawn', force=True)
208
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
209
+
210
+ print("=" * 60)
211
+ print(" OPEN-ENDED EVAL: Base vs SFT (Multi-GPU)")
212
+ print(f" Base model: {BASE_MODEL}")
213
+ print(f" SFT model: {SFT_MODEL}")
214
+ print(f" Base GPUs: {BASE_GPUS}")
215
+ print(f" SFT GPUs: {SFT_GPUS}")
216
+ print("=" * 60)
217
+
218
+ # Load test data
219
+ samples = load_test_data()
220
+ print(f"\nLoaded {len(samples)} test samples")
221
+
222
+ cats = Counter(s['category'] for s in samples)
223
+ for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
224
+ print(f" {cat}: {cnt}")
225
+
226
+ # Run both models (each uses 4 GPUs internally for parallel inference)
227
+ t0 = time.time()
228
+
229
+ base_output = os.path.join(OUTPUT_DIR, "inference_results_base")
230
+ sft_output = os.path.join(OUTPUT_DIR, "inference_results_phyx")
231
+
232
+ # Run base model on GPUs 0-3 (4 workers in parallel)
233
+ pass # SKIP BASE
234
+
235
+ # Run SFT model on GPUs 4-7 (4 workers in parallel)
236
+ print("\n>>> Starting SFT model inference...", flush=True)
237
+ run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)
238
+
239
+ # Merge results
240
+ base_files = [f"{base_output}_gpu{g}.jsonl" for g in BASE_GPUS]
241
+ sft_files = [f"{sft_output}_gpu{g}.jsonl" for g in SFT_GPUS]
242
+
243
+ base_final = os.path.join(OUTPUT_DIR, "inference_results_base.jsonl")
244
+ sft_final = os.path.join(OUTPUT_DIR, "inference_results_phyx.jsonl")
245
+
246
+ base_results = []
247
+ sft_results = merge_results(sft_files, sft_final)
248
+
249
+ elapsed = time.time() - t0
250
+ print(f"\n{'=' * 60}")
251
+ print(f" INFERENCE COMPLETE in {elapsed/60:.1f} min")
252
+ print(f" Base results: {len(base_results)} → {base_final}")
253
+ print(f" SFT results: {len(sft_results)} → {sft_final}")
254
+ print(f"{'=' * 60}")
255
+
256
+
257
+ if __name__ == '__main__':
258
+ main()
eval_footprint/eval_openended_inference.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).
4
+
5
+ Runs both Base and SFT models on the 1533 open-ended physics test set.
6
+ Saves raw model outputs for later judging.
7
+
8
+ Usage (inside Docker container):
9
+ cd /tmp && python3 /path/to/eval_openended_inference.py
10
+
11
+ Output:
12
+ sft_eval_footprint/inference_results_base.jsonl
13
+ sft_eval_footprint/inference_results_sft.jsonl
14
+ """
15
+ import os
16
+ import sys
17
+ import json
18
+ import re
19
+ import time
20
+ import torch
21
+ import multiprocessing as mp
22
+ from collections import Counter
23
+
24
+ # ============ CONFIG ============
25
+ os.environ["HF_HUB_OFFLINE"] = "1"
26
+ os.environ["TRANSFORMERS_OFFLINE"] = "1"
27
+
28
+ BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
29
+ SFT_MODEL = "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_qwen25vl_3b_fullft/final"
30
+ TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
31
+ OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
32
+ IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
33
+
34
+ # Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
35
+ BASE_GPUS = [0, 1, 2, 3]
36
+ SFT_GPUS = [4, 5, 6, 7]
37
+ MAX_NEW_TOKENS = 2048
38
+ # ================================
39
+
40
+
41
+ def load_test_data():
42
+ """Load test samples from JSONL."""
43
+ samples = []
44
+ with open(TEST_FILE, 'r', encoding='utf-8') as f:
45
+ for line in f:
46
+ if line.strip():
47
+ samples.append(json.loads(line))
48
+ return samples
49
+
50
+
51
+ def build_open_ended_prompt(sample):
52
+ """Build an open-ended prompt (no MCQ options)."""
53
+ desc = sample.get('description', '')
54
+ question = sample.get('question', '')
55
+
56
+ prompt = f"""Look at the image and answer the physics question.
57
+
58
+ {desc}
59
+
60
+ {question}
61
+
62
+ Please reason step by step, and put your final answer within \\boxed{{}}.
63
+ """
64
+ return prompt.strip()
65
+
66
+
67
+ def worker_inference(gpu_id, model_path, samples, output_file, model_name):
68
+ """Worker: load model on specific GPU and run inference on assigned samples."""
69
+ import torch
70
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
71
+ from qwen_vl_utils import process_vision_info
72
+ from PIL import Image
73
+
74
+ device = f"cuda:{gpu_id}"
75
+ print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)
76
+
77
+ processor = AutoProcessor.from_pretrained(
78
+ model_path,
79
+ min_pixels=3136,
80
+ max_pixels=200704,
81
+ local_files_only=True,
82
+ trust_remote_code=True,
83
+ )
84
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
85
+ model_path,
86
+ torch_dtype=torch.bfloat16,
87
+ device_map=device,
88
+ local_files_only=True,
89
+ trust_remote_code=True,
90
+ )
91
+ model.eval()
92
+ print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)
93
+
94
+ results = []
95
+ for i, sample in enumerate(samples):
96
+ idx = sample['index']
97
+ prompt_text = build_open_ended_prompt(sample)
98
+ image_path = os.path.join(IMAGE_DIR, sample['image'])
99
+
100
+ # Build messages
101
+ messages = [{
102
+ "role": "user",
103
+ "content": [
104
+ {"type": "image", "image": f"file://{image_path}"},
105
+ {"type": "text", "text": prompt_text},
106
+ ],
107
+ }]
108
+
109
+ try:
110
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
111
+ image_inputs, video_inputs = process_vision_info(messages)
112
+ inputs = processor(
113
+ text=[text],
114
+ images=image_inputs,
115
+ videos=video_inputs,
116
+ padding=True,
117
+ return_tensors="pt",
118
+ ).to(device)
119
+
120
+ with torch.no_grad():
121
+ output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
122
+
123
+ generated = output_ids[0][inputs.input_ids.shape[1]:]
124
+ response = processor.decode(generated, skip_special_tokens=True)
125
+ except Exception as e:
126
+ response = f"ERROR: {str(e)}"
127
+
128
+ result = {
129
+ "index": idx,
130
+ "category": sample['category'],
131
+ "subfield": sample.get('subfield', ''),
132
+ "question": sample['question'],
133
+ "ground_truth_value": sample['ground_truth_value'],
134
+ "ground_truth_letter": sample.get('ground_truth_letter', ''),
135
+ "model_output": response,
136
+ "model_name": model_name,
137
+ "gpu_id": gpu_id,
138
+ }
139
+ results.append(result)
140
+
141
+ if (i + 1) % 20 == 0 or (i + 1) == len(samples):
142
+ print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
143
+
144
+ # Write results
145
+ with open(output_file, 'w', encoding='utf-8') as f:
146
+ for r in results:
147
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
148
+
149
+ print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
150
+ return len(results)
151
+
152
+
153
+ def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
154
+ """Split samples across GPUs and run in parallel."""
155
+ n = len(samples)
156
+ k = len(gpu_ids)
157
+ chunk_size = (n + k - 1) // k
158
+
159
+ processes = []
160
+ output_files = []
161
+ for i, gpu_id in enumerate(gpu_ids):
162
+ chunk = samples[i * chunk_size: (i + 1) * chunk_size]
163
+ if not chunk:
164
+ continue
165
+ out_file = f"{output_base}_gpu{gpu_id}.jsonl"
166
+ output_files.append(out_file)
167
+ p = mp.Process(
168
+ target=worker_inference,
169
+ args=(gpu_id, model_path, chunk, out_file, model_name)
170
+ )
171
+ processes.append(p)
172
+
173
+ for p in processes:
174
+ p.start()
175
+ for p in processes:
176
+ p.join()
177
+
178
+ return output_files
179
+
180
+
181
+ def merge_results(output_files, final_output):
182
+ """Merge per-GPU result files into one."""
183
+ all_results = []
184
+ for f in output_files:
185
+ if os.path.exists(f):
186
+ with open(f, 'r', encoding='utf-8') as fh:
187
+ for line in fh:
188
+ if line.strip():
189
+ all_results.append(json.loads(line))
190
+
191
+ # Sort by index for consistency
192
+ all_results.sort(key=lambda x: x['index'])
193
+
194
+ with open(final_output, 'w', encoding='utf-8') as f:
195
+ for r in all_results:
196
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
197
+
198
+ # Cleanup per-GPU files
199
+ for f in output_files:
200
+ if os.path.exists(f):
201
+ os.remove(f)
202
+
203
+ return all_results
204
+
205
+
206
+ def main():
207
+ mp.set_start_method('spawn', force=True)
208
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
209
+
210
+ print("=" * 60)
211
+ print(" OPEN-ENDED EVAL: Base vs SFT (Multi-GPU)")
212
+ print(f" Base model: {BASE_MODEL}")
213
+ print(f" SFT model: {SFT_MODEL}")
214
+ print(f" Base GPUs: {BASE_GPUS}")
215
+ print(f" SFT GPUs: {SFT_GPUS}")
216
+ print("=" * 60)
217
+
218
+ # Load test data
219
+ samples = load_test_data()
220
+ print(f"\nLoaded {len(samples)} test samples")
221
+
222
+ cats = Counter(s['category'] for s in samples)
223
+ for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
224
+ print(f" {cat}: {cnt}")
225
+
226
+ # Run both models (each uses 4 GPUs internally for parallel inference)
227
+ t0 = time.time()
228
+
229
+ base_output = os.path.join(OUTPUT_DIR, "inference_results_base")
230
+ sft_output = os.path.join(OUTPUT_DIR, "inference_results_sft")
231
+
232
+ # Run base model on GPUs 0-3 (4 workers in parallel)
233
+ print("\n>>> Starting BASE model inference...", flush=True)
234
+ run_model_parallel(BASE_MODEL, "base", BASE_GPUS, samples, base_output)
235
+
236
+ # Run SFT model on GPUs 4-7 (4 workers in parallel)
237
+ print("\n>>> Starting SFT model inference...", flush=True)
238
+ run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)
239
+
240
+ # Merge results
241
+ base_files = [f"{base_output}_gpu{g}.jsonl" for g in BASE_GPUS]
242
+ sft_files = [f"{sft_output}_gpu{g}.jsonl" for g in SFT_GPUS]
243
+
244
+ base_final = os.path.join(OUTPUT_DIR, "inference_results_base.jsonl")
245
+ sft_final = os.path.join(OUTPUT_DIR, "inference_results_sft.jsonl")
246
+
247
+ base_results = merge_results(base_files, base_final)
248
+ sft_results = merge_results(sft_files, sft_final)
249
+
250
+ elapsed = time.time() - t0
251
+ print(f"\n{'=' * 60}")
252
+ print(f" INFERENCE COMPLETE in {elapsed/60:.1f} min")
253
+ print(f" Base results: {len(base_results)} → {base_final}")
254
+ print(f" SFT results: {len(sft_results)} → {sft_final}")
255
+ print(f"{'=' * 60}")
256
+
257
+
258
+ if __name__ == '__main__':
259
+ main()
eval_footprint/eval_openended_judge.py ADDED
@@ -0,0 +1,467 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Phase 2: Score open-ended inference results.
4
+
5
+ Two-stage scoring (adapted from eval_dual.py + MetaPhyX DeepSeek judge):
6
+ Stage 1: Rule-based (boxed extraction + normalization + numeric tolerance)
7
+ - If CORRECT → done, count as correct
8
+ - If WRONG or UNCERTAIN → go to Stage 2
9
+ Stage 2: Gemini 2.5 Flash LLM-as-Judge
10
+ - Sends model's full response + ground truth to Gemini
11
+ - Gemini determines [[YES]] or [[NO]] equivalence
12
+
13
+ Usage:
14
+ python eval_openended_judge.py [--results_dir PATH] [--api_key KEY]
15
+
16
+ Inputs:
17
+ inference_results_base.jsonl
18
+ inference_results_sft.jsonl
19
+
20
+ Outputs:
21
+ scored_results_base.jsonl
22
+ scored_results_sft.jsonl
23
+ comparison_report.json
24
+ """
25
+ import json, os, re, time, sys, argparse
26
+ from collections import defaultdict, Counter
27
+
28
+ # ===================== CONFIG =====================
29
+ GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "AIzaSyCXQ9gjVmRhoB1OVSqElnTB6p83GLX4W4w")
30
+ GEMINI_MODEL = "gemini-2.5-flash"
31
+ MAX_RETRIES = 3
32
+ RATE_LIMIT_DELAY = 0.5 # seconds between Gemini calls
33
+
34
+ # ===================== RULE-BASED SCORING =====================
35
+ # Adapted from eval_dual.py (verl/utils/reward_score/utils/utils.py approach)
36
+
37
+ def _strip_string(string):
38
+ """Normalize math string: remove LaTeX formatting, units, whitespace."""
39
+ string = string.replace("\n", "")
40
+ string = string.replace("\\!", "")
41
+ string = string.replace("\\\\", "\\")
42
+ string = string.replace("tfrac", "frac")
43
+ string = string.replace("dfrac", "frac")
44
+ string = string.replace("\\left", "")
45
+ string = string.replace("\\right", "")
46
+ string = string.replace("^{\\circ}", "")
47
+ string = string.replace("^\\circ", "")
48
+ string = string.replace("\\$", "")
49
+ if "\\text{ " in string:
50
+ splits = string.split("\\text{ ")
51
+ if len(splits) == 2:
52
+ string = splits[0]
53
+ string = string.replace("\\%", "")
54
+ string = string.replace(" .", " 0.")
55
+ string = string.replace("{.", "{0.")
56
+ if len(string) == 0:
57
+ return string
58
+ if string[0] == ".":
59
+ string = "0" + string
60
+ if len(string.split("=")) == 2:
61
+ if len(string.split("=")[0]) <= 2:
62
+ string = string.split("=")[1]
63
+ string = string.replace(" ", "")
64
+ return string
65
+
66
+
67
+ def _normalize(expr):
68
+ """Normalize answer expression for comparison."""
69
+ if expr is None:
70
+ return None
71
+ m = re.search("^\\\\text\\{(?P<text>.+?)\\}$", expr)
72
+ if m is not None:
73
+ expr = m.group("text")
74
+ expr = expr.replace("\\%", "%")
75
+ expr = expr.replace("\\$", "$")
76
+ expr = expr.replace("$", "")
77
+ expr = expr.replace("%", "")
78
+ expr = expr.replace(" or ", " , ")
79
+ expr = expr.replace(" and ", " , ")
80
+ for unit in ["degree", "cm", "centimeter", "meter", "mile", "second", "minute",
81
+ "hour", "day", "week", "month", "year", "foot", "feet", "inch", "yard",
82
+ "newton", "joule", "watt", "ampere", "volt", "ohm", "hertz",
83
+ "kilogram", "gram", "liter", "mole", "kelvin", "pascal",
84
+ "m/s", "km/h", "rad/s", "N", "J", "W", "A", "V", "Hz", "Pa", "kg", "mol"]:
85
+ expr = re.sub(f"\\s*{re.escape(unit)}(es)?(s)?\\s*(\\^[0-9]+)?", "", expr, flags=re.IGNORECASE)
86
+ if len(expr) > 0 and expr[0] == "{" and expr[-1] == "}":
87
+ expr = expr[1:-1]
88
+ try:
89
+ if "." in expr:
90
+ val = float(expr)
91
+ if abs(val - int(round(val))) <= 1e-7:
92
+ expr = str(int(round(val)))
93
+ except:
94
+ pass
95
+ expr = re.sub("- *", "-", expr)
96
+ expr = expr.replace(" ", "")
97
+ expr = expr.replace("{", "")
98
+ expr = expr.replace("}", "")
99
+ expr = expr.lower()
100
+ return expr
101
+
102
+
103
+ def extract_boxed_answer(text):
104
+ """Extract the last \\boxed{} content from text."""
105
+ idx = text.rfind("\\boxed")
106
+ if idx < 0:
107
+ idx = text.rfind("\\fbox")
108
+ if idx < 0:
109
+ return None
110
+ i = idx
111
+ num_left = 0
112
+ right_idx = None
113
+ while i < len(text):
114
+ if text[i] == "{":
115
+ num_left += 1
116
+ if text[i] == "}":
117
+ num_left -= 1
118
+ if num_left == 0:
119
+ right_idx = i
120
+ break
121
+ i += 1
122
+ if right_idx is None:
123
+ return None
124
+ boxed = text[idx:right_idx + 1]
125
+ left = "\\boxed{"
126
+ if boxed.startswith(left) and boxed.endswith("}"):
127
+ return boxed[len(left):-1]
128
+ return None
129
+
130
+
131
+ def extract_answer_from_text(text):
132
+ """Try to extract answer: first from \\boxed{}, then from common patterns."""
133
+ # Handle <think>...</think>
134
+ if '<think>' in text and '</think>' in text:
135
+ text = text.split('</think>')[-1]
136
+
137
+ # Priority 1: \boxed{}
138
+ boxed = extract_boxed_answer(text)
139
+ if boxed:
140
+ return boxed
141
+
142
+ # Priority 2: Common answer patterns
143
+ patterns = [
144
+ r'(?:the answer is|answer is|答案是|答案为)[:\s]*(.+?)(?:\.|$)',
145
+ r'(?:therefore|thus|so|hence)[,\s]+(?:the answer is\s+)?(.+?)(?:\.|$)',
146
+ ]
147
+ for p in patterns:
148
+ m = re.search(p, text, re.IGNORECASE)
149
+ if m:
150
+ ans = m.group(1).strip()
151
+ if len(ans) < 100:
152
+ return ans
153
+
154
+ return None
155
+
156
+
157
+ def rule_based_score(prediction, ground_truth):
158
+ """
159
+ Rule-based scoring: extract answer + normalize + compare.
160
+ Returns: (is_correct: bool, reason: str)
161
+ """
162
+ model_answer = extract_answer_from_text(prediction)
163
+ if model_answer is None:
164
+ return False, "no_answer_extracted"
165
+
166
+ gt_norm = _normalize(ground_truth)
167
+ pred_norm = _normalize(model_answer)
168
+
169
+ if gt_norm is None or pred_norm is None:
170
+ return False, "normalize_failed"
171
+
172
+ # Direct match after normalization
173
+ if gt_norm == pred_norm:
174
+ return True, "exact_match"
175
+
176
+ # Numeric comparison (1% tolerance)
177
+ try:
178
+ gt_float = float(gt_norm.replace(",", ""))
179
+ pred_float = float(pred_norm.replace(",", ""))
180
+ if abs(gt_float - pred_float) < 1e-6:
181
+ return True, "numeric_match"
182
+ if gt_float != 0 and abs((gt_float - pred_float) / gt_float) < 0.01:
183
+ return True, "numeric_close"
184
+ except:
185
+ pass
186
+
187
+ # Short answer containment (e.g., "III", "decreasing")
188
+ if len(ground_truth.strip()) <= 10:
189
+ gt_clean = ground_truth.strip()
190
+ if re.search(r'\b' + re.escape(gt_clean) + r'\b', prediction, re.IGNORECASE):
191
+ return True, "containment_match"
192
+
193
+ return False, f"no_match(pred={pred_norm[:30]},gt={gt_norm[:30]})"
194
+
195
+
196
+ # ===================== GEMINI LLM-AS-JUDGE =====================
197
+ # Adapted from eval_dual.py + MetaPhyX deepscaler ORM prompt
198
+
199
+ ORM_PROMPT = """You are an expert in verifying if two physics answers are the same.
200
+ Your input is a physics question prompt and two answers:
201
+ - Answer 1: the model's prediction
202
+ - Answer 2: the ground truth answer
203
+
204
+ Determine if they are equivalent.
205
+
206
+ Guidelines for equivalence:
207
+ - Different forms of the same number (0.5 = 1/2 = 50%)
208
+ - Same physical quantity with different units or notation (7.55N = 7.55 N = 7.55 newtons)
209
+ - Semantically equivalent descriptions ("point III" and "III", "decreasing" and "the velocity is decreasing")
210
+ - Algebraically equivalent expressions (x+1)^2 = x^2+2x+1
211
+ - Same choice letter or option name
212
+ - Correct numerical value even if formatting differs
213
+ - Minor rounding differences within 2% are acceptable
214
+
215
+ Your output must follow this format:
216
+ 1) Brief explanation for why the answers are equivalent or not.
217
+ 2) Final answer: [[YES]] or [[NO]]
218
+ """
219
+
220
+
221
+ def call_gemini(prompt, api_key):
222
+ """Call Gemini API using urllib (no external deps)."""
223
+ import urllib.request, urllib.error
224
+
225
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/{GEMINI_MODEL}:generateContent?key={api_key}"
226
+ payload = json.dumps({
227
+ "contents": [{"parts": [{"text": prompt}]}],
228
+ "generationConfig": {
229
+ "temperature": 0.0,
230
+ "maxOutputTokens": 512,
231
+ }
232
+ }).encode('utf-8')
233
+
234
+ req = urllib.request.Request(
235
+ url, data=payload,
236
+ headers={"Content-Type": "application/json"},
237
+ method="POST",
238
+ )
239
+
240
+ for attempt in range(MAX_RETRIES):
241
+ try:
242
+ with urllib.request.urlopen(req, timeout=30) as resp:
243
+ result = json.loads(resp.read().decode('utf-8'))
244
+ text = result['candidates'][0]['content']['parts'][0]['text']
245
+ return text.strip()
246
+ except urllib.error.HTTPError as e:
247
+ if e.code == 429:
248
+ wait = (attempt + 1) * 5
249
+ print(f" Rate limited, waiting {wait}s...")
250
+ time.sleep(wait)
251
+ else:
252
+ print(f" HTTP error {e.code}")
253
+ if attempt == MAX_RETRIES - 1:
254
+ return None
255
+ time.sleep(2)
256
+ except Exception as e:
257
+ print(f" Error: {e}")
258
+ if attempt == MAX_RETRIES - 1:
259
+ return None
260
+ time.sleep(2)
261
+ return None
262
+
263
+
264
+ def gemini_judge(prediction, ground_truth, api_key):
265
+ """Use Gemini to judge if model's prediction matches ground truth."""
266
+ user_msg = f"""
267
+ Model's full response (contains reasoning and answer):
268
+ {prediction[:2000]}
269
+
270
+ Ground truth answer: {ground_truth}
271
+ """
272
+ response = call_gemini(ORM_PROMPT + "\n\n" + user_msg, api_key)
273
+
274
+ if response is None:
275
+ return False, "api_error"
276
+
277
+ if "[[YES]]" in response:
278
+ return True, response[:200]
279
+ elif "[[NO]]" in response:
280
+ return False, response[:200]
281
+ else:
282
+ lower = response.lower()
283
+ if "yes" in lower and "no" not in lower:
284
+ return True, response[:200]
285
+ return False, response[:200]
286
+
287
+
288
+ # ===================== MAIN EVALUATION =====================
289
+
290
+ def score_model(results, model_name, api_key, output_file):
291
+ """
292
+ Score all results using two-stage approach:
293
+ 1. Rule-based first → if correct, DONE
294
+ 2. If rule-based says wrong/uncertain → Gemini fallback
295
+ """
296
+ print(f"\n{'='*60}")
297
+ print(f" Scoring: {model_name} ({len(results)} samples)")
298
+ print(f"{'='*60}")
299
+
300
+ rule_correct = 0
301
+ rule_wrong_gemini_correct = 0
302
+ rule_wrong_gemini_wrong = 0
303
+ gemini_errors = 0
304
+ total = len(results)
305
+
306
+ cat_stats = defaultdict(lambda: {'total': 0, 'rule_correct': 0, 'gemini_correct': 0, 'final_correct': 0})
307
+
308
+ for i, r in enumerate(results):
309
+ cat = r.get('category', 'Unknown')
310
+ pred = r.get('model_output', '')
311
+ gt = r.get('ground_truth_value', '')
312
+ cat_stats[cat]['total'] += 1
313
+
314
+ # === Stage 1: Rule-based ===
315
+ rule_match, rule_reason = rule_based_score(pred, gt)
316
+ r['rule_match'] = rule_match
317
+ r['rule_reason'] = rule_reason
318
+
319
+ if rule_match:
320
+ # Rule says CORRECT → done
321
+ rule_correct += 1
322
+ cat_stats[cat]['rule_correct'] += 1
323
+ cat_stats[cat]['final_correct'] += 1
324
+ r['final_correct'] = True
325
+ r['final_method'] = f"rule:{rule_reason}"
326
+ r['gemini_called'] = False
327
+ else:
328
+ # Rule says WRONG → Gemini fallback
329
+ r['gemini_called'] = True
330
+ gemini_match, gemini_reason = gemini_judge(pred, gt, api_key)
331
+ r['gemini_match'] = gemini_match
332
+ r['gemini_reason'] = gemini_reason
333
+
334
+ if gemini_match:
335
+ rule_wrong_gemini_correct += 1
336
+ cat_stats[cat]['gemini_correct'] += 1
337
+ cat_stats[cat]['final_correct'] += 1
338
+ r['final_correct'] = True
339
+ r['final_method'] = "gemini_override"
340
+ else:
341
+ rule_wrong_gemini_wrong += 1
342
+ r['final_correct'] = False
343
+ r['final_method'] = f"wrong:{rule_reason}"
344
+
345
+ time.sleep(RATE_LIMIT_DELAY)
346
+
347
+ # Progress
348
+ final_correct_so_far = rule_correct + rule_wrong_gemini_correct
349
+ if (i + 1) % 10 == 0 or (i + 1) == total:
350
+ acc_so_far = final_correct_so_far / (i + 1)
351
+ print(f" [{i+1}/{total}] acc={acc_so_far:.1%} "
352
+ f"(rule✓={rule_correct} gemini✓={rule_wrong_gemini_correct} ✗={rule_wrong_gemini_wrong})",
353
+ flush=True)
354
+
355
+ # Save scored results
356
+ with open(output_file, 'w', encoding='utf-8') as f:
357
+ for r in results:
358
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
359
+
360
+ # Summary
361
+ final_correct = rule_correct + rule_wrong_gemini_correct
362
+ final_acc = final_correct / total if total > 0 else 0
363
+
364
+ print(f"\n{'─'*60}")
365
+ print(f" {model_name} — RESULTS")
366
+ print(f"{'─'*60}")
367
+ print(f" Rule-based correct : {rule_correct}/{total} ({100*rule_correct/total:.1f}%)")
368
+ print(f" Gemini rescued : {rule_wrong_gemini_correct} (rule wrong → Gemini correct)")
369
+ print(f" Final accuracy : {final_correct}/{total} ({100*final_acc:.1f}%)")
370
+ print(f" Gemini calls made : {rule_wrong_gemini_correct + rule_wrong_gemini_wrong}")
371
+ print(f"\n Per-category:")
372
+ for cat, s in sorted(cat_stats.items()):
373
+ acc = s['final_correct'] / s['total'] if s['total'] > 0 else 0
374
+ print(f" {cat:25s}: {s['final_correct']}/{s['total']} ({acc:.1%})"
375
+ f" [rule={s['rule_correct']}, gemini+={s['gemini_correct']}]")
376
+
377
+ return {
378
+ 'model': model_name,
379
+ 'total': total,
380
+ 'rule_correct': rule_correct,
381
+ 'gemini_rescued': rule_wrong_gemini_correct,
382
+ 'final_correct': final_correct,
383
+ 'final_acc': round(100 * final_acc, 2),
384
+ 'category_stats': {cat: dict(s) for cat, s in cat_stats.items()},
385
+ }
386
+
387
+
388
+ def main():
389
+ parser = argparse.ArgumentParser()
390
+ parser.add_argument('--results_dir', type=str, default=None)
391
+ parser.add_argument('--api_key', type=str, default=None)
392
+ args = parser.parse_args()
393
+
394
+ api_key = args.api_key or GEMINI_API_KEY
395
+
396
+ # Find results directory
397
+ results_dir = args.results_dir
398
+ if results_dir is None:
399
+ for d in [os.path.dirname(os.path.abspath(__file__)),
400
+ '/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/']:
401
+ if os.path.exists(os.path.join(d, 'inference_results_base.jsonl')):
402
+ results_dir = d
403
+ break
404
+ if results_dir is None:
405
+ print("ERROR: Cannot find inference results. Use --results_dir")
406
+ sys.exit(1)
407
+
408
+ print("=" * 60)
409
+ print(" OPEN-ENDED EVAL: Rule-based + Gemini 2.5 Flash Judge")
410
+ print(f" Results dir: {results_dir}")
411
+ print("=" * 60)
412
+
413
+ # Load test data for context
414
+ test_file = os.path.join(results_dir, 'test_1533_openended.jsonl')
415
+ if os.path.exists(test_file):
416
+ with open(test_file, 'r') as f:
417
+ test_data = {json.loads(l)['index']: json.loads(l) for l in f if l.strip()}
418
+ print(f"Test data loaded: {len(test_data)} samples")
419
+
420
+ # Load and score base model
421
+ base_file = os.path.join(results_dir, 'inference_results_base.jsonl')
422
+ with open(base_file, 'r') as f:
423
+ base_results = [json.loads(l) for l in f if l.strip()]
424
+ base_scored_file = os.path.join(results_dir, 'scored_results_base.jsonl')
425
+ base_stats = score_model(base_results, "Qwen2.5-VL-3B (Base)", api_key, base_scored_file)
426
+
427
+ # Load and score SFT model
428
+ sft_file = os.path.join(results_dir, 'inference_results_sft.jsonl')
429
+ with open(sft_file, 'r') as f:
430
+ sft_results = [json.loads(l) for l in f if l.strip()]
431
+ sft_scored_file = os.path.join(results_dir, 'scored_results_sft.jsonl')
432
+ sft_stats = score_model(sft_results, "Qwen2.5-VL-3B (SFT)", api_key, sft_scored_file)
433
+
434
+ # Comparison
435
+ delta = sft_stats['final_acc'] - base_stats['final_acc']
436
+ report = {
437
+ 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
438
+ 'scoring_method': 'rule-based + Gemini 2.5 Flash judge (fallback)',
439
+ 'base': base_stats,
440
+ 'sft': sft_stats,
441
+ 'improvement': f"{delta:+.2f}%",
442
+ }
443
+
444
+ report_file = os.path.join(results_dir, 'comparison_report.json')
445
+ with open(report_file, 'w', encoding='utf-8') as f:
446
+ json.dump(report, f, indent=2, ensure_ascii=False)
447
+
448
+ print(f"\n{'='*60}")
449
+ print(f" FINAL COMPARISON")
450
+ print(f"{'='*60}")
451
+ print(f" Base accuracy: {base_stats['final_acc']}% ({base_stats['final_correct']}/{base_stats['total']})")
452
+ print(f" SFT accuracy: {sft_stats['final_acc']}% ({sft_stats['final_correct']}/{sft_stats['total']})")
453
+ print(f" Improvement: {delta:+.2f}%")
454
+ print(f"\n Per-category:")
455
+ all_cats = sorted(set(list(base_stats['category_stats'].keys()) + list(sft_stats['category_stats'].keys())))
456
+ for cat in all_cats:
457
+ b = base_stats['category_stats'].get(cat, {'final_correct': 0, 'total': 0})
458
+ s = sft_stats['category_stats'].get(cat, {'final_correct': 0, 'total': 0})
459
+ b_acc = b['final_correct'] / b['total'] if b['total'] > 0 else 0
460
+ s_acc = s['final_correct'] / s['total'] if s['total'] > 0 else 0
461
+ print(f" {cat:25s} Base: {b_acc:.1%} SFT: {s_acc:.1%} Δ: {(s_acc-b_acc)*100:+.1f}%")
462
+ print(f"\n Report: {report_file}")
463
+ print(f"{'='*60}")
464
+
465
+
466
+ if __name__ == '__main__':
467
+ main()
eval_footprint/eval_sft_only.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import json, os, re, time, sys, ast
3
+ from collections import defaultdict
4
+ import urllib.request
5
+
6
+ DEEPSEEK_API_KEY = "sk-6364e2b3116241c59577191c32b09021"
7
+ DEEPSEEK_MODEL = "deepseek-chat"
8
+ DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
9
+ RESULTS_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
10
+ SFT_RESULTS = os.path.join(RESULTS_DIR, "inference_results_sft.jsonl")
11
+ FAIL_MSG = "Failed to obtain answer via API."
12
+ RETRY = 5
13
+
14
+ def get_ICE():
15
+ example_1 = "\nGround truth answer: 502 \\n\nPredicted answer: The mass of block (B) is:\n[\n\\\\boxed{ 50 \\\\sqrt{101} }\n] \\n\nJudegement: 1\n"
16
+ example_2 = "\nGround truth answer: 46.3 kN \\n\nPredicted answer: The tension ( T_B ) in the cable is approximately:\n[\n\\\\boxed{46300 }\n] \\n\nJudegement: 1\n"
17
+ example_3 = "\nGround truth answer: 12 m/s \\n\nPredicted answer: The speed of the box after 2.00 seconds is:\n[\n\\\\boxed{11.3, \\\\text{m/s}}\n] \\n\nJudegement: 0\n"
18
+ example_4 = "\nGround truth answer: 36.00 kg \\n\nPredicted answer: The mass of the hanging block ( m_2 ) must be approximately:\n[\n\\\\boxed{36.1, \\\\text\\\\{kg\\\\}}\n] \\n\nJudegement: 1\n"
19
+ example_5 = "\nGround truth answer: 3.2 m \\n\nPredicted answer: The stuntman and villain slide approximately \\\\frac{10}{3.1415} meters**.\nJudegement: 1\n"
20
+ return [example_1, example_2, example_3, example_4, example_5]
21
+
22
+ def build_phyx_gpt4_prompt(gt_answer, pred):
23
+ task_description = "\nPlease read the following example. Given predicted answer and ground truth answer,\ncompare the these two answers, then ONLY output judegement 1/0 for matched/unmatched at the end of the prompt.\nIf the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.\nIf the given predicted mentions \"approximately\", then allow the Approximation Error, \\\nsuch as 0.49 and approximately 0.5, 0.81 and approximately 0.8. \\n\n\n"
24
+ prompt = task_description
25
+ for ex in get_ICE():
26
+ prompt += ex + "\n"
27
+ prompt += "Ground truth answer: {} \n".format(gt_answer)
28
+ prompt += "Predicted answer: {} \n".format(pred)
29
+ prompt += "Judegement:"
30
+ return prompt
31
+
32
+ def mapping_str(s):
33
+ return s.replace("\\dfrac", "\\frac").replace("\\pi", "3.14")
34
+
35
+ def extract_boxed_content(s):
36
+ start = s.find(r"\boxed{")
37
+ if start == -1: return None
38
+ rest = s[start + len(r"\boxed{"):]
39
+ depth = 0
40
+ for i, ch in enumerate(rest):
41
+ if ch == "{": depth += 1
42
+ elif ch == "}":
43
+ if depth == 0: return rest[:i]
44
+ else: depth -= 1
45
+ return None
46
+
47
+ def call_deepseek(prompt, temperature=0.0):
48
+ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {DEEPSEEK_API_KEY}"}
49
+ data = json.dumps({"model": DEEPSEEK_MODEL, "messages": [{"role": "user", "content": prompt}], "temperature": temperature, "max_tokens": 200}).encode("utf-8")
50
+ try:
51
+ req = urllib.request.Request(DEEPSEEK_API_URL, data=data, headers=headers)
52
+ with urllib.request.urlopen(req, timeout=30) as resp:
53
+ return json.loads(resp.read().decode())["choices"][0]["message"]["content"]
54
+ except:
55
+ return FAIL_MSG
56
+
57
+ def PhyX_auxeval(gt_answer, prediction):
58
+ log = ""
59
+ pred = prediction.strip()
60
+ boxed = extract_boxed_content(pred)
61
+ if boxed is not None:
62
+ extracted = mapping_str(boxed)
63
+ else:
64
+ m = re.search(r"\b(?:final\s+answer|correct\s+answer)\b[^:]*[:]\s*(.*?)(?=\n\n\n|\Z)", pred, re.IGNORECASE | re.DOTALL)
65
+ extracted = mapping_str(m.group(1)) if m else pred
66
+
67
+ if str(gt_answer).strip().lower() == extracted.strip().lower():
68
+ return dict(log="Matched at string level", res=1, extracted=extracted)
69
+
70
+ prompt = build_phyx_gpt4_prompt(gt_answer, extracted)
71
+ for i in range(RETRY):
72
+ res = call_deepseek(prompt, temperature=i * 0.5)
73
+ if FAIL_MSG not in res:
74
+ if "1" in res: return dict(log="Semantic equal via LLM", res=1, extracted=extracted)
75
+ elif "0" in res: return dict(log=f"LLM judgement {res}", res=0, extracted=extracted)
76
+ return dict(log="All retries failed", res=0, extracted=extracted)
77
+
78
+ # Main
79
+ results = []
80
+ with open(SFT_RESULTS, "r") as f:
81
+ for line in f:
82
+ if line.strip(): results.append(json.loads(line))
83
+
84
+ print(f"Scoring SFT model ({len(results)} samples)")
85
+ hit = 0
86
+ cat_stats = defaultdict(lambda: {"total": 0, "correct": 0})
87
+ scored = []
88
+
89
+ for i, r in enumerate(results):
90
+ gt = r["ground_truth_value"]
91
+ ev = PhyX_auxeval(gt, r["model_output"])
92
+ r["extracted_answer"] = ev["extracted"]
93
+ r["eval_log"] = ev["log"]
94
+ r["res"] = ev["res"]
95
+ if ev["res"] == 1:
96
+ hit += 1
97
+ cat_stats[r.get("category", "unknown")]["correct"] += 1
98
+ cat_stats[r.get("category", "unknown")]["total"] += 1
99
+ scored.append(r)
100
+ if (i+1) % 50 == 0:
101
+ print(f" [{i+1}/{len(results)}] acc={hit/(i+1)*100:.1f}%", flush=True)
102
+
103
+ acc = hit / len(results) * 100
104
+ print(f"\nSFT Final: {hit}/{len(results)} ({acc:.1f}%)")
105
+ for cat, s in sorted(cat_stats.items(), key=lambda x: -x[1]["total"]):
106
+ print(f" {cat}: {s['correct']}/{s['total']} ({s['correct']/s['total']*100:.1f}%)")
107
+
108
+ with open(os.path.join(RESULTS_DIR, "scored_results_sft_phyx.jsonl"), "w") as f:
109
+ for r in scored: f.write(json.dumps(r, ensure_ascii=False) + "\n")
110
+
111
+ json.dump({"model": "SFT", "total": len(results), "correct": hit, "acc": round(acc, 2),
112
+ "categories": {k: dict(v) for k,v in cat_stats.items()}},
113
+ open(os.path.join(RESULTS_DIR, "sft_report_phyx.json"), "w"), indent=2, ensure_ascii=False)
114
+
eval_footprint/eval_sft_vs_base_multigpu.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Multi-GPU parallel evaluation: Base vs SFT on 1533 physics test set.
4
+
5
+ Strategy:
6
+ - Base model: 4 workers on GPU 0,1,2,3 (each processes ~383 samples)
7
+ - SFT model: 4 workers on GPU 4,5,6,7 (each processes ~383 samples)
8
+ - All 8 workers run SIMULTANEOUSLY → ~8x speedup vs single GPU
9
+ """
10
+ import json
11
+ import os
12
+ import sys
13
+ import time
14
+
15
+ # CRITICAL: Set offline mode before importing transformers
16
+ os.environ['HF_HUB_OFFLINE'] = '1'
17
+ os.environ['TRANSFORMERS_OFFLINE'] = '1'
18
+
19
+ import pandas as pd
20
+ import numpy as np
21
+ import torch
22
+ import re
23
+ from PIL import Image
24
+ from collections import defaultdict
25
+ from multiprocessing import Process, Queue
26
+
27
+ # ============ CONFIG ============
28
+ TEST_PARQUET = "/workspace/rl4phyx/RL4Phyx/SFT/eval_data/test_1533.parquet"
29
+ IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
30
+
31
+ MODELS = {
32
+ "base": "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct",
33
+ "sft": "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/sft_qwen25vl_3b/merged",
34
+ }
35
+
36
+ # GPU assignment: base on GPUs 0-3, sft on GPUs 4-7
37
+ MODEL_GPUS = {
38
+ "base": [0, 1, 2, 3],
39
+ "sft": [4, 5, 6, 7],
40
+ }
41
+
42
+ OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/eval_results"
43
+ MAX_NEW_TOKENS = 1024
44
+ # ================================
45
+
46
+
47
+ def extract_choice(solution_str):
48
+ """Extract A/B/C/D from model output."""
49
+ if not solution_str:
50
+ return None
51
+ boxed = re.search(r'\\boxed\{([ABCD])\}', solution_str, re.IGNORECASE)
52
+ if boxed:
53
+ return boxed.group(1).upper()
54
+ patterns = [
55
+ r'(?:answer|choice)[是为:\s]*([ABCD])\b',
56
+ r'\b([ABCD])\s*(?:is correct|is the correct)',
57
+ r'(?:correct answer is)\s*([ABCD])\b',
58
+ ]
59
+ for p in patterns:
60
+ match = re.search(p, solution_str, re.IGNORECASE)
61
+ if match:
62
+ return match.group(1).upper()
63
+ matches = re.findall(r'\b([ABCD])\b', solution_str.upper())
64
+ if matches:
65
+ return matches[-1]
66
+ return None
67
+
68
+
69
+ def compute_score(solution_str, ground_truth):
70
+ """Score MCQ: 1.0 if correct, 0.0 otherwise."""
71
+ if '<think>' in solution_str and '</think>' in solution_str:
72
+ solution_str = solution_str.split('</think>')[-1]
73
+ model_choice = extract_choice(solution_str)
74
+ correct_choice = ground_truth.strip().upper()
75
+ if model_choice is None:
76
+ return 0.0
77
+ return 1.0 if model_choice == correct_choice else 0.0
78
+
79
+
80
+ def load_test_data(parquet_path):
81
+ """Load test parquet and return list of dicts."""
82
+ df = pd.read_parquet(parquet_path)
83
+ samples = []
84
+ for _, row in df.iterrows():
85
+ ei = row['extra_info']
86
+ rm = row['reward_model']
87
+ prompt_content = row['prompt'][0]['content']
88
+ text = ""
89
+ for item in prompt_content:
90
+ if item.get('type') == 'text' and item.get('text'):
91
+ text = item['text']
92
+ samples.append({
93
+ 'index': ei['index'],
94
+ 'category': ei['category'],
95
+ 'image_path': ei['image_path'],
96
+ 'ground_truth': rm['ground_truth'],
97
+ 'prompt_text': text,
98
+ })
99
+ return samples
100
+
101
+
102
+ def worker_evaluate(model_name, model_path, gpu_id, chunk, image_dir, result_queue):
103
+ """Single GPU worker: load model, run inference on chunk, put results in queue."""
104
+ os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
105
+
106
+ try:
107
+ from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
108
+
109
+ print(f"[{model_name}@GPU{gpu_id}] Loading model... ({len(chunk)} samples)", flush=True)
110
+
111
+ processor = AutoProcessor.from_pretrained(
112
+ model_path,
113
+ min_pixels=3136,
114
+ max_pixels=200704,
115
+ local_files_only=True,
116
+ trust_remote_code=True,
117
+ )
118
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
119
+ model_path,
120
+ torch_dtype=torch.bfloat16,
121
+ attn_implementation="sdpa",
122
+ device_map="cuda:0", # maps to the CUDA_VISIBLE_DEVICES GPU
123
+ local_files_only=True,
124
+ trust_remote_code=True,
125
+ )
126
+ model.eval()
127
+ print(f"[{model_name}@GPU{gpu_id}] Model loaded, starting inference", flush=True)
128
+
129
+ results = []
130
+ correct = 0
131
+ t0 = time.time()
132
+
133
+ for i, s in enumerate(chunk):
134
+ img_path = os.path.join(image_dir, s['image_path'])
135
+ try:
136
+ image = Image.open(img_path).convert('RGB')
137
+ except Exception as e:
138
+ print(f"[{model_name}@GPU{gpu_id}] WARN: Cannot load {img_path}: {e}", flush=True)
139
+ image = Image.new('RGB', (224, 224), 'white')
140
+
141
+ messages = [{
142
+ "role": "user",
143
+ "content": [
144
+ {"type": "image", "image": image},
145
+ {"type": "text", "text": s['prompt_text']},
146
+ ]
147
+ }]
148
+
149
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
150
+ inputs = processor(
151
+ text=[text],
152
+ images=[image],
153
+ return_tensors="pt",
154
+ padding=True,
155
+ ).to("cuda:0")
156
+
157
+ with torch.no_grad():
158
+ output_ids = model.generate(
159
+ **inputs,
160
+ max_new_tokens=MAX_NEW_TOKENS,
161
+ do_sample=False,
162
+ temperature=None,
163
+ top_p=None,
164
+ )
165
+
166
+ gen_ids = output_ids[:, inputs['input_ids'].shape[1]:]
167
+ prediction = processor.batch_decode(gen_ids, skip_special_tokens=True)[0]
168
+
169
+ score = compute_score(prediction, s['ground_truth'])
170
+ is_correct = score == 1.0
171
+ if is_correct:
172
+ correct += 1
173
+
174
+ extracted = extract_choice(prediction)
175
+ results.append({
176
+ 'index': s['index'],
177
+ 'category': s['category'],
178
+ 'ground_truth': s['ground_truth'],
179
+ 'extracted_answer': extracted,
180
+ 'prediction': prediction[:300],
181
+ 'correct': is_correct,
182
+ })
183
+
184
+ if (i + 1) % 50 == 0 or i == 0 or (i + 1) == len(chunk):
185
+ elapsed = time.time() - t0
186
+ eta = elapsed / (i + 1) * (len(chunk) - i - 1)
187
+ acc = correct / (i + 1) * 100
188
+ print(f"[{model_name}@GPU{gpu_id}] {i+1}/{len(chunk)} "
189
+ f"acc={acc:.1f}% elapsed={elapsed:.0f}s eta={eta:.0f}s", flush=True)
190
+
191
+ elapsed_total = time.time() - t0
192
+ print(f"[{model_name}@GPU{gpu_id}] DONE: {correct}/{len(chunk)} "
193
+ f"({correct/len(chunk)*100:.1f}%) in {elapsed_total:.0f}s", flush=True)
194
+
195
+ result_queue.put({
196
+ 'model': model_name,
197
+ 'gpu_id': gpu_id,
198
+ 'results': results,
199
+ 'correct': correct,
200
+ 'total': len(chunk),
201
+ 'time_seconds': elapsed_total,
202
+ })
203
+ except Exception as e:
204
+ import traceback
205
+ print(f"[{model_name}@GPU{gpu_id}] ERROR: {e}", flush=True)
206
+ traceback.print_exc()
207
+ result_queue.put({
208
+ 'model': model_name,
209
+ 'gpu_id': gpu_id,
210
+ 'results': [],
211
+ 'correct': 0,
212
+ 'total': len(chunk),
213
+ 'time_seconds': 0,
214
+ 'error': str(e),
215
+ })
216
+
217
+
218
+ def split_chunks(data, n):
219
+ """Split data into n roughly equal chunks."""
220
+ k, m = divmod(len(data), n)
221
+ return [data[i*k + min(i, m):(i+1)*k + min(i+1, m)] for i in range(n)]
222
+
223
+
224
+ def main():
225
+ print("=" * 60)
226
+ print(" MULTI-GPU EVAL: Base vs SFT on Physics Test Set")
227
+ print(" 8 GPUs: Base→[0,1,2,3] SFT→[4,5,6,7]")
228
+ print("=" * 60, flush=True)
229
+
230
+ # Load test data
231
+ samples = load_test_data(TEST_PARQUET)
232
+ print(f"\nLoaded {len(samples)} test samples")
233
+ cats = defaultdict(int)
234
+ for s in samples:
235
+ cats[s['category']] += 1
236
+ for cat, cnt in sorted(cats.items(), key=lambda x: -x[1]):
237
+ print(f" {cat}: {cnt}")
238
+
239
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
240
+
241
+ # Launch all 8 workers simultaneously
242
+ result_queue = Queue()
243
+ processes = []
244
+
245
+ for model_name, model_path in MODELS.items():
246
+ gpus = MODEL_GPUS[model_name]
247
+ chunks = split_chunks(samples, len(gpus))
248
+
249
+ for gpu_id, chunk in zip(gpus, chunks):
250
+ p = Process(
251
+ target=worker_evaluate,
252
+ args=(model_name, model_path, gpu_id, chunk, IMAGE_DIR, result_queue),
253
+ )
254
+ p.start()
255
+ processes.append(p)
256
+ print(f" Launched {model_name} worker on GPU {gpu_id} ({len(chunk)} samples)")
257
+
258
+ print(f"\nAll {len(processes)} workers launched, waiting for completion...", flush=True)
259
+
260
+ # Collect results
261
+ worker_results = []
262
+ for _ in range(len(processes)):
263
+ worker_results.append(result_queue.get())
264
+
265
+ for p in processes:
266
+ p.join()
267
+
268
+ # Aggregate per model
269
+ all_evals = {}
270
+ for model_name in MODELS:
271
+ model_results = [w for w in worker_results if w['model'] == model_name]
272
+
273
+ all_results = []
274
+ total_correct = 0
275
+ total_count = 0
276
+ total_time = 0
277
+ cat_stats = defaultdict(lambda: {'correct': 0, 'total': 0})
278
+
279
+ for w in model_results:
280
+ if 'error' in w:
281
+ print(f" WARNING: {model_name}@GPU{w['gpu_id']} had error: {w['error']}")
282
+ continue
283
+ all_results.extend(w['results'])
284
+ total_correct += w['correct']
285
+ total_count += w['total']
286
+ total_time = max(total_time, w['time_seconds'])
287
+
288
+ for r in all_results:
289
+ cat = r['category']
290
+ cat_stats[cat]['total'] += 1
291
+ if r['correct']:
292
+ cat_stats[cat]['correct'] += 1
293
+
294
+ accuracy = total_correct / total_count * 100 if total_count > 0 else 0
295
+ all_evals[model_name] = {
296
+ 'accuracy': accuracy,
297
+ 'correct': total_correct,
298
+ 'total': total_count,
299
+ 'category_stats': dict(cat_stats),
300
+ 'results': all_results,
301
+ 'time_seconds': total_time,
302
+ }
303
+
304
+ # Save per-model results
305
+ out_path = os.path.join(OUTPUT_DIR, f"eval_{model_name}.jsonl")
306
+ with open(out_path, 'w', encoding='utf-8') as f:
307
+ for r in all_results:
308
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
309
+
310
+ print(f"\n{'─'*60}")
311
+ print(f" {model_name.upper()} — RESULTS")
312
+ print(f"{'─'*60}")
313
+ print(f" Overall accuracy: {total_correct}/{total_count} ({accuracy:.1f}%)")
314
+ print(f" Wall-clock time: {total_time:.0f}s")
315
+ for cat in sorted(cat_stats.keys()):
316
+ s = cat_stats[cat]
317
+ pct = s['correct'] / s['total'] * 100 if s['total'] > 0 else 0
318
+ print(f" {cat:25s}: {s['correct']:3d}/{s['total']:3d} ({pct:.1f}%)")
319
+
320
+ # Final comparison
321
+ base = all_evals['base']
322
+ sft = all_evals['sft']
323
+
324
+ print(f"\n{'='*60}")
325
+ print(f" FINAL COMPARISON: Base vs SFT")
326
+ print(f"{'='*60}")
327
+ print(f"\n{'Metric':30s} {'Base':>10s} {'SFT':>10s} {'Delta':>10s}")
328
+ print("─" * 60)
329
+ print(f"{'Overall Accuracy':30s} {base['accuracy']:>9.1f}% {sft['accuracy']:>9.1f}% {sft['accuracy']-base['accuracy']:>+9.1f}%")
330
+
331
+ all_cats = sorted(set(list(base['category_stats'].keys()) + list(sft['category_stats'].keys())))
332
+ print(f"\n Per-category comparison:")
333
+ for cat in all_cats:
334
+ bs = base['category_stats'].get(cat, {'correct': 0, 'total': 0})
335
+ ss = sft['category_stats'].get(cat, {'correct': 0, 'total': 0})
336
+ b_pct = bs['correct'] / bs['total'] * 100 if bs['total'] > 0 else 0
337
+ s_pct = ss['correct'] / ss['total'] * 100 if ss['total'] > 0 else 0
338
+ delta = s_pct - b_pct
339
+ print(f" {cat:25s}: Base={b_pct:5.1f}% SFT={s_pct:5.1f}% Δ={delta:+5.1f}%")
340
+
341
+ # Save comparison
342
+ comparison = {
343
+ 'base_accuracy': base['accuracy'],
344
+ 'sft_accuracy': sft['accuracy'],
345
+ 'delta': sft['accuracy'] - base['accuracy'],
346
+ 'base_time': base['time_seconds'],
347
+ 'sft_time': sft['time_seconds'],
348
+ 'base_categories': base['category_stats'],
349
+ 'sft_categories': sft['category_stats'],
350
+ }
351
+ comp_path = os.path.join(OUTPUT_DIR, "comparison.json")
352
+ with open(comp_path, 'w') as f:
353
+ json.dump(comparison, f, indent=2, ensure_ascii=False)
354
+ print(f"\n Comparison saved to: {comp_path}")
355
+ print("\n=== EVALUATION COMPLETE ===", flush=True)
356
+
357
+
358
+ if __name__ == '__main__':
359
+ main()
eval_footprint/eval_single_model_template.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Phase 1: Open-ended inference on cluster (multi-GPU, no internet needed).
4
+
5
+ Runs both Base and SFT models on the 1533 open-ended physics test set.
6
+ Saves raw model outputs for later judging.
7
+
8
+ Usage (inside Docker container):
9
+ cd /tmp && python3 /path/to/eval_openended_inference.py
10
+
11
+ Output:
12
+ sft_eval_footprint/inference_results_base.jsonl
13
+ sft_eval_footprint/inference_results_sft.jsonl
14
+ """
15
+ import os
16
+ import sys
17
+ import json
18
+ import re
19
+ import time
20
+ import torch
21
+ import multiprocessing as mp
22
+ from collections import Counter
23
+
24
+ # ============ CONFIG ============
25
+ os.environ["HF_HUB_OFFLINE"] = "1"
26
+ os.environ["TRANSFORMERS_OFFLINE"] = "1"
27
+
28
+ BASE_MODEL = "/workspace/rl4phyx/models/Qwen2.5-VL-3B-Instruct"
29
+ SFT_MODEL = "MODEL_PATH_PLACEHOLDER"
30
+ TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
31
+ OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
32
+ IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
33
+
34
+ # Multi-GPU config: base on GPUs 0-3, SFT on GPUs 4-7
35
+ BASE_GPUS = [0, 1, 2, 3]
36
+ SFT_GPUS = [4, 5, 6, 7]
37
+ MAX_NEW_TOKENS = 2048
38
+ # ================================
39
+
40
+
41
+ def load_test_data():
42
+ """Load test samples from JSONL."""
43
+ samples = []
44
+ with open(TEST_FILE, 'r', encoding='utf-8') as f:
45
+ for line in f:
46
+ if line.strip():
47
+ samples.append(json.loads(line))
48
+ return samples
49
+
50
+
51
+ def build_open_ended_prompt(sample):
52
+ """Build an open-ended prompt (no MCQ options)."""
53
+ desc = sample.get('description', '')
54
+ question = sample.get('question', '')
55
+
56
+ prompt = f"""Look at the image and answer the physics question.
57
+
58
+ {desc}
59
+
60
+ {question}
61
+
62
+ Please reason step by step, and put your final answer within \\boxed{{}}.
63
+ """
64
+ return prompt.strip()
65
+
66
+
67
+ def worker_inference(gpu_id, model_path, samples, output_file, model_name):
68
+ """Worker: load model on specific GPU and run inference on assigned samples."""
69
+ import torch
70
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
71
+ from qwen_vl_utils import process_vision_info
72
+ from PIL import Image
73
+
74
+ device = f"cuda:{gpu_id}"
75
+ print(f"[{model_name}][GPU {gpu_id}] Loading model...", flush=True)
76
+
77
+ processor = AutoProcessor.from_pretrained(
78
+ model_path,
79
+ min_pixels=3136,
80
+ max_pixels=200704,
81
+ local_files_only=True,
82
+ trust_remote_code=True,
83
+ )
84
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
85
+ model_path,
86
+ torch_dtype=torch.bfloat16,
87
+ device_map=device,
88
+ local_files_only=True,
89
+ trust_remote_code=True,
90
+ )
91
+ model.eval()
92
+ print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples.", flush=True)
93
+
94
+ results = []
95
+ for i, sample in enumerate(samples):
96
+ idx = sample['index']
97
+ prompt_text = build_open_ended_prompt(sample)
98
+ image_path = os.path.join(IMAGE_DIR, sample['image'])
99
+
100
+ # Build messages
101
+ messages = [{
102
+ "role": "user",
103
+ "content": [
104
+ {"type": "image", "image": f"file://{image_path}"},
105
+ {"type": "text", "text": prompt_text},
106
+ ],
107
+ }]
108
+
109
+ try:
110
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
111
+ image_inputs, video_inputs = process_vision_info(messages)
112
+ inputs = processor(
113
+ text=[text],
114
+ images=image_inputs,
115
+ videos=video_inputs,
116
+ padding=True,
117
+ return_tensors="pt",
118
+ ).to(device)
119
+
120
+ with torch.no_grad():
121
+ output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
122
+
123
+ generated = output_ids[0][inputs.input_ids.shape[1]:]
124
+ response = processor.decode(generated, skip_special_tokens=True)
125
+ except Exception as e:
126
+ response = f"ERROR: {str(e)}"
127
+
128
+ result = {
129
+ "index": idx,
130
+ "category": sample['category'],
131
+ "subfield": sample.get('subfield', ''),
132
+ "question": sample['question'],
133
+ "ground_truth_value": sample['ground_truth_value'],
134
+ "ground_truth_letter": sample.get('ground_truth_letter', ''),
135
+ "model_output": response,
136
+ "model_name": model_name,
137
+ "gpu_id": gpu_id,
138
+ }
139
+ results.append(result)
140
+
141
+ if (i + 1) % 20 == 0 or (i + 1) == len(samples):
142
+ print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
143
+
144
+ # Write results
145
+ with open(output_file, 'w', encoding='utf-8') as f:
146
+ for r in results:
147
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
148
+
149
+ print(f"[{model_name}][GPU {gpu_id}] Saved {len(results)} results to {output_file}", flush=True)
150
+ return len(results)
151
+
152
+
153
+ def run_model_parallel(model_path, model_name, gpu_ids, samples, output_base):
154
+ """Split samples across GPUs and run in parallel."""
155
+ n = len(samples)
156
+ k = len(gpu_ids)
157
+ chunk_size = (n + k - 1) // k
158
+
159
+ processes = []
160
+ output_files = []
161
+ for i, gpu_id in enumerate(gpu_ids):
162
+ chunk = samples[i * chunk_size: (i + 1) * chunk_size]
163
+ if not chunk:
164
+ continue
165
+ out_file = f"{output_base}_gpu{gpu_id}.jsonl"
166
+ output_files.append(out_file)
167
+ p = mp.Process(
168
+ target=worker_inference,
169
+ args=(gpu_id, model_path, chunk, out_file, model_name)
170
+ )
171
+ processes.append(p)
172
+
173
+ for p in processes:
174
+ p.start()
175
+ for p in processes:
176
+ p.join()
177
+
178
+ return output_files
179
+
180
+
181
+ def merge_results(output_files, final_output):
182
+ """Merge per-GPU result files into one."""
183
+ all_results = []
184
+ for f in output_files:
185
+ if os.path.exists(f):
186
+ with open(f, 'r', encoding='utf-8') as fh:
187
+ for line in fh:
188
+ if line.strip():
189
+ all_results.append(json.loads(line))
190
+
191
+ # Sort by index for consistency
192
+ all_results.sort(key=lambda x: x['index'])
193
+
194
+ with open(final_output, 'w', encoding='utf-8') as f:
195
+ for r in all_results:
196
+ f.write(json.dumps(r, ensure_ascii=False) + '\n')
197
+
198
+ # Cleanup per-GPU files
199
+ for f in output_files:
200
+ if os.path.exists(f):
201
+ os.remove(f)
202
+
203
+ return all_results
204
+
205
+
206
+ def main():
207
+ import json, os
208
+ TEST_FILE = os.path.join(OUTPUT_DIR, "test_1533_openended.jsonl")
209
+ samples = []
210
+ with open(TEST_FILE) as f:
211
+ for line in f:
212
+ if line.strip():
213
+ samples.append(json.loads(line))
214
+ print(f"Loaded {len(samples)} test samples")
215
+ print(f"Model: {SFT_MODEL}")
216
+
217
+ sft_output = os.path.join(OUTPUT_DIR, "OUTPUT_NAME_PLACEHOLDER")
218
+ SFT_GPUS = list(range(8))
219
+ run_model_parallel(SFT_MODEL, "sft", SFT_GPUS, samples, sft_output)
220
+
221
+ sft_final = sft_output + ".jsonl" if not sft_output.endswith(".jsonl") else sft_output
222
+ if not os.path.exists(sft_final):
223
+ # merge from per-gpu files
224
+ all_r = []
225
+ for gpu in SFT_GPUS:
226
+ gf = sft_output + f"_gpu{gpu}.jsonl"
227
+ if os.path.exists(gf):
228
+ with open(gf) as f:
229
+ for line in f:
230
+ if line.strip():
231
+ all_r.append(line)
232
+ os.remove(gf)
233
+ with open(sft_final, 'w') as f:
234
+ for line in all_r:
235
+ f.write(line)
236
+
237
+ with open(sft_final) as f:
238
+ count = sum(1 for _ in f)
239
+ print(f"Total: {count} results -> {sft_final}")
240
+
241
+ if __name__ == "__main__":
242
+ main()
eval_footprint/inference_fullft_math_nf.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_footprint/inference_fullft_math_nf_old.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_footprint/inference_fullft_phyx_math_nf.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_footprint/inference_fullft_phyx_nf.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_footprint/inference_lora_phyx_f.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_footprint/inference_results_base.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_footprint/inference_results_lora_math_f.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_footprint/inference_results_sft.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_footprint/inference_results_sft.jsonl_gpu4.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_footprint/inference_results_sft.jsonl_gpu5.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_footprint/inference_results_sft.jsonl_gpu6.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_footprint/inference_results_sft.jsonl_gpu7.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_footprint/report_lora_math_f.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "2026-03-16 17:48:12",
3
+ "scoring_method": "PhyX-aligned (DeepSeek-V3 judge, 5-shot ICE, 5 retries)",
4
+ "model": {
5
+ "model": "lora_math_f (LoRA+freeze+math)",
6
+ "total": 1533,
7
+ "string_matches": 13,
8
+ "llm_calls": 1520,
9
+ "llm_matches": 297,
10
+ "final_correct": 310,
11
+ "final_acc": 20.22,
12
+ "category_stats": {
13
+ "Mechanics": {
14
+ "total": 276,
15
+ "correct": 63
16
+ },
17
+ "Waves/Acoustics": {
18
+ "total": 253,
19
+ "correct": 47
20
+ },
21
+ "Electromagnetism": {
22
+ "total": 275,
23
+ "correct": 61
24
+ },
25
+ "Modern Physics": {
26
+ "total": 222,
27
+ "correct": 55
28
+ },
29
+ "Optics": {
30
+ "total": 252,
31
+ "correct": 46
32
+ },
33
+ "Thermodynamics": {
34
+ "total": 255,
35
+ "correct": 38
36
+ }
37
+ }
38
+ }
39
+ }
eval_footprint/run_inference_single.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Single-model inference on 8 GPUs. Usage: python3 run_inference_single.py <MODEL_PATH> <MODEL_NAME>"""
3
+ import os, sys, json, torch, multiprocessing as mp
4
+
5
+ os.environ["HF_HUB_OFFLINE"] = "1"
6
+ os.environ["TRANSFORMERS_OFFLINE"] = "1"
7
+
8
+ MODEL_PATH = sys.argv[1] if len(sys.argv) > 1 else "/workspace/rl4phyx/RL4Phyx/SFT/checkpoints/lora_math_f/merged"
9
+ MODEL_NAME = sys.argv[2] if len(sys.argv) > 2 else "lora_math_f"
10
+ TEST_FILE = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/test_1533_openended.jsonl"
11
+ OUTPUT_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
12
+ IMAGE_DIR = "/workspace/rl4phyx/RL4Phyx/MetaPhyX/test_images"
13
+ GPUS = [0, 1, 2, 3, 4, 5, 6, 7]
14
+ MAX_NEW_TOKENS = 2048
15
+
16
+ def load_test_data():
17
+ samples = []
18
+ with open(TEST_FILE, "r", encoding="utf-8") as f:
19
+ for line in f:
20
+ if line.strip():
21
+ samples.append(json.loads(line))
22
+ return samples
23
+
24
+ def build_prompt(sample):
25
+ desc = sample.get("description", "")
26
+ question = sample.get("question", "")
27
+ return f"Look at the image and answer the physics question.\n\n{desc}\n\n{question}\n\nPlease reason step by step, and put your final answer within \\boxed{{}}."
28
+
29
+ def worker_inference(gpu_id, model_path, samples, output_file, model_name):
30
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
31
+ from qwen_vl_utils import process_vision_info
32
+
33
+ device = f"cuda:{gpu_id}"
34
+ print(f"[{model_name}][GPU {gpu_id}] Loading model from {model_path}...", flush=True)
35
+
36
+ processor = AutoProcessor.from_pretrained(model_path, min_pixels=3136, max_pixels=200704)
37
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
38
+ model_path, torch_dtype=torch.bfloat16, device_map=device, attn_implementation="sdpa"
39
+ )
40
+ model.eval()
41
+ print(f"[{model_name}][GPU {gpu_id}] Model loaded. Processing {len(samples)} samples...", flush=True)
42
+
43
+ results = []
44
+ for i, sample in enumerate(samples):
45
+ try:
46
+ image_path = os.path.join(IMAGE_DIR, sample[image])
47
+
48
+ prompt = build_prompt(sample)
49
+ messages = [{"role": "user", "content": [
50
+ {"type": "image", "image": f"file://{image_path}"},
51
+ {"type": "text", "text": prompt}
52
+ ]}]
53
+
54
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
55
+ image_inputs, video_inputs = process_vision_info(messages)
56
+ inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt").to(device)
57
+
58
+ with torch.no_grad():
59
+ output_ids = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS, do_sample=False)
60
+
61
+ input_len = inputs["input_ids"].shape[1]
62
+ response = processor.decode(output_ids[0][input_len:], skip_special_tokens=True)
63
+ result = {**sample, "model_output": response, "model_name": model_name}
64
+ results.append(result)
65
+
66
+ if (i + 1) % 10 == 0:
67
+ print(f"[{model_name}][GPU {gpu_id}] {i+1}/{len(samples)} done", flush=True)
68
+ except Exception as e:
69
+ print(f"[{model_name}][GPU {gpu_id}] Error on sample {i}: {e}", flush=True)
70
+ results.append({**sample, "model_output": f"ERROR: {e}", "model_name": model_name})
71
+
72
+ with open(output_file, "w", encoding="utf-8") as f:
73
+ for r in results:
74
+ f.write(json.dumps(r, ensure_ascii=False) + "\n")
75
+ print(f"[{model_name}][GPU {gpu_id}] Done. Saved {len(results)} results to {output_file}", flush=True)
76
+
77
+ if __name__ == "__main__":
78
+ mp.set_start_method("spawn", force=True)
79
+ samples = load_test_data()
80
+ print(f"Model: {MODEL_PATH}")
81
+ print(f"Name: {MODEL_NAME}")
82
+ print(f"Loaded {len(samples)} test samples")
83
+
84
+ n = len(samples)
85
+ n_gpus = len(GPUS)
86
+ chunk_size = (n + n_gpus - 1) // n_gpus
87
+
88
+ processes = []
89
+ for idx, gpu_id in enumerate(GPUS):
90
+ start = idx * chunk_size
91
+ end = min(start + chunk_size, n)
92
+ chunk = samples[start:end]
93
+ if not chunk:
94
+ continue
95
+ out_file = os.path.join(OUTPUT_DIR, f"inference_results_{MODEL_NAME}_gpu{gpu_id}.jsonl")
96
+ p = mp.Process(target=worker_inference, args=(gpu_id, MODEL_PATH, chunk, out_file, MODEL_NAME))
97
+ p.start()
98
+ processes.append(p)
99
+
100
+ for p in processes:
101
+ p.join()
102
+
103
+ # Merge
104
+ merged = os.path.join(OUTPUT_DIR, f"inference_results_{MODEL_NAME}.jsonl")
105
+ with open(merged, "w", encoding="utf-8") as out:
106
+ for gpu_id in GPUS:
107
+ part = os.path.join(OUTPUT_DIR, f"inference_results_{MODEL_NAME}_gpu{gpu_id}.jsonl")
108
+ if os.path.exists(part):
109
+ with open(part) as inp:
110
+ for line in inp:
111
+ out.write(line)
112
+
113
+ # Count and check errors
114
+ with open(merged) as f:
115
+ results = [json.loads(l) for l in f if l.strip()]
116
+ errors = sum(1 for r in results if r.get("model_output","").startswith("ERROR"))
117
+ print(f"\n===== Inference Complete =====")
118
+ print(f"Total: {len(results)}, Valid: {len(results)-errors}, Errors: {errors}")
119
+ print(f"Output: {merged}")
120
+
eval_footprint/scored_results_base.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_footprint/scored_results_lora_math_f.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_footprint/scored_results_sft.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_footprint/scored_results_sft_phyx.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_footprint/sft_report_phyx.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "SFT",
3
+ "total": 1533,
4
+ "correct": 351,
5
+ "acc": 22.9,
6
+ "categories": {
7
+ "Mechanics": {
8
+ "total": 276,
9
+ "correct": 76
10
+ },
11
+ "Waves/Acoustics": {
12
+ "total": 253,
13
+ "correct": 45
14
+ },
15
+ "Electromagnetism": {
16
+ "total": 275,
17
+ "correct": 67
18
+ },
19
+ "Modern Physics": {
20
+ "total": 222,
21
+ "correct": 66
22
+ },
23
+ "Optics": {
24
+ "total": 252,
25
+ "correct": 53
26
+ },
27
+ "Thermodynamics": {
28
+ "total": 255,
29
+ "correct": 44
30
+ }
31
+ }
32
+ }
eval_footprint/simple_eval.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import json, os, sys, time, torch
3
+
4
+ MODEL_PATH = sys.argv[1]
5
+ OUTPUT_NAME = sys.argv[2]
6
+ EVAL_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
7
+ TEST_FILE = os.path.join(EVAL_DIR, "test_1533_openended.jsonl")
8
+
9
+ print(f"Model: {MODEL_PATH}")
10
+ print(f"Output: {OUTPUT_NAME}")
11
+
12
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
13
+ from qwen_vl_utils import process_vision_info
14
+
15
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
16
+ MODEL_PATH, torch_dtype=torch.bfloat16, device_map="cuda",
17
+ attn_implementation="sdpa"
18
+ )
19
+ processor = AutoProcessor.from_pretrained(MODEL_PATH)
20
+ model.eval()
21
+
22
+ samples = []
23
+ with open(TEST_FILE) as f:
24
+ for line in f:
25
+ if line.strip():
26
+ samples.append(json.loads(line))
27
+ print(f"Loaded {len(samples)} samples")
28
+
29
+ results = []
30
+ t0 = time.time()
31
+ for idx, sample in enumerate(samples):
32
+ desc = sample.get("description", "")
33
+ q = sample.get("question", "")
34
+ parts = [p for p in [desc, q] if p]
35
+ parts.append("Please reason step by step, and put your final answer within \\boxed{}.")
36
+ prompt_text = "\n\n".join(parts)
37
+
38
+ img = sample.get("image_path", "")
39
+ content = []
40
+ if img and os.path.exists(img):
41
+ content.append({"type": "image", "image": f"file://{img}"})
42
+ content.append({"type": "text", "text": prompt_text})
43
+
44
+ messages = [{"role": "user", "content": content}]
45
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
46
+ image_inputs, video_inputs = process_vision_info(messages)
47
+ inputs = processor(text=[text], images=image_inputs, videos=video_inputs,
48
+ padding=True, return_tensors="pt").to("cuda")
49
+
50
+ with torch.no_grad():
51
+ ids = model.generate(**inputs, max_new_tokens=2048, do_sample=False)
52
+
53
+ out_ids = ids[0][len(inputs.input_ids[0]):]
54
+ response = processor.decode(out_ids, skip_special_tokens=True)
55
+ sample["model_output"] = response
56
+ results.append(sample)
57
+
58
+ if (idx + 1) % 50 == 0:
59
+ elapsed = time.time() - t0
60
+ rate = (idx + 1) / elapsed
61
+ eta = (len(samples) - idx - 1) / rate / 60
62
+ print(f" {idx+1}/{len(samples)} ({rate:.1f}/s, ETA {eta:.0f}min)", flush=True)
63
+
64
+ output_file = os.path.join(EVAL_DIR, f"inference_results_{OUTPUT_NAME}.jsonl")
65
+ with open(output_file, "w", encoding="utf-8") as f:
66
+ for r in results:
67
+ f.write(json.dumps(r, ensure_ascii=False) + "\n")
68
+ print(f"\nDone: {len(results)} -> {output_file}")
eval_footprint/single_model_eval.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import sys, os, json, torch, time
3
+ from multiprocessing import Process
4
+ from PIL import Image
5
+
6
+ EVAL_DIR = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint"
7
+ TEST_FILE = os.path.join(EVAL_DIR, "test_1533_openended.jsonl")
8
+
9
+ MODEL_PATH = sys.argv[1]
10
+ OUTPUT_NAME = sys.argv[2]
11
+ NUM_GPUS = 8
12
+
13
+ def build_open_ended_prompt(sample):
14
+ parts = []
15
+ desc = sample.get("description", "")
16
+ q = sample.get("question", "")
17
+ if desc: parts.append(desc)
18
+ if q: parts.append(q)
19
+ parts.append("Please reason step by step, and put your final answer within \\boxed{}.")
20
+ return "\n\n".join(parts)
21
+
22
+ def worker_inference(gpu_id, model_path, samples, out_file, model_name):
23
+ os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
24
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
25
+ from qwen_vl_utils import process_vision_info
26
+
27
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
28
+ model_path, torch_dtype=torch.bfloat16, device_map="cuda",
29
+ attn_implementation="sdpa"
30
+ )
31
+ processor = AutoProcessor.from_pretrained(model_path)
32
+ model.eval()
33
+
34
+ results = []
35
+ for idx, sample in enumerate(samples):
36
+ try:
37
+ prompt_text = build_open_ended_prompt(sample)
38
+ img = sample.get("image_path", "")
39
+ content = []
40
+ if img and os.path.exists(img):
41
+ content.append({"type": "image", "image": f"file://{img}"})
42
+ content.append({"type": "text", "text": prompt_text})
43
+ messages = [{"role": "user", "content": content}]
44
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
45
+ image_inputs, video_inputs = process_vision_info(messages)
46
+ inputs = processor(text=[text], images=image_inputs, videos=video_inputs,
47
+ padding=True, return_tensors="pt").to("cuda")
48
+ with torch.no_grad():
49
+ ids = model.generate(**inputs, max_new_tokens=2048, do_sample=False)
50
+ out_ids = ids[0][len(inputs.input_ids[0]):]
51
+ response = processor.decode(out_ids, skip_special_tokens=True)
52
+ sample["model_output"] = response
53
+ except Exception as e:
54
+ sample["model_output"] = f"Error: {str(e)}"
55
+ results.append(sample)
56
+ if (idx + 1) % 50 == 0:
57
+ print(f" [GPU {gpu_id}] {idx+1}/{len(samples)}", flush=True)
58
+
59
+ with open(out_file, "w", encoding="utf-8") as f:
60
+ for r in results:
61
+ f.write(json.dumps(r, ensure_ascii=False) + "\n")
62
+ print(f" [GPU {gpu_id}] Done: {len(results)} -> {out_file}", flush=True)
63
+
64
+ def main():
65
+ print(f"\nModel: {MODEL_PATH}")
66
+ print(f"Output: {OUTPUT_NAME}")
67
+
68
+ samples = []
69
+ with open(TEST_FILE) as f:
70
+ for line in f:
71
+ if line.strip(): samples.append(json.loads(line))
72
+ print(f"Test samples: {len(samples)}")
73
+
74
+ output_file = os.path.join(EVAL_DIR, f"inference_results_{OUTPUT_NAME}.jsonl")
75
+
76
+ chunk_size = len(samples) // NUM_GPUS
77
+ procs = []
78
+ for gpu in range(NUM_GPUS):
79
+ s = gpu * chunk_size
80
+ e = s + chunk_size if gpu < NUM_GPUS - 1 else len(samples)
81
+ gpu_file = os.path.join(EVAL_DIR, f"_temp_{OUTPUT_NAME}_gpu{gpu}.jsonl")
82
+ p = Process(target=worker_inference, args=(gpu, MODEL_PATH, samples[s:e], gpu_file, OUTPUT_NAME))
83
+ p.start()
84
+ procs.append((p, gpu_file))
85
+
86
+ for p, _ in procs:
87
+ p.join()
88
+
89
+ all_results = []
90
+ for _, gf in procs:
91
+ if os.path.exists(gf):
92
+ with open(gf) as f:
93
+ for line in f:
94
+ if line.strip(): all_results.append(line)
95
+ os.remove(gf)
96
+
97
+ with open(output_file, "w") as f:
98
+ for line in all_results:
99
+ f.write(line)
100
+ print(f"Total: {len(all_results)} results -> {output_file}")
101
+
102
+ if __name__ == "__main__":
103
+ main()
eval_footprint/test_1533_openended.jsonl ADDED
The diff for this file is too large to render. See raw diff