YUNTA88 commited on
Commit
0c7080e
·
verified ·
1 Parent(s): 7b4cf9d

Upload root_scripts/analyze_gt.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. root_scripts/analyze_gt.py +71 -0
root_scripts/analyze_gt.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import json, re, sys
3
+
4
+ # Reuse parse logic from physics_reward
5
+ UNIT_PREFIXES = {
6
+ 'T': 1e12, 'G': 1e9, 'M': 1e6, 'k': 1e3, 'h': 1e2,
7
+ 'c': 1e-2, 'm': 1e-3, 'u': 1e-6, 'n': 1e-9, 'p': 1e-12,
8
+ }
9
+ BASE_UNITS = {'N','Pa','J','W','Hz','m','s','g','kg','m/s','m/s^2','V','A','C','F','H','T','Wb','K','mol','cd','rad','sr','dB','eV'}
10
+ COMPOUND_UNITS = {'km/h','km/s','cm/s','mm/s','kg/m^3','g/cm^3','N/m','N/m^2','J/s','W/m^2','rad/s','rpm','m/s^2'}
11
+
12
+ def parse_gt(text):
13
+ text = text.strip()
14
+ # Try to match: number + optional unit
15
+ m = re.match(r'^([+-]?\d+\.?\d*(?:[eE][+-]?\d+)?)\s*(.*)$', text)
16
+ if not m:
17
+ return None, None, "no_number"
18
+ value = float(m.group(1))
19
+ unit_str = m.group(2).strip()
20
+ if not unit_str:
21
+ return value, None, "number_only"
22
+ if unit_str in COMPOUND_UNITS or unit_str in BASE_UNITS:
23
+ return value, unit_str, "ok"
24
+ if len(unit_str) >= 2 and unit_str[0] in UNIT_PREFIXES:
25
+ rest = unit_str[1:]
26
+ if rest in BASE_UNITS or rest in COMPOUND_UNITS:
27
+ return value * UNIT_PREFIXES[unit_str[0]], rest, "ok_prefixed"
28
+ return value, unit_str, "unknown_unit"
29
+
30
+ # Load the 1533 test questions
31
+ f = "/workspace/rl4phyx/RL4Phyx/SFT/sft_eval_footprint/inference_results_base.jsonl"
32
+ with open(f) as fh:
33
+ lines = [json.loads(l) for l in fh if l.strip()]
34
+
35
+ categories = {}
36
+ hard_cases = []
37
+
38
+ for r in lines:
39
+ gt = str(r.get("ground_truth_value", "")).strip()
40
+ cat = r.get("category", "unknown")
41
+ val, unit, status = parse_gt(gt)
42
+
43
+ if status not in categories:
44
+ categories[status] = []
45
+ categories[status].append(gt)
46
+
47
+ if status in ("no_number", "unknown_unit"):
48
+ hard_cases.append({"gt": gt, "category": cat, "status": status})
49
+
50
+ print("=== GT Format Analysis (1533 questions) ===\n")
51
+ for status, items in sorted(categories.items(), key=lambda x: -len(x[1])):
52
+ print(f" {status:20s}: {len(items):4d} ({len(items)/len(lines)*100:.1f}%)")
53
+
54
+ print(f"\n=== Hard Cases ({len(hard_cases)} total) ===\n")
55
+
56
+ # Group hard cases by status
57
+ for status_type in ["no_number", "unknown_unit"]:
58
+ cases = [c for c in hard_cases if c["status"] == status_type]
59
+ if not cases:
60
+ continue
61
+ print(f"--- {status_type} ({len(cases)}) ---")
62
+ # Show unique GT values (deduplicated)
63
+ seen = set()
64
+ for c in cases:
65
+ gt = c["gt"]
66
+ if gt not in seen and len(seen) < 30:
67
+ seen.add(gt)
68
+ print(f" [{c['category']:20s}] {gt[:80]}")
69
+ if len(cases) > 30:
70
+ print(f" ... and {len(cases)-30} more")
71
+ print()