YUNTA88 commited on
Commit
24f08aa
·
verified ·
1 Parent(s): 0a18e92

Upload folder using huggingface_hub

Browse files
eval_results/comparison_report_base_fullft.json ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "2026-03-09 08:34:31",
3
+ "scoring_method": "PhyX-aligned (DeepSeek-V3 judge, 5-shot ICE, 5 retries)",
4
+ "base": {
5
+ "model": "Base (Qwen2.5-VL-3B-Instruct)",
6
+ "total": 1533,
7
+ "string_matches": 5,
8
+ "llm_calls": 1528,
9
+ "llm_matches": 278,
10
+ "final_correct": 283,
11
+ "final_acc": 18.46,
12
+ "category_stats": {
13
+ "Mechanics": {
14
+ "total": 276,
15
+ "correct": 65
16
+ },
17
+ "Waves/Acoustics": {
18
+ "total": 253,
19
+ "correct": 35
20
+ },
21
+ "Electromagnetism": {
22
+ "total": 275,
23
+ "correct": 51
24
+ },
25
+ "Modern Physics": {
26
+ "total": 222,
27
+ "correct": 54
28
+ },
29
+ "Optics": {
30
+ "total": 252,
31
+ "correct": 44
32
+ },
33
+ "Thermodynamics": {
34
+ "total": 255,
35
+ "correct": 34
36
+ }
37
+ }
38
+ },
39
+ "fullft": {
40
+ "model": "SFT-fullft (Cold-Start)",
41
+ "total": 1533,
42
+ "string_matches": 13,
43
+ "llm_calls": 1520,
44
+ "llm_matches": 331,
45
+ "final_correct": 344,
46
+ "final_acc": 22.44,
47
+ "category_stats": {
48
+ "Mechanics": {
49
+ "total": 276,
50
+ "correct": 77
51
+ },
52
+ "Waves/Acoustics": {
53
+ "total": 253,
54
+ "correct": 42
55
+ },
56
+ "Electromagnetism": {
57
+ "total": 275,
58
+ "correct": 62
59
+ },
60
+ "Modern Physics": {
61
+ "total": 222,
62
+ "correct": 63
63
+ },
64
+ "Optics": {
65
+ "total": 252,
66
+ "correct": 55
67
+ },
68
+ "Thermodynamics": {
69
+ "total": 255,
70
+ "correct": 45
71
+ }
72
+ }
73
+ }
74
+ }
eval_results/scored_results_base.jsonl ADDED
The diff for this file is too large to render. See raw diff