explcre commited on
Commit
aa5c8f8
·
verified ·
1 Parent(s): f75b333

Upload _paper_results/reasoning_rl_multiseed_summary.json with huggingface_hub

Browse files
_paper_results/reasoning_rl_multiseed_summary.json ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "t1": {
3
+ "per_seed": [
4
+ {
5
+ "seed": 2,
6
+ "score_path": "/workspace/dnathinker/runs/eval_reasoning_t1_v7r128_postRL_alpha1_s2_20260506_203542/score.json",
7
+ "tfg": 0.3964326508231607,
8
+ "n_cited": 20.48,
9
+ "n_grounded": 9.68,
10
+ "n_halluc": 10.58,
11
+ "reasoning_tags_rate": 0.6
12
+ },
13
+ {
14
+ "seed": 3,
15
+ "score_path": "/workspace/dnathinker/runs/eval_reasoning_t1_v7r128_postRL_alpha1_s3_20260506_234027/score.json",
16
+ "tfg": 0.3996367124792261,
17
+ "n_cited": 22.84,
18
+ "n_grounded": 13.74,
19
+ "n_halluc": 8.76,
20
+ "reasoning_tags_rate": 0.2
21
+ },
22
+ {
23
+ "seed": 42,
24
+ "score_path": "/workspace/dnathinker/runs/eval_reasoning_t1_v7r128_postRLext_best_20260506_053915/score.json",
25
+ "tfg": 0.43835448568269475,
26
+ "n_cited": 27.46,
27
+ "n_grounded": 14.58,
28
+ "n_halluc": 12.36,
29
+ "reasoning_tags_rate": 0.12
30
+ }
31
+ ],
32
+ "tfg_stats": {
33
+ "n": 3,
34
+ "mean": 0.4114746163283605,
35
+ "std": 0.023333710274054672,
36
+ "min": 0.3964326508231607,
37
+ "max": 0.43835448568269475,
38
+ "values": [
39
+ 0.3964326508231607,
40
+ 0.3996367124792261,
41
+ 0.43835448568269475
42
+ ]
43
+ }
44
+ },
45
+ "t2": {
46
+ "per_seed": [
47
+ {
48
+ "seed": 42,
49
+ "score_path": "/workspace/dnathinker/runs/eval_reasoning_t2_v7r128_postRL_alpha1_20260506_004858/score.json",
50
+ "tfg": 0.3650301689387592,
51
+ "n_cited": 20.58,
52
+ "n_grounded": 9.84,
53
+ "n_halluc": 10.08,
54
+ "reasoning_tags_rate": 0.44
55
+ }
56
+ ],
57
+ "tfg_stats": {
58
+ "n": 1,
59
+ "mean": 0.3650301689387592,
60
+ "std": 0.0,
61
+ "min": 0.3650301689387592,
62
+ "max": 0.3650301689387592,
63
+ "values": [
64
+ 0.3650301689387592
65
+ ]
66
+ }
67
+ },
68
+ "t3": {
69
+ "per_seed": [
70
+ {
71
+ "seed": 2,
72
+ "score_path": "/workspace/dnathinker/runs/eval_reasoning_t3_v7r128_postRL_alpha1_s2_par_20260506_221253/score.json",
73
+ "tfg": 0.23970945554855788,
74
+ "n_cited": 14.38,
75
+ "n_grounded": 6.46,
76
+ "n_halluc": 7.52,
77
+ "reasoning_tags_rate": 0.64
78
+ },
79
+ {
80
+ "seed": 3,
81
+ "score_path": "/workspace/dnathinker/runs/eval_reasoning_t3_v7r128_postRL_alpha1_s3_par_20260507_005010/score.json",
82
+ "tfg": 0.19508590161534112,
83
+ "n_cited": 15.08,
84
+ "n_grounded": 5.22,
85
+ "n_halluc": 9.54,
86
+ "reasoning_tags_rate": 0.8
87
+ },
88
+ {
89
+ "seed": 42,
90
+ "score_path": "/workspace/dnathinker/runs/eval_reasoning_t3_v7r128_postRL_alpha1_20260506_004903/score.json",
91
+ "tfg": 0.23932521613754584,
92
+ "n_cited": 20.06,
93
+ "n_grounded": 7.8,
94
+ "n_halluc": 11.88,
95
+ "reasoning_tags_rate": 0.64
96
+ }
97
+ ],
98
+ "tfg_stats": {
99
+ "n": 3,
100
+ "mean": 0.22470685776714827,
101
+ "std": 0.025653219924440397,
102
+ "min": 0.19508590161534112,
103
+ "max": 0.23970945554855788,
104
+ "values": [
105
+ 0.23970945554855788,
106
+ 0.19508590161534112,
107
+ 0.23932521613754584
108
+ ]
109
+ }
110
+ }
111
+ }