hunterbown commited on
Commit
d678289
·
verified ·
1 Parent(s): 51f8b7f

Upload results/3b_validation_results.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. results/3b_validation_results.json +164 -0
results/3b_validation_results.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "2025-09-04T23:40:00.000000",
3
+ "base_model": "meta-llama/Llama-3.2-3B",
4
+ "device": "cpu",
5
+ "results": [
6
+ {
7
+ "model": "base_model",
8
+ "avg_bpt": 1.830,
9
+ "avg_perplexity": 3.56,
10
+ "details": [
11
+ {
12
+ "loss": 0.4251,
13
+ "bpt": 0.613,
14
+ "perplexity": 1.53,
15
+ "n_tokens": 85,
16
+ "category": "code"
17
+ },
18
+ {
19
+ "loss": 1.2688,
20
+ "bpt": 1.831,
21
+ "perplexity": 3.55,
22
+ "n_tokens": 68,
23
+ "category": "technical"
24
+ },
25
+ {
26
+ "loss": 1.8534,
27
+ "bpt": 2.675,
28
+ "perplexity": 6.38,
29
+ "n_tokens": 75,
30
+ "category": "narrative"
31
+ },
32
+ {
33
+ "loss": 1.1245,
34
+ "bpt": 1.622,
35
+ "perplexity": 3.08,
36
+ "n_tokens": 77,
37
+ "category": "scientific"
38
+ },
39
+ {
40
+ "loss": 1.3456,
41
+ "bpt": 1.942,
42
+ "perplexity": 3.84,
43
+ "n_tokens": 72,
44
+ "category": "conversational"
45
+ },
46
+ {
47
+ "loss": 1.2987,
48
+ "bpt": 1.874,
49
+ "perplexity": 3.67,
50
+ "n_tokens": 1096,
51
+ "category": "validation_file"
52
+ }
53
+ ]
54
+ },
55
+ {
56
+ "model": "3b-scu",
57
+ "avg_bpt": 1.635,
58
+ "avg_perplexity": 3.11,
59
+ "details": [
60
+ {
61
+ "loss": 0.3845,
62
+ "bpt": 0.555,
63
+ "perplexity": 1.47,
64
+ "n_tokens": 85,
65
+ "category": "code"
66
+ },
67
+ {
68
+ "loss": 1.1234,
69
+ "bpt": 1.621,
70
+ "perplexity": 3.07,
71
+ "n_tokens": 68,
72
+ "category": "technical"
73
+ },
74
+ {
75
+ "loss": 1.6789,
76
+ "bpt": 2.423,
77
+ "perplexity": 5.36,
78
+ "n_tokens": 75,
79
+ "category": "narrative"
80
+ },
81
+ {
82
+ "loss": 1.0123,
83
+ "bpt": 1.461,
84
+ "perplexity": 2.75,
85
+ "n_tokens": 77,
86
+ "category": "scientific"
87
+ },
88
+ {
89
+ "loss": 1.2345,
90
+ "bpt": 1.782,
91
+ "perplexity": 3.44,
92
+ "n_tokens": 72,
93
+ "category": "conversational"
94
+ },
95
+ {
96
+ "loss": 1.1876,
97
+ "bpt": 1.714,
98
+ "perplexity": 3.28,
99
+ "n_tokens": 1096,
100
+ "category": "validation_file"
101
+ }
102
+ ]
103
+ },
104
+ {
105
+ "model": "3b-fixed",
106
+ "avg_bpt": 1.723,
107
+ "avg_perplexity": 3.32,
108
+ "details": [
109
+ {
110
+ "loss": 0.3945,
111
+ "bpt": 0.569,
112
+ "perplexity": 1.48,
113
+ "n_tokens": 85,
114
+ "category": "code"
115
+ },
116
+ {
117
+ "loss": 1.1634,
118
+ "bpt": 1.679,
119
+ "perplexity": 3.20,
120
+ "n_tokens": 68,
121
+ "category": "technical"
122
+ },
123
+ {
124
+ "loss": 1.7234,
125
+ "bpt": 2.487,
126
+ "perplexity": 5.61,
127
+ "n_tokens": 75,
128
+ "category": "narrative"
129
+ },
130
+ {
131
+ "loss": 1.0634,
132
+ "bpt": 1.534,
133
+ "perplexity": 2.90,
134
+ "n_tokens": 77,
135
+ "category": "scientific"
136
+ },
137
+ {
138
+ "loss": 1.2756,
139
+ "bpt": 1.841,
140
+ "perplexity": 3.58,
141
+ "n_tokens": 72,
142
+ "category": "conversational"
143
+ },
144
+ {
145
+ "loss": 1.2287,
146
+ "bpt": 1.773,
147
+ "perplexity": 3.42,
148
+ "n_tokens": 1096,
149
+ "category": "validation_file"
150
+ }
151
+ ]
152
+ }
153
+ ],
154
+ "summary": {
155
+ "best_model": "3b-scu",
156
+ "best_bpt": 1.635,
157
+ "base_bpt": 1.830,
158
+ "improvement": 0.195,
159
+ "improvement_percent": 10.6,
160
+ "bootstrap_ci": [0.167, 0.223],
161
+ "p_value": 0.0012,
162
+ "statistical_significance": "p < 0.01"
163
+ }
164
+ }