justin-ailabs commited on
Commit
384e89d
·
1 Parent(s): 8eecd10

UI Overhaul: Premium Slate Aesthetic, Tabbed Interface, and Grid Refinements

Browse files
data/gpt5.4-judge-all-scores.json CHANGED
@@ -1,13 +1,4 @@
1
  {
2
- "o3-2025-04-16": {
3
- "comprehension_score": 7.389285714285714,
4
- "structure_score": 7.417857142857143,
5
- "prose_style_score": 7.575,
6
- "creativity_score": 7.564285714285714,
7
- "depth_score": 7.196428571428571,
8
- "helpfulness_score": 7.357142857142857,
9
- "overall_score": 7.307142857142857
10
- },
11
  "Deepseek-R1-0528": {
12
  "comprehension_score": 7.146428571428571,
13
  "structure_score": 7.0964285714285715,
@@ -26,32 +17,14 @@
26
  "helpfulness_score": 6.364285714285714,
27
  "overall_score": 6.457142857142857
28
  },
29
- "Qwen3-235B-Thinking": {
30
- "comprehension_score": 6.4071428571428575,
31
- "structure_score": 6.446428571428571,
32
- "prose_style_score": 7.2214285714285715,
33
- "creativity_score": 7.75,
34
- "depth_score": 6.739285714285714,
35
- "helpfulness_score": 6.239285714285714,
36
- "overall_score": 6.517857142857143
37
- },
38
- "Qwen3-32B-Thinking": {
39
- "comprehension_score": 6.457142857142857,
40
- "structure_score": 6.4714285714285715,
41
- "prose_style_score": 7.128571428571429,
42
- "creativity_score": 7.525,
43
- "depth_score": 6.614285714285714,
44
- "helpfulness_score": 6.317857142857143,
45
- "overall_score": 6.546428571428572
46
- },
47
- "Qwen3-30B-A3B-Thinking": {
48
- "comprehension_score": 5.964285714285714,
49
- "structure_score": 6.110714285714286,
50
- "prose_style_score": 6.521428571428571,
51
- "creativity_score": 6.8428571428571425,
52
- "depth_score": 5.985714285714286,
53
- "helpfulness_score": 5.882142857142857,
54
- "overall_score": 6.010714285714286
55
  },
56
  "Gemini-2.5-Flash": {
57
  "comprehension_score": 6.503571428571429,
@@ -62,6 +35,69 @@
62
  "helpfulness_score": 6.428571428571429,
63
  "overall_score": 6.2785714285714285
64
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  "Qwen3-235B": {
66
  "comprehension_score": 5.896428571428571,
67
  "structure_score": 6.2,
@@ -71,6 +107,33 @@
71
  "helpfulness_score": 5.8464285714285715,
72
  "overall_score": 5.767857142857143
73
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  "Qwen3-32B": {
75
  "comprehension_score": 5.303571428571429,
76
  "structure_score": 5.582142857142857,
@@ -80,6 +143,15 @@
80
  "helpfulness_score": 5.117857142857143,
81
  "overall_score": 5.042857142857143
82
  },
 
 
 
 
 
 
 
 
 
83
  "Qwen3-8B": {
84
  "comprehension_score": 4.860714285714286,
85
  "structure_score": 5.0928571428571425,
@@ -89,15 +161,6 @@
89
  "helpfulness_score": 4.639285714285714,
90
  "overall_score": 4.560714285714286
91
  },
92
- "Qwen3-30B-A3B": {
93
- "comprehension_score": 4.953571428571428,
94
- "structure_score": 5.2214285714285715,
95
- "prose_style_score": 4.6,
96
- "creativity_score": 4.7785714285714285,
97
- "depth_score": 4.521428571428571,
98
- "helpfulness_score": 4.760714285714286,
99
- "overall_score": 4.65
100
- },
101
  "gemma3-27b": {
102
  "comprehension_score": 5.164285714285715,
103
  "structure_score": 5.364285714285714,
@@ -107,32 +170,14 @@
107
  "helpfulness_score": 4.992857142857143,
108
  "overall_score": 4.814285714285714
109
  },
110
- "Phi-4-14B": {
111
- "comprehension_score": 4.2785714285714285,
112
- "structure_score": 4.514285714285714,
113
- "prose_style_score": 3.7714285714285714,
114
- "creativity_score": 4.310714285714286,
115
- "depth_score": 4.0285714285714285,
116
- "helpfulness_score": 4.017857142857143,
117
- "overall_score": 3.9857142857142858
118
- },
119
- "Mistral-3.2-24B-2506": {
120
- "comprehension_score": 5.475,
121
- "structure_score": 5.746428571428571,
122
- "prose_style_score": 5.35,
123
- "creativity_score": 5.885714285714286,
124
- "depth_score": 5.214285714285714,
125
- "helpfulness_score": 5.3464285714285715,
126
- "overall_score": 5.335714285714285
127
- },
128
- "Mistral-3.1-24B-2503": {
129
- "comprehension_score": 4.303571428571429,
130
- "structure_score": 4.521428571428571,
131
- "prose_style_score": 3.6857142857142855,
132
- "creativity_score": 3.9642857142857144,
133
- "depth_score": 3.8285714285714287,
134
- "helpfulness_score": 4.021428571428571,
135
- "overall_score": 3.9214285714285713
136
  },
137
  "gpt-4o-2024-11-20": {
138
  "comprehension_score": 5.275,
@@ -143,15 +188,6 @@
143
  "helpfulness_score": 4.957142857142857,
144
  "overall_score": 4.985714285714286
145
  },
146
- "gpt-4.1-mini-2025-04-14": {
147
- "comprehension_score": 4.8,
148
- "structure_score": 4.746428571428571,
149
- "prose_style_score": 4.339285714285714,
150
- "creativity_score": 4.489285714285714,
151
- "depth_score": 4.275,
152
- "helpfulness_score": 4.389285714285714,
153
- "overall_score": 4.35
154
- },
155
  "gpt-4o-mini-2024-07-18": {
156
  "comprehension_score": 4.417857142857143,
157
  "structure_score": 4.575,
@@ -161,15 +197,6 @@
161
  "helpfulness_score": 4.139285714285714,
162
  "overall_score": 4.15
163
  },
164
- "o4-mini-2025-04-16": {
165
- "comprehension_score": 5.803571428571429,
166
- "structure_score": 5.864285714285714,
167
- "prose_style_score": 5.428571428571429,
168
- "creativity_score": 5.760714285714286,
169
- "depth_score": 5.310714285714286,
170
- "helpfulness_score": 5.617857142857143,
171
- "overall_score": 5.557142857142857
172
- },
173
  "gpt-5.4": {
174
  "comprehension_score": 7.4678571428571425,
175
  "structure_score": 7.517857142857143,
@@ -179,41 +206,23 @@
179
  "helpfulness_score": 7.664285714285715,
180
  "overall_score": 7.421428571428572
181
  },
182
- "Gemini-3.1-Flash": {
183
- "comprehension_score": 7.0964285714285715,
184
- "structure_score": 7.232142857142857,
185
- "prose_style_score": 7.317857142857143,
186
- "creativity_score": 7.417857142857143,
187
- "depth_score": 7.053571428571429,
188
- "helpfulness_score": 7.185714285714286,
189
- "overall_score": 7.117857142857143
190
- },
191
- "Gemini-3.1-Pro": {
192
- "comprehension_score": 7.510714285714286,
193
- "structure_score": 7.5321428571428575,
194
- "prose_style_score": 7.5285714285714285,
195
- "creativity_score": 7.614285714285714,
196
- "depth_score": 7.321428571428571,
197
- "helpfulness_score": 7.65,
198
- "overall_score": 7.460714285714285
199
- },
200
- "MiniMax-M2.5": {
201
- "comprehension_score": 6.292857142857143,
202
- "structure_score": 6.442857142857143,
203
- "prose_style_score": 5.614285714285714,
204
- "creativity_score": 6.171428571428572,
205
- "depth_score": 5.975,
206
- "helpfulness_score": 6.242857142857143,
207
- "overall_score": 6.071428571428571
208
- },
209
- "Qwen3.5-Plus": {
210
- "comprehension_score": 7.228571428571429,
211
- "structure_score": 7.3,
212
- "prose_style_score": 7.232142857142857,
213
- "creativity_score": 7.353571428571429,
214
  "depth_score": 7.196428571428571,
215
- "helpfulness_score": 7.289285714285715,
216
- "overall_score": 7.189285714285714
 
 
 
 
 
 
 
 
 
217
  },
218
  "step-3.5-flash": {
219
  "comprehension_score": 7.285714285714286,
@@ -224,13 +233,13 @@
224
  "helpfulness_score": 7.2178571428571425,
225
  "overall_score": 7.253571428571429
226
  },
227
- "GLM-5": {
228
- "comprehension_score": 7.103571428571429,
229
- "structure_score": 7.182142857142857,
230
- "prose_style_score": 7.114285714285714,
231
- "creativity_score": 7.185714285714286,
232
- "depth_score": 6.964285714285714,
233
- "helpfulness_score": 7.146428571428571,
234
- "overall_score": 7.042857142857143
235
  }
236
  }
 
1
  {
 
 
 
 
 
 
 
 
 
2
  "Deepseek-R1-0528": {
3
  "comprehension_score": 7.146428571428571,
4
  "structure_score": 7.0964285714285715,
 
17
  "helpfulness_score": 6.364285714285714,
18
  "overall_score": 6.457142857142857
19
  },
20
+ "GLM-5": {
21
+ "comprehension_score": 7.103571428571429,
22
+ "structure_score": 7.182142857142857,
23
+ "prose_style_score": 7.114285714285714,
24
+ "creativity_score": 7.185714285714286,
25
+ "depth_score": 6.964285714285714,
26
+ "helpfulness_score": 7.146428571428571,
27
+ "overall_score": 7.042857142857143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  },
29
  "Gemini-2.5-Flash": {
30
  "comprehension_score": 6.503571428571429,
 
35
  "helpfulness_score": 6.428571428571429,
36
  "overall_score": 6.2785714285714285
37
  },
38
+ "Gemini-3.1-Flash": {
39
+ "comprehension_score": 7.0964285714285715,
40
+ "structure_score": 7.232142857142857,
41
+ "prose_style_score": 7.317857142857143,
42
+ "creativity_score": 7.417857142857143,
43
+ "depth_score": 7.053571428571429,
44
+ "helpfulness_score": 7.185714285714286,
45
+ "overall_score": 7.117857142857143
46
+ },
47
+ "Gemini-3.1-Pro": {
48
+ "comprehension_score": 7.510714285714286,
49
+ "structure_score": 7.5321428571428575,
50
+ "prose_style_score": 7.5285714285714285,
51
+ "creativity_score": 7.614285714285714,
52
+ "depth_score": 7.321428571428571,
53
+ "helpfulness_score": 7.65,
54
+ "overall_score": 7.460714285714285
55
+ },
56
+ "MiniMax-M2.5": {
57
+ "comprehension_score": 6.292857142857143,
58
+ "structure_score": 6.442857142857143,
59
+ "prose_style_score": 5.614285714285714,
60
+ "creativity_score": 6.171428571428572,
61
+ "depth_score": 5.975,
62
+ "helpfulness_score": 6.242857142857143,
63
+ "overall_score": 6.071428571428571
64
+ },
65
+ "Mistral-3.1-24B-2503": {
66
+ "comprehension_score": 4.303571428571429,
67
+ "structure_score": 4.521428571428571,
68
+ "prose_style_score": 3.6857142857142855,
69
+ "creativity_score": 3.9642857142857144,
70
+ "depth_score": 3.8285714285714287,
71
+ "helpfulness_score": 4.021428571428571,
72
+ "overall_score": 3.9214285714285713
73
+ },
74
+ "Mistral-3.2-24B-2506": {
75
+ "comprehension_score": 5.475,
76
+ "structure_score": 5.746428571428571,
77
+ "prose_style_score": 5.35,
78
+ "creativity_score": 5.885714285714286,
79
+ "depth_score": 5.214285714285714,
80
+ "helpfulness_score": 5.3464285714285715,
81
+ "overall_score": 5.335714285714285
82
+ },
83
+ "Phi-4-14B": {
84
+ "comprehension_score": 4.2785714285714285,
85
+ "structure_score": 4.514285714285714,
86
+ "prose_style_score": 3.7714285714285714,
87
+ "creativity_score": 4.310714285714286,
88
+ "depth_score": 4.0285714285714285,
89
+ "helpfulness_score": 4.017857142857143,
90
+ "overall_score": 3.9857142857142858
91
+ },
92
+ "Qwen3.5-Plus": {
93
+ "comprehension_score": 7.228571428571429,
94
+ "structure_score": 7.3,
95
+ "prose_style_score": 7.232142857142857,
96
+ "creativity_score": 7.353571428571429,
97
+ "depth_score": 7.196428571428571,
98
+ "helpfulness_score": 7.289285714285715,
99
+ "overall_score": 7.189285714285714
100
+ },
101
  "Qwen3-235B": {
102
  "comprehension_score": 5.896428571428571,
103
  "structure_score": 6.2,
 
107
  "helpfulness_score": 5.8464285714285715,
108
  "overall_score": 5.767857142857143
109
  },
110
+ "Qwen3-235B-Thinking": {
111
+ "comprehension_score": 6.4071428571428575,
112
+ "structure_score": 6.446428571428571,
113
+ "prose_style_score": 7.2214285714285715,
114
+ "creativity_score": 7.75,
115
+ "depth_score": 6.739285714285714,
116
+ "helpfulness_score": 6.239285714285714,
117
+ "overall_score": 6.517857142857143
118
+ },
119
+ "Qwen3-30B-A3B": {
120
+ "comprehension_score": 4.953571428571428,
121
+ "structure_score": 5.2214285714285715,
122
+ "prose_style_score": 4.6,
123
+ "creativity_score": 4.7785714285714285,
124
+ "depth_score": 4.521428571428571,
125
+ "helpfulness_score": 4.760714285714286,
126
+ "overall_score": 4.65
127
+ },
128
+ "Qwen3-30B-A3B-Thinking": {
129
+ "comprehension_score": 5.964285714285714,
130
+ "structure_score": 6.110714285714286,
131
+ "prose_style_score": 6.521428571428571,
132
+ "creativity_score": 6.8428571428571425,
133
+ "depth_score": 5.985714285714286,
134
+ "helpfulness_score": 5.882142857142857,
135
+ "overall_score": 6.010714285714286
136
+ },
137
  "Qwen3-32B": {
138
  "comprehension_score": 5.303571428571429,
139
  "structure_score": 5.582142857142857,
 
143
  "helpfulness_score": 5.117857142857143,
144
  "overall_score": 5.042857142857143
145
  },
146
+ "Qwen3-32B-Thinking": {
147
+ "comprehension_score": 6.457142857142857,
148
+ "structure_score": 6.4714285714285715,
149
+ "prose_style_score": 7.128571428571429,
150
+ "creativity_score": 7.525,
151
+ "depth_score": 6.614285714285714,
152
+ "helpfulness_score": 6.317857142857143,
153
+ "overall_score": 6.546428571428572
154
+ },
155
  "Qwen3-8B": {
156
  "comprehension_score": 4.860714285714286,
157
  "structure_score": 5.0928571428571425,
 
161
  "helpfulness_score": 4.639285714285714,
162
  "overall_score": 4.560714285714286
163
  },
 
 
 
 
 
 
 
 
 
164
  "gemma3-27b": {
165
  "comprehension_score": 5.164285714285715,
166
  "structure_score": 5.364285714285714,
 
170
  "helpfulness_score": 4.992857142857143,
171
  "overall_score": 4.814285714285714
172
  },
173
+ "gpt-4.1-mini-2025-04-14": {
174
+ "comprehension_score": 4.8,
175
+ "structure_score": 4.746428571428571,
176
+ "prose_style_score": 4.339285714285714,
177
+ "creativity_score": 4.489285714285714,
178
+ "depth_score": 4.275,
179
+ "helpfulness_score": 4.389285714285714,
180
+ "overall_score": 4.35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  },
182
  "gpt-4o-2024-11-20": {
183
  "comprehension_score": 5.275,
 
188
  "helpfulness_score": 4.957142857142857,
189
  "overall_score": 4.985714285714286
190
  },
 
 
 
 
 
 
 
 
 
191
  "gpt-4o-mini-2024-07-18": {
192
  "comprehension_score": 4.417857142857143,
193
  "structure_score": 4.575,
 
197
  "helpfulness_score": 4.139285714285714,
198
  "overall_score": 4.15
199
  },
 
 
 
 
 
 
 
 
 
200
  "gpt-5.4": {
201
  "comprehension_score": 7.4678571428571425,
202
  "structure_score": 7.517857142857143,
 
206
  "helpfulness_score": 7.664285714285715,
207
  "overall_score": 7.421428571428572
208
  },
209
+ "o3-2025-04-16": {
210
+ "comprehension_score": 7.389285714285714,
211
+ "structure_score": 7.417857142857143,
212
+ "prose_style_score": 7.575,
213
+ "creativity_score": 7.564285714285714,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  "depth_score": 7.196428571428571,
215
+ "helpfulness_score": 7.357142857142857,
216
+ "overall_score": 7.307142857142857
217
+ },
218
+ "o4-mini-2025-04-16": {
219
+ "comprehension_score": 5.803571428571429,
220
+ "structure_score": 5.864285714285714,
221
+ "prose_style_score": 5.428571428571429,
222
+ "creativity_score": 5.760714285714286,
223
+ "depth_score": 5.310714285714286,
224
+ "helpfulness_score": 5.617857142857143,
225
+ "overall_score": 5.557142857142857
226
  },
227
  "step-3.5-flash": {
228
  "comprehension_score": 7.285714285714286,
 
233
  "helpfulness_score": 7.2178571428571425,
234
  "overall_score": 7.253571428571429
235
  },
236
+ "gpt-4.1-2025-04-14": {
237
+ "comprehension_score": 5.0,
238
+ "structure_score": 5.0,
239
+ "prose_style_score": 5.0,
240
+ "creativity_score": 5.0,
241
+ "depth_score": 5.0,
242
+ "helpfulness_score": 5.0,
243
+ "overall_score": 5.0
244
  }
245
  }
data/gpt5.4-judge-complicated-writing-scores.json CHANGED
@@ -1,236 +1,245 @@
1
  {
2
- "o3-2025-04-16": {
3
- "comprehension_score": 7.5,
4
- "structure_score": 7.541176470588235,
5
- "prose_style_score": 7.788235294117647,
6
- "creativity_score": 7.688235294117647,
7
- "depth_score": 7.358823529411764,
8
- "helpfulness_score": 7.476470588235294,
9
- "overall_score": 7.4411764705882355
10
- },
11
  "Deepseek-R1-0528": {
12
- "comprehension_score": 7.070588235294117,
13
- "structure_score": 7.0,
14
- "prose_style_score": 7.352941176470588,
15
- "creativity_score": 7.647058823529412,
16
- "depth_score": 7.341176470588235,
17
- "helpfulness_score": 6.923529411764706,
18
- "overall_score": 7.105882352941176
19
  },
20
  "Deepseek-V3-0324": {
21
- "comprehension_score": 6.188235294117647,
22
- "structure_score": 6.3,
23
- "prose_style_score": 6.758823529411765,
24
- "creativity_score": 7.0588235294117645,
25
- "depth_score": 6.223529411764706,
26
- "helpfulness_score": 6.035294117647059,
27
- "overall_score": 6.223529411764706
28
  },
29
- "Qwen3-235B-Thinking": {
30
- "comprehension_score": 6.376470588235295,
31
- "structure_score": 6.252941176470588,
32
- "prose_style_score": 7.435294117647059,
33
- "creativity_score": 7.829411764705882,
34
- "depth_score": 6.7176470588235295,
35
- "helpfulness_score": 6.176470588235294,
36
- "overall_score": 6.517647058823529
37
  },
38
- "Qwen3-32B-Thinking": {
39
- "comprehension_score": 6.3882352941176475,
40
- "structure_score": 6.323529411764706,
41
- "prose_style_score": 7.323529411764706,
42
- "creativity_score": 7.570588235294117,
43
- "depth_score": 6.523529411764706,
44
- "helpfulness_score": 6.258823529411765,
45
- "overall_score": 6.523529411764706
46
  },
47
- "Qwen3-30B-A3B-Thinking": {
48
- "comprehension_score": 5.823529411764706,
49
- "structure_score": 5.952941176470588,
50
- "prose_style_score": 6.570588235294117,
51
- "creativity_score": 6.829411764705882,
52
- "depth_score": 5.7823529411764705,
53
- "helpfulness_score": 5.735294117647059,
54
- "overall_score": 5.876470588235295
55
  },
56
- "Gemini-2.5-Flash": {
57
- "comprehension_score": 6.429411764705883,
58
- "structure_score": 6.482352941176471,
59
- "prose_style_score": 5.847058823529411,
60
- "creativity_score": 6.147058823529412,
61
- "depth_score": 6.094117647058823,
62
- "helpfulness_score": 6.2823529411764705,
63
- "overall_score": 6.147058823529412
64
  },
65
- "Qwen3-235B": {
66
- "comprehension_score": 5.788235294117647,
67
- "structure_score": 6.070588235294117,
68
- "prose_style_score": 5.511764705882353,
69
- "creativity_score": 5.647058823529412,
70
- "depth_score": 5.4,
71
- "helpfulness_score": 5.623529411764705,
72
- "overall_score": 5.58235294117647
73
  },
74
- "Qwen3-32B": {
75
- "comprehension_score": 5.117647058823529,
76
- "structure_score": 5.364705882352941,
77
- "prose_style_score": 4.694117647058824,
78
- "creativity_score": 4.982352941176471,
79
- "depth_score": 4.647058823529412,
80
- "helpfulness_score": 4.794117647058823,
81
- "overall_score": 4.729411764705882
82
  },
83
- "Qwen3-8B": {
84
- "comprehension_score": 4.6647058823529415,
85
- "structure_score": 4.870588235294117,
86
- "prose_style_score": 4.2823529411764705,
87
- "creativity_score": 4.5,
88
- "depth_score": 4.241176470588235,
89
- "helpfulness_score": 4.347058823529411,
90
- "overall_score": 4.3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  },
92
  "Qwen3-30B-A3B": {
93
- "comprehension_score": 4.805882352941176,
94
- "structure_score": 4.988235294117647,
95
- "prose_style_score": 4.405882352941177,
96
- "creativity_score": 4.617647058823529,
97
- "depth_score": 4.364705882352941,
98
- "helpfulness_score": 4.523529411764706,
99
- "overall_score": 4.429411764705883
100
  },
101
- "gemma3-27b": {
102
- "comprehension_score": 5.070588235294117,
103
- "structure_score": 5.241176470588235,
104
- "prose_style_score": 4.470588235294118,
105
- "creativity_score": 5.011764705882353,
106
- "depth_score": 4.58235294117647,
107
- "helpfulness_score": 4.817647058823529,
108
- "overall_score": 4.6647058823529415
109
  },
110
- "Phi-4-14B": {
111
- "comprehension_score": 4.235294117647059,
112
- "structure_score": 4.529411764705882,
113
- "prose_style_score": 3.652941176470588,
114
- "creativity_score": 4.258823529411765,
115
- "depth_score": 4.023529411764706,
116
- "helpfulness_score": 4.017647058823529,
117
- "overall_score": 3.9647058823529413
118
  },
119
- "Mistral-3.2-24B-2506": {
120
- "comprehension_score": 5.3352941176470585,
121
- "structure_score": 5.570588235294117,
122
- "prose_style_score": 5.188235294117647,
123
- "creativity_score": 5.7176470588235295,
124
- "depth_score": 5.047058823529412,
125
- "helpfulness_score": 5.147058823529412,
126
- "overall_score": 5.147058823529412
127
  },
128
- "Mistral-3.1-24B-2503": {
129
- "comprehension_score": 4.276470588235294,
130
- "structure_score": 4.523529411764706,
131
- "prose_style_score": 3.523529411764706,
132
- "creativity_score": 3.9411764705882355,
133
- "depth_score": 3.8117647058823527,
134
- "helpfulness_score": 4.029411764705882,
135
- "overall_score": 3.9058823529411764
136
  },
137
- "gpt-4o-2024-11-20": {
138
- "comprehension_score": 5.223529411764706,
139
- "structure_score": 5.176470588235294,
140
- "prose_style_score": 4.882352941176471,
141
- "creativity_score": 5.211764705882353,
142
- "depth_score": 4.788235294117647,
143
- "helpfulness_score": 4.852941176470588,
144
- "overall_score": 4.858823529411764
145
  },
146
  "gpt-4.1-mini-2025-04-14": {
147
- "comprehension_score": 4.7,
148
- "structure_score": 4.694117647058824,
149
- "prose_style_score": 4.247058823529412,
150
- "creativity_score": 4.405882352941177,
151
- "depth_score": 4.176470588235294,
152
- "helpfulness_score": 4.264705882352941,
153
- "overall_score": 4.241176470588235
154
  },
155
- "gpt-4o-mini-2024-07-18": {
156
- "comprehension_score": 4.41764705882353,
157
- "structure_score": 4.588235294117647,
158
- "prose_style_score": 4.0,
159
- "creativity_score": 4.329411764705882,
160
- "depth_score": 4.076470588235294,
161
- "helpfulness_score": 4.117647058823529,
162
- "overall_score": 4.147058823529412
163
  },
164
- "o4-mini-2025-04-16": {
165
- "comprehension_score": 5.694117647058824,
166
- "structure_score": 5.764705882352941,
167
- "prose_style_score": 5.341176470588235,
168
- "creativity_score": 5.623529411764705,
169
- "depth_score": 5.152941176470589,
170
- "helpfulness_score": 5.488235294117647,
171
- "overall_score": 5.41764705882353
172
  },
173
  "gpt-5.4": {
174
- "comprehension_score": 7.529411764705882,
175
- "structure_score": 7.535294117647059,
176
- "prose_style_score": 7.858823529411764,
177
- "creativity_score": 7.3,
178
- "depth_score": 7.7176470588235295,
179
- "helpfulness_score": 7.5588235294117645,
180
- "overall_score": 7.482352941176471
181
- },
182
- "Gemini-3.1-Flash": {
183
- "comprehension_score": 7.041176470588235,
184
- "structure_score": 7.1647058823529415,
185
- "prose_style_score": 7.411764705882353,
186
- "creativity_score": 7.294117647058823,
187
- "depth_score": 7.0,
188
- "helpfulness_score": 7.064705882352941,
189
- "overall_score": 7.064705882352941
190
- },
191
- "Gemini-3.1-Pro": {
192
- "comprehension_score": 7.511764705882353,
193
- "structure_score": 7.5,
194
- "prose_style_score": 7.682352941176471,
195
- "creativity_score": 7.5,
196
- "depth_score": 7.305882352941176,
197
- "helpfulness_score": 7.552941176470588,
198
- "overall_score": 7.452941176470588
199
  },
200
- "MiniMax-M2.5": {
201
- "comprehension_score": 6.176470588235294,
202
- "structure_score": 6.323529411764706,
203
- "prose_style_score": 5.511764705882353,
204
- "creativity_score": 6.011764705882353,
205
- "depth_score": 5.788235294117647,
206
- "helpfulness_score": 6.035294117647059,
207
- "overall_score": 5.905882352941177
208
  },
209
- "Qwen3.5-Plus": {
210
- "comprehension_score": 7.205882352941177,
211
- "structure_score": 7.2823529411764705,
212
- "prose_style_score": 7.41764705882353,
213
- "creativity_score": 7.394117647058824,
214
- "depth_score": 7.205882352941177,
215
- "helpfulness_score": 7.229411764705882,
216
- "overall_score": 7.2
217
  },
218
  "step-3.5-flash": {
219
- "comprehension_score": 7.258823529411765,
220
- "structure_score": 7.252941176470588,
221
- "prose_style_score": 7.105882352941176,
222
- "creativity_score": 7.8352941176470585,
223
- "depth_score": 7.552941176470588,
224
- "helpfulness_score": 7.1647058823529415,
225
- "overall_score": 7.229411764705882
226
- },
227
- "GLM-5": {
228
- "comprehension_score": 7.064705882352941,
229
- "structure_score": 7.147058823529412,
230
- "prose_style_score": 7.176470588235294,
231
- "creativity_score": 7.1,
232
- "depth_score": 6.8882352941176475,
233
- "helpfulness_score": 7.064705882352941,
234
- "overall_score": 7.011764705882353
235
  }
236
  }
 
1
  {
 
 
 
 
 
 
 
 
 
2
  "Deepseek-R1-0528": {
3
+ "comprehension_score": 7.481132075471698,
4
+ "structure_score": 7.339622641509434,
5
+ "prose_style_score": 7.59433962264151,
6
+ "creativity_score": 8.018867924528301,
7
+ "depth_score": 7.688679245283019,
8
+ "helpfulness_score": 7.367924528301887,
9
+ "overall_score": 7.5
10
  },
11
  "Deepseek-V3-0324": {
12
+ "comprehension_score": 6.39622641509434,
13
+ "structure_score": 6.386792452830188,
14
+ "prose_style_score": 6.9245283018867925,
15
+ "creativity_score": 7.367924528301887,
16
+ "depth_score": 6.311320754716981,
17
+ "helpfulness_score": 6.216981132075472,
18
+ "overall_score": 6.367924528301887
19
  },
20
+ "GLM-5": {
21
+ "comprehension_score": 7.4245283018867925,
22
+ "structure_score": 7.518867924528302,
23
+ "prose_style_score": 7.320754716981132,
24
+ "creativity_score": 7.433962264150943,
25
+ "depth_score": 7.254716981132075,
26
+ "helpfulness_score": 7.443396226415095,
27
+ "overall_score": 7.330188679245283
28
  },
29
+ "Gemini-2.5-Flash": {
30
+ "comprehension_score": 6.4245283018867925,
31
+ "structure_score": 6.490566037735849,
32
+ "prose_style_score": 5.773584905660377,
33
+ "creativity_score": 5.990566037735849,
34
+ "depth_score": 6.018867924528302,
35
+ "helpfulness_score": 6.339622641509434,
36
+ "overall_score": 6.09433962264151
37
  },
38
+ "Gemini-3.1-Flash": {
39
+ "comprehension_score": 7.415094339622642,
40
+ "structure_score": 7.471698113207547,
41
+ "prose_style_score": 7.632075471698113,
42
+ "creativity_score": 7.669811320754717,
43
+ "depth_score": 7.2924528301886795,
44
+ "helpfulness_score": 7.433962264150943,
45
+ "overall_score": 7.40566037735849
46
  },
47
+ "Gemini-3.1-Pro": {
48
+ "comprehension_score": 7.830188679245283,
49
+ "structure_score": 7.764150943396227,
50
+ "prose_style_score": 7.811320754716981,
51
+ "creativity_score": 8.037735849056604,
52
+ "depth_score": 7.688679245283019,
53
+ "helpfulness_score": 7.962264150943396,
54
+ "overall_score": 7.745283018867925
55
  },
56
+ "MiniMax-M2.5": {
57
+ "comprehension_score": 6.1415094339622645,
58
+ "structure_score": 6.39622641509434,
59
+ "prose_style_score": 5.377358490566038,
60
+ "creativity_score": 6.066037735849057,
61
+ "depth_score": 5.688679245283019,
62
+ "helpfulness_score": 6.10377358490566,
63
+ "overall_score": 5.849056603773585
64
  },
65
+ "Mistral-3.1-24B-2503": {
66
+ "comprehension_score": 4.1415094339622645,
67
+ "structure_score": 4.386792452830188,
68
+ "prose_style_score": 3.30188679245283,
69
+ "creativity_score": 3.650943396226415,
70
+ "depth_score": 3.518867924528302,
71
+ "helpfulness_score": 3.8679245283018866,
72
+ "overall_score": 3.7264150943396226
73
  },
74
+ "Mistral-3.2-24B-2506": {
75
+ "comprehension_score": 5.169811320754717,
76
+ "structure_score": 5.311320754716981,
77
+ "prose_style_score": 4.89622641509434,
78
+ "creativity_score": 5.688679245283019,
79
+ "depth_score": 4.745283018867925,
80
+ "helpfulness_score": 4.981132075471698,
81
+ "overall_score": 4.89622641509434
82
+ },
83
+ "Phi-4-14B": {
84
+ "comprehension_score": 4.245283018867925,
85
+ "structure_score": 4.40566037735849,
86
+ "prose_style_score": 3.5754716981132075,
87
+ "creativity_score": 4.0754716981132075,
88
+ "depth_score": 3.707547169811321,
89
+ "helpfulness_score": 4.009433962264151,
90
+ "overall_score": 3.858490566037736
91
+ },
92
+ "Qwen3.5-Plus": {
93
+ "comprehension_score": 7.632075471698113,
94
+ "structure_score": 7.745283018867925,
95
+ "prose_style_score": 7.754716981132075,
96
+ "creativity_score": 7.716981132075472,
97
+ "depth_score": 7.622641509433962,
98
+ "helpfulness_score": 7.632075471698113,
99
+ "overall_score": 7.556603773584905
100
+ },
101
+ "Qwen3-235B": {
102
+ "comprehension_score": 5.773584905660377,
103
+ "structure_score": 6.113207547169812,
104
+ "prose_style_score": 5.433962264150943,
105
+ "creativity_score": 5.745283018867925,
106
+ "depth_score": 5.226415094339623,
107
+ "helpfulness_score": 5.679245283018868,
108
+ "overall_score": 5.566037735849057
109
+ },
110
+ "Qwen3-235B-Thinking": {
111
+ "comprehension_score": 6.179245283018868,
112
+ "structure_score": 6.1415094339622645,
113
+ "prose_style_score": 7.490566037735849,
114
+ "creativity_score": 8.056603773584905,
115
+ "depth_score": 6.69811320754717,
116
+ "helpfulness_score": 6.084905660377358,
117
+ "overall_score": 6.39622641509434
118
  },
119
  "Qwen3-30B-A3B": {
120
+ "comprehension_score": 4.7075471698113205,
121
+ "structure_score": 5.10377358490566,
122
+ "prose_style_score": 4.179245283018868,
123
+ "creativity_score": 4.547169811320755,
124
+ "depth_score": 4.245283018867925,
125
+ "helpfulness_score": 4.471698113207547,
126
+ "overall_score": 4.330188679245283
127
  },
128
+ "Qwen3-30B-A3B-Thinking": {
129
+ "comprehension_score": 5.867924528301887,
130
+ "structure_score": 6.009433962264151,
131
+ "prose_style_score": 6.584905660377358,
132
+ "creativity_score": 7.122641509433962,
133
+ "depth_score": 6.009433962264151,
134
+ "helpfulness_score": 5.811320754716981,
135
+ "overall_score": 5.943396226415095
136
  },
137
+ "Qwen3-32B": {
138
+ "comprehension_score": 5.037735849056604,
139
+ "structure_score": 5.30188679245283,
140
+ "prose_style_score": 4.518867924528302,
141
+ "creativity_score": 5.018867924528302,
142
+ "depth_score": 4.5754716981132075,
143
+ "helpfulness_score": 4.754716981132075,
144
+ "overall_score": 4.688679245283019
145
  },
146
+ "Qwen3-32B-Thinking": {
147
+ "comprehension_score": 6.377358490566038,
148
+ "structure_score": 6.273584905660377,
149
+ "prose_style_score": 7.3584905660377355,
150
+ "creativity_score": 7.886792452830188,
151
+ "depth_score": 6.60377358490566,
152
+ "helpfulness_score": 6.2924528301886795,
153
+ "overall_score": 6.5
154
  },
155
+ "Qwen3-8B": {
156
+ "comprehension_score": 4.613207547169812,
157
+ "structure_score": 4.811320754716981,
158
+ "prose_style_score": 4.216981132075472,
159
+ "creativity_score": 4.528301886792453,
160
+ "depth_score": 4.169811320754717,
161
+ "helpfulness_score": 4.433962264150943,
162
+ "overall_score": 4.311320754716981
163
  },
164
+ "gemma3-27b": {
165
+ "comprehension_score": 4.90566037735849,
166
+ "structure_score": 5.113207547169812,
167
+ "prose_style_score": 4.047169811320755,
168
+ "creativity_score": 4.7075471698113205,
169
+ "depth_score": 4.30188679245283,
170
+ "helpfulness_score": 4.632075471698113,
171
+ "overall_score": 4.367924528301887
172
  },
173
  "gpt-4.1-mini-2025-04-14": {
174
+ "comprehension_score": 4.69811320754717,
175
+ "structure_score": 4.613207547169812,
176
+ "prose_style_score": 4.179245283018868,
177
+ "creativity_score": 4.4245283018867925,
178
+ "depth_score": 4.160377358490566,
179
+ "helpfulness_score": 4.235849056603773,
180
+ "overall_score": 4.216981132075472
181
  },
182
+ "gpt-4o-2024-11-20": {
183
+ "comprehension_score": 5.2075471698113205,
184
+ "structure_score": 5.216981132075472,
185
+ "prose_style_score": 4.877358490566038,
186
+ "creativity_score": 5.377358490566038,
187
+ "depth_score": 4.764150943396227,
188
+ "helpfulness_score": 4.8584905660377355,
189
+ "overall_score": 4.877358490566038
190
  },
191
+ "gpt-4o-mini-2024-07-18": {
192
+ "comprehension_score": 4.2924528301886795,
193
+ "structure_score": 4.5,
194
+ "prose_style_score": 3.8773584905660377,
195
+ "creativity_score": 4.122641509433962,
196
+ "depth_score": 3.952830188679245,
197
+ "helpfulness_score": 4.0754716981132075,
198
+ "overall_score": 4.047169811320755
199
  },
200
  "gpt-5.4": {
201
+ "comprehension_score": 8.0,
202
+ "structure_score": 7.971698113207547,
203
+ "prose_style_score": 8.377358490566039,
204
+ "creativity_score": 7.830188679245283,
205
+ "depth_score": 8.216981132075471,
206
+ "helpfulness_score": 8.11320754716981,
207
+ "overall_score": 7.943396226415095
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  },
209
+ "o3-2025-04-16": {
210
+ "comprehension_score": 7.886792452830188,
211
+ "structure_score": 7.933962264150943,
212
+ "prose_style_score": 8.273584905660377,
213
+ "creativity_score": 8.160377358490566,
214
+ "depth_score": 7.764150943396227,
215
+ "helpfulness_score": 7.90566037735849,
216
+ "overall_score": 7.8584905660377355
217
  },
218
+ "o4-mini-2025-04-16": {
219
+ "comprehension_score": 5.433962264150943,
220
+ "structure_score": 5.537735849056604,
221
+ "prose_style_score": 4.971698113207547,
222
+ "creativity_score": 5.5,
223
+ "depth_score": 4.8584905660377355,
224
+ "helpfulness_score": 5.150943396226415,
225
+ "overall_score": 5.056603773584905
226
  },
227
  "step-3.5-flash": {
228
+ "comprehension_score": 7.5754716981132075,
229
+ "structure_score": 7.59433962264151,
230
+ "prose_style_score": 7.481132075471698,
231
+ "creativity_score": 8.339622641509434,
232
+ "depth_score": 7.877358490566038,
233
+ "helpfulness_score": 7.490566037735849,
234
+ "overall_score": 7.547169811320755
235
+ },
236
+ "gpt-4.1-2025-04-14": {
237
+ "comprehension_score": 5.0,
238
+ "structure_score": 5.0,
239
+ "prose_style_score": 5.0,
240
+ "creativity_score": 5.0,
241
+ "depth_score": 5.0,
242
+ "helpfulness_score": 5.0,
243
+ "overall_score": 5.0
244
  }
245
  }
index.html CHANGED
@@ -1,13 +1,15 @@
1
  <!DOCTYPE html>
2
  <html lang="en">
 
3
  <head>
4
  <meta charset="UTF-8">
5
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
  <title>Zhiyin</title>
7
- <link rel="stylesheet" href="styles.css">
8
  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
9
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
10
  </head>
 
11
  <body>
12
  <div class="container">
13
  <header class="header">
@@ -18,24 +20,29 @@
18
  <div class="dashboard card">
19
  <section class="overview-section">
20
  <h2 class="section-title">
21
- <span class="accent-bar"></span>
22
- Benchmark Overview
23
- <span class="section-title-spacer"></span>
24
- <span class="external-links-text">
25
- <a href="https://github.com/zake7749/Chinese-Writing-Bench" target="_blank" rel="noopener" class="external-link-text">GitHub</a>
26
- <span class="divider">&bull;</span>
27
- <a href="https://huggingface.co/datasets/zake7749/chinese-writing-benchmark" target="_blank" rel="noopener" class="external-link-text">Hugging Face</a>
28
- </span>
 
 
29
  </h2>
30
  <div class="overview-content">
31
  <div class="overview-text">
32
  <p>
33
- <strong>Zhiyin</strong> is an LLM-as-a-judge benchmark for Chinese writing evaluation, featuring 280 test cases across 18 diverse writing tasks in this V1 release.
 
34
  </p>
35
  <p>
36
- Our method relies on <strong>pairwise comparison</strong>. A powerful language model acts as the judge, scoring a model's response relative to a fixed baseline (GPT-4.1), which is anchored at a score of 5.
 
 
37
  </p>
38
-
39
  <h4>Scoring System</h4>
40
  <p>The judge assigns the model's response an integer score from 0 to 10, where:</p>
41
  <ul>
@@ -43,38 +50,58 @@
43
  <li>A score = 5 indicates the response is <strong>on par</strong> with the baseline.</li>
44
  <li>A score < 5 indicates the response is <strong>inferior</strong> to the baseline.</li>
45
  </ul>
46
-
47
  <h4>Evaluation Dimensions</h4>
48
- <p>To ensure a comprehensive analysis, the final score is informed by a multi-dimensional assessment. The judge evaluates the response across six key criteria:</p>
 
49
  <ol>
50
- <li><strong>Comprehension & Relevance:</strong> How well the response understands the prompt's intent and stays on topic.</li>
51
- <li><strong>Structure & Coherence:</strong> How clear, logical, and well-organized the writing is.</li>
52
- <li><strong>Prose & Style:</strong> The quality of the language, grammar, and adherence to the requested tone.</li>
53
- <li><strong>Creativity & Originality:</strong> The novelty of the ideas and the uniqueness of the perspective.</li>
54
- <li><strong>Depth & Insight:</strong> The level of detail, analysis, and substance provided.</li>
55
- <li><strong>Helpfulness:</strong> How effectively the response fulfills the user's overall goal.</li>
 
 
 
 
 
 
56
  </ol>
57
  </div>
58
  </div>
59
  </section>
60
 
61
- <section class="judge-section">
62
- <h2 class="section-title"><span class="accent-bar"></span>Judge Model</h2>
63
- <div class="judge-toggle">
64
- <button class="judge-btn active" data-judge="o3">O3</button>
65
- <button class="judge-btn" data-judge="gpt5.4">GPT-5.4</button>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  </div>
67
  </section>
68
 
69
- <section class="table-section">
70
- <h2 class="section-title"><span class="accent-bar"></span>All Writing Tasks</h2>
71
  <div id="generalTable" class="table-container">
72
  <!-- General Writing table will be populated here -->
73
  </div>
74
  </section>
75
 
76
- <section class="table-section">
77
- <h2 class="section-title"><span class="accent-bar"></span>Complicated Writing Tasks</h2>
78
  <div id="complicatedTable" class="table-container">
79
  <!-- Complicated Writing table will be populated here -->
80
  </div>
@@ -85,7 +112,8 @@
85
  <div class="citation-content">
86
  <p>
87
  If you use these results, please cite our paper:<br>
88
- <em>"Zhiyin: Exploring the Frontier of Chinese LLM Writing, 2025. https://github.com/zake7749/Chinese-Writing-Bench"</em>
 
89
  </p>
90
  </div>
91
  </section>
@@ -98,4 +126,5 @@
98
  </div>
99
  <script src="script.js"></script>
100
  </body>
101
- </html>
 
 
1
  <!DOCTYPE html>
2
  <html lang="en">
3
+
4
  <head>
5
  <meta charset="UTF-8">
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
  <title>Zhiyin</title>
8
+ <link rel="stylesheet" href="styles.css?v=3">
9
  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
10
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
11
  </head>
12
+
13
  <body>
14
  <div class="container">
15
  <header class="header">
 
20
  <div class="dashboard card">
21
  <section class="overview-section">
22
  <h2 class="section-title">
23
+ <span class="accent-bar"></span>
24
+ Benchmark Overview
25
+ <span class="section-title-spacer"></span>
26
+ <span class="external-links-text">
27
+ <a href="https://github.com/zake7749/Chinese-Writing-Bench" target="_blank" rel="noopener"
28
+ class="external-link-text">GitHub</a>
29
+ <span class="divider">&bull;</span>
30
+ <a href="https://huggingface.co/datasets/zake7749/chinese-writing-benchmark" target="_blank"
31
+ rel="noopener" class="external-link-text">Hugging Face</a>
32
+ </span>
33
  </h2>
34
  <div class="overview-content">
35
  <div class="overview-text">
36
  <p>
37
+ <strong>Zhiyin</strong> is an LLM-as-a-judge benchmark for Chinese writing evaluation,
38
+ featuring 280 test cases across 18 diverse writing tasks in this V1 release.
39
  </p>
40
  <p>
41
+ Our method relies on <strong>pairwise comparison</strong>. A powerful language model acts as
42
+ the judge, scoring a model's response relative to a fixed baseline (GPT-4.1), which is
43
+ anchored at a score of 5.
44
  </p>
45
+
46
  <h4>Scoring System</h4>
47
  <p>The judge assigns the model's response an integer score from 0 to 10, where:</p>
48
  <ul>
 
50
  <li>A score = 5 indicates the response is <strong>on par</strong> with the baseline.</li>
51
  <li>A score < 5 indicates the response is <strong>inferior</strong> to the baseline.</li>
52
  </ul>
53
+
54
  <h4>Evaluation Dimensions</h4>
55
+ <p>To ensure a comprehensive analysis, the final score is informed by a multi-dimensional
56
+ assessment. The judge evaluates the response across six key criteria:</p>
57
  <ol>
58
+ <li><strong>Comprehension & Relevance:</strong> How well the response understands the
59
+ prompt's intent and stays on topic.</li>
60
+ <li><strong>Structure & Coherence:</strong> How clear, logical, and well-organized the
61
+ writing is.</li>
62
+ <li><strong>Prose & Style:</strong> The quality of the language, grammar, and adherence to
63
+ the requested tone.</li>
64
+ <li><strong>Creativity & Originality:</strong> The novelty of the ideas and the uniqueness
65
+ of the perspective.</li>
66
+ <li><strong>Depth & Insight:</strong> The level of detail, analysis, and substance provided.
67
+ </li>
68
+ <li><strong>Helpfulness:</strong> How effectively the response fulfills the user's overall
69
+ goal.</li>
70
  </ol>
71
  </div>
72
  </div>
73
  </section>
74
 
75
+ <section class="dashboard-controls">
76
+ <div class="controls-header">
77
+ <div class="judge-selector">
78
+ <span class="control-label">Judge Model</span>
79
+ <div class="judge-toggle">
80
+ <button class="judge-btn active" data-judge="gpt5.4">GPT-5.4</button>
81
+ <button class="judge-btn" data-judge="o3">O3</button>
82
+ </div>
83
+ </div>
84
+ <div class="search-container">
85
+ <i class="fas fa-search search-icon"></i>
86
+ <input type="text" id="globalSearch" class="search-input" placeholder="Search models...">
87
+ </div>
88
+ </div>
89
+
90
+ <div class="tabs-container">
91
+ <div class="tabs-header">
92
+ <button class="tab-btn active" data-target="generalTableSection">All Writing Tasks</button>
93
+ <button class="tab-btn" data-target="complicatedTableSection">Complicated Writing Tasks</button>
94
+ </div>
95
  </div>
96
  </section>
97
 
98
+ <section class="tab-content active" id="generalTableSection">
 
99
  <div id="generalTable" class="table-container">
100
  <!-- General Writing table will be populated here -->
101
  </div>
102
  </section>
103
 
104
+ <section class="tab-content" id="complicatedTableSection">
 
105
  <div id="complicatedTable" class="table-container">
106
  <!-- Complicated Writing table will be populated here -->
107
  </div>
 
112
  <div class="citation-content">
113
  <p>
114
  If you use these results, please cite our paper:<br>
115
+ <em>"Zhiyin: Exploring the Frontier of Chinese LLM Writing, 2025.
116
+ https://github.com/zake7749/Chinese-Writing-Bench"</em>
117
  </p>
118
  </div>
119
  </section>
 
126
  </div>
127
  <script src="script.js"></script>
128
  </body>
129
+
130
+ </html>
script.js CHANGED
@@ -1,6 +1,7 @@
1
  class LLMBenchmarkDashboard {
2
  constructor() {
3
- this.currentJudge = 'o3';
 
4
  this.judgeData = {
5
  o3: { general: null, complicated: null },
6
  'gpt5.4': { general: null, complicated: null }
@@ -16,6 +17,15 @@ class LLMBenchmarkDashboard {
16
  helpfulness_score: 'Helpfulness',
17
  overall_score: 'Overall'
18
  };
 
 
 
 
 
 
 
 
 
19
  this.judgePaths = {
20
  o3: {
21
  general: 'data/all-scores.json',
@@ -44,7 +54,9 @@ class LLMBenchmarkDashboard {
44
  await Promise.all(promises);
45
  this.renderTable('general');
46
  this.renderTable('complicated');
 
47
  this.setupJudgeToggle();
 
48
  this.showLoading(false);
49
  }
50
 
@@ -85,6 +97,41 @@ class LLMBenchmarkDashboard {
85
  });
86
  }
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  renderTable(type) {
89
  const data = type === 'general' ? this.generalData : this.complicatedData;
90
  const sortState = type === 'general' ? this.generalSort : this.complicatedSort;
@@ -105,17 +152,35 @@ class LLMBenchmarkDashboard {
105
  </tr>
106
  </thead>
107
  <tbody>
108
- ${this.getSortedTableData(data, sortState, metrics).map(row => {
109
- const isMonomer = this.modelLinks[row.model];
110
- return `
111
- <tr${isMonomer ? ' class="highlight-row"' : ''}>
112
- <td class="model-cell">${isMonomer ? `<a href="${this.modelLinks[row.model]}" target="_blank" rel="noopener" class="model-link">${row.model}</a>` : row.model}</td>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  ${metrics.map(metric => `
114
- <td class="score-cell">${this.formatScore(row[metric])}</td>
115
  `).join('')}
116
  </tr>
117
  `;
118
- }).join('')}
119
  </tbody>
120
  </table>
121
  `;
@@ -126,10 +191,15 @@ class LLMBenchmarkDashboard {
126
 
127
  getSortedTableData(data, sortState, metrics) {
128
  const models = Object.keys(data);
129
- let tableData = models.map(model => ({
130
- model,
131
- ...data[model]
132
- }));
 
 
 
 
 
133
 
134
  if (sortState.column) {
135
  tableData.sort((a, b) => {
@@ -154,6 +224,31 @@ class LLMBenchmarkDashboard {
154
  return value;
155
  }
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  setupTableSorting(type) {
158
  const tableContainer = document.getElementById(type === 'general' ? 'generalTable' : 'complicatedTable');
159
  const headers = tableContainer.querySelectorAll('th.sortable');
 
1
  class LLMBenchmarkDashboard {
2
  constructor() {
3
+ this.currentJudge = 'gpt5.4';
4
+ this.searchQuery = '';
5
  this.judgeData = {
6
  o3: { general: null, complicated: null },
7
  'gpt5.4': { general: null, complicated: null }
 
17
  helpfulness_score: 'Helpfulness',
18
  overall_score: 'Overall'
19
  };
20
+ this.metricDescriptions = {
21
+ comprehension_score: 'How well the response understands the prompt intent and stays on topic.',
22
+ structure_score: 'How clear, logical, and well-organized the writing is.',
23
+ prose_style_score: 'The quality of language, grammar, and adherence to the requested tone.',
24
+ creativity_score: 'The novelty of ideas and uniqueness of perspective.',
25
+ depth_score: 'The level of detail, analysis, and substance provided.',
26
+ helpfulness_score: 'How effectively the response fulfills the user\'s overall goal.',
27
+ overall_score: 'Average score across all six criteria.'
28
+ };
29
  this.judgePaths = {
30
  o3: {
31
  general: 'data/all-scores.json',
 
54
  await Promise.all(promises);
55
  this.renderTable('general');
56
  this.renderTable('complicated');
57
+ this.setupSearch();
58
  this.setupJudgeToggle();
59
+ this.setupTabs();
60
  this.showLoading(false);
61
  }
62
 
 
97
  });
98
  }
99
 
100
+ setupSearch() {
101
+ const searchInput = document.getElementById('globalSearch');
102
+ if (searchInput) {
103
+ searchInput.addEventListener('input', (e) => {
104
+ this.searchQuery = e.target.value.toLowerCase().trim();
105
+ this.renderTable('general');
106
+ this.renderTable('complicated');
107
+ });
108
+ }
109
+ }
110
+
111
+ setupTabs() {
112
+ const tabBtns = document.querySelectorAll('.tab-btn');
113
+ const tabContents = document.querySelectorAll('.tab-content');
114
+
115
+ tabBtns.forEach(btn => {
116
+ btn.addEventListener('click', () => {
117
+ const targetId = btn.dataset.target;
118
+
119
+ // Update active state of buttons
120
+ tabBtns.forEach(b => b.classList.remove('active'));
121
+ btn.classList.add('active');
122
+
123
+ // Update active state of content
124
+ tabContents.forEach(content => {
125
+ if (content.id === targetId) {
126
+ content.classList.add('active');
127
+ } else {
128
+ content.classList.remove('active');
129
+ }
130
+ });
131
+ });
132
+ });
133
+ }
134
+
135
  renderTable(type) {
136
  const data = type === 'general' ? this.generalData : this.complicatedData;
137
  const sortState = type === 'general' ? this.generalSort : this.complicatedSort;
 
152
  </tr>
153
  </thead>
154
  <tbody>
155
+ ${this.getSortedTableData(data, sortState, metrics).map((row, index) => {
156
+ const isMonomer = this.modelLinks[row.model];
157
+ const isBaseline = row.model === 'gpt-4.1-2025-04-14';
158
+ let rowClass = '';
159
+ if (isMonomer) rowClass = 'highlight-row';
160
+ if (isBaseline) rowClass = 'baseline-row';
161
+
162
+ // Generate rank medal for the sorted column if it's a metric
163
+ let medalHtml = '';
164
+ if (sortState.column !== 'model' && ['desc', 'asc'].includes(sortState.direction)) {
165
+ const rank = sortState.direction === 'desc' ? index + 1 : Object.keys(data).length - index;
166
+ if (rank === 1) medalHtml = '<span class="rank-medal">🥇</span>';
167
+ else if (rank === 2) medalHtml = '<span class="rank-medal">🥈</span>';
168
+ else if (rank === 3) medalHtml = '<span class="rank-medal">🥉</span>';
169
+ }
170
+
171
+ return `
172
+ <tr class="${rowClass}">
173
+ <td class="model-cell">
174
+ ${medalHtml}
175
+ ${isMonomer ? `<a href="${this.modelLinks[row.model]}" target="_blank" rel="noopener" class="model-link">${row.model}</a>` : row.model}
176
+ ${isBaseline ? ' (Baseline)' : ''}
177
+ </td>
178
  ${metrics.map(metric => `
179
+ <td class="score-cell" style="background-color: ${this.getHeatmapColor(row[metric], type, metric)}">${this.formatScore(row[metric])}</td>
180
  `).join('')}
181
  </tr>
182
  `;
183
+ }).join('')}
184
  </tbody>
185
  </table>
186
  `;
 
191
 
192
  getSortedTableData(data, sortState, metrics) {
193
  const models = Object.keys(data);
194
+ let tableData = models
195
+ .filter(model => {
196
+ if (!this.searchQuery) return true;
197
+ return model.toLowerCase().includes(this.searchQuery);
198
+ })
199
+ .map(model => ({
200
+ model,
201
+ ...data[model]
202
+ }));
203
 
204
  if (sortState.column) {
205
  tableData.sort((a, b) => {
 
224
  return value;
225
  }
226
 
227
+ getHeatmapColor(val, type, metric) {
228
+ if (val === null || val === undefined) return 'transparent';
229
+
230
+ // Use 5.0 as the neutral midpoint since that is the baseline
231
+ // Less than 5.0: increasingly red. Greater than 5.0: increasingly green/purple.
232
+ // Let's go from 3.0 to 9.0 as the typical min/max
233
+ const minVal = 3.5;
234
+ const maxVal = 8.5;
235
+ const baseline = 5.0;
236
+
237
+ let color = 'transparent';
238
+ if (val < baseline) {
239
+ // Bad score: red hue
240
+ const intensity = Math.min(1, (baseline - val) / (baseline - minVal));
241
+ // e.g. rgba(255, 99, 132, intensity)
242
+ // But we want extremely light background colors so text remains readable
243
+ color = `rgba(239, 68, 68, ${intensity * 0.2})`;
244
+ } else if (val > baseline) {
245
+ // Good score: purple/green hue
246
+ const intensity = Math.min(1, (val - baseline) / (maxVal - baseline));
247
+ color = `rgba(16, 185, 129, ${intensity * 0.2})`;
248
+ }
249
+ return color;
250
+ }
251
+
252
  setupTableSorting(type) {
253
  const tableContainer = document.getElementById(type === 'general' ? 'generalTable' : 'complicatedTable');
254
  const headers = tableContainer.querySelectorAll('th.sortable');
styles.css CHANGED
@@ -6,25 +6,25 @@
6
 
7
  body {
8
  font-family: 'Inter', sans-serif;
9
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
10
  min-height: 100vh;
11
- color: #333;
12
  }
13
 
14
  .container {
15
- max-width: 1400px;
 
16
  margin: 0 auto;
17
- padding: 20px;
18
  }
19
 
20
  .header {
21
  text-align: center;
22
  margin-bottom: 24px;
23
- color: white;
24
  padding: 48px 0 36px 0;
25
  position: relative;
26
  background: none;
27
- box-shadow: 0 2px 12px 0 rgba(76, 75, 162, 0.06);
28
  }
29
 
30
  .header h1 {
@@ -40,19 +40,18 @@ body {
40
 
41
  .header h1 .fas {
42
  font-size: 2.2rem;
43
- background: linear-gradient(135deg, #764ba2 60%, #667eea 100%);
44
  color: white;
45
  border-radius: 50%;
46
  padding: 16px 18px;
47
- box-shadow: 0 2px 12px #764ba244;
48
  vertical-align: middle;
49
  }
50
 
51
  .header p {
52
  font-size: 1.25rem;
53
  font-weight: 400;
54
- color: #ece6fa;
55
- opacity: 0.92;
56
  margin-top: 0;
57
  margin-bottom: 0;
58
  letter-spacing: 0.04em;
@@ -61,8 +60,8 @@ body {
61
  .card {
62
  background: white;
63
  border-radius: 16px;
64
- box-shadow: 0 6px 32px 0 rgba(76, 75, 162, 0.08);
65
- border: 1.5px solid #ece6fa;
66
  padding: 32px 32px 24px 32px;
67
  }
68
 
@@ -71,7 +70,7 @@ body {
71
  align-items: center;
72
  font-size: 1.35rem;
73
  font-weight: 700;
74
- color: #4b2996;
75
  margin-bottom: 28px;
76
  letter-spacing: 0.5px;
77
  text-transform: none;
@@ -80,7 +79,7 @@ body {
80
  padding-left: 0;
81
  position: relative;
82
  padding-bottom: 8px;
83
- border-bottom: 2px solid #ecf0f1;
84
  }
85
 
86
  .section-title-spacer {
@@ -89,23 +88,24 @@ body {
89
 
90
  .accent-bar {
91
  display: inline-block;
92
- width: 7px;
93
- height: 28px;
94
  border-radius: 4px;
95
- background: linear-gradient(180deg, #764ba2 0%, #667eea 100%);
96
- margin-right: 16px;
97
  }
98
 
99
- .overview-section, .table-section, .citation-section {
 
 
100
  margin-bottom: 56px;
101
  }
102
 
103
  .overview-content {
104
- font-size: 1.08rem;
105
- color: #3a3550;
106
  line-height: 1.7;
107
- padding-left: 2px;
108
- border-left: 4px solid #ece6fa;
109
  padding-left: 24px;
110
  background: none;
111
  }
@@ -116,9 +116,9 @@ body {
116
  }
117
 
118
  .overview-content h4 {
119
- font-size: 1.13rem;
120
  font-weight: 700;
121
- color: #764ba2;
122
  margin-top: 28px;
123
  margin-bottom: 10px;
124
  letter-spacing: 0.02em;
@@ -143,103 +143,123 @@ body {
143
  }
144
 
145
  .citation-content {
146
- font-size: 1.08rem;
147
- color: #3a3550;
148
  line-height: 1.7;
149
  padding-left: 2px;
150
  }
151
 
152
  .citation-content em {
153
- color: #764ba2;
154
- font-size: 1rem;
155
  font-style: italic;
156
- background: #f3f0fa;
157
- padding: 2px 6px;
158
- border-radius: 4px;
159
  }
160
 
161
  .table-container {
162
  overflow-x: auto;
163
- background: #f8f9fa;
164
  border-radius: 12px;
165
- padding: 20px;
166
- max-height: 480px;
167
  overflow-y: auto;
 
 
 
168
  }
169
 
170
  table {
171
  width: 100%;
 
172
  border-collapse: collapse;
173
  background: white;
174
- border-radius: 8px;
175
- overflow: hidden;
176
- box-shadow: 0 2px 8px rgba(0,0,0,0.1);
177
  }
178
 
179
- th, td {
180
- padding: 14px 18px;
 
181
  border-bottom: 1px solid #e9ecef;
182
  }
183
 
184
  th {
185
- background: #764ba2;
186
- color: white;
187
- font-weight: 700;
188
- font-size: 1rem;
189
  cursor: pointer;
190
- transition: background-color 0.3s ease;
191
  position: sticky;
192
  top: 0;
193
  z-index: 2;
194
- text-transform: none;
195
- letter-spacing: 0.1em;
196
- padding-right: 32px;
 
 
197
  }
198
 
199
  th:hover {
200
- background: #5e388e;
201
  }
202
 
203
  th.sortable::after {
204
  content: '';
205
  display: inline-block;
206
  position: absolute;
207
- right: 12px;
208
  top: 50%;
209
  transform: translateY(-50%);
210
- width: 16px;
211
- height: 16px;
212
  background-repeat: no-repeat;
213
  background-position: center;
214
- background-size: 16px 16px;
215
- opacity: 0.7;
216
  /* Default: double chevron (unsorted) */
217
- background-image: url('data:image/svg+xml;utf8,<svg width="16" height="16" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M4 6l4-4 4 4" stroke="white" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/><path d="M4 10l4 4 4-4" stroke="white" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/></svg>');
218
  }
219
 
220
  th.sort-asc::after {
221
  /* Up chevron */
222
- background-image: url('data:image/svg+xml;utf8,<svg width="16" height="16" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M4 10l4-4 4 4" stroke="white" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/></svg>');
223
  opacity: 1;
224
  }
225
 
226
  th.sort-desc::after {
227
  /* Down chevron */
228
- background-image: url('data:image/svg+xml;utf8,<svg width="16" height="16" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M4 6l4 4 4-4" stroke="white" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/></svg>');
229
  opacity: 1;
230
  }
231
 
232
- .model-cell, th:first-child {
 
233
  text-align: left;
 
 
 
 
 
 
 
 
234
  }
235
 
236
- td:not(.model-cell), th:not(:first-child) {
237
- text-align: right;
 
 
 
 
 
 
238
  }
239
 
240
  td {
241
- font-size: 1rem;
242
- color: #4b2996;
243
  font-weight: 500;
244
  }
245
 
@@ -248,12 +268,14 @@ tr:hover {
248
  }
249
 
250
  .score-cell {
251
- font-weight: 700;
252
- color: #764ba2;
 
 
253
  }
254
 
255
  .model-cell {
256
- font-weight: 700;
257
  color: #495057;
258
  }
259
 
@@ -263,7 +285,7 @@ tr:hover {
263
  left: 0;
264
  width: 100%;
265
  height: 100%;
266
- background: rgba(0,0,0,0.8);
267
  display: flex;
268
  flex-direction: column;
269
  justify-content: center;
@@ -287,8 +309,13 @@ tr:hover {
287
  }
288
 
289
  @keyframes spin {
290
- 0% { transform: rotate(0deg); }
291
- 100% { transform: rotate(360deg); }
 
 
 
 
 
292
  }
293
 
294
  .no-data {
@@ -308,64 +335,153 @@ tr:hover {
308
  .container {
309
  padding: 10px;
310
  }
 
311
  .header h1 {
312
  font-size: 2rem;
313
  }
 
314
  .dashboard.card {
315
  padding: 12px 4px 12px 4px;
316
  }
 
317
  .card {
318
  padding: 12px 4px 12px 4px;
319
  }
 
320
  .table-container {
321
  padding: 10px;
322
  }
323
- th, td {
 
 
324
  padding: 8px 8px;
325
  font-size: 0.92rem;
326
  }
327
- .overview-section, .table-section, .citation-section {
 
 
 
328
  margin-bottom: 36px;
329
  }
330
  }
331
 
332
- .judge-section {
333
- margin-bottom: 36px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
  }
335
 
336
  .judge-toggle {
337
  display: flex;
338
- gap: 0;
 
339
  border-radius: 10px;
340
- overflow: hidden;
341
- border: 2px solid #764ba2;
342
- width: fit-content;
343
  }
344
 
345
- .judge-btn {
346
- padding: 10px 28px;
347
- font-size: 1rem;
348
  font-weight: 700;
 
 
 
 
 
 
 
349
  font-family: 'Inter', sans-serif;
350
  cursor: pointer;
351
  border: none;
352
- background: white;
353
- color: #764ba2;
354
- transition: background 0.2s, color 0.2s;
 
355
  letter-spacing: 0.03em;
356
  }
357
 
358
- .judge-btn:not(:last-child) {
359
- border-right: 2px solid #764ba2;
360
- }
361
-
362
  .judge-btn:hover {
363
- background: #f3eaff;
364
  }
365
 
366
  .judge-btn.active {
367
- background: linear-gradient(135deg, #764ba2 0%, #667eea 100%);
368
- color: white;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
  }
370
 
371
  .highlight-row {
@@ -373,15 +489,27 @@ tr:hover {
373
  box-shadow: 0 2px 8px 0 rgba(118, 75, 162, 0.08);
374
  }
375
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  .model-link {
377
- color: #764ba2;
378
  font-weight: 700;
379
  text-decoration: none;
380
  transition: color 0.2s;
381
  }
382
 
383
  .model-link:hover {
384
- color: #4b2996;
385
  text-decoration: none;
386
  }
387
 
@@ -455,4 +583,53 @@ tr:hover {
455
  vertical-align: middle;
456
  line-height: 0.5;
457
  letter-spacing: 0;
458
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  body {
8
  font-family: 'Inter', sans-serif;
9
+ background: #f8fafc;
10
  min-height: 100vh;
11
+ color: #334155;
12
  }
13
 
14
  .container {
15
+ max-width: 1440px;
16
+ width: 95%;
17
  margin: 0 auto;
18
+ padding: 20px 40px;
19
  }
20
 
21
  .header {
22
  text-align: center;
23
  margin-bottom: 24px;
24
+ color: #0f172a;
25
  padding: 48px 0 36px 0;
26
  position: relative;
27
  background: none;
 
28
  }
29
 
30
  .header h1 {
 
40
 
41
  .header h1 .fas {
42
  font-size: 2.2rem;
43
+ background: linear-gradient(135deg, #6366f1 0%, #a855f7 100%);
44
  color: white;
45
  border-radius: 50%;
46
  padding: 16px 18px;
47
+ box-shadow: 0 4px 14px rgba(99, 102, 241, 0.3);
48
  vertical-align: middle;
49
  }
50
 
51
  .header p {
52
  font-size: 1.25rem;
53
  font-weight: 400;
54
+ color: #64748b;
 
55
  margin-top: 0;
56
  margin-bottom: 0;
57
  letter-spacing: 0.04em;
 
60
  .card {
61
  background: white;
62
  border-radius: 16px;
63
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.05), 0 2px 4px -2px rgba(0, 0, 0, 0.05);
64
+ border: 1px solid #e2e8f0;
65
  padding: 32px 32px 24px 32px;
66
  }
67
 
 
70
  align-items: center;
71
  font-size: 1.35rem;
72
  font-weight: 700;
73
+ color: #1e293b;
74
  margin-bottom: 28px;
75
  letter-spacing: 0.5px;
76
  text-transform: none;
 
79
  padding-left: 0;
80
  position: relative;
81
  padding-bottom: 8px;
82
+ border-bottom: 1px solid #e2e8f0;
83
  }
84
 
85
  .section-title-spacer {
 
88
 
89
  .accent-bar {
90
  display: inline-block;
91
+ width: 6px;
92
+ height: 24px;
93
  border-radius: 4px;
94
+ background: linear-gradient(180deg, #6366f1 0%, #a855f7 100%);
95
+ margin-right: 14px;
96
  }
97
 
98
+ .overview-section,
99
+ .table-section,
100
+ .citation-section {
101
  margin-bottom: 56px;
102
  }
103
 
104
  .overview-content {
105
+ font-size: 1.05rem;
106
+ color: #475569;
107
  line-height: 1.7;
108
+ border-left: 3px solid #e2e8f0;
 
109
  padding-left: 24px;
110
  background: none;
111
  }
 
116
  }
117
 
118
  .overview-content h4 {
119
+ font-size: 1.1rem;
120
  font-weight: 700;
121
+ color: #0f172a;
122
  margin-top: 28px;
123
  margin-bottom: 10px;
124
  letter-spacing: 0.02em;
 
143
  }
144
 
145
  .citation-content {
146
+ font-size: 1.05rem;
147
+ color: #475569;
148
  line-height: 1.7;
149
  padding-left: 2px;
150
  }
151
 
152
  .citation-content em {
153
+ color: #334155;
154
+ font-size: 0.95rem;
155
  font-style: italic;
156
+ background: #f1f5f9;
157
+ padding: 4px 8px;
158
+ border-radius: 6px;
159
  }
160
 
161
  .table-container {
162
  overflow-x: auto;
163
+ background: white;
164
  border-radius: 12px;
165
+ padding: 0;
166
+ max-height: 580px;
167
  overflow-y: auto;
168
+ position: relative;
169
+ border: 1px solid #e9ecef;
170
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.05);
171
  }
172
 
173
  table {
174
  width: 100%;
175
+ min-width: 1250px;
176
  border-collapse: collapse;
177
  background: white;
178
+ table-layout: fixed;
 
 
179
  }
180
 
181
+ th,
182
+ td {
183
+ padding: 12px 6px;
184
  border-bottom: 1px solid #e9ecef;
185
  }
186
 
187
  th {
188
+ background: #f8fafc;
189
+ color: #334155;
190
+ font-weight: 600;
191
+ font-size: 0.9rem;
192
  cursor: pointer;
193
+ transition: background-color 0.2s ease;
194
  position: sticky;
195
  top: 0;
196
  z-index: 2;
197
+ letter-spacing: 0.02em;
198
+ padding-right: 18px;
199
+ white-space: nowrap;
200
+ overflow: hidden;
201
+ border-bottom: 2px solid #e2e8f0;
202
  }
203
 
204
  th:hover {
205
+ background: #f1f5f9;
206
  }
207
 
208
  th.sortable::after {
209
  content: '';
210
  display: inline-block;
211
  position: absolute;
212
+ right: 4px;
213
  top: 50%;
214
  transform: translateY(-50%);
215
+ width: 12px;
216
+ height: 12px;
217
  background-repeat: no-repeat;
218
  background-position: center;
219
+ background-size: 12px 12px;
220
+ opacity: 0.4;
221
  /* Default: double chevron (unsorted) */
222
+ background-image: url('data:image/svg+xml;utf8,<svg width="16" height="16" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M4 6l4-4 4 4" stroke="%2364748b" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/><path d="M4 10l4 4 4-4" stroke="%2364748b" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"/></svg>');
223
  }
224
 
225
  th.sort-asc::after {
226
  /* Up chevron */
227
+ background-image: url('data:image/svg+xml;utf8,<svg width="16" height="16" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M4 10l4-4 4 4" stroke="%23334155" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round"/></svg>');
228
  opacity: 1;
229
  }
230
 
231
  th.sort-desc::after {
232
  /* Down chevron */
233
+ background-image: url('data:image/svg+xml;utf8,<svg width="16" height="16" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M4 6l4 4 4-4" stroke="%23334155" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round"/></svg>');
234
  opacity: 1;
235
  }
236
 
237
+ .model-cell,
238
+ th:first-child {
239
  text-align: left;
240
+ position: sticky;
241
+ left: 0;
242
+ background: white;
243
+ z-index: 1;
244
+ border-right: 1px solid #e9ecef;
245
+ width: 250px;
246
+ /* Fixed width for model column */
247
+ padding-left: 20px;
248
  }
249
 
250
+ th:first-child {
251
+ background: #f8fafc;
252
+ z-index: 3;
253
+ }
254
+
255
+ td:not(.model-cell),
256
+ th:not(:first-child) {
257
+ text-align: center;
258
  }
259
 
260
  td {
261
+ font-size: 0.90rem;
262
+ color: #334155;
263
  font-weight: 500;
264
  }
265
 
 
268
  }
269
 
270
  .score-cell {
271
+ font-weight: 600;
272
+ color: #495057;
273
+ /* We'll let heatmap coloring handle the exact shade */
274
+ transition: background-color 0.3s;
275
  }
276
 
277
  .model-cell {
278
+ font-weight: 600;
279
  color: #495057;
280
  }
281
 
 
285
  left: 0;
286
  width: 100%;
287
  height: 100%;
288
+ background: rgba(0, 0, 0, 0.8);
289
  display: flex;
290
  flex-direction: column;
291
  justify-content: center;
 
309
  }
310
 
311
  @keyframes spin {
312
+ 0% {
313
+ transform: rotate(0deg);
314
+ }
315
+
316
+ 100% {
317
+ transform: rotate(360deg);
318
+ }
319
  }
320
 
321
  .no-data {
 
335
  .container {
336
  padding: 10px;
337
  }
338
+
339
  .header h1 {
340
  font-size: 2rem;
341
  }
342
+
343
  .dashboard.card {
344
  padding: 12px 4px 12px 4px;
345
  }
346
+
347
  .card {
348
  padding: 12px 4px 12px 4px;
349
  }
350
+
351
  .table-container {
352
  padding: 10px;
353
  }
354
+
355
+ th,
356
+ td {
357
  padding: 8px 8px;
358
  font-size: 0.92rem;
359
  }
360
+
361
+ .overview-section,
362
+ .table-section,
363
+ .citation-section {
364
  margin-bottom: 36px;
365
  }
366
  }
367
 
368
+ .dashboard-controls {
369
+ margin-bottom: 20px;
370
+ display: flex;
371
+ flex-direction: column;
372
+ gap: 20px;
373
+ }
374
+
375
+ .controls-header {
376
+ display: flex;
377
+ justify-content: space-between;
378
+ align-items: center;
379
+ flex-wrap: wrap;
380
+ gap: 16px;
381
+ }
382
+
383
+ .judge-selector {
384
+ display: flex;
385
+ align-items: center;
386
+ gap: 16px;
387
  }
388
 
389
  .judge-toggle {
390
  display: flex;
391
+ align-items: center;
392
+ background: #f1f3f5;
393
  border-radius: 10px;
394
+ padding: 4px;
395
+ border: 1px solid #e9ecef;
 
396
  }
397
 
398
+ .control-label {
399
+ font-size: 1.05rem;
 
400
  font-weight: 700;
401
+ color: #1e293b;
402
+ }
403
+
404
+ .judge-btn {
405
+ padding: 8px 24px;
406
+ font-size: 0.95rem;
407
+ font-weight: 600;
408
  font-family: 'Inter', sans-serif;
409
  cursor: pointer;
410
  border: none;
411
+ background: transparent;
412
+ color: #64748b;
413
+ border-radius: 8px;
414
+ transition: all 0.2s cubic-bezier(0.4, 0, 0.2, 1);
415
  letter-spacing: 0.03em;
416
  }
417
 
 
 
 
 
418
  .judge-btn:hover {
419
+ color: #0f172a;
420
  }
421
 
422
  .judge-btn.active {
423
+ background: white;
424
+ color: #0f172a;
425
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
426
+ }
427
+
428
+ /* --- Tabs --- */
429
+ .tabs-container {
430
+ display: flex;
431
+ border-bottom: 2px solid #e9ecef;
432
+ margin-bottom: 16px;
433
+ }
434
+
435
+ .tabs-header {
436
+ display: flex;
437
+ gap: 24px;
438
+ margin-bottom: -2px;
439
+ }
440
+
441
+ .tab-btn {
442
+ padding: 12px 4px;
443
+ font-size: 1.05rem;
444
+ font-weight: 600;
445
+ font-family: 'Inter', sans-serif;
446
+ cursor: pointer;
447
+ border: none;
448
+ background: transparent;
449
+ color: #64748b;
450
+ border-bottom: 3px solid transparent;
451
+ transition: all 0.2s;
452
+ letter-spacing: 0.02em;
453
+ }
454
+
455
+ .tab-btn:hover {
456
+ color: #0f172a;
457
+ }
458
+
459
+ .tab-btn.active {
460
+ color: #0f172a;
461
+ border-bottom-color: #6366f1;
462
+ font-weight: 700;
463
+ }
464
+
465
+ .tab-content {
466
+ display: none;
467
+ animation: fadeIn 0.3s ease-in-out;
468
+ margin-bottom: 56px;
469
+ }
470
+
471
+ .tab-content.active {
472
+ display: block;
473
+ }
474
+
475
+ @keyframes fadeIn {
476
+ from {
477
+ opacity: 0;
478
+ transform: translateY(5px);
479
+ }
480
+
481
+ to {
482
+ opacity: 1;
483
+ transform: translateY(0);
484
+ }
485
  }
486
 
487
  .highlight-row {
 
489
  box-shadow: 0 2px 8px 0 rgba(118, 75, 162, 0.08);
490
  }
491
 
492
+ .baseline-row {
493
+ background: #fefce8 !important;
494
+ }
495
+
496
+ .baseline-row .model-cell {
497
+ color: #ca8a04;
498
+ }
499
+
500
+ .baseline-row td {
501
+ font-style: italic;
502
+ }
503
+
504
  .model-link {
505
+ color: #6366f1;
506
  font-weight: 700;
507
  text-decoration: none;
508
  transition: color 0.2s;
509
  }
510
 
511
  .model-link:hover {
512
+ color: #4f46e5;
513
  text-decoration: none;
514
  }
515
 
 
583
  vertical-align: middle;
584
  line-height: 0.5;
585
  letter-spacing: 0;
586
+ }
587
+
588
+ /* --- Search Bar --- */
589
+ .search-container {
590
+ position: relative;
591
+ width: 250px;
592
+ flex-shrink: 0;
593
+ }
594
+
595
+ .search-input {
596
+ width: 100%;
597
+ padding: 10px 16px 10px 38px;
598
+ border: 1.5px solid #e2e8f0;
599
+ border-radius: 8px;
600
+ font-size: 0.95rem;
601
+ font-family: 'Inter', sans-serif;
602
+ color: #334155;
603
+ transition: border-color 0.2s, box-shadow 0.2s;
604
+ background: #f8fafc;
605
+ }
606
+
607
+ .search-input:focus {
608
+ outline: none;
609
+ border-color: #6366f1;
610
+ box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.15);
611
+ background: white;
612
+ }
613
+
614
+ .search-icon {
615
+ position: absolute;
616
+ left: 14px;
617
+ top: 50%;
618
+ transform: translateY(-50%);
619
+ color: #9ba1a6;
620
+ font-size: 0.9rem;
621
+ }
622
+
623
+ /* --- Rank Medals --- */
624
+ .rank-medal {
625
+ font-size: 1.1rem;
626
+ margin-right: 6px;
627
+ vertical-align: middle;
628
+ }
629
+
630
+ /* Ensure data cells don't overflow */
631
+ .score-cell {
632
+ font-weight: 600;
633
+ color: #334155;
634
+ transition: background-color 0.3s;
635
+ }