Jongyoon Song commited on
Commit
3ae20b5
·
1 Parent(s): a452b10

Add Dhanishtha-2.0 Preview results

Browse files
src/data/open/length_data.json CHANGED
@@ -747,6 +747,74 @@
747
  "Med Resp": 1623.0
748
  }
749
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
750
  "GPT-5 (Reasoning: medium)": {
751
  "Overall": {
752
  "Min": -10,
 
747
  "Med Resp": 1623.0
748
  }
749
  },
750
+ "Dhanishtha-2.0 Preview": {
751
+ "Overall": {
752
+ "Min": 93,
753
+ "Max": 6076,
754
+ "Med": 520.0,
755
+ "Med Resp": 356.0
756
+ },
757
+ "Content Generation": {
758
+ "Min": 102,
759
+ "Max": 3978,
760
+ "Med": 589.0,
761
+ "Med Resp": 439.0
762
+ },
763
+ "Editing": {
764
+ "Min": 116,
765
+ "Max": 1716,
766
+ "Med": 437.5,
767
+ "Med Resp": 301.5
768
+ },
769
+ "Data Analysis": {
770
+ "Min": 116,
771
+ "Max": 4070,
772
+ "Med": 470.0,
773
+ "Med Resp": 288.0
774
+ },
775
+ "Reasoning": {
776
+ "Min": 182,
777
+ "Max": 2451,
778
+ "Med": 625.0,
779
+ "Med Resp": 366.0
780
+ },
781
+ "Hallucination": {
782
+ "Min": 160,
783
+ "Max": 4068,
784
+ "Med": 494.0,
785
+ "Med Resp": 318.5
786
+ },
787
+ "Safety": {
788
+ "Min": 121,
789
+ "Max": 1470,
790
+ "Med": 381.0,
791
+ "Med Resp": 236.0
792
+ },
793
+ "Repetition": {
794
+ "Min": 240,
795
+ "Max": 3982,
796
+ "Med": 576.5,
797
+ "Med Resp": 384.5
798
+ },
799
+ "Summarization": {
800
+ "Min": 93,
801
+ "Max": 2578,
802
+ "Med": 385.0,
803
+ "Med Resp": 289.0
804
+ },
805
+ "Translation": {
806
+ "Min": 107,
807
+ "Max": 3331,
808
+ "Med": 426.0,
809
+ "Med Resp": 331.5
810
+ },
811
+ "Multi-Turn": {
812
+ "Min": 362,
813
+ "Max": 6076,
814
+ "Med": 1462.0,
815
+ "Med Resp": 1095.0
816
+ }
817
+ },
818
  "GPT-5 (Reasoning: medium)": {
819
  "Overall": {
820
  "Min": -10,
src/data/open/stats.csv CHANGED
@@ -59,6 +59,8 @@ top-p: 0.95" "Exaone" "1274.5" "503.0" "40.64476558326666" "52.11687910556793" "
59
  top-p: 0.95" "Apriel" "2238.0" "375.0" "299.8162105011457" "379.46853709220886" "14.66275339770088" "15.0" "Open" "Think" "On" "31.92" "44.25" "26.56" "47.41" "59.09" "22.99" "37.19" "20.0" "26.98" "20.22" "10.07"
60
  "HyperCLOVAX SEED Think 14B (think)" "https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" "temperature: 0.5
61
  top-p: 0.6" "HCX" "1444.0" "382.5" "16.12651202553951" "24.703290462493896" "83.75171982150616" "14.7" "Open" "Hybrid" "On" "31.84" "35.0" "26.56" "53.78" "58.68" "27.59" "26.45" "17.14" "29.76" "17.13" "20.47"
 
 
62
  "ERNIE 4.5 21B A3B Thinking" "https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking" "temperature: 0.6
63
  top-p: 0.95" "ERNIE" "1637.0" "541.0" "48.24206436969081" "56.95321476459503" "78.52955859303597" "21.0" "Open" "Think" "On" "25.32" "27.25" "20.31" "42.23" "49.59" "23.56" "31.4" "17.14" "28.17" "7.3" "13.76"
64
  "Solar Pro Preview (top_p:0.95, temp: 0.7)" "https://huggingface.co/upstage/solar-pro-preview-instruct" "temperature: 0.7
 
59
  top-p: 0.95" "Apriel" "2238.0" "375.0" "299.8162105011457" "379.46853709220886" "14.66275339770088" "15.0" "Open" "Think" "On" "31.92" "44.25" "26.56" "47.41" "59.09" "22.99" "37.19" "20.0" "26.98" "20.22" "10.07"
60
  "HyperCLOVAX SEED Think 14B (think)" "https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" "temperature: 0.5
61
  top-p: 0.6" "HCX" "1444.0" "382.5" "16.12651202553951" "24.703290462493896" "83.75171982150616" "14.7" "Open" "Hybrid" "On" "31.84" "35.0" "26.56" "53.78" "58.68" "27.59" "26.45" "17.14" "29.76" "17.13" "20.47"
62
+ "Dhanishtha-2.0 Preview" "https://huggingface.co/HelpingAI/Dhanishtha-2.0-preview" "temperature: 0.7
63
+ top-p: 0.9" "HelpingAI" "520.0" "356.0" "4.368606805801392" "35.15699875354767" "17.75738514863349" "14.8" "Open" "Think" "On" "25.81" "28.25" "19.38" "30.28" "33.47" "43.1" "47.93" "20.0" "31.75" "12.08" "13.09"
64
  "ERNIE 4.5 21B A3B Thinking" "https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking" "temperature: 0.6
65
  top-p: 0.95" "ERNIE" "1637.0" "541.0" "48.24206436969081" "56.95321476459503" "78.52955859303597" "21.0" "Open" "Think" "On" "25.32" "27.25" "20.31" "42.23" "49.59" "23.56" "31.4" "17.14" "28.17" "7.3" "13.76"
66
  "Solar Pro Preview (top_p:0.95, temp: 0.7)" "https://huggingface.co/upstage/solar-pro-preview-instruct" "temperature: 0.7
src/data/open/stats_lang.csv CHANGED
@@ -59,6 +59,8 @@ top-p: 0.95" "Exaone" "1274.5" "503.0" "40.64476558326666" "52.11687910556793" "
59
  top-p: 0.95" "Apriel" "2238.0" "375.0" "299.8162105011457" "379.46853709220886" "14.66275339770088" "15.0" "Open" "Think" "On" "31.92" "23.61" "39.72" "30.72" "38.41" "24.44" "40.88" "37.99" "32.43" "32.61" "22.95" "28.65" "31.71"
60
  "HyperCLOVAX SEED Think 14B (think)" "https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" "temperature: 0.5
61
  top-p: 0.6" "HCX" "1444.0" "382.5" "16.12651202553951" "24.703290462493896" "83.75171982150616" "14.7" "Open" "Hybrid" "On" "31.84" "32.22" "37.22" "31.93" "38.41" "27.78" "32.6" "30.17" "29.19" "32.07" "33.33" "25.28" "26.22"
 
 
62
  "ERNIE 4.5 21B A3B Thinking" "https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking" "temperature: 0.6
63
  top-p: 0.95" "ERNIE" "1637.0" "541.0" "48.24206436969081" "56.95321476459503" "78.52955859303597" "21.0" "Open" "Think" "On" "25.32" "17.5" "31.11" "18.67" "39.02" "23.33" "24.31" "24.58" "26.49" "24.46" "30.6" "19.1" "27.44"
64
  "Solar Pro Preview (top_p:0.95, temp: 0.7)" "https://huggingface.co/upstage/solar-pro-preview-instruct" "temperature: 0.7
 
59
  top-p: 0.95" "Apriel" "2238.0" "375.0" "299.8162105011457" "379.46853709220886" "14.66275339770088" "15.0" "Open" "Think" "On" "31.92" "23.61" "39.72" "30.72" "38.41" "24.44" "40.88" "37.99" "32.43" "32.61" "22.95" "28.65" "31.71"
60
  "HyperCLOVAX SEED Think 14B (think)" "https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" "temperature: 0.5
61
  top-p: 0.6" "HCX" "1444.0" "382.5" "16.12651202553951" "24.703290462493896" "83.75171982150616" "14.7" "Open" "Hybrid" "On" "31.84" "32.22" "37.22" "31.93" "38.41" "27.78" "32.6" "30.17" "29.19" "32.07" "33.33" "25.28" "26.22"
62
+ "Dhanishtha-2.0 Preview" "https://huggingface.co/HelpingAI/Dhanishtha-2.0-preview" "temperature: 0.7
63
+ top-p: 0.9" "HelpingAI" "520.0" "356.0" "4.368606805801392" "35.15699875354767" "17.75738514863349" "14.8" "Open" "Think" "On" "25.81" "23.33" "27.22" "30.12" "32.32" "20.56" "20.99" "26.26" "25.95" "25.54" "30.6" "23.6" "25.0"
64
  "ERNIE 4.5 21B A3B Thinking" "https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking" "temperature: 0.6
65
  top-p: 0.95" "ERNIE" "1637.0" "541.0" "48.24206436969081" "56.95321476459503" "78.52955859303597" "21.0" "Open" "Think" "On" "25.32" "17.5" "31.11" "18.67" "39.02" "23.33" "24.31" "24.58" "26.49" "24.46" "30.6" "19.1" "27.44"
66
  "Solar Pro Preview (top_p:0.95, temp: 0.7)" "https://huggingface.co/upstage/solar-pro-preview-instruct" "temperature: 0.7
src/data/open/time_data.json CHANGED
@@ -2089,6 +2089,196 @@
2089
  }
2090
  }
2091
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2092
  "GPT-5 (Reasoning: medium)": {
2093
  "NUM_GPUS": 0,
2094
  "Overall": {
 
2089
  }
2090
  }
2091
  },
2092
+ "Dhanishtha-2.0 Preview": {
2093
+ "NUM_GPUS": 1,
2094
+ "Overall": {
2095
+ "Time to Answer": {
2096
+ "Min": 1.5839078426361084,
2097
+ "Max": 31.054526805877686,
2098
+ "Med": 4.368606805801392
2099
+ },
2100
+ "Latency": {
2101
+ "Min": 9.942606449127197,
2102
+ "Max": 131.58703541755676,
2103
+ "Med": 35.15699875354767
2104
+ },
2105
+ "Speed": {
2106
+ "Min": 3.7315392457146315,
2107
+ "Max": 83.12757800749918,
2108
+ "Med": 17.75738514863349
2109
+ }
2110
+ },
2111
+ "Content Generation": {
2112
+ "Time to Answer": {
2113
+ "Min": 1.5839078426361084,
2114
+ "Max": 14.249487161636353,
2115
+ "Med": 4.174551248550415
2116
+ },
2117
+ "Latency": {
2118
+ "Min": 15.61229681968689,
2119
+ "Max": 68.42075729370117,
2120
+ "Med": 35.556575536727905
2121
+ },
2122
+ "Speed": {
2123
+ "Min": 3.7315392457146315,
2124
+ "Max": 78.70559503273634,
2125
+ "Med": 18.569924892372086
2126
+ }
2127
+ },
2128
+ "Editing": {
2129
+ "Time to Answer": {
2130
+ "Min": 1.9209108352661133,
2131
+ "Max": 14.220961093902588,
2132
+ "Med": 4.214044094085693
2133
+ },
2134
+ "Latency": {
2135
+ "Min": 15.878032684326172,
2136
+ "Max": 51.72106313705444,
2137
+ "Med": 34.23416888713837
2138
+ },
2139
+ "Speed": {
2140
+ "Min": 4.349397272605004,
2141
+ "Max": 53.4553157438125,
2142
+ "Med": 15.59048741844877
2143
+ }
2144
+ },
2145
+ "Data Analysis": {
2146
+ "Time to Answer": {
2147
+ "Min": 2.0026886463165283,
2148
+ "Max": 14.330097436904907,
2149
+ "Med": 4.264540910720825
2150
+ },
2151
+ "Latency": {
2152
+ "Min": 15.492263317108154,
2153
+ "Max": 86.72672486305237,
2154
+ "Med": 34.66960024833679
2155
+ },
2156
+ "Speed": {
2157
+ "Min": 5.711889702488061,
2158
+ "Max": 78.99341854325922,
2159
+ "Med": 16.223466498569906
2160
+ }
2161
+ },
2162
+ "Reasoning": {
2163
+ "Time to Answer": {
2164
+ "Min": 1.6599221229553223,
2165
+ "Max": 14.43197774887085,
2166
+ "Med": 3.9918036460876465
2167
+ },
2168
+ "Latency": {
2169
+ "Min": 15.9655921459198,
2170
+ "Max": 51.98690748214722,
2171
+ "Med": 35.49437427520752
2172
+ },
2173
+ "Speed": {
2174
+ "Min": 6.382185072861903,
2175
+ "Max": 77.07097090018262,
2176
+ "Med": 20.275446641877387
2177
+ }
2178
+ },
2179
+ "Hallucination": {
2180
+ "Time to Answer": {
2181
+ "Min": 1.6478140354156494,
2182
+ "Max": 12.624572038650513,
2183
+ "Med": 4.188554286956787
2184
+ },
2185
+ "Latency": {
2186
+ "Min": 14.814888954162598,
2187
+ "Max": 84.91817879676819,
2188
+ "Med": 34.31693959236145
2189
+ },
2190
+ "Speed": {
2191
+ "Min": 5.576878248063783,
2192
+ "Max": 79.0218001634417,
2193
+ "Med": 16.994417851887242
2194
+ }
2195
+ },
2196
+ "Safety": {
2197
+ "Time to Answer": {
2198
+ "Min": 2.003626585006714,
2199
+ "Max": 12.981850862503052,
2200
+ "Med": 4.0139992237091064
2201
+ },
2202
+ "Latency": {
2203
+ "Min": 14.703532457351685,
2204
+ "Max": 46.97670245170593,
2205
+ "Med": 33.39446020126343
2206
+ },
2207
+ "Speed": {
2208
+ "Min": 5.835292654577009,
2209
+ "Max": 57.624133083858574,
2210
+ "Med": 13.772987124238506
2211
+ }
2212
+ },
2213
+ "Repetition": {
2214
+ "Time to Answer": {
2215
+ "Min": 1.6010737419128418,
2216
+ "Max": 12.8489511013031,
2217
+ "Med": 3.9335577487945557
2218
+ },
2219
+ "Latency": {
2220
+ "Min": 15.96399211883545,
2221
+ "Max": 83.84577012062073,
2222
+ "Med": 35.164939522743225
2223
+ },
2224
+ "Speed": {
2225
+ "Min": 7.913820950042516,
2226
+ "Max": 61.16802651575593,
2227
+ "Med": 19.237509583858355
2228
+ }
2229
+ },
2230
+ "Summarization": {
2231
+ "Time to Answer": {
2232
+ "Min": 2.0594398975372314,
2233
+ "Max": 12.99567985534668,
2234
+ "Med": 4.562246680259705
2235
+ },
2236
+ "Latency": {
2237
+ "Min": 9.942606449127197,
2238
+ "Max": 49.9459753036499,
2239
+ "Med": 33.85763645172119
2240
+ },
2241
+ "Speed": {
2242
+ "Min": 5.511090930667453,
2243
+ "Max": 74.90896346809664,
2244
+ "Med": 14.041764327517052
2245
+ }
2246
+ },
2247
+ "Translation": {
2248
+ "Time to Answer": {
2249
+ "Min": 1.6257987022399902,
2250
+ "Max": 18.216102361679077,
2251
+ "Med": 4.544387936592102
2252
+ },
2253
+ "Latency": {
2254
+ "Min": 13.830479621887207,
2255
+ "Max": 51.186935901641846,
2256
+ "Med": 34.61378490924835
2257
+ },
2258
+ "Speed": {
2259
+ "Min": 3.7880436790911665,
2260
+ "Max": 83.12757800749918,
2261
+ "Med": 15.479612589205376
2262
+ }
2263
+ },
2264
+ "Multi-Turn": {
2265
+ "Time to Answer": {
2266
+ "Min": 2.355271100997925,
2267
+ "Max": 31.054526805877686,
2268
+ "Med": 14.455448269844055
2269
+ },
2270
+ "Latency": {
2271
+ "Min": 24.809295892715454,
2272
+ "Max": 131.58703541755676,
2273
+ "Med": 61.22457039356232
2274
+ },
2275
+ "Speed": {
2276
+ "Min": 24.85843420960008,
2277
+ "Max": 78.04249041256476,
2278
+ "Med": 43.742556439353024
2279
+ }
2280
+ }
2281
+ },
2282
  "GPT-5 (Reasoning: medium)": {
2283
  "NUM_GPUS": 0,
2284
  "Overall": {