djstrong commited on
Commit
fd35185
·
1 Parent(s): 74253ba

Refactor app.py to use JSON for benchmark data, removing CSV and metadata dependencies. Update performance plotting to reflect new data structure and enhance visualization with cultural context. Introduce benchmark report JSON file for structured model evaluation results.

Browse files
Files changed (7) hide show
  1. app.py +55 -81
  2. benchmark_report.json +142 -0
  3. benchmark_results.csv +0 -189
  4. metadata.json +0 -355
  5. plot_results.py +100 -98
  6. script.py +0 -322
  7. src/about.py +27 -9
app.py CHANGED
@@ -19,92 +19,54 @@ with demo:
19
  gr.HTML(TITLE)
20
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
21
 
22
- NUMBER_OF_QUESTIONS = 171.0
23
-
24
- # load dataframe from csv
25
- # leaderboard_df = pd.read_csv("benchmark_results.csv")
26
- leaderboard_df = []
27
- with open("benchmark_results.csv", "r") as f:
28
- header = f.readline().strip().split(",")
29
- header = [h.strip() for h in header]
30
- for i, line in enumerate(f):
31
- leaderboard_df.append(line.strip().split(",", 13))
32
-
33
- metadata = json.load(open('metadata.json'))
34
- for k, v in list(metadata.items()):
35
- metadata[k.split(",")[0]] = v
36
-
37
- # create dataframe from list and header
38
- leaderboard_df = pd.DataFrame(leaderboard_df, columns=header)
39
- # filter column with value eq-bench_v2_pl
40
- print(header)
41
- leaderboard_df = leaderboard_df[(leaderboard_df["Benchmark Version"] == "eq-bench_v2_pl") | (
42
- leaderboard_df["Benchmark Version"] == 'eq-bench_pl')]
43
- # fix: ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
44
-
45
- # leave only defined columns
46
- leaderboard_df = leaderboard_df[["Model Path", "Benchmark Score", "Num Questions Parseable", "Error"]]
47
-
48
-
49
- # create new column with model name
50
- def parse_parseable(x):
51
- if x["Num Questions Parseable"] == 'FAILED':
52
- m = re.match(r'(\d+)\.0 questions were parseable', x["Error"])
53
- return m.group(1)
54
- return x["Num Questions Parseable"]
55
-
56
-
57
- leaderboard_df["Num Questions Parseable"] = leaderboard_df[["Num Questions Parseable", "Error"]].apply(
58
- lambda x: parse_parseable(x), axis=1)
59
-
60
- def fraction_to_percentage(numerator: float, denominator: float) -> float:
61
- return (numerator / denominator) * 100
62
-
63
- leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].apply(lambda x: fraction_to_percentage(float(x), NUMBER_OF_QUESTIONS))
64
-
65
- def get_params(model_name):
66
- if model_name in metadata:
67
- return metadata[model_name]
68
- else:
69
- print(model_name)
70
- return numpy.nan
71
-
72
-
73
- leaderboard_df["Params"] = leaderboard_df["Model Path"].apply(lambda x: get_params(x))
74
-
75
- # move column order
76
- leaderboard_df = leaderboard_df[["Model Path", "Params", "Benchmark Score", "Num Questions Parseable", 'Error']]
77
-
78
- # change value of column to nan
79
- leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', numpy.nan)
80
-
81
- #scale Benchmark Score by Num Questions Parseable*100
82
- leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float) * ((leaderboard_df["Num Questions Parseable"].astype(float) / 100))
83
-
84
- # set datatype of column
85
- leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float)
86
- leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].astype(float)
87
 
88
- # set nan if value of column is less than 0
89
- leaderboard_df.loc[leaderboard_df["Benchmark Score"] < 0, "Benchmark Score"] = 0
 
 
90
 
91
- # sort by 2 columns
92
- leaderboard_df = leaderboard_df.sort_values(by=["Benchmark Score", "Num Questions Parseable"],
93
- ascending=[False, False])
94
 
95
  # Print model names and scores to console before HTML formatting
96
  print("\n===== MODEL RESULTS =====")
 
97
  for index, row in leaderboard_df.iterrows():
98
- print(f"{row['Model Path']}: {row['Benchmark Score']:.2f}")
99
  print("========================\n")
100
 
101
  # Apply HTML formatting for display
102
  leaderboard_df["Model Path"] = leaderboard_df["Model Path"].apply(lambda x: make_clickable_model(x))
103
 
104
- # rename columns
105
  leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model"})
106
- leaderboard_df = leaderboard_df.rename(columns={"Num Questions Parseable": "Percentage Questions Parseable"})
107
-
108
  leaderboard_df.to_csv("output.csv")
109
 
110
  # Set midpoint for gradient coloring based on data ranges
@@ -118,17 +80,29 @@ with demo:
118
  vmax=150
119
  )
120
 
121
- rounding = {}
122
- # for col in ["Benchmark Score", "Num Questions Parseable"]:
123
-
124
- rounding["Benchmark Score"] = "{:.2f}"
125
- rounding["Percentage Questions Parseable"] = "{:.2f}"
126
- rounding["Params"] = "{:.0f}"
 
 
 
 
 
 
 
 
 
127
  leaderboard_df_styled = leaderboard_df_styled.format(rounding)
128
 
 
 
 
129
  leaderboard_table = gr.components.Dataframe(
130
  value=leaderboard_df_styled,
131
- datatype=['markdown', 'number', 'number', 'number', 'str'],
132
  elem_id="leaderboard-table",
133
  interactive=False,
134
  visible=True,
 
19
  gr.HTML(TITLE)
20
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
21
 
22
+ # Load dataframe from JSON
23
+ with open("benchmark_report.json", "r") as f:
24
+ json_data = json.load(f)
25
+
26
+ # Create dataframe from JSON data
27
+ leaderboard_df = pd.DataFrame(json_data)
28
+
29
+ # Rename columns for consistency
30
+ leaderboard_df = leaderboard_df.rename(columns={
31
+ "Model Name": "Model Path",
32
+ "Model Size": "Params"
33
+ })
34
+
35
+ # Calculate overall benchmark score as average of Avg (object) and Avg (country)
36
+ leaderboard_df["Avg"] = (leaderboard_df["Avg (object)"] + leaderboard_df["Avg (country)"]) / 2
37
+
38
+ # Select and reorder columns for display (removed Percentage Questions Parseable)
39
+ display_columns = [
40
+ "Model Path", "Params", "Avg",
41
+ "Avg (object)", "Avg (country)",
42
+ "History (object)", "History (country)",
43
+ "Geography (object)", "Geography (country)",
44
+ "Art & Entertainment (object)", "Art & Entertainment (country)",
45
+ "Culture & Tradition (object)", "Culture & Tradition (country)"
46
+ ]
47
+
48
+ leaderboard_df = leaderboard_df[display_columns]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ # Convert Params column - replace "-" with NaN and convert numeric strings to float
51
+ leaderboard_df["Params"] = leaderboard_df["Params"].replace("-", numpy.nan)
52
+ # Convert numeric strings directly to float (no regex needed since values are already clean numbers)
53
+ leaderboard_df.loc[leaderboard_df["Params"].notna(), "Params"] = leaderboard_df.loc[leaderboard_df["Params"].notna(), "Params"].astype(float)
54
 
55
+ # Sort by benchmark score
56
+ leaderboard_df = leaderboard_df.sort_values(by=["Avg"], ascending=[False])
 
57
 
58
  # Print model names and scores to console before HTML formatting
59
  print("\n===== MODEL RESULTS =====")
60
+ print("Avg is calculated as: (Avg (object) + Avg (country)) / 2")
61
  for index, row in leaderboard_df.iterrows():
62
+ print(f"{row['Model Path']}: {row['Avg']:.2f}")
63
  print("========================\n")
64
 
65
  # Apply HTML formatting for display
66
  leaderboard_df["Model Path"] = leaderboard_df["Model Path"].apply(lambda x: make_clickable_model(x))
67
 
68
+ # Rename column for display
69
  leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model"})
 
 
70
  leaderboard_df.to_csv("output.csv")
71
 
72
  # Set midpoint for gradient coloring based on data ranges
 
80
  vmax=150
81
  )
82
 
83
+ # Set up number formatting (removed Percentage Questions Parseable)
84
+ rounding = {
85
+ "Avg": "{:.2f}",
86
+ "Params": "{:.0f}",
87
+ "Avg (object)": "{:.2f}",
88
+ "Avg (country)": "{:.2f}",
89
+ "History (object)": "{:.2f}",
90
+ "History (country)": "{:.2f}",
91
+ "Geography (object)": "{:.2f}",
92
+ "Geography (country)": "{:.2f}",
93
+ "Art & Entertainment (object)": "{:.2f}",
94
+ "Art & Entertainment (country)": "{:.2f}",
95
+ "Culture & Tradition (object)": "{:.2f}",
96
+ "Culture & Tradition (country)": "{:.2f}"
97
+ }
98
  leaderboard_df_styled = leaderboard_df_styled.format(rounding)
99
 
100
+ # Create dataframe component with appropriate datatypes
101
+ datatypes = ['markdown', 'number'] + ['number'] * (len(display_columns) - 1)
102
+
103
  leaderboard_table = gr.components.Dataframe(
104
  value=leaderboard_df_styled,
105
+ datatype=datatypes,
106
  elem_id="leaderboard-table",
107
  interactive=False,
108
  visible=True,
benchmark_report.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "Model Name":"Anthropic Claude 3.7 Sonnet",
4
+ "Model Size":"-",
5
+ "Avg (object)":37.06,
6
+ "Avg (country)":62.46,
7
+ "History (object)":52.5,
8
+ "History (country)":80.0,
9
+ "Geography (object)":58.33,
10
+ "Geography (country)":83.33,
11
+ "Art & Entertainment (object)":22.41,
12
+ "Art & Entertainment (country)":44.83,
13
+ "Culture & Tradition (object)":15.0,
14
+ "Culture & Tradition (country)":41.67
15
+ },
16
+ {
17
+ "Model Name":"OpenAI GPT-4o",
18
+ "Model Size":"-",
19
+ "Avg (object)":28.94,
20
+ "Avg (country)":42.49,
21
+ "History (object)":30.0,
22
+ "History (country)":37.5,
23
+ "Geography (object)":45.0,
24
+ "Geography (country)":55.0,
25
+ "Art & Entertainment (object)":22.41,
26
+ "Art & Entertainment (country)":24.14,
27
+ "Culture & Tradition (object)":18.33,
28
+ "Culture & Tradition (country)":53.33
29
+ },
30
+ {
31
+ "Model Name":"Qwen 2.5 VL 72B",
32
+ "Model Size":"72",
33
+ "Avg (object)":23.91,
34
+ "Avg (country)":51.51,
35
+ "History (object)":35.0,
36
+ "History (country)":70.0,
37
+ "Geography (object)":31.67,
38
+ "Geography (country)":71.67,
39
+ "Art & Entertainment (object)":18.97,
40
+ "Art & Entertainment (country)":31.03,
41
+ "Culture & Tradition (object)":10.0,
42
+ "Culture & Tradition (country)":33.33
43
+ },
44
+ {
45
+ "Model Name":"Qwen 2.5 VL 32B",
46
+ "Model Size":"32",
47
+ "Avg (object)":22.27,
48
+ "Avg (country)":48.8,
49
+ "History (object)":30.0,
50
+ "History (country)":67.5,
51
+ "Geography (object)":28.33,
52
+ "Geography (country)":66.67,
53
+ "Art & Entertainment (object)":22.41,
54
+ "Art & Entertainment (country)":31.03,
55
+ "Culture & Tradition (object)":8.33,
56
+ "Culture & Tradition (country)":30.0
57
+ },
58
+ {
59
+ "Model Name":"Qwen 2.5 VL 7B",
60
+ "Model Size":"7",
61
+ "Avg (object)":21.62,
62
+ "Avg (country)":44.72,
63
+ "History (object)":32.5,
64
+ "History (country)":65.0,
65
+ "Geography (object)":28.33,
66
+ "Geography (country)":66.67,
67
+ "Art & Entertainment (object)":18.97,
68
+ "Art & Entertainment (country)":15.52,
69
+ "Culture & Tradition (object)":6.67,
70
+ "Culture & Tradition (country)":31.67
71
+ },
72
+ {
73
+ "Model Name":"Google Gemma 3 27B",
74
+ "Model Size":"27",
75
+ "Avg (object)":19.14,
76
+ "Avg (country)":43.76,
77
+ "History (object)":12.5,
78
+ "History (country)":52.5,
79
+ "Geography (object)":28.33,
80
+ "Geography (country)":48.33,
81
+ "Art & Entertainment (object)":22.41,
82
+ "Art & Entertainment (country)":25.86,
83
+ "Culture & Tradition (object)":13.33,
84
+ "Culture & Tradition (country)":48.33
85
+ },
86
+ {
87
+ "Model Name":"Meta Llama 4 Maverick",
88
+ "Model Size":"402",
89
+ "Avg (object)":17.49,
90
+ "Avg (country)":42.98,
91
+ "History (object)":17.5,
92
+ "History (country)":52.5,
93
+ "Geography (object)":20.0,
94
+ "Geography (country)":50.0,
95
+ "Art & Entertainment (object)":24.14,
96
+ "Art & Entertainment (country)":32.76,
97
+ "Culture & Tradition (object)":8.33,
98
+ "Culture & Tradition (country)":36.67
99
+ },
100
+ {
101
+ "Model Name":"Mistral Medium 3",
102
+ "Model Size":"-",
103
+ "Avg (object)":17.45,
104
+ "Avg (country)":45.99,
105
+ "History (object)":12.5,
106
+ "History (country)":65.0,
107
+ "Geography (object)":31.67,
108
+ "Geography (country)":56.67,
109
+ "Art & Entertainment (object)":18.97,
110
+ "Art & Entertainment (country)":18.97,
111
+ "Culture & Tradition (object)":6.67,
112
+ "Culture & Tradition (country)":43.33
113
+ },
114
+ {
115
+ "Model Name":"Google Gemma 3 12B",
116
+ "Model Size":"12",
117
+ "Avg (object)":13.06,
118
+ "Avg (country)":40.04,
119
+ "History (object)":10.0,
120
+ "History (country)":42.5,
121
+ "Geography (object)":15.0,
122
+ "Geography (country)":46.67,
123
+ "Art & Entertainment (object)":17.24,
124
+ "Art & Entertainment (country)":29.31,
125
+ "Culture & Tradition (object)":10.0,
126
+ "Culture & Tradition (country)":41.67
127
+ },
128
+ {
129
+ "Model Name":"Google Gemma 3 4B",
130
+ "Model Size":"4",
131
+ "Avg (object)":9.72,
132
+ "Avg (country)":35.84,
133
+ "History (object)":5.0,
134
+ "History (country)":47.5,
135
+ "Geography (object)":8.33,
136
+ "Geography (country)":38.33,
137
+ "Art & Entertainment (object)":17.24,
138
+ "Art & Entertainment (country)":25.86,
139
+ "Culture & Tradition (object)":8.33,
140
+ "Culture & Tradition (country)":31.67
141
+ }
142
+ ]
benchmark_results.csv DELETED
@@ -1,189 +0,0 @@
1
- Run ID, Benchmark Completed, Prompt Format, Model Path, Lora Path, Quantization, Benchmark Score, Benchmark Version, Num Questions Parseable, Num Iterations, Inference Engine, Ooba Params, Download Filters, Error
2
- Bielik_v0.1,2024-06-18 12:48:51,,speakleash/Bielik-7B-Instruct-v0.1,,,47.1,eq-bench_v2,170.0,1,transformers, ,,
3
- Bielik_v0.1,2024-06-18 13:44:54,,speakleash/Bielik-7B-Instruct-v0.1,,,34.17,eq-bench_v2_pl,149.0,1,transformers, ,,
4
- Bielik_v0.1,2024-06-18 14:01:46,,speakleash/Bielik-7B-Instruct-v0.1,,,34.27,eq-bench_v2_pl,156.0,1,transformers, ,,
5
- openchat-gemma,2024-06-18 14:03:04,,openchat/openchat-3.5-0106-gemma,,,FAILED,eq-bench,FAILED,1,transformers, ,,System role not supported
6
- openchat-35-0106,2024-06-18 14:30:24,,openchat/openchat-3.5-0106,,,45.69,eq-bench_v2_pl,170.0,1,transformers, ,,
7
- openchat-35-0106,2024-06-18 15:15:03,,openchat/openchat-3.5-0106,,,45.69,eq-bench_v2_pl,170.0,1,transformers, ,,
8
- glm-4-9b-chat,2024-06-18 15:16:14,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,
9
- openchat-35-0106,2024-06-18 15:19:01,,openchat/openchat-3.5-0106,,,72.92,eq-bench_v2,171.0,1,transformers, ,,
10
- glm-4-9b-chat,2024-06-18 15:20:10,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,
11
- openchat-35-0106,2024-06-18 15:22:41,,openchat/openchat-3.5-0106,,,45.69,eq-bench_v2_pl,170.0,1,transformers, ,,
12
- glm-4-9b-chat,2024-06-18 15:23:50,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,
13
- glm-4-9b-chat,2024-06-18 15:26:30,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,
14
- glm-4-9b-chat,2024-06-18 16:30:21,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,
15
- glm-4-9b-chat-1m,2024-06-18 16:54:28,,THUDM/glm-4-9b-chat-1m,,,FAILED,eq-bench,FAILED,1,transformers, ,,
16
- glm-4-9b-chat-1m,2024-06-18 17:05:16,,THUDM/glm-4-9b-chat-1m,,,FAILED,eq-bench,FAILED,1,transformers, ,,
17
- openchat-3.6-8b-20240522,2024-06-18 17:12:00,,openchat/openchat-3.6-8b-20240522,,,-1.339640900815702e+23,eq-bench_v2,171.0,1,transformers, ,,
18
- openchat-gemma,2024-06-18 17:13:12,,openchat/openchat-3.5-0106-gemma,,,FAILED,eq-bench,FAILED,1,transformers, ,,System role not supported
19
- Meta-Llama-3-8B-Instruct,2024-06-18 21:29:03,,meta-llama/Meta-Llama-3-8B-Instruct,,,69.09,eq-bench_v2,171.0,1,transformers, ,,
20
- Starling-LM-7B-alpha,2024-06-18 21:45:18,,berkeley-nest/Starling-LM-7B-alpha,,,49.63,eq-bench_v2_pl,171.0,1,transformers, ,,
21
- Starling-LM-7B-beta,2024-06-18 21:51:54,,Nexusflow/Starling-LM-7B-beta,,,44.91,eq-bench_v2_pl,159.0,1,transformers, ,,
22
- Mistral-7B-Instruct-v0.2,2024-06-18 21:52:17,,mistralai/Mistral-7B-Instruct-v0.2,,,FAILED,eq-bench,FAILED,1,transformers, ,,Conversation roles must alternate user/assistant/user/assistant/...
23
- Mistral-7B-Instruct-v0.1,2024-06-18 22:26:07,,mistralai/Mistral-7B-Instruct-v0.1,,,FAILED,eq-bench,FAILED,1,transformers, ,,Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)
24
- Meta-Llama-3-8B-Instruct,2024-06-18 22:35:53,,meta-llama/Meta-Llama-3-8B-Instruct,,,46.53,eq-bench_v2_pl,171.0,1,transformers, ,,
25
- openchat-gemma,2024-06-19 09:30:28,,openchat/openchat-3.5-0106-gemma,,,FAILED,eq-bench,FAILED,1,transformers, ,,System role not supported
26
- Mistral-7B-Instruct-v0.2,2024-06-19 09:30:46,,mistralai/Mistral-7B-Instruct-v0.2,,,FAILED,eq-bench,FAILED,1,transformers, ,,Conversation roles must alternate user/assistant/user/assistant/...
27
- openchat-gemma,2024-06-19 09:35:50,,openchat/openchat-3.5-0106-gemma,,,FAILED,eq-bench,FAILED,1,transformers, ,,System role not supported
28
- Mistral-7B-Instruct-v0.2,2024-06-19 09:36:01,,mistralai/Mistral-7B-Instruct-v0.2,,,FAILED,eq-bench,FAILED,1,transformers, ,,Conversation roles must alternate user/assistant/user/assistant/...
29
- openchat-gemma,2024-06-19 09:43:53,,openchat/openchat-3.5-0106-gemma,,,60.11,eq-bench_v2_pl,169.0,1,transformers, ,,
30
- Mistral-7B-Instruct-v0.2,2024-06-19 09:49:42,,mistralai/Mistral-7B-Instruct-v0.2,,,52.99,eq-bench_v2_pl,148.0,1,transformers, ,,
31
- openchat-gemma,2024-06-19 09:54:01,,openchat/openchat-3.5-0106-gemma,,,60.11,eq-bench_v2_pl,169.0,1,transformers, ,,
32
- openchat-gemma,2024-06-19 10:16:52,,openchat/openchat-3.5-0106-gemma,,,59.93,eq-bench_v2_pl,170.0,1,transformers, ,,
33
- openchat-gemma,2024-06-19 10:19:44,,openchat/openchat-3.5-0106-gemma,,,59.93,eq-bench_v2_pl,170.0,1,transformers, ,,
34
- Nous-Hermes-2-SOLAR-10.7B,2024-06-19 10:27:36,,NousResearch/Nous-Hermes-2-SOLAR-10.7B,,,48.22,eq-bench_v2_pl,169.0,1,transformers, ,,
35
- SOLAR-10.7B-Instruct-v1.0,2024-06-19 10:43:47,,upstage/SOLAR-10.7B-Instruct-v1.0,,,57.57,eq-bench_v2_pl,164.0,1,transformers, ,,
36
- Qwen2-7B-Instruct,2024-06-19 10:46:52,,Qwen/Qwen2-7B-Instruct,,,53.08,eq-bench_v2_pl,171.0,1,transformers, ,,
37
- models/gwint2,2024-06-19 11:21:15,,speakleash/Bielik-11B-v2.0-Instruct,,,68.24,eq-bench_v2_pl,171.0,1,transformers, ,,
38
- Azurro/APT3-275M-Base,2024-06-19 11:36:43,,Azurro/APT3-275M-Base,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
39
- Qwen/Qwen2-0.5B,2024-06-19 11:47:44,,Qwen/Qwen2-0.5B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,18.0 questions were parseable (min is 83%)
40
- Qwen/Qwen2-0.5B-Instruct,2024-06-19 11:51:21,,Qwen/Qwen2-0.5B-Instruct,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,125.0 questions were parseable (min is 83%)
41
- allegro/plt5-large,2024-06-19 11:51:22,,allegro/plt5-large,,,FAILED,eq-bench,FAILED,1,transformers, ,,Unrecognized configuration class <class 'transformers.models.t5.configuration_t5.T5Config'> for this kind of AutoModel: AutoModelForCausalLM. Model type should be one of BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CohereConfig, CpmAntConfig, CTRLConfig, Data2VecTextConfig, DbrxConfig, ElectraConfig, ErnieConfig, FalconConfig, FuyuConfig, GemmaConfig, GitConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, JambaConfig, JetMoeConfig, LlamaConfig, MambaConfig, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MixtralConfig, MptConfig, MusicgenConfig, MusicgenMelodyConfig, MvpConfig, OlmoConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PhiConfig, Phi3Config, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, Qwen2MoeConfig, RecurrentGemmaConfig, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, StableLmConfig, Starcoder2Config, TransfoXLConfig, TrOCRConfig, WhisperConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig.
42
- APT3-1B-Instruct-e1,2024-06-19 11:51:22,,APT3-1B-Instruct-e1,,,FAILED,eq-bench,FAILED,1,transformers, ,,APT3-1B-Instruct-e1 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
43
- APT3-1B-Instruct-e2,2024-06-19 11:51:23,,APT3-1B-Instruct-e2,,,FAILED,eq-bench,FAILED,1,transformers, ,,APT3-1B-Instruct-e2 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
44
- Azurro/APT3-1B-Base,2024-06-19 12:00:40,,Azurro/APT3-1B-Base,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
45
- OPI-PG/Qra-1b,2024-06-19 12:13:15,,OPI-PG/Qra-1b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
46
- TinyLlama/TinyLlama-1.1B-Chat-v1.0,2024-06-19 12:23:45,,TinyLlama/TinyLlama-1.1B-Chat-v1.0,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,36.0 questions were parseable (min is 83%)
47
- Qwen/Qwen2-1.5B,2024-06-19 12:35:37,,Qwen/Qwen2-1.5B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,54.0 questions were parseable (min is 83%)
48
- Qwen/Qwen2-1.5B-Instruct,2024-06-19 12:38:29,,Qwen/Qwen2-1.5B-Instruct,,,15.33,eq-bench_v2_pl,165.0,1,transformers, ,,
49
- sdadas/polish-gpt2-xl,2024-06-19 12:54:39,,sdadas/polish-gpt2-xl,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
50
- internlm/internlm2-1_8b,2024-06-19 13:08:50,,internlm/internlm2-1_8b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
51
- internlm/internlm2-chat-1_8b,2024-06-19 13:13:21,,internlm/internlm2-chat-1_8b,,,13.83,eq-bench_v2_pl,150.0,1,transformers, ,,
52
- google/gemma-1.1-2b-it,2024-06-19 13:15:24,,google/gemma-1.1-2b-it,,,16.47,eq-bench_v2_pl,171.0,1,transformers, ,,
53
- microsoft/phi-2,2024-06-19 13:28:07,,microsoft/phi-2,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
54
- google/mt5-xl,2024-06-19 13:28:10,,google/mt5-xl,,,FAILED,eq-bench,FAILED,1,transformers, ,,Unrecognized configuration class <class 'transformers.models.mt5.configuration_mt5.MT5Config'> for this kind of AutoModel: AutoModelForCausalLM. Model type should be one of BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CohereConfig, CpmAntConfig, CTRLConfig, Data2VecTextConfig, DbrxConfig, ElectraConfig, ErnieConfig, FalconConfig, FuyuConfig, GemmaConfig, GitConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, JambaConfig, JetMoeConfig, LlamaConfig, MambaConfig, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MixtralConfig, MptConfig, MusicgenConfig, MusicgenMelodyConfig, MvpConfig, OlmoConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PhiConfig, Phi3Config, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, Qwen2MoeConfig, RecurrentGemmaConfig, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, StableLmConfig, Starcoder2Config, TransfoXLConfig, TrOCRConfig, WhisperConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig, InternLM2Config, InternLM2Config.
55
- microsoft/Phi-3-mini-4k-instruct,2024-06-19 13:34:56,,microsoft/Phi-3-mini-4k-instruct,,,28.05,eq-bench_v2_pl,159.0,1,transformers, ,,
56
- ssmits/Falcon2-5.5B-Polish,2024-06-19 13:47:21,,ssmits/Falcon2-5.5B-Polish,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
57
- 01-ai/Yi-1.5-6B,2024-06-19 14:04:20,,01-ai/Yi-1.5-6B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,1.0 questions were parseable (min is 83%)
58
- 01-ai/Yi-1.5-6B-Chat,2024-06-19 14:11:22,,01-ai/Yi-1.5-6B-Chat,,,5.19,eq-bench_v2_pl,161.0,1,transformers, ,,
59
- THUDM/chatglm3-6b,2024-06-19 14:12:11,,THUDM/chatglm3-6b,,,FAILED,eq-bench,FAILED,1,transformers, ,,too many values to unpack (expected 2)
60
- THUDM/chatglm3-6b-base,2024-06-19 14:13:00,,THUDM/chatglm3-6b-base,,,FAILED,eq-bench,FAILED,1,transformers, ,,too many values to unpack (expected 2)
61
- alpindale/Mistral-7B-v0.2-hf,2024-06-19 14:16:37,,alpindale/Mistral-7B-v0.2-hf,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,45.0 questions were parseable (min is 83%)
62
- berkeley-nest/Starling-LM-7B-alpha,2024-06-19 14:22:32,,berkeley-nest/Starling-LM-7B-alpha,,,46.26,eq-bench_v2_pl,171.0,1,transformers, ,,
63
- google/gemma-7b,2024-06-19 14:38:02,,google/gemma-7b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
64
- google/gemma-7b-it,2024-06-19 14:53:28,,google/gemma-7b-it,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
65
- HuggingFaceH4/zephyr-7b-alpha,2024-06-19 15:05:31,,HuggingFaceH4/zephyr-7b-alpha,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,99.0 questions were parseable (min is 83%)
66
- HuggingFaceH4/zephyr-7b-beta,2024-06-19 15:18:24,,HuggingFaceH4/zephyr-7b-beta,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,88.0 questions were parseable (min is 83%)
67
- internlm/internlm2-7b,2024-06-19 15:36:06,,internlm/internlm2-7b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,43.0 questions were parseable (min is 83%)
68
- internlm/internlm2-base-7b,2024-06-19 15:54:53,,internlm/internlm2-base-7b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,6.0 questions were parseable (min is 83%)
69
- internlm/internlm2-chat-7b,2024-06-19 16:02:07,,internlm/internlm2-chat-7b,,,40.0,eq-bench_v2_pl,169.0,1,transformers, ,,
70
- internlm/internlm2-chat-7b-sft,2024-06-19 16:07:04,,internlm/internlm2-chat-7b-sft,,,41.62,eq-bench_v2_pl,170.0,1,transformers, ,,
71
- lex-hue/Delexa-7b,2024-06-19 16:12:19,,lex-hue/Delexa-7b,,,49.03,eq-bench_v2_pl,169.0,1,transformers, ,,
72
- meta-llama/Llama-2-7b-chat-hf,2024-06-19 16:21:08,,meta-llama/Llama-2-7b-chat-hf,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,116.0 questions were parseable (min is 83%)
73
- meta-llama/Llama-2-7b-hf,2024-06-19 16:36:41,,meta-llama/Llama-2-7b-hf,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,1.0 questions were parseable (min is 83%)
74
- microsoft/WizardLM-2-7B,2024-06-19 16:44:22,,microsoft/WizardLM-2-7B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,137.0 questions were parseable (min is 83%)
75
- mistralai/Mistral-7B-Instruct-v0.1,2024-06-19 16:44:33,,mistralai/Mistral-7B-Instruct-v0.1,,,FAILED,eq-bench,FAILED,1,transformers, ,,Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)
76
- mistralai/Mistral-7B-Instruct-v0.2,2024-06-19 16:50:36,,mistralai/Mistral-7B-Instruct-v0.2,,,53.25,eq-bench_v2_pl,151.0,1,transformers, ,,
77
- mistralai/Mistral-7B-Instruct-v0.3,2024-06-19 16:54:49,,mistralai/Mistral-7B-Instruct-v0.3,,,45.21,eq-bench_v2_pl,171.0,1,transformers, ,,
78
- mistralai/Mistral-7B-v0.1,2024-06-19 16:59:50,,mistralai/Mistral-7B-v0.1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,65.0 questions were parseable (min is 83%)
79
- mistralai/Mistral-7B-v0.3,2024-06-19 17:16:38,,mistralai/Mistral-7B-v0.3,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,14.0 questions were parseable (min is 83%)
80
- Nexusflow/Starling-LM-7B-beta,2024-06-19 17:23:18,,Nexusflow/Starling-LM-7B-beta,,,45.1,eq-bench_v2_pl,166.0,1,transformers, ,,
81
- openchat/openchat-3.5-0106,2024-06-19 17:27:10,,openchat/openchat-3.5-0106,,,43.81,eq-bench_v2_pl,171.0,1,transformers, ,,
82
- openchat/openchat-3.5-0106-gemma,2024-06-19 17:30:31,,openchat/openchat-3.5-0106-gemma,,,58.62,eq-bench_v2_pl,169.0,1,transformers, ,,
83
- openchat/openchat-3.5-1210,2024-06-19 17:34:27,,openchat/openchat-3.5-1210,,,49.04,eq-bench_v2_pl,171.0,1,transformers, ,,
84
- OPI-PG/Qra-7b,2024-06-19 17:50:28,,OPI-PG/Qra-7b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
85
- Qwen/Qwen1.5-7B,2024-06-19 17:57:53,,Qwen/Qwen1.5-7B,,,23.11,eq-bench_v2_pl,155.0,1,transformers, ,,
86
- Qwen/Qwen1.5-7B-Chat,2024-06-19 18:03:34,,Qwen/Qwen1.5-7B-Chat,,,25.0,eq-bench_v2_pl,164.0,1,transformers, ,,
87
- Qwen/Qwen2-7B,2024-06-19 18:09:23,,Qwen/Qwen2-7B,,,36.58,eq-bench_v2_pl,166.0,1,transformers, ,,
88
- Qwen/Qwen2-7B-Instruct,2024-06-19 18:12:42,,Qwen/Qwen2-7B-Instruct,,,53.74,eq-bench_v2_pl,171.0,1,transformers, ,,
89
- Remek/Kruk-7B-SP-001,2024-06-19 18:17:13,,Remek/Kruk-7B-SP-001,,,44.44,eq-bench_v2_pl,171.0,1,transformers, ,,
90
- Remek/OpenChat-3.5-0106-PL-Omnibusv2,2024-06-19 18:17:24,,Remek/OpenChat-3.5-0106-PL-Omnibusv2,,,FAILED,eq-bench,FAILED,1,transformers, ,,'system_message' is undefined
91
- Remek/OpenChat3.5-0106-Spichlerz-Bocian,2024-06-19 18:24:08,,Remek/OpenChat3.5-0106-Spichlerz-Bocian,,,44.13,eq-bench_v2_pl,166.0,1,transformers, ,,
92
- Remek/OpenChat3.5-0106-Spichlerz-Inst-001,2024-06-19 18:28:48,,Remek/OpenChat3.5-0106-Spichlerz-Inst-001,,,41.6,eq-bench_v2_pl,171.0,1,transformers, ,,
93
- RWKV/HF_v5-Eagle-7B,2024-06-19 19:16:27,,RWKV/HF_v5-Eagle-7B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
94
- RWKV/v5-Eagle-7B-HF,2024-06-19 20:04:12,,RWKV/v5-Eagle-7B-HF,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
95
- speakleash/Bielik-7B-v0.1,2024-06-19 20:11:16,,speakleash/Bielik-7B-v0.1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,139.0 questions were parseable (min is 83%)
96
- szymonrucinski/Curie-7B-v1,2024-06-19 20:29:24,,szymonrucinski/Curie-7B-v1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,1.0 questions were parseable (min is 83%)
97
- teknium/OpenHermes-2.5-Mistral-7B,2024-06-19 20:34:12,,teknium/OpenHermes-2.5-Mistral-7B,,,37.48,eq-bench_v2_pl,171.0,1,transformers, ,,
98
- Voicelab/trurl-2-7b,2024-06-19 20:39:26,,Voicelab/trurl-2-7b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,141.0 questions were parseable (min is 83%)
99
- microsoft/Phi-3-small-8k-instruct,2024-06-19 20:39:31,,microsoft/Phi-3-small-8k-instruct,,,FAILED,eq-bench,FAILED,1,transformers, ,,No module named 'pytest'
100
- CohereForAI/aya-23-8B,2024-06-19 20:44:01,,CohereForAI/aya-23-8B,,,45.43,eq-bench_v2_pl,171.0,1,transformers, ,,
101
- meta-llama/Meta-Llama-3-8B,2024-06-19 21:01:55,,meta-llama/Meta-Llama-3-8B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
102
- meta-llama/Meta-Llama-3-8B-Instruct,2024-06-19 21:06:08,,meta-llama/Meta-Llama-3-8B-Instruct,,,46.27,eq-bench_v2_pl,171.0,1,transformers, ,,
103
- mlabonne/NeuralDaredevil-8B-abliterated,2024-06-19 21:13:31,,mlabonne/NeuralDaredevil-8B-abliterated,,,54.74,eq-bench_v2_pl,171.0,1,transformers, ,,
104
- NousResearch/Hermes-2-Pro-Llama-3-8B,2024-06-19 21:18:18,,NousResearch/Hermes-2-Pro-Llama-3-8B,,,54.57,eq-bench_v2_pl,171.0,1,transformers, ,,
105
- NousResearch/Hermes-2-Theta-Llama-3-8B,2024-06-19 21:25:22,,NousResearch/Hermes-2-Theta-Llama-3-8B,,,54.88,eq-bench_v2_pl,171.0,1,transformers, ,,
106
- nvidia/Llama3-ChatQA-1.5-8B,2024-06-19 22:27:24,,nvidia/Llama3-ChatQA-1.5-8B,,,40.55,eq-bench_v2_pl,166.0,1,transformers, ,,
107
- openchat/openchat-3.6-8b-20240522,2024-06-19 22:34:56,,openchat/openchat-3.6-8b-20240522,,,-2.0090659464796595e+18,eq-bench_v2_pl,170.0,1,transformers, ,,
108
- Remek/Llama-3-8B-Omnibus-1-PL-v01-INSTRUCT,2024-06-19 22:39:46,,Remek/Llama-3-8B-Omnibus-1-PL-v01-INSTRUCT,,,26.63,eq-bench_v2_pl,171.0,1,transformers, ,,
109
- 01-ai/Yi-1.5-9B,2024-06-19 23:07:56,,01-ai/Yi-1.5-9B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,1.0 questions were parseable (min is 83%)
110
- 01-ai/Yi-1.5-9B-Chat,2024-06-19 23:19:16,,01-ai/Yi-1.5-9B-Chat,,,48.78,eq-bench_v2_pl,163.0,1,transformers, ,,
111
- google/recurrentgemma-9b-it,2024-06-19 23:28:19,,google/recurrentgemma-9b-it,,,52.82,eq-bench_v2_pl,171.0,1,transformers, ,,
112
- THUDM/glm-4-9b,2024-06-19 23:28:41,,THUDM/glm-4-9b,,,FAILED,eq-bench,FAILED,1,transformers, ,,too many values to unpack (expected 2)
113
- THUDM/glm-4-9b-chat,2024-06-19 23:29:01,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,too many values to unpack (expected 2)
114
- NousResearch/Nous-Hermes-2-SOLAR-10.7B,2024-06-19 23:51:07,,NousResearch/Nous-Hermes-2-SOLAR-10.7B,,,49.85,eq-bench_v2_pl,169.0,1,transformers, ,,
115
- TeeZee/Bielik-SOLAR-LIKE-10.7B-Instruct-v0.1,2024-06-20 00:00:02,,TeeZee/Bielik-SOLAR-LIKE-10.7B-Instruct-v0.1,,,35.63,eq-bench_v2_pl,164.0,1,transformers, ,,
116
- upstage/SOLAR-10.7B-Instruct-v1.0,2024-06-20 00:19:48,,upstage/SOLAR-10.7B-Instruct-v1.0,,,57.35,eq-bench_v2_pl,162.0,1,transformers, ,,
117
- upstage/SOLAR-10.7B-v1.0,2024-06-20 01:12:51,,upstage/SOLAR-10.7B-v1.0,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,1.0 questions were parseable (min is 83%)
118
- tiiuae/falcon-11B,2024-06-20 01:23:54,,tiiuae/falcon-11B,,,42.41,eq-bench_v2_pl,171.0,1,transformers, ,,
119
- lmsys/vicuna-13b-v1.5,2024-06-20 01:43:40,,lmsys/vicuna-13b-v1.5,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,84.0 questions were parseable (min is 83%)
120
- OPI-PG/Qra-13b,2024-06-20 02:07:48,,OPI-PG/Qra-13b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
121
- teknium/OpenHermes-13B,2024-06-20 02:32:04,,teknium/OpenHermes-13B,,,36.85,eq-bench_v2_pl,162.0,1,transformers, ,,
122
- Voicelab/trurl-2-13b-academic,2024-06-20 02:38:04,,Voicelab/trurl-2-13b-academic,,,25.92,eq-bench_v2_pl,162.0,1,transformers, ,,
123
- microsoft/Phi-3-medium-4k-instruct,2024-06-20 02:46:38,,microsoft/Phi-3-medium-4k-instruct,,,57.07,eq-bench_v2_pl,169.0,1,transformers, ,,
124
- Qwen/Qwen1.5-14B-Chat,2024-06-20 02:52:13,,Qwen/Qwen1.5-14B-Chat,,,51.26,eq-bench_v2_pl,160.0,1,transformers, ,,
125
- internlm/internlm2-20b,2024-06-20 09:04:33,,internlm/internlm2-20b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,4.0 questions were parseable (min is 83%)
126
- internlm/internlm2-chat-20b,2024-06-20 09:47:11,,internlm/internlm2-chat-20b,,,36.52,eq-bench_v2_pl,170.0,1,transformers, ,,
127
- Qwen/Qwen1.5-32B,2024-06-20 13:25:12,,Qwen/Qwen1.5-32B,,,54.35,eq-bench_v2_pl,170.0,1,transformers, ,,
128
- Qwen/Qwen1.5-32B-Chat,2024-06-20 13:34:52,,Qwen/Qwen1.5-32B-Chat,,,60.69,eq-bench_v2_pl,168.0,1,transformers, ,,
129
- 01-ai/Yi-1.5-34B-Chat,2024-06-20 13:51:30,,01-ai/Yi-1.5-34B-Chat,,,46.32,eq-bench_v2_pl,171.0,1,transformers, ,,
130
- CohereForAI/aya-23-35B,2024-06-20 14:03:07,,CohereForAI/aya-23-35B,,,58.41,eq-bench_v2_pl,171.0,1,transformers, ,,
131
- CohereForAI/c4ai-command-r-v01,2024-06-20 14:14:54,,CohereForAI/c4ai-command-r-v01,,,56.43,eq-bench_v2_pl,171.0,1,transformers, ,,
132
- mistralai/Mixtral-8x7B-Instruct-v0.1,2024-06-20 14:35:28,,mistralai/Mixtral-8x7B-Instruct-v0.1,,,58.64,eq-bench_v2_pl,168.0,1,transformers, ,,
133
- mistralai/Mixtral-8x7B-v0.1,2024-06-20 15:30:24,,mistralai/Mixtral-8x7B-v0.1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,10.0 questions were parseable (min is 83%)
134
- Qwen/Qwen2-57B-A14B-Instruct,2024-06-20 16:19:41,,Qwen/Qwen2-57B-A14B-Instruct,,,57.64,eq-bench_v2_pl,171.0,1,transformers, ,,
135
- meta-llama/Meta-Llama-3-70B,2024-06-20 16:59:30,,meta-llama/Meta-Llama-3-70B,,,46.1,eq-bench_v2_pl,145.0,1,transformers, ,,
136
- meta-llama/Meta-Llama-3-70B-Instruct,2024-06-20 17:15:58,,meta-llama/Meta-Llama-3-70B-Instruct,,,71.21,eq-bench_v2_pl,171.0,1,transformers, ,,
137
- Qwen/Qwen1.5-72B,2024-06-20 17:50:17,,Qwen/Qwen1.5-72B,,,53.96,eq-bench_v2_pl,163.0,1,transformers, ,,
138
- Qwen/Qwen1.5-72B-Chat,2024-06-20 18:06:58,,Qwen/Qwen1.5-72B-Chat,,,68.03,eq-bench_v2_pl,171.0,1,transformers, ,,
139
- Qwen/Qwen2-72B,2024-06-20 18:36:22,,Qwen/Qwen2-72B,,,69.75,eq-bench_v2_pl,169.0,1,transformers, ,,
140
- Qwen/Qwen2-72B-Instruct,2024-06-20 18:55:02,,Qwen/Qwen2-72B-Instruct,,,72.07,eq-bench_v2_pl,169.0,1,transformers, ,,
141
- mistralai/Mixtral-8x22B-v0.1,2024-06-21 20:20:37,,mistralai/Mixtral-8x22B-v0.1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,34.0 questions were parseable (min is 83%)
142
- mistralai/Mixtral-8x22B-Instruct-v0.1,2024-06-26 23:40:01,,mistralai/Mixtral-8x22B-Instruct-v0.1,,,67.63,eq-bench_v2_pl,171.0,1,transformers, ,,
143
- mistralai/Mixtral-8x22B-v0.1,2024-06-27 01:17:13,,mistralai/Mixtral-8x22B-v0.1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,50.0 questions were parseable (min is 83%)
144
- alpindale/WizardLM-2-8x22B,2024-06-27 01:50:42,,alpindale/WizardLM-2-8x22B,,,69.56,eq-bench_v2_pl,171.0,1,transformers, ,,
145
- Bielik_v2.2b,2024-08-24 09:54:33,,speakleash/Bielik-11B-v2.2-Instruct,,,69.05,eq-bench_v2_pl,171.0,1,transformers, ,,
146
- Bielik_v2.1,2024-08-24 10:07:46,,speakleash/Bielik-11B-v2.1-Instruct,,,66.27,eq-bench_v2_pl,155.0,1,transformers, ,,
147
- meta-llama/Meta-Llama-3.1-70B-Instruct,2024-08-24 21:24:39,,meta-llama/Meta-Llama-3.1-70B-Instruct,,,FAILED,eq-bench,FAILED,1,transformers, ,,`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {'factor': 8.0, 'low_freq_factor': 1.0, 'high_freq_factor': 4.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}
148
- mistralai/Mistral-Large-Instruct-2407,2024-08-24 21:51:53,,mistralai/Mistral-Large-Instruct-2407,,,78.07,eq-bench_v2_pl,171.0,1,transformers, ,,
149
- meta-llama/Meta-Llama-3.1-70B-Instruct,2024-08-24 22:23:40,,meta-llama/Meta-Llama-3.1-70B-Instruct,,,72.53,eq-bench_v2_pl,171.0,1,transformers, ,,
150
- meta-llama/Meta-Llama-3.1-405B-Instruct-FP8,2024-08-25 20:59:04,openai_api,meta-llama/Meta-Llama-3.1-405B-Instruct-FP8,,,77.23,eq-bench_v2_pl,171.0,1,openai,,,
151
- gpt-3.5-turbo,2024-08-25 21:14:25,openai_api,gpt-3.5-turbo,,,57.7,eq-bench_v2_pl,171.0,1,openai,,,
152
- gpt-4o-mini-2024-07-18,2024-08-25 21:17:34,openai_api,gpt-4o-mini-2024-07-18,,,71.15,eq-bench_v2_pl,171.0,1,openai,,,
153
- gpt-4o-2024-08-06,2024-08-25 21:24:35,openai_api,gpt-4o-2024-08-06,,,75.15,eq-bench_v2_pl,171.0,1,openai,,,
154
- gpt-4-turbo-2024-04-09,2024-08-25 21:31:42,openai_api,gpt-4-turbo-2024-04-09,,,77.77,eq-bench_v2_pl,164.0,1,openai,,,
155
- Bielik_v2.3,2024-09-14 10:40:57,,speakleash/Bielik-11B-v2.3-Instruct,,,70.86,eq-bench_v2_pl,171.0,1,transformers, ,,
156
- PLLuM-12B-nc-chat,2025-02-24 15:02:07,,CYFRAGOVPL/PLLuM-12B-nc-chat,,,49.23,eq-bench_pl,123.0,1,transformers, ,,123.0 questions were parseable (min is 83%)
157
- Llama-PLLuM-8B-instruct,2025-02-24 16:55:16,,CYFRAGOVPL/Llama-PLLuM-8B-instruct,,,43.56,eq-bench_pl,124.0,1,transformers, ,,124.0 questions were parseable (min is 83%)
158
- PLLuM-12B-nc-instruct,2025-02-24 17:38:48,,CYFRAGOVPL/PLLuM-12B-nc-instruct,,,29.50,eq-bench_pl,76.0,1,transformers, ,,76.0 questions were parseable (min is 83%)
159
- PLLuM-12B-chat,2025-02-24 17:56:34,,CYFRAGOVPL/PLLuM-12B-chat,,,57.29,eq-bench_v2_pl,156.0,1,transformers, ,,
160
- PLLuM-12B-instruct,2025-02-24 18:03:06,,CYFRAGOVPL/PLLuM-12B-instruct,,,40.21,eq-bench_v2_pl,154.0,1,transformers, ,,
161
- Llama-PLLuM-8B-chat,2025-02-24 18:40:04,,CYFRAGOVPL/Llama-PLLuM-8B-chat,,,50.97,eq-bench_v2_pl,155.0,1,transformers, ,,
162
- Llama-PLLuM-70B-instruct,2025-02-23 22:45:37,,CYFRAGOVPL/Llama-PLLuM-70B-instruct,,,69.99,eq-bench_v2_pl,171.0,1,transformers, ,,
163
- Llama-PLLuM-70B-chat,2025-02-24 22:32:57,,CYFRAGOVPL/Llama-PLLuM-70B-chat,,,72.99,eq-bench_v2_pl,170.0,1,transformers, ,,
164
- PLLuM-8x7B-nc-chat,2025-02-23 14:33:22,openai_api,CYFRAGOVPL/PLLuM-8x7B-nc-chat,,,47.29,eq-bench_v2_pl,171.0,1,openai,,,
165
- PLLuM-8x7B-nc-instruct,2025-02-23 14:33:22,openai_api,CYFRAGOVPL/PLLuM-8x7B-nc-instruct,,,41.75,eq-bench_v2_pl,171.0,1,openai,,,
166
- PLLuM-8x7B-chat,2025-02-23 14:33:22,openai_api,CYFRAGOVPL/PLLuM-8x7B-chat,,,45.22,eq-bench_v2_pl,171.0,1,openai,,,
167
- PLLuM-8x7B-instruct,2025-02-23 14:33:22,openai_api,CYFRAGOVPL/PLLuM-8x7B-instruct,,,39.55,eq-bench_v2_pl,171.0,1,openai,,,
168
- Qwen2.5-7B-Instruct,2025-03-01 11:49:28,,Qwen/Qwen2.5-7B-Instruct,,,58.58,eq-bench_v2_pl,171.0,1,transformers,,,
169
- Qwen2.5-14B-Instruct,2025-03-01 12:01:56,,Qwen/Qwen2.5-14B-Instruct,,,69.58,eq-bench_v2_pl,170.0,1,transformers,,,
170
- Qwen2.5-1.5B-Instruct,2025-03-01 12:09:18,,Qwen/Qwen2.5-1.5B-Instruct,,,27.79,eq-bench_v2_pl,170.0,1,transformers,,,
171
- phi-4,2025-03-01 12:19:38,,microsoft/phi-4,,,64.37,eq-bench_v2_pl,157.0,1,transformers,,,
172
- glm-4-9b-chat,2025-03-01 12:23:46,,THUDM/glm-4-9b-chat,,,61.79,eq-bench_v2_pl,171.0,1,transformers,,,
173
- openchat-3.6-8b-20240522,2025-03-01 12:29:29,,openchat/openchat-3.6-8b-20240522,,,-2.0090659464796536e+18,eq-bench_v2_pl,170.0,1,transformers,,,
174
- Qwen2.5-32B-Instruct,2025-03-02 14:08:52,,Qwen/Qwen2.5-32B-Instruct,,,71.15,eq-bench_v2_pl,171.0,1,transformers,,,
175
- Qwen2.5-72B-Instruct,2025-03-02 14:25:32,,Qwen/Qwen2.5-72B-Instruct,,,68.89,eq-bench_v2_pl,170.0,1,transformers,,,
176
- Llama-3.1-Nemotron-70B-Instruct-HF,2025-03-02 15:04:25,,nvidia/Llama-3.1-Nemotron-70B-Instruct-HF,,,74.75,eq-bench_pl,133.0,1,transformers,,,133.0 questions were parseable (min is 83%)
177
- Llama-3.2-1B-Instruct,2025-03-02 16:35:24,,meta-llama/Llama-3.2-1B-Instruct,,,20.59,eq-bench_v2_pl,148.0,1,transformers,,,
178
- EuroLLM-9B-Instruct,2025-03-02 16:41:02,,utter-project/EuroLLM-9B-Instruct,,,54.75,eq-bench_v2_pl,169.0,1,transformers,,,
179
- Llama-3.3-70B-Instruct,2025-03-02 16:59:31,,meta-llama/Llama-3.3-70B-Instruct,,,72.86,eq-bench_v2_pl,166.0,1,transformers,,,
180
- Llama-3.2-3B-Instruct,2025-03-02 17:14:17,,meta-llama/Llama-3.2-3B-Instruct,,,46.46,eq-bench_v2_pl,170.0,1,transformers,,,
181
- Qwen2.5-3B-Instruct,2025-03-02 17:26:57,,Qwen/Qwen2.5-3B-Instruct,,,36.08,eq-bench_v2_pl,170.0,1,transformers,,,
182
- Mistral-Small-24B-Instruct-2501,2025-03-02 17:33:14,,mistralai/Mistral-Small-24B-Instruct-2501,,,70.52,eq-bench_v2_pl,171.0,1,transformers,,,
183
- Mistral-Small-Instruct-2409,2025-03-02 17:43:01,,mistralai/Mistral-Small-Instruct-2409,,,72.85,eq-bench_v2_pl,171.0,1,transformers,,,
184
- Mistral-Nemo-Instruct-2407,2025-03-03 10:29:42,,mistralai/Mistral-Nemo-Instruct-2407,,,61.76,eq-bench_v2_pl,171.0,1,transformers,,,
185
- Phi-4-mini-instruct,2025-03-03 13:20:03,,microsoft/Phi-4-mini-instruct,,,50.82,eq-bench_v2_pl,170.0,1,transformers,,,
186
- Mistral-Large-Instruct-2411,2025-03-07 12:17:17,,mistralai/Mistral-Large-Instruct-2411,,,77.29,eq-bench_v2_pl,171.0,1,transformers,,,
187
- Bielik-11B-v2.5-Instruct,2025-05-01 19:27:42,,speakleash/Bielik-11B-v2.5-Instruct,,,72.42,eq-bench_v2_pl,170.0,1,transformers,,,
188
- Bielik-1.5B-v3.0-Instruct,2025-05-04 00:28:45,,speakleash/Bielik-1.5B-v3.0-Instruct,,,18.99,eq-bench_pl,125.0,1,transformers,,,125.0 questions were parseable (min is 83%)
189
- Bielik-4.5B-v3.0-Instruct,2025-05-04 16:16:42,,speakleash/Bielik-4.5B-v3.0-Instruct,,,56.21,eq-bench_v2_pl,163.0,1,transformers,,,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
metadata.json DELETED
@@ -1,355 +0,0 @@
1
- {
2
- "Azurro/APT3-1B-Base": 1,
3
- "HuggingFaceH4/zephyr-7b-alpha": 7,
4
- "Voicelab/trurl-2-13b-academic": 13,
5
- "HuggingFaceH4/zephyr-7b-beta": 7,
6
- "Voicelab/trurl-2-7b": 7,
7
- "mistralai/Mistral-7B-v0.1": 7,
8
- "mistralai/Mistral-7B-v0.1,peft=lora/output/mistral-7b-v0.1-lora-pl/checkpoint-400/adapter_model": 7,
9
- "mistralai/Mistral-7B-v0.1,peft=lora/output/mistral-7b-v0.1-lora-pl/checkpoint-200/adapter_model": 7,
10
- "mistralai/Mistral-7B-v0.1,load_in_8bit=True": 7,
11
- "Nondzu/zephyr-speakleash-007-pl-8192-32-16-0.05": 7,
12
- "openchat/openchat-3.5-0106": 7,
13
- "mistralai/Mistral-7B-v0.1,peft=lora/output/mistral-7b-v0.1-lora-pl/checkpoint-2000/adapter_model": 7,
14
- "mistralai/Mistral-7B-v0.1,peft=lora/output/mistral-7b-v0.1-lora-pl/checkpoint-2200/adapter_model": 7,
15
- "mistralai/Mistral-7B-Instruct-v0.1": 7,
16
- "APT3-1B-Instruct-e1": 1,
17
- "mistralai/Mistral-7B-v0.1,peft=lora/output/mistral-7b-v0.1-lora-pl/checkpoint-800/adapter_model": 7,
18
- "mistralai/Mistral-7B-v0.1,peft=lora/output/mistral-7b-v0.1-lora-pl/checkpoint-600/adapter_model": 7,
19
- "APT3-1B-Instruct-e2": 1,
20
- "mistralai/Mistral-7B-v0.1,load_in_4bit=True": 7,
21
- "speakleash/3-5B_high_base/epoch_2_hf": 3.5,
22
- "speakleash/3-5B_high_base/epoch_1_hf": 3.5,
23
- "speakleash/3-5B_high_base/epoch_0_hf": 3.5,
24
- "speakleash/7B_high_base/epoch_1_hf": 7,
25
- "speakleash/7B_high_base/epoch_0_hf": 7,
26
- "Nondzu/zephyr-speakleash-010-pl-3072-32-16-0.01": 7,
27
- "google/mt5-xl": 3.7,
28
- "speakleash/7B_high_sft/epoch_2_base/epoch_2_hf": 7,
29
- "OPI-PG/Qra-1b": 1,
30
- "OPI-PG/Qra-13b": 13,
31
- "OPI-PG/Qra-7b": 7,
32
- "teknium/OpenHermes-2.5-Mistral-7B": 7,
33
- "openchat/openchat-3.5-1210": 7,
34
- "speakleash/apt3-1B_base/apt3-1B-sequential_hf": 1,
35
- "speakleash/apt3-1B_base/apt3-1B-shuffled_hf": 1,
36
- "speakleash/1B_high_base/like_apt3-1B_hf": 1,
37
- "speakleash/1B_high_base/epoch_3_hf": 1,
38
- "speakleash/7B_high_sft/epoch_1_base/epoch_2_hf": 7,
39
- "speakleash/7B_high_sft/epoch_1_base/epoch_1_hf": 7,
40
- "speakleash/7B_high_sft/epoch_0_base/epoch_0_hf": 7,
41
- "speakleash/7B_high_sft/epoch_2_base/epoch_1_hf": 7,
42
- "speakleash/3-5B_high_sft/epoch_3_base/epoch_2_hf": 3.5,
43
- "allegro/plt5-large": 0.82,
44
- "internlm/internlm2-7b": 7,
45
- "sdadas/polish-gpt2-xl": 1.67,
46
- "speakleash/1B_4k_high_sft/epoch_3_base/epoch_1_hf": 1,
47
- "speakleash/mistral-PL_7B/epoch_0_hf": 7,
48
- "speakleash/1B_high_sft/epoch_3_base/epoch_1_hf": 1,
49
- "speakleash/polish-mistral-7B/epoch_0_hf": 7,
50
- "speakleash/3-5B_high_sft/epoch_0_base/epoch_2_hf": 3.5,
51
- "speakleash/3-5B_high_sft/epoch_0_base/epoch_1_hf": 3.5,
52
- "speakleash/3-5B_high_sft/epoch_0_base/epoch_0_hf": 3.5,
53
- "speakleash/7B_high_base/epoch_2_hf": 7,
54
- "speakleash/10B-4k_high_sft/epoch_3_base/epoch_1_hf": 10,
55
- "speakleash/3-5B_high_base/epoch_3_hf": 3.5,
56
- "microsoft/phi-2": 2.7,
57
- "RWKV/HF_v5-Eagle-7B": 7,
58
- "mistralai/Mistral-7B-Instruct-v0.2": 7,
59
- "speakleash/llama-apt3-7B/only-spi-e0_hf": 7,
60
- "speakleash/llama-apt3-7B/spkl-only_sft/e4_hf": 7,
61
- "speakleash/llama-apt3-7B/spkl-only_sft/e5_hf": 7,
62
- "speakleash/llama-apt3-7B/spkl-only_sft/e3_hf": 7,
63
- "speakleash/llama-apt3-7B/spkl-only_sft/e2_hf": 7,
64
- "meta-llama/Llama-2-7b-hf": 7,
65
- "meta-llama/Llama-2-7b-chat-hf": 7,
66
- "internlm/internlm2-chat-7b": 7,
67
- "internlm/internlm2-base-7b": 7,
68
- "internlm/internlm2-1_8b": 1.8,
69
- "internlm/internlm2-chat-1_8b": 1.8,
70
- "speakleash/mistral-apt3-7B/only-spi_sft/e0_hf": 7,
71
- "speakleash/mistral-apt3-7B/only-spi-e0_hf": 7,
72
- "speakleash/mistral-apt3-7B/apt3-e0_hf": 7,
73
- "speakleash/mistral-apt3-7B/spi-e0_hf": 7,
74
- "speakleash/mistral-apt3-7B/spkl_sft_v2/e4_hf": 7,
75
- "speakleash/mistral-apt3-7B/spkl_sft_v2/e5_hf": 7,
76
- "speakleash/mistral-apt3-7B/spkl_sft_v2/e3_hf": 7,
77
- "speakleash/mistral-apt3-7B/spkl_sft_v2/e2_hf": 7,
78
- "speakleash/mistral-apt3-7B/only-spi_sft_v2/e4_bb62a5b8": 7,
79
- "speakleash/mistral-apt3-7B/only-spi_sft_v2/e6_6b0aa8d6": 7,
80
- "speakleash/mistral-apt3-7B/only-spi_sft_v2/e3_f8b5e568": 7,
81
- "speakleash/mistral-apt3-7B/only-spi_sft_v2/e2_3b7fc53e": 7,
82
- "speakleash/mistral-apt3-7B/only-spi_sft_v2/e5_f75cbc76": 7,
83
- "speakleash/mistral-apt3-7B/only-spi_sft_v2/e7_642f3822": 7,
84
- "speakleash/mistral-apt3-7B/spkl_sft/e3_17ef3119": 7,
85
- "speakleash/mistral-apt3-7B/spkl_sft/e2_7dc8df86": 7,
86
- "google/gemma-7b": 7,
87
- "google/gemma-7b-it": 7,
88
- "SOTA FT HerBERT (large)": 1,
89
- "Baseline (majority class)": 0,
90
- "SOTA FT Polish RoBERTa": 1,
91
- "SOTA FT ULMFiT-SP-PL": 0.1,
92
- "speakleash/llama-apt3-13B/spkl-plus/e0_caa5ad79": 13,
93
- "speakleash/llama-apt3-13B/spkl-only/e0_cc0931c5": 13,
94
- "eryk-mazus/polka-1.1b": 1.1,
95
- "berkeley-nest/Starling-LM-7B-alpha": 7,
96
- "Remek/OpenChat3.5-0106-Spichlerz-Inst-001": 7,
97
- "speakleash/mistral_7B-v2/spkl-all-e2_5bd6027d": 7,
98
- "speakleash/mistral_7B-v2/spkl-all-e0_8cf0987d": 7,
99
- "speakleash/mistral_7B-v2/spkl-only-e0_ef715d74": 7,
100
- "speakleash/mistral_7B-v2/spkl-only-e1_333887a5": 7,
101
- "speakleash/mistral_7B-v2/spkl-all-e1_0b514ce9": 7,
102
- "speakleash/mistral_7B-v2/spkl-only-e2_5dac700d": 7,
103
- "speakleash/llama-apt3-13B/spkl-only_e0_sft/ext_e3_23b6bc9b": 13,
104
- "speakleash/llama-apt3-13B/spkl-only_e0_sft/spkl_e4_e3a666b1": 13,
105
- "speakleash/llama-apt3-13B/spkl-only_e0_sft/spkl_e3_45ef6b63": 13,
106
- "speakleash/llama-apt3-13B/spkl-only_e0_sft/spkl_e5_bf95416b": 13,
107
- "speakleash/llama-apt3-13B/spkl-only_e0_sft/ext_e2_f7606252": 13,
108
- "speakleash/llama-apt3-13B/spkl-only_e0_sft/spkl_e2_898ae6c6": 13,
109
- "speakleash/apt4-1B/spkl-only-e3_756856c4": 1,
110
- "speakleash/apt4-1B/spkl-all-e0_7f6a991e": 1,
111
- "speakleash/apt4-1B/spkl-only-e2_969e76b4": 1,
112
- "speakleash/apt4-1B/spkl-all-e2_bfb44ded": 1,
113
- "speakleash/apt4-1B/spkl-all-e3_063753f9": 1,
114
- "speakleash/apt4-1B/spkl-all-e1_74a293c8": 1,
115
- "speakleash/apt4-1B/spkl-only-e0_b9c8bb39": 1,
116
- "speakleash/apt4-1B/spkl-only-e1_fea4b41b": 1,
117
- "upstage/SOLAR-10.7B-Instruct-v1.0": 10.7,
118
- "upstage/SOLAR-10.7B-v1.0": 10.7,
119
- "speakleash/mistral_7B-v2/spkl-all_sft/e1_base/spkl-all-e1_9aee511a": 7,
120
- "speakleash/mistral_7B-v2/spkl-all_sft/e1_base/spkl-all-e0_dd9d2777": 7,
121
- "speakleash/mistral_7B-v2/spkl-only_sft/e1_base/spkl-only-e1_d0ac34b7": 7,
122
- "speakleash/mistral_7B-v2/spkl-only_sft/e1_base/spkl-only-e0_9eea5944": 7,
123
- "Remek/Kruk-7B-SP-001": 7,
124
- "TinyLlama/TinyLlama-1.1B-Chat-v1.0": 1.1,
125
- "internlm/internlm2-chat-7b-sft": 7,
126
- "speakleash/mistral_7B-v2/spkl-all_sft/e1_base/spkl-all-e3_72a6c52a": 7,
127
- "speakleash/mistral_7B-v2/spkl-only_sft/e1_base/spkl-only-e3_08a0fd89": 7,
128
- "speakleash/mistral_7B-v2/spkl-all_sft/e1_base/spkl-all-e2_0a1a62c0": 7,
129
- "speakleash/mistral_7B-v2/spkl-only_sft/e1_base/spkl-only-e2_a7c66ac5": 7,
130
- "speakleash/mistral-apt3-7B_v2/spkl-only_2e5-e0_116fa2bc": 7,
131
- "speakleash/mistral-apt3-7B_v2/spkl-only_7e6-e0_8544bbd3": 7,
132
- "speakleash/mistral-apt3-7B_v2/spkl-only_2e5-e1_013bd434": 7,
133
- "speakleash/mistral-apt3-7B_v2/spkl-only-e1_87bfffac": 7,
134
- "speakleash/mistral-apt3-7B_v2/spkl-only-e2_939d897f": 7,
135
- "speakleash/mistral-apt3-7B_v2/spkl-only-e0_2a5be0dc": 7,
136
- "speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5/spkl-only-e1_0303962d": 7,
137
- "speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5/spkl-only-e0_f4aaf490": 7,
138
- "speakleash/mistral-apt3-7B_v2/spkl-only_sft/e0_base_2e5/spkl-only-e0_009b090e": 7,
139
- "speakleash/mistral-apt3-7B_v2/spkl-only_sft/e0_base_2e5/spkl-only-e1_91aae327": 7,
140
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_7e6w-e1_14d52992": 7,
141
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_7e6w-e2_72422a32": 7,
142
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only-e2_dcb87efc": 7,
143
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_7e6-e2_04382c38": 7,
144
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_7e6-e3_860889b1": 7,
145
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_7e6w-e3_78cf3243": 7,
146
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_9e7-e0_27275908": 7,
147
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only-e0_d31a18b7": 7,
148
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_7e6-e0_c26126c8": 7,
149
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only-e3_a5833b75": 7,
150
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_7e6w-e0_6c834bf7": 7,
151
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_7e6-e1_87b7c12f": 7,
152
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_9e7-e2_5ce06dd2": 7,
153
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_9e7-e1_561ac4bb": 7,
154
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only-e1_392d55d9": 7,
155
- "speakleash/mistral_7B-v2/spkl-all_sft_v2/e1_base/spkl-all_2e6-e2_db0cd739": 7,
156
- "speakleash/mistral_7B-v2/spkl-all_sft_v2/e1_base/spkl-all_2e6-e3_4960543c": 7,
157
- "speakleash/mistral_7B-v2/spkl-all_sft_v2/e1_base/spkl-all_2e6-e0_1b65c3ac": 7,
158
- "speakleash/mistral_7B-v2/spkl-all_sft_v2/e1_base/spkl-all_2e6-e1_70c70cc6": 7,
159
- "speakleash/mistral-apt3-7B-v2/spkl-only_sft/e1_base/spkl-only-e2_3a071212": 7,
160
- "speakleash/mistral-apt3-7B-v2/spkl-only_sft/e1_base/spkl-only-e0_6dc2e217": 7,
161
- "speakleash/mistral-apt3-7B-v2/spkl-only_sft/e1_base/spkl-only-e1_46610eb1": 7,
162
- "speakleash/mistral-apt3-7B-v2/spkl-only_sft-weighted/e1_base/spkl-only-e0_e79dcb9f": 7,
163
- "speakleash/mistral-apt3-7B-v2/spkl-only_sft-weighted/e1_base/spkl-only-e1_10a78140": 7,
164
- "Remek/OpenChat3.5-0106-Spichlerz-Bocian": 7,
165
- "alpindale/Mistral-7B-v0.2-hf": 7,
166
- "Azurro/APT3-275M-Base": 0.3,
167
- "szymonrucinski/Curie-7B-v1": 7,
168
- "speakleash/mistral-apt3-7B/spkl-all_sft_v4/e0_base/spkl-all-e0-lr5e5_a47a2047": 7,
169
- "speakleash/mistral-apt3-7B/spkl-all_sft_v4/e0_base/spkl-all-e1_1774eb92": 7,
170
- "speakleash/mistral-apt3-7B/spkl-all_sft_v4/e0_base/spkl-all-e0-lr2e6_71659188": 7,
171
- "speakleash/mistral-apt3-7B/spkl-all_sft_v4/e0_base/spkl-all-e0_35239ee5": 7,
172
- "speakleash/mistral-apt3-7B/spkl-all_sft_v4/e0_base/spkl-all-e2_5257da77": 7,
173
- "speakleash/mistral-apt3-7B/spkl-all_sft_v4/e0_base/spkl-all-e3_5ca4603b": 7,
174
- "speakleash/mistral-apt3-7B/spkl-only_sft_v3/e0_base/spkl-only-e3_90666ab5": 7,
175
- "speakleash/mistral-apt3-7B/spkl-only_sft_v3/e0_base/spkl-only-e1_4e524cad": 7,
176
- "speakleash/mistral-apt3-7B/spkl-only_sft_v3/e0_base/spkl-only-e0_40cdde38": 7,
177
- "speakleash/mistral-apt3-7B/spkl-all_sft_v3/e0_base/spkl-all-e0_67274d1b": 7,
178
- "speakleash/mistral-apt3-7B/spkl-all_sft_v3/e0_base/spkl-all-e1_695e8b44": 7,
179
- "speakleash/mistral-apt3-7B/spkl-all_sft_v3/e0_base/spkl-all-e2_a9e6a2f0": 7,
180
- "speakleash/mistral-apt3-7B/spkl-all_sft_v3/e0_base/spkl-all-e3_2ff00c2b": 7,
181
- "speakleash/mistral-apt3-7B/spkl_sft_v2/e1_4067e14e": 7,
182
- "speakleash/mistral-apt3-7B/spkl_sft_v2/e0_6214300a": 7,
183
- "speakleash/mistral-apt3-7B/only-spi_sft_v2/e1_596202b3": 7,
184
- "speakleash/mistral-apt3-7B/only-spi_sft_v2/e0_c4ea165e": 7,
185
- "speakleash/mistral-apt3-7B/spkl-only_sft_v4/e0_base/spkl-only-e0_c00001c4": 7,
186
- "speakleash/mistral-apt3-7B/spkl-only_sft_v4/e0_base/spkl-only-e3_2bcd3961": 7,
187
- "speakleash/mistral-apt3-7B/spkl-only_sft_v4/e0_base/spkl-only-e1_f2730438": 7,
188
- "speakleash/mistral-apt3-7B/spkl-only_sft_v4/e0_base/spkl-only-e2_f39a22a2": 7,
189
- "speakleash/mistral-apt3-7B/spkl-all_sft_v3-lr2/e0_base/spkl-all-e0-lr6_376eb1d5": 7,
190
- "speakleash/mistral-apt3-7B/spkl-all_sft_v3-lr2/e0_base/spkl-all-e0-lr5_54b6226f": 7,
191
- "speakleash/mistral-apt3-7B/spkl-only_sft_v3/e0_base/spkl-only-e2_f036d0fd": 7,
192
- "speakleash/mistral-apt3-7B_v2/spkl-only_7e5-e0_e143e6ce": 7,
193
- "Nexusflow/Starling-LM-7B-beta": 7,
194
- "RWKV/v5-Eagle-7B-HF": 7,
195
- "speakleash/mistral-apt3-7B_v2/spkl-only_sft/e0_base_2e5/spkl-only-e2_afcfbe2d": 7,
196
- "speakleash/mistral-apt3-7B_v2/spkl-only_sft/e0_base_2e5/spkl-only-e3_6908149d": 7,
197
- "speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5/spkl-only-e2_d5a874b1": 7,
198
- "speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5/spkl-only-e3_1be744af": 7,
199
- "speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5_v6/spkl-only-e0_4efab00a": 7,
200
- "speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5_v6/spkl-only-e1_1b706f85": 7,
201
- "speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5_v6/spkl-only-e2_f86f7889": 7,
202
- "speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5_v6/spkl-only-e3_13641875": 7,
203
- "speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5_v7w/spkl-only-e0_1f5f4968": 7,
204
- "speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5_v7w/spkl-only-e1_50de9812": 7,
205
- "speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5_v7w/spkl-only-e2_dd38abb9": 7,
206
- "speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5_v7w/spkl-only-e3_36236df3": 7,
207
- "speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5_v8w/spkl-only-e0_e185fb84": 7,
208
- "speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5_v8w/spkl-only-e1_fb5d327f": 7,
209
- "speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5_v8w/spkl-only-e2_dd71be08": 7,
210
- "speakleash/mistral_7B-v2/spkl-all_sft_v2/e1_base/spkl-all_3e6_v8w-e0_d2d8a320": 7,
211
- "speakleash/mistral_7B-v2/spkl-all_sft_v2/e1_base/spkl-all_3e6_v8w-e1_cd7c61a1": 7,
212
- "speakleash/mistral_7B-v2/spkl-all_sft_v2/e1_base/spkl-all_v8wa_9e6-e0_32c27aa5": 7,
213
- "speakleash/mistral_7B-v2/spkl-all_sft_v2/e1_base/spkl-all_v8wa_9e6-e1_518b38ca": 7,
214
- "speakleash/mistral_7B-v2/spkl-all_sft_v2/e1_base/spkl-all_v8wa_9e6-e2_84fb05a1": 7,
215
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v10wa_3e6-e0_2ba34bd9": 7,
216
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v10wa_3e6-e1_35ecfaaa": 7,
217
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v10wa_3e6-e2_920b5c3f": 7,
218
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v10wa_7e6-e0_d137146f": 7,
219
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v10wa_7e6-e1_5bddbd74": 7,
220
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v10wa_7e6-e2_bbc67e89": 7,
221
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v10wa_7e6-e2b_53f28c53": 7,
222
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v10wa_7e6-e3_9931f988": 7,
223
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v10wa_7e6-e4_0bc82b61": 7,
224
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v8wa_9e6-e0_8aa4a0ae": 7,
225
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v8wa_9e6-e1_57357d6c": 7,
226
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v8wa_9e6-e2_5eb84913": 7,
227
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v9wa_3e6-e0_ae5e354c": 7,
228
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v9wa_7e6-e0_724b2d41": 7,
229
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v9wa_7e6-e1_d962636b": 7,
230
- "speakleash/Bielik-7B-v0.1": 7,
231
- "NousResearch/Nous-Hermes-2-SOLAR-10.7B": 10.7,
232
- "Qwen/Qwen1.5-7B-Chat": 7,
233
- "THUDM/chatglm3-6b-base": 6,
234
- "THUDM/chatglm3-6b": 6,
235
- "TeeZee/Bielik-SOLAR-LIKE-10.7B-Instruct-v0.1": 10.7,
236
- "google/gemma-1.1-2b-it": 2,
237
- "meta-llama/Meta-Llama-3-8B-Instruct": 8,
238
- "meta-llama/Meta-Llama-3-8B-Instruct,max_length=4096": 8,
239
- "meta-llama/Meta-Llama-3-8B": 8,
240
- "meta-llama/Meta-Llama-3-8B,max_length=4096": 8,
241
- "microsoft/WizardLM-2-7B": 7,
242
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v10wa4_9e6-e0_193ad881": 7,
243
- "speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v10wa4_9e6-e1_f40e0808": 7,
244
- "speakleash/Bielik-7B-Instruct-v0.1": 7,
245
- "speakleash/mistral_7B-v3/spkl-only_sft_v0/e0_base/spkl-only_v11wa_9e6-e0_fe38d62e": 7,
246
- "speakleash/mistral_7B-v3/spkl-only_sft_v0/e0_base/spkl-only_v11wa_9e6-e1_6f84698e": 7,
247
- "speakleash/mistral_7B-v3/spkl-only_sft_v0/e0_base/spkl-only_v11wap_9e6-e0_5c6927dd": 7,
248
- "speakleash/mistral_7B-v3/spkl-only_sft_v0/e0_base/spkl-only_v11wap_9e6-e1_1d6755a9": 7,
249
- "speakleash/mistral_7B-v3/spkl-only_v0-e0_b93294c8": 7,
250
- "speakleash/mistral_7B-v3/spkl-only_v2-e0_e5547fd5": 7,
251
- "speakleash/Bielik-7B-Instruct-v0.1-GPTQ,autogptq=True": 7,
252
- "speakleash/Bielik-7B-Instruct-v0.1,load_in_4bit=True": 7,
253
- "speakleash/Test-v02-ep3": 7,
254
- "speakleash/mistral_7B-v3/spkl-only_v2-e1.34500_a9c75816": 7,
255
- "CohereForAI/c4ai-command-r-v01,max_length=4096": 35,
256
- "Qwen/Qwen1.5-14B-Chat": 14,
257
- "Remek/Llama-3-8B-Omnibus-1-PL-v01-INSTRUCT": 8,
258
- "Remek/Llama-3-8B-Omnibus-1-PL-v01-INSTRUCT,max_length=4096": 8,
259
- "internlm/internlm2-20b,max_length=4096": 20,
260
- "internlm/internlm2-chat-20b,max_length=4096": 20,
261
- "lex-hue/Delexa-7b": 7,
262
- "lmsys/vicuna-13b-v1.5": 13,
263
- "maciek-pioro/Mixtral-8x7B-v0.1-pl,max_length=4096": 46.7,
264
- "mistralai/Mixtral-8x7B-Instruct-v0.1,max_length=4096": 46.7,
265
- "mistralai/Mixtral-8x7B-v0.1,max_length=4096": 46.7,
266
- "speakleash/Test-001-wiki": 7,
267
- "speakleash/Test-002": 7,
268
- "teknium/OpenHermes-13B": 13,
269
- "meta-llama/Meta-Llama-3-70B-Instruct,max_length=4096": 70,
270
- "meta-llama/Meta-Llama-3-70B,max_length=4096": 70,
271
- "mistralai/Mixtral-8x22B-Instruct-v0.1,max_length=4096": 141,
272
- "mistralai/Mixtral-8x22B-v0.1,max_length=4096": 141,
273
- "Qwen/Qwen1.5-14B-Chat,max_length=4096": 14,
274
- "Qwen/Qwen1.5-32B-Chat,max_length=4096": 32,
275
- "Qwen/Qwen1.5-72B-Chat,max_length=4096": 72,
276
- "Qwen/Qwen1.5-32B,max_length=4096": 32,
277
- "Qwen/Qwen1.5-72B,max_length=4096": 72,
278
- "Qwen/Qwen1.5-7B": 7,
279
- "Qwen/Qwen2-0.5B-Instruct": 0.5,
280
- "Qwen/Qwen2-0.5B": 0.5,
281
- "Qwen/Qwen2-1.5B-Instruct": 1.5,
282
- "Qwen/Qwen2-1.5B": 1.5,
283
- "Qwen/Qwen2-7B-Instruct": 7,
284
- "Qwen/Qwen2-7B": 7,
285
- "model=gpt-3.5-turbo-instruct": 20,
286
- "model=gpt-4-turbo-2024-04-09": 1000,
287
- "01-ai/Yi-1.5-6B-Chat": 6,
288
- "01-ai/Yi-1.5-6B": 6,
289
- "01-ai/Yi-1.5-9B-Chat": 9,
290
- "01-ai/Yi-1.5-9B": 9,
291
- "CohereForAI/aya-23-35B,max_length=4096": 35,
292
- "CohereForAI/aya-23-8B": 8,
293
- "NousResearch/Hermes-2-Pro-Llama-3-8B": 8,
294
- "NousResearch/Hermes-2-Theta-Llama-3-8B": 8,
295
- "Remek/OpenChat-3.5-0106-PL-Omnibusv2": 7,
296
- "mistralai/Mistral-7B-Instruct-v0.3": 7,
297
- "mistralai/Mistral-7B-v0.3": 7,
298
- "nvidia/Llama3-ChatQA-1.5-8B": 8,
299
- "openchat/openchat-3.5-0106-gemma": 7,
300
- "openchat/openchat-3.6-8b-20240522": 8,
301
- "tiiuae/falcon-11B": 11,
302
- "mlabonne/NeuralDaredevil-8B-abliterated": 8,
303
- "01-ai/Yi-1.5-34B-Chat,max_length=4096": 34,
304
- "Qwen/Qwen2-57B-A14B-Instruct,max_length=4096": 57,
305
- "Qwen/Qwen2-72B-Instruct,max_length=4096": 72,
306
- "Qwen/Qwen2-72B,max_length=4096": 72,
307
- "THUDM/glm-4-9b-chat": 9,
308
- "THUDM/glm-4-9b": 9,
309
- "google/recurrentgemma-9b-it": 9,
310
- "microsoft/Phi-3-medium-4k-instruct,max_length=4096": 14,
311
- "microsoft/Phi-3-mini-4k-instruct": 3.8,
312
- "microsoft/Phi-3-small-8k-instruct": 7.4,
313
- "ssmits/Falcon2-5.5B-Polish": 5.5,
314
- "alpindale/WizardLM-2-8x22B,max_length=4096": 141,
315
- "dreamgen/WizardLM-2-7B": 7,
316
- "mistralai/Mistral-Large-Instruct-2407": 123,
317
- "meta-llama/Meta-Llama-3.1-70B-Instruct": 70,
318
- "meta-llama/Meta-Llama-3.1-405B-Instruct-FP8": 405,
319
- "speakleash/Bielik-11B-v2.0-Instruct": 11,
320
- "speakleash/Bielik-11B-v2.2-Instruct": 11,
321
- "speakleash/Bielik-11B-v2.1-Instruct": 11,
322
- "speakleash/Bielik-11B-v2.3-Instruct": 11,
323
- "CYFRAGOVPL/PLLuM-12B-nc-chat": 12,
324
- "CYFRAGOVPL/PLLuM-12B-chat": 12,
325
- "CYFRAGOVPL/PLLuM-12B-instruct": 12,
326
- "CYFRAGOVPL/Llama-PLLuM-8B-instruct": 8,
327
- "CYFRAGOVPL/PLLuM-12B-nc-instruct": 12,
328
- "CYFRAGOVPL/Llama-PLLuM-8B-chat": 8,
329
- "CYFRAGOVPL/PLLuM-8x7B-nc-chat": 46.7,
330
- "CYFRAGOVPL/PLLuM-8x7B-nc-instruct": 46.7,
331
- "CYFRAGOVPL/PLLuM-8x7B-chat": 46.7,
332
- "CYFRAGOVPL/PLLuM-8x7B-instruct": 46.7,
333
- "CYFRAGOVPL/Llama-PLLuM-70B-chat": 70,
334
- "CYFRAGOVPL/Llama-PLLuM-70B-instruct": 70,
335
- "Qwen/Qwen2.5-7B-Instruct": 7,
336
- "Qwen/Qwen2.5-14B-Instruct": 14,
337
- "Qwen/Qwen2.5-1.5B-Instruct": 1.5,
338
- "microsoft/phi-4": 14.7,
339
- "Qwen/Qwen2.5-32B-Instruct": 32,
340
- "Qwen/Qwen2.5-72B-Instruct": 72,
341
- "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF": 70,
342
- "meta-llama/Llama-3.2-1B-Instruct": 1,
343
- "utter-project/EuroLLM-9B-Instruct": 9,
344
- "mistralai/Mistral-Small-Instruct-2409": 22.2,
345
- "mistralai/Mistral-Small-24B-Instruct-2501": 24,
346
- "meta-llama/Llama-3.3-70B-Instruct": 70,
347
- "meta-llama/Llama-3.2-3B-Instruct": 3,
348
- "Qwen/Qwen2.5-3B-Instruct": 3,
349
- "mistralai/Mistral-Nemo-Instruct-2407": 12,
350
- "microsoft/Phi-4-mini-instruct": 4,
351
- "mistralai/Mistral-Large-Instruct-2411": 123,
352
- "speakleash/Bielik-11B-v2.5-Instruct": 11,
353
- "speakleash/Bielik-4.5B-v3.0-Instruct": 4.5,
354
- "speakleash/Bielik-1.5B-v3.0-Instruct": 1.5
355
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
plot_results.py CHANGED
@@ -2,90 +2,78 @@ import pandas as pd
2
  import matplotlib.pyplot as plt
3
  import numpy as np
4
  import json
5
- import csv
6
 
7
- def create_performance_plot(csv_path='benchmark_results.csv', metadata_path='metadata.json'):
8
  # Define whitelist of interesting models (partial matches)
9
  WHITELIST = [
10
- 'Meta-Llama-3.1-70B-Instruct'
 
 
11
  ]
12
 
13
- # Read the benchmark results with error handling for inconsistent rows
14
- valid_rows = []
15
- expected_fields = 14 # Number of expected fields in each row
16
-
17
- with open(csv_path, 'r') as f:
18
- reader = csv.reader(f)
19
- header = next(reader) # Get header row
20
- # Strip whitespace from header names
21
- header = [h.strip() for h in header]
22
- for row in reader:
23
- if len(row) == expected_fields: # Only keep rows with correct number of fields
24
- # Strip whitespace from values
25
- valid_rows.append([val.strip() for val in row])
26
-
27
- # Create DataFrame from valid rows
28
- df = pd.DataFrame(valid_rows, columns=header)
29
-
30
- # Read model sizes from metadata
31
- with open(metadata_path, 'r') as f:
32
- metadata = json.load(f)
33
-
34
- # Process the data
35
- # Keep only successful runs (where Benchmark Score is not FAILED)
36
- df = df[df['Benchmark Score'] != 'FAILED']
37
- df = df[df['Benchmark Score'].notna()]
38
- # Convert score to numeric, handling invalid values
39
- df['Benchmark Score'] = pd.to_numeric(df['Benchmark Score'], errors='coerce')
40
- df = df[df['Benchmark Score'].notna()] # Remove rows where conversion failed
41
-
42
- # Convert Num Questions Parseable to numeric and calculate adjusted score
43
- df['Num Questions Parseable'] = pd.to_numeric(df['Num Questions Parseable'], errors='coerce')
44
- df['Benchmark Score'] = df['Benchmark Score'] * (df['Num Questions Parseable'] / 171)
45
-
46
- # For each model, keep only the latest run
47
- df['Run ID'] = df['Run ID'].fillna('')
48
- df['timestamp'] = pd.to_datetime(df['Benchmark Completed'])
49
- df = df.sort_values('timestamp')
50
- df = df.drop_duplicates(subset=['Model Path'], keep='last')
51
-
52
- # Get model sizes
53
- def get_model_size(model_path):
54
- # Try exact match first
55
- if model_path in metadata:
56
- return metadata[model_path]
57
- # Try with max_length suffix
58
- if f"{model_path},max_length=4096" in metadata:
59
- return metadata[f"{model_path},max_length=4096"]
60
- return None
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  # Print models without size before filtering
63
  print("\nModels without size assigned:")
64
- models_without_size = df[df['Model Path'].apply(get_model_size).isna()]
65
- for model in models_without_size['Model Path']:
66
- print(f"- {model}")
67
-
68
- df['Model Size'] = df['Model Path'].apply(get_model_size)
69
- df = df[df['Model Size'].notna()]
70
 
71
  # Remove extreme outliers (scores that are clearly errors)
72
- q1 = df['Benchmark Score'].quantile(0.25)
73
- q3 = df['Benchmark Score'].quantile(0.75)
74
- iqr = q3 - q1
75
- df = df[
76
- (df['Benchmark Score'] >= q1 - 1.5 * iqr) &
77
- (df['Benchmark Score'] <= q3 + 1.5 * iqr)
78
- ]
 
79
 
80
  # Find models on Pareto frontier
81
- sizes = sorted(df['Model Size'].unique())
82
  frontier_points = []
83
  max_score = float('-inf')
84
  frontier_models = set()
85
 
86
  for size in sizes:
87
  # Get scores for models of this size or smaller
88
- subset = df[df['Model Size'] <= size]
89
  if len(subset) > 0:
90
  max_score_idx = subset['Benchmark Score'].idxmax()
91
  current_max = subset.loc[max_score_idx, 'Benchmark Score']
@@ -95,59 +83,73 @@ def create_performance_plot(csv_path='benchmark_results.csv', metadata_path='met
95
  frontier_models.add(subset.loc[max_score_idx, 'Model Path'])
96
 
97
  # Filter models - keep those on Pareto frontier or matching whitelist
98
- df['Keep'] = False
99
- for idx, row in df.iterrows():
100
  if row['Model Path'] in frontier_models:
101
- df.loc[idx, 'Keep'] = True
102
  else:
103
  for pattern in WHITELIST:
104
  if pattern in row['Model Path']:
105
- df.loc[idx, 'Keep'] = True
106
  break
107
 
108
- df = df[df['Keep']]
 
 
 
 
 
 
 
 
 
 
109
 
110
  # Create the plot
111
  fig = plt.figure(figsize=(12, 8))
112
 
113
- # Create scatter plot
114
- plt.scatter(df['Model Size'],
115
- df['Benchmark Score'],
116
- alpha=0.6)
 
117
 
118
- # Add labels for points
119
- for idx, row in df.iterrows():
120
- # Get model name - either last part of path or full name for special cases
121
- model_name = row['Model Path'].split('/')[-1]
122
- if any(pattern in row['Model Path'] for pattern in ['gpt-3', 'gpt-4']):
123
  model_name = row['Model Path']
124
 
125
- plt.annotate(model_name,
126
- (row['Model Size'], row['Benchmark Score']),
127
- xytext=(5, 5), textcoords='offset points',
128
- fontsize=8,
129
- bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=0.5))
130
-
131
- # Plot the Pareto frontier line
132
- if frontier_points:
133
- frontier_x, frontier_y = zip(*frontier_points)
134
- plt.plot(frontier_x, frontier_y, 'r--', label='Pareto frontier')
135
-
136
- # Add vertical line for consumer GPU budget
137
- plt.axvline(x=12, color='gray', linestyle=':', label='Consumer-budget GPU limit', ymin=-0.15, clip_on=False)
138
- plt.text(12, -0.15, 'Consumer-budget\nGPU (24GB) limit\nin half precision',
139
- horizontalalignment='center', verticalalignment='top',
140
- transform=plt.gca().get_xaxis_transform())
141
 
142
  # Customize the plot
143
  plt.grid(True, linestyle='--', alpha=0.7)
144
  plt.xlabel('Model Size (billions of parameters)')
145
- plt.ylabel('Benchmark Score')
146
- plt.title('Model Performance vs Size (Pareto Frontier)')
147
 
148
  # Add legend
149
  plt.legend()
150
 
 
 
 
 
 
151
  # Adjust layout to prevent label cutoff
152
  plt.tight_layout()
153
 
 
2
  import matplotlib.pyplot as plt
3
  import numpy as np
4
  import json
 
5
 
6
+ def create_performance_plot(json_path='benchmark_report.json'):
7
  # Define whitelist of interesting models (partial matches)
8
  WHITELIST = [
9
+ 'Meta Llama 4 Maverick',
10
+ 'Anthropic Claude 3.7 Sonnet',
11
+ 'OpenAI GPT-4o'
12
  ]
13
 
14
+ # Load the benchmark results from JSON
15
+ with open(json_path, 'r') as f:
16
+ json_data = json.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ # Create DataFrame from JSON data
19
+ df = pd.DataFrame(json_data)
20
+
21
+ # Rename columns for consistency
22
+ df = df.rename(columns={
23
+ "Model Name": "Model Path",
24
+ "Model Size": "Model Size Raw"
25
+ })
26
+
27
+ # Calculate overall benchmark score as average of Avg (object) and Avg (country)
28
+ df['Benchmark Score'] = (df['Avg (object)'] + df['Avg (country)']) / 2
29
+
30
+ # Process model sizes - convert to numeric, handle "-" and extract numbers
31
+ df['Model Size'] = df['Model Size Raw'].replace("-", np.nan)
32
+
33
+ # Extract numeric values from size strings like "72 MB" -> 72 or plain "72" -> 72
34
+ def extract_size(size_val):
35
+ if pd.isna(size_val):
36
+ return np.nan
37
+ if isinstance(size_val, (int, float)):
38
+ return float(size_val)
39
+ if isinstance(size_val, str):
40
+ # Try to extract number from string (handles both "72" and "72 MB")
41
+ import re
42
+ match = re.search(r'(\d+(?:\.\d+)?)', str(size_val))
43
+ if match:
44
+ return float(match.group(1))
45
+ return np.nan
46
+
47
+ df['Model Size'] = df['Model Size'].apply(extract_size)
48
+
49
+ # Remove models without size information for plotting
50
+ df_with_size = df[df['Model Size'].notna()].copy()
51
+
52
  # Print models without size before filtering
53
  print("\nModels without size assigned:")
54
+ models_without_size = df[df['Model Size'].isna()]
55
+ for idx, row in models_without_size.iterrows():
56
+ print(f"- {row['Model Path']}")
 
 
 
57
 
58
  # Remove extreme outliers (scores that are clearly errors)
59
+ if len(df_with_size) > 0:
60
+ q1 = df_with_size['Benchmark Score'].quantile(0.25)
61
+ q3 = df_with_size['Benchmark Score'].quantile(0.75)
62
+ iqr = q3 - q1
63
+ df_with_size = df_with_size[
64
+ (df_with_size['Benchmark Score'] >= q1 - 1.5 * iqr) &
65
+ (df_with_size['Benchmark Score'] <= q3 + 1.5 * iqr)
66
+ ]
67
 
68
  # Find models on Pareto frontier
69
+ sizes = sorted(df_with_size['Model Size'].unique())
70
  frontier_points = []
71
  max_score = float('-inf')
72
  frontier_models = set()
73
 
74
  for size in sizes:
75
  # Get scores for models of this size or smaller
76
+ subset = df_with_size[df_with_size['Model Size'] <= size]
77
  if len(subset) > 0:
78
  max_score_idx = subset['Benchmark Score'].idxmax()
79
  current_max = subset.loc[max_score_idx, 'Benchmark Score']
 
83
  frontier_models.add(subset.loc[max_score_idx, 'Model Path'])
84
 
85
  # Filter models - keep those on Pareto frontier or matching whitelist
86
+ df_with_size['Keep'] = False
87
+ for idx, row in df_with_size.iterrows():
88
  if row['Model Path'] in frontier_models:
89
+ df_with_size.loc[idx, 'Keep'] = True
90
  else:
91
  for pattern in WHITELIST:
92
  if pattern in row['Model Path']:
93
+ df_with_size.loc[idx, 'Keep'] = True
94
  break
95
 
96
+ # Also include models without size if they're in whitelist
97
+ df_no_size = df[df['Model Size'].isna()].copy()
98
+ df_no_size['Keep'] = False
99
+ for idx, row in df_no_size.iterrows():
100
+ for pattern in WHITELIST:
101
+ if pattern in row['Model Path']:
102
+ df_no_size.loc[idx, 'Keep'] = True
103
+ break
104
+
105
+ # Combine datasets for plotting
106
+ plot_df = df_with_size[df_with_size['Keep']].copy()
107
 
108
  # Create the plot
109
  fig = plt.figure(figsize=(12, 8))
110
 
111
+ if len(plot_df) > 0:
112
+ # Create scatter plot
113
+ plt.scatter(plot_df['Model Size'],
114
+ plot_df['Benchmark Score'],
115
+ alpha=0.6, s=60)
116
 
117
+ # Add labels for points
118
+ for idx, row in plot_df.iterrows():
119
+ # Use the full model name for labeling
 
 
120
  model_name = row['Model Path']
121
 
122
+ plt.annotate(model_name,
123
+ (row['Model Size'], row['Benchmark Score']),
124
+ xytext=(5, 5), textcoords='offset points',
125
+ fontsize=8,
126
+ bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=0.5))
127
+
128
+ # Plot the Pareto frontier line
129
+ if frontier_points:
130
+ frontier_x, frontier_y = zip(*frontier_points)
131
+ plt.plot(frontier_x, frontier_y, 'r--', label='Pareto frontier', linewidth=2)
132
+
133
+ # Add vertical line for consumer GPU budget (assuming 24GB can handle ~12B parameters)
134
+ plt.axvline(x=12, color='gray', linestyle=':', label='Consumer-budget GPU limit', ymin=-0.15, clip_on=False)
135
+ plt.text(12, plt.ylim()[0] - (plt.ylim()[1] - plt.ylim()[0]) * 0.1,
136
+ 'Consumer-budget\nGPU (24GB) limit\nin half precision',
137
+ horizontalalignment='center', verticalalignment='top')
138
 
139
  # Customize the plot
140
  plt.grid(True, linestyle='--', alpha=0.7)
141
  plt.xlabel('Model Size (billions of parameters)')
142
+ plt.ylabel('Benchmark Score (Average of Object & Country Recognition)')
143
+ plt.title('Polish Photo Recognition: Model Performance vs Size')
144
 
145
  # Add legend
146
  plt.legend()
147
 
148
+ # Set reasonable axis limits
149
+ if len(plot_df) > 0:
150
+ plt.xlim(left=0)
151
+ plt.ylim(bottom=0)
152
+
153
  # Adjust layout to prevent label cutoff
154
  plt.tight_layout()
155
 
script.py DELETED
@@ -1,322 +0,0 @@
1
- import pandas as pd
2
- import json
3
- import re
4
-
5
- # Load the CSV file
6
- leaderboard_df = []
7
- with open("benchmark_results.csv", "r") as f:
8
- header = f.readline().strip().split(",")
9
- header = [h.strip() for h in header]
10
- for i, line in enumerate(f):
11
- leaderboard_df.append(line.strip().split(",", 13))
12
-
13
- # Load metadata
14
- metadata = json.load(open('metadata.json'))
15
- for k, v in list(metadata.items()):
16
- metadata[k.split(",")[0]] = v
17
-
18
- # Create DataFrame
19
- leaderboard_df = pd.DataFrame(leaderboard_df, columns=header)
20
-
21
- # Filter and process DataFrame
22
- leaderboard_df = leaderboard_df[(leaderboard_df["Benchmark Version"] == "eq-bench_v2_pl") | (
23
- leaderboard_df["Benchmark Version"] == 'eq-bench_pl')]
24
- leaderboard_df = leaderboard_df[["Model Path", "Benchmark Score", "Num Questions Parseable", "Error"]]
25
-
26
- def parse_parseable(x):
27
- if x["Num Questions Parseable"] == 'FAILED':
28
- m = re.match(r'(\d+)\.0 questions were parseable', x["Error"])
29
- return m.group(1)
30
- return x["Num Questions Parseable"]
31
-
32
- leaderboard_df["Num Questions Parseable"] = leaderboard_df[["Num Questions Parseable", "Error"]].apply(
33
- lambda x: parse_parseable(x), axis=1)
34
-
35
- NUMBER_OF_QUESTIONS = 171.0
36
-
37
- def fraction_to_percentage(numerator: float, denominator: float) -> float:
38
- return (numerator / denominator) * 100
39
-
40
- leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].apply(lambda x: fraction_to_percentage(float(x), NUMBER_OF_QUESTIONS))
41
-
42
- def get_params(model_name):
43
- if model_name in metadata:
44
- return metadata[model_name]
45
- else:
46
- print(model_name)
47
- return None
48
-
49
- leaderboard_df["Params"] = leaderboard_df["Model Path"].apply(lambda x: get_params(x))
50
- leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', None)
51
- leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float) * ((leaderboard_df["Num Questions Parseable"].astype(float) / 100))
52
- leaderboard_df.loc[leaderboard_df["Benchmark Score"] < 0, "Benchmark Score"] = 0
53
- leaderboard_df = leaderboard_df.sort_values(by=["Benchmark Score", "Num Questions Parseable"], ascending=[False, False])
54
- leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model", "Num Questions Parseable": "Percentage Questions Parseable"})
55
-
56
- # Generate HTML with DataTables
57
- html = """
58
- <!DOCTYPE html>
59
- <html lang="en">
60
- <head>
61
- <meta charset="UTF-8">
62
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
63
- <title>Leaderboard</title>
64
- <link rel="stylesheet" href="https://cdn.datatables.net/1.11.5/css/jquery.dataTables.min.css">
65
- <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
66
- <script src="https://cdn.datatables.net/1.11.5/js/jquery.dataTables.min.js"></script>
67
- <style>
68
- body {
69
- font: 90%/1.45em "Helvetica Neue", HelveticaNeue, Verdana, Arial, Helvetica, sans-serif;
70
- margin: 0;
71
- padding: 20px;
72
- color: #333;
73
- background-color: #fff;
74
- }
75
- .numeric-cell {
76
- text-align: right;
77
- padding: 8px !important;
78
- }
79
- </style>
80
- <script>
81
- (function($) {
82
- $.fn.colorize = function(oOptions) {
83
- var settings = $.extend({
84
- parse: function(e) {
85
- return parseFloat(e.html());
86
- },
87
- min: undefined,
88
- max: undefined,
89
- readable: true,
90
- themes: {
91
- "default": {
92
- color_min: "#C80000",
93
- color_mid: "#FFFFFF",
94
- color_max: "#10A54A"
95
- }
96
- },
97
- theme: "default",
98
- center: undefined,
99
- percent: false
100
- }, oOptions);
101
-
102
- function getColor(color1, color2, ratio) {
103
- var hex = function(x) {
104
- x = x.toString(16);
105
- return (x.length == 1) ? '0' + x : x;
106
- }
107
- color1 = (color1.charAt(0) == "#") ? color1.slice(1) : color1
108
- color2 = (color2.charAt(0) == "#") ? color2.slice(1) : color2
109
- var r = Math.ceil(parseInt(color1.substring(0,2), 16) * ratio + parseInt(color2.substring(0,2), 16) * (1-ratio));
110
- var g = Math.ceil(parseInt(color1.substring(2,4), 16) * ratio + parseInt(color2.substring(2,4), 16) * (1-ratio));
111
- var b = Math.ceil(parseInt(color1.substring(4,6), 16) * ratio + parseInt(color2.substring(4,6), 16) * (1-ratio));
112
- return "#" + (hex(r) + hex(g) + hex(b)).toUpperCase();
113
- }
114
-
115
- function getContrastYIQ(hexcolor) {
116
- var hex = (hexcolor.charAt(0) == "#") ? hexcolor.slice(1) : hexcolor;
117
- var r = parseInt(hex.substr(0,2),16);
118
- var g = parseInt(hex.substr(2,2),16);
119
- var b = parseInt(hex.substr(4,2),16);
120
- var yiq = ((r*299)+(g*587)+(b*114))/1000;
121
- return (yiq >= 128) ? 'black' : 'white';
122
- }
123
-
124
- var min = settings.min;
125
- var max = settings.max;
126
- if (min === undefined || max === undefined) {
127
- min = Infinity;
128
- max = -Infinity;
129
- this.each(function() {
130
- var value = parseFloat(settings.parse($(this)));
131
- if (!isNaN(value) && isFinite(value)) {
132
- min = Math.min(min, value);
133
- max = Math.max(max, value);
134
- }
135
- });
136
- }
137
-
138
- var center = settings.center !== undefined ? settings.center : (max + min) / 2;
139
- var adj = Math.max(Math.abs(max - center), Math.abs(center - min));
140
-
141
- this.each(function() {
142
- var value = parseFloat(settings.parse($(this)));
143
- if (isNaN(value) || !isFinite(value)) return;
144
-
145
- var ratio = (value - center) / adj;
146
- var color1, color2;
147
-
148
- if (value < center) {
149
- ratio = Math.abs(ratio);
150
- if (ratio > 1) ratio = 1;
151
- color1 = settings.themes[settings.theme].color_min;
152
- color2 = settings.themes[settings.theme].color_mid;
153
- } else {
154
- ratio = Math.abs(ratio);
155
- if (ratio > 1) ratio = 1;
156
- color1 = settings.themes[settings.theme].color_max;
157
- color2 = settings.themes[settings.theme].color_mid;
158
- }
159
- var color = getColor(color1, color2, ratio);
160
- $(this).css('background-color', color);
161
- if (settings.readable)
162
- $(this).css('color', getContrastYIQ(color));
163
- });
164
-
165
- return this;
166
- };
167
- }(jQuery));
168
-
169
- $(document).ready(function() {
170
- // Add custom filtering function
171
- $.fn.dataTable.ext.search.push(function(settings, data, dataIndex) {
172
- var searchValue = $('.dataTables_filter input').val();
173
- if (!searchValue) return true;
174
-
175
- // Split search terms by semicolon and trim whitespace
176
- var searchTerms = searchValue.split(';').map(term => term.trim().toLowerCase());
177
- var modelName = data[0].toLowerCase(); // Model name is in first column
178
-
179
- // Return true if ANY search terms are found in the model name (OR logic)
180
- return searchTerms.some(term => modelName.includes(term));
181
- });
182
-
183
- // Custom sorting function for benchmark scores
184
- $.fn.dataTable.ext.type.order['score-pre'] = function(data) {
185
- var score = parseFloat(data);
186
- return isNaN(score) ? -Infinity : score;
187
- };
188
-
189
- // Get min/max values for each numeric column before initializing DataTables
190
- var columnRanges = {
191
- 1: { min: Infinity, max: -Infinity }, // Params
192
- 2: { min: Infinity, max: -Infinity }, // Benchmark Score
193
- 3: { min: Infinity, max: -Infinity } // Percentage Questions Parseable
194
- };
195
-
196
- $('#leaderboard tbody td').each(function() {
197
- var columnIdx = $(this).index();
198
- if (columnIdx in columnRanges) {
199
- var value = parseFloat($(this).text());
200
- if (!isNaN(value) && isFinite(value)) {
201
- columnRanges[columnIdx].min = Math.min(columnRanges[columnIdx].min, value);
202
- columnRanges[columnIdx].max = Math.max(columnRanges[columnIdx].max, value);
203
- }
204
- }
205
- });
206
-
207
- var table = $('#leaderboard').DataTable({
208
- "order": [[2, "desc"]], // Sort by Benchmark Score by default
209
- "pageLength": 20, // Show 20 results per page
210
- "lengthMenu": [[10, 20, 50, 100, -1], [10, 20, 50, 100, "All"]], // Update length menu options
211
- "columnDefs": [
212
- {
213
- "targets": [1],
214
- "className": "numeric-cell"
215
- },
216
- {
217
- "type": "score",
218
- "targets": [2], // Apply custom sorting to Benchmark Score column
219
- "className": "numeric-cell"
220
- },
221
- {
222
- "targets": [3],
223
- "className": "numeric-cell"
224
- }
225
- ],
226
- "drawCallback": function() {
227
- // Apply colorization with pre-calculated ranges
228
- $("#leaderboard tbody td:nth-child(2)").colorize({
229
- parse: function(e) { return parseFloat($(e).text()); },
230
- min: columnRanges[1].min,
231
- max: columnRanges[1].max,
232
- themes: {
233
- "default": {
234
- color_min: "#10A54A", // White for smaller models
235
- color_mid: "#FFD700", // Gold/yellow for medium models
236
- color_max: "#C80000" // Hot pink for larger models
237
- }
238
- }
239
- });
240
- $("#leaderboard tbody td:nth-child(3)").colorize({
241
- parse: function(e) { return parseFloat($(e).text()); },
242
- min: columnRanges[2].min,
243
- max: columnRanges[2].max,
244
- themes: {
245
- "default": {
246
- color_min: "#C80000", // Red for lower scores
247
- color_mid: "#FFD700", // Gold/yellow for medium scores
248
- color_max: "#10A54A" // Green for higher scores
249
- }
250
- }
251
- });
252
- $("#leaderboard tbody td:nth-child(4)").colorize({
253
- parse: function(e) { return parseFloat($(e).text()); },
254
- min: columnRanges[3].min,
255
- max: columnRanges[3].max,
256
- themes: {
257
- "default": {
258
- color_min: "#C80000", // Red for lower percentages
259
- color_mid: "#FFD700", // Gold/yellow for medium percentages
260
- color_max: "#10A54A" // Green for higher percentages
261
- }
262
- }
263
- });
264
- },
265
- // Override the default search behavior
266
- "search": {
267
- "smart": false
268
- },
269
-
270
- // Update search on input change
271
- "initComplete": function() {
272
- var table = this.api();
273
- $('.dataTables_filter input')
274
- .off() // Remove default binding
275
- .on('input', function() {
276
- table.draw();
277
- });
278
- }
279
- });
280
- });
281
- </script>
282
- </head>
283
- <body>
284
- <h1>Leaderboard</h1>
285
- <table id="leaderboard" class="display" style="width:100%">
286
- <thead>
287
- <tr>
288
- <th>Model</th>
289
- <th>Params</th>
290
- <th>Benchmark Score</th>
291
- <th>Percentage Questions Parseable</th>
292
- <th>Error</th>
293
- </tr>
294
- </thead>
295
- <tbody>
296
- """
297
-
298
- # Add rows to the HTML table
299
- for _, row in leaderboard_df.iterrows():
300
- html += f"""
301
- <tr>
302
- <td>{row['Model']}</td>
303
- <td>{row['Params']}</td>
304
- <td>{row['Benchmark Score']:.2f}</td>
305
- <td>{row['Percentage Questions Parseable']:.2f}</td>
306
- <td>{row['Error']}</td>
307
- </tr>
308
- """
309
-
310
- # Close the HTML tags
311
- html += """
312
- </tbody>
313
- </table>
314
- </body>
315
- </html>
316
- """
317
-
318
- # Save the HTML to a file
319
- with open("leaderboard.html", "w") as file:
320
- file.write(html)
321
-
322
- print("HTML leaderboard generated and saved as leaderboard.html")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/about.py CHANGED
@@ -2,25 +2,43 @@
2
  TITLE = """<div style="display: flex; flex-wrap: wrap; justify-content: space-around;">
3
  <img src="https://speakleash.org/wp-content/uploads/2023/09/SpeakLeash_logo.svg">
4
  <div>
5
- <h1 align="center" id="space-title">Polish EQ-Bench Leaderboard</h1>
6
- <h2 align="center" id="space-subtitle">Leaderboard was created as part of an open-science project SpeakLeash.org</h2>
7
  </div>
8
  </div>"""
9
 
10
  # What does your leaderboard evaluate?
11
  INTRODUCTION_TEXT = """
12
- Polish Emotional Intelligence Benchmark for LLMs
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  Help us develop Polish Large Language Model Bielik by using [Arena](https://arena.speakleash.org.pl/).
15
 
16
  We gratefully acknowledge Polish high-performance computing infrastructure PLGrid (HPC Centers: ACK Cyfronet AGH) for providing computer facilities and support within computational grant no. PLG/2024/016951.
17
  """
18
 
19
- AUTHORS = """Authors:
20
- * Automatic translation: [Remigiusz Kinas](https://www.linkedin.com/in/remigiusz-kinas/)
21
- * Translation proofreading and localization: [Maria Filipkowska](https://www.linkedin.com/in/maria-filipkowska/), [Zuzanna Dabić](https://www.linkedin.com/in/zuzanna-dabic/)
22
- * Preparing dataset: [Kacper Milan](https://www.linkedin.com/in/kacper-milan/)
23
- * Running benchmark and leaderboard: [Krzysztof Wróbel](https://www.linkedin.com/in/wrobelkrzysztof/)
 
 
 
 
 
 
24
 
25
- Based on: EQ-Bench: An Emotional Intelligence Benchmark for Large Language Models, Samuel J. Paech, 2023"""
26
 
 
2
  TITLE = """<div style="display: flex; flex-wrap: wrap; justify-content: space-around;">
3
  <img src="https://speakleash.org/wp-content/uploads/2023/09/SpeakLeash_logo.svg">
4
  <div>
5
+ <h1 align="center" id="space-title">Polish Cultural Vision Benchmark (PCVB)</h1>
6
+ <h2 align="center" id="space-subtitle">Evaluating Vision-Language Models on Polish Cultural Heritage</h2>
7
  </div>
8
  </div>"""
9
 
10
  # What does your leaderboard evaluate?
11
  INTRODUCTION_TEXT = """
12
+ A specialized evaluation dataset designed to assess vision-language models' understanding of Polish cultural heritage, history, geography, and traditions. This benchmark addresses the critical gap in multilingual and culturally-specific evaluation of multimodal AI systems.
13
+
14
+ **Benchmark Scope:**
15
+ - **Domain**: Polish Cultural Knowledge
16
+ - **Modality**: Vision + Language
17
+ - **Task Type**: Visual Recognition and Cultural Classification
18
+ - **Dataset Size**: ~220 curated image-text pairs across 11 subcategories
19
+
20
+ **Categories Evaluated:**
21
+ - 🎭 **Art & Entertainment**: Movies, Art, Theatre
22
+ - 🏛️ **Culture & Tradition**: Food, Folk Culture, Traditions
23
+ - 🗺️ **Geography**: Cities, Nature, Architecture
24
+ - 📚 **History**: Historical Figures, Historical Sites
25
 
26
  Help us develop Polish Large Language Model Bielik by using [Arena](https://arena.speakleash.org.pl/).
27
 
28
  We gratefully acknowledge Polish high-performance computing infrastructure PLGrid (HPC Centers: ACK Cyfronet AGH) for providing computer facilities and support within computational grant no. PLG/2024/016951.
29
  """
30
 
31
+ AUTHORS = """**Benchmark Details:**
32
+
33
+ **Methodology**: Each test item consists of carefully selected and manually verified images that represent authentic Polish cultural elements. Models are prompted to identify specific cultural objects, landmarks, foods, or personalities shown in images, along with their country of origin.
34
+
35
+ **Evaluation Protocol**: Responses are evaluated for both object accuracy and geographical attribution using binary scoring (correct/incorrect) across all categories.
36
+
37
+ **Unique Value Proposition**:
38
+ - Cultural Specificity: Tests deep understanding of Polish heritage beyond generic object recognition
39
+ - Multimodal Integration: Requires both visual processing and cultural knowledge
40
+ - Bias Detection: Reveals potential Western-centric biases in vision-language models
41
+ - Real-world Relevance: Evaluates practically useful cultural knowledge for Polish applications
42
 
43
+ This benchmark is maintained as a private evaluation suite to ensure result integrity and prevent training data contamination."""
44