Refactor app.py to use JSON for benchmark data, removing CSV and metadata dependencies. Update performance plotting to reflect new data structure and enhance visualization with cultural context. Introduce benchmark report JSON file for structured model evaluation results.
Browse files- app.py +55 -81
- benchmark_report.json +142 -0
- benchmark_results.csv +0 -189
- metadata.json +0 -355
- plot_results.py +100 -98
- script.py +0 -322
- src/about.py +27 -9
app.py
CHANGED
|
@@ -19,92 +19,54 @@ with demo:
|
|
| 19 |
gr.HTML(TITLE)
|
| 20 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
# create new column with model name
|
| 50 |
-
def parse_parseable(x):
|
| 51 |
-
if x["Num Questions Parseable"] == 'FAILED':
|
| 52 |
-
m = re.match(r'(\d+)\.0 questions were parseable', x["Error"])
|
| 53 |
-
return m.group(1)
|
| 54 |
-
return x["Num Questions Parseable"]
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
leaderboard_df["Num Questions Parseable"] = leaderboard_df[["Num Questions Parseable", "Error"]].apply(
|
| 58 |
-
lambda x: parse_parseable(x), axis=1)
|
| 59 |
-
|
| 60 |
-
def fraction_to_percentage(numerator: float, denominator: float) -> float:
|
| 61 |
-
return (numerator / denominator) * 100
|
| 62 |
-
|
| 63 |
-
leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].apply(lambda x: fraction_to_percentage(float(x), NUMBER_OF_QUESTIONS))
|
| 64 |
-
|
| 65 |
-
def get_params(model_name):
|
| 66 |
-
if model_name in metadata:
|
| 67 |
-
return metadata[model_name]
|
| 68 |
-
else:
|
| 69 |
-
print(model_name)
|
| 70 |
-
return numpy.nan
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
leaderboard_df["Params"] = leaderboard_df["Model Path"].apply(lambda x: get_params(x))
|
| 74 |
-
|
| 75 |
-
# move column order
|
| 76 |
-
leaderboard_df = leaderboard_df[["Model Path", "Params", "Benchmark Score", "Num Questions Parseable", 'Error']]
|
| 77 |
-
|
| 78 |
-
# change value of column to nan
|
| 79 |
-
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', numpy.nan)
|
| 80 |
-
|
| 81 |
-
#scale Benchmark Score by Num Questions Parseable*100
|
| 82 |
-
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float) * ((leaderboard_df["Num Questions Parseable"].astype(float) / 100))
|
| 83 |
-
|
| 84 |
-
# set datatype of column
|
| 85 |
-
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float)
|
| 86 |
-
leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].astype(float)
|
| 87 |
|
| 88 |
-
#
|
| 89 |
-
leaderboard_df
|
|
|
|
|
|
|
| 90 |
|
| 91 |
-
#
|
| 92 |
-
leaderboard_df = leaderboard_df.sort_values(by=["
|
| 93 |
-
ascending=[False, False])
|
| 94 |
|
| 95 |
# Print model names and scores to console before HTML formatting
|
| 96 |
print("\n===== MODEL RESULTS =====")
|
|
|
|
| 97 |
for index, row in leaderboard_df.iterrows():
|
| 98 |
-
print(f"{row['Model Path']}: {row['
|
| 99 |
print("========================\n")
|
| 100 |
|
| 101 |
# Apply HTML formatting for display
|
| 102 |
leaderboard_df["Model Path"] = leaderboard_df["Model Path"].apply(lambda x: make_clickable_model(x))
|
| 103 |
|
| 104 |
-
#
|
| 105 |
leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model"})
|
| 106 |
-
leaderboard_df = leaderboard_df.rename(columns={"Num Questions Parseable": "Percentage Questions Parseable"})
|
| 107 |
-
|
| 108 |
leaderboard_df.to_csv("output.csv")
|
| 109 |
|
| 110 |
# Set midpoint for gradient coloring based on data ranges
|
|
@@ -118,17 +80,29 @@ with demo:
|
|
| 118 |
vmax=150
|
| 119 |
)
|
| 120 |
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
leaderboard_df_styled = leaderboard_df_styled.format(rounding)
|
| 128 |
|
|
|
|
|
|
|
|
|
|
| 129 |
leaderboard_table = gr.components.Dataframe(
|
| 130 |
value=leaderboard_df_styled,
|
| 131 |
-
datatype=
|
| 132 |
elem_id="leaderboard-table",
|
| 133 |
interactive=False,
|
| 134 |
visible=True,
|
|
|
|
| 19 |
gr.HTML(TITLE)
|
| 20 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 21 |
|
| 22 |
+
# Load dataframe from JSON
|
| 23 |
+
with open("benchmark_report.json", "r") as f:
|
| 24 |
+
json_data = json.load(f)
|
| 25 |
+
|
| 26 |
+
# Create dataframe from JSON data
|
| 27 |
+
leaderboard_df = pd.DataFrame(json_data)
|
| 28 |
+
|
| 29 |
+
# Rename columns for consistency
|
| 30 |
+
leaderboard_df = leaderboard_df.rename(columns={
|
| 31 |
+
"Model Name": "Model Path",
|
| 32 |
+
"Model Size": "Params"
|
| 33 |
+
})
|
| 34 |
+
|
| 35 |
+
# Calculate overall benchmark score as average of Avg (object) and Avg (country)
|
| 36 |
+
leaderboard_df["Avg"] = (leaderboard_df["Avg (object)"] + leaderboard_df["Avg (country)"]) / 2
|
| 37 |
+
|
| 38 |
+
# Select and reorder columns for display (removed Percentage Questions Parseable)
|
| 39 |
+
display_columns = [
|
| 40 |
+
"Model Path", "Params", "Avg",
|
| 41 |
+
"Avg (object)", "Avg (country)",
|
| 42 |
+
"History (object)", "History (country)",
|
| 43 |
+
"Geography (object)", "Geography (country)",
|
| 44 |
+
"Art & Entertainment (object)", "Art & Entertainment (country)",
|
| 45 |
+
"Culture & Tradition (object)", "Culture & Tradition (country)"
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
leaderboard_df = leaderboard_df[display_columns]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
+
# Convert Params column - replace "-" with NaN and convert numeric strings to float
|
| 51 |
+
leaderboard_df["Params"] = leaderboard_df["Params"].replace("-", numpy.nan)
|
| 52 |
+
# Convert numeric strings directly to float (no regex needed since values are already clean numbers)
|
| 53 |
+
leaderboard_df.loc[leaderboard_df["Params"].notna(), "Params"] = leaderboard_df.loc[leaderboard_df["Params"].notna(), "Params"].astype(float)
|
| 54 |
|
| 55 |
+
# Sort by benchmark score
|
| 56 |
+
leaderboard_df = leaderboard_df.sort_values(by=["Avg"], ascending=[False])
|
|
|
|
| 57 |
|
| 58 |
# Print model names and scores to console before HTML formatting
|
| 59 |
print("\n===== MODEL RESULTS =====")
|
| 60 |
+
print("Avg is calculated as: (Avg (object) + Avg (country)) / 2")
|
| 61 |
for index, row in leaderboard_df.iterrows():
|
| 62 |
+
print(f"{row['Model Path']}: {row['Avg']:.2f}")
|
| 63 |
print("========================\n")
|
| 64 |
|
| 65 |
# Apply HTML formatting for display
|
| 66 |
leaderboard_df["Model Path"] = leaderboard_df["Model Path"].apply(lambda x: make_clickable_model(x))
|
| 67 |
|
| 68 |
+
# Rename column for display
|
| 69 |
leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model"})
|
|
|
|
|
|
|
| 70 |
leaderboard_df.to_csv("output.csv")
|
| 71 |
|
| 72 |
# Set midpoint for gradient coloring based on data ranges
|
|
|
|
| 80 |
vmax=150
|
| 81 |
)
|
| 82 |
|
| 83 |
+
# Set up number formatting (removed Percentage Questions Parseable)
|
| 84 |
+
rounding = {
|
| 85 |
+
"Avg": "{:.2f}",
|
| 86 |
+
"Params": "{:.0f}",
|
| 87 |
+
"Avg (object)": "{:.2f}",
|
| 88 |
+
"Avg (country)": "{:.2f}",
|
| 89 |
+
"History (object)": "{:.2f}",
|
| 90 |
+
"History (country)": "{:.2f}",
|
| 91 |
+
"Geography (object)": "{:.2f}",
|
| 92 |
+
"Geography (country)": "{:.2f}",
|
| 93 |
+
"Art & Entertainment (object)": "{:.2f}",
|
| 94 |
+
"Art & Entertainment (country)": "{:.2f}",
|
| 95 |
+
"Culture & Tradition (object)": "{:.2f}",
|
| 96 |
+
"Culture & Tradition (country)": "{:.2f}"
|
| 97 |
+
}
|
| 98 |
leaderboard_df_styled = leaderboard_df_styled.format(rounding)
|
| 99 |
|
| 100 |
+
# Create dataframe component with appropriate datatypes
|
| 101 |
+
datatypes = ['markdown', 'number'] + ['number'] * (len(display_columns) - 1)
|
| 102 |
+
|
| 103 |
leaderboard_table = gr.components.Dataframe(
|
| 104 |
value=leaderboard_df_styled,
|
| 105 |
+
datatype=datatypes,
|
| 106 |
elem_id="leaderboard-table",
|
| 107 |
interactive=False,
|
| 108 |
visible=True,
|
benchmark_report.json
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model Name":"Anthropic Claude 3.7 Sonnet",
|
| 4 |
+
"Model Size":"-",
|
| 5 |
+
"Avg (object)":37.06,
|
| 6 |
+
"Avg (country)":62.46,
|
| 7 |
+
"History (object)":52.5,
|
| 8 |
+
"History (country)":80.0,
|
| 9 |
+
"Geography (object)":58.33,
|
| 10 |
+
"Geography (country)":83.33,
|
| 11 |
+
"Art & Entertainment (object)":22.41,
|
| 12 |
+
"Art & Entertainment (country)":44.83,
|
| 13 |
+
"Culture & Tradition (object)":15.0,
|
| 14 |
+
"Culture & Tradition (country)":41.67
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"Model Name":"OpenAI GPT-4o",
|
| 18 |
+
"Model Size":"-",
|
| 19 |
+
"Avg (object)":28.94,
|
| 20 |
+
"Avg (country)":42.49,
|
| 21 |
+
"History (object)":30.0,
|
| 22 |
+
"History (country)":37.5,
|
| 23 |
+
"Geography (object)":45.0,
|
| 24 |
+
"Geography (country)":55.0,
|
| 25 |
+
"Art & Entertainment (object)":22.41,
|
| 26 |
+
"Art & Entertainment (country)":24.14,
|
| 27 |
+
"Culture & Tradition (object)":18.33,
|
| 28 |
+
"Culture & Tradition (country)":53.33
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
"Model Name":"Qwen 2.5 VL 72B",
|
| 32 |
+
"Model Size":"72",
|
| 33 |
+
"Avg (object)":23.91,
|
| 34 |
+
"Avg (country)":51.51,
|
| 35 |
+
"History (object)":35.0,
|
| 36 |
+
"History (country)":70.0,
|
| 37 |
+
"Geography (object)":31.67,
|
| 38 |
+
"Geography (country)":71.67,
|
| 39 |
+
"Art & Entertainment (object)":18.97,
|
| 40 |
+
"Art & Entertainment (country)":31.03,
|
| 41 |
+
"Culture & Tradition (object)":10.0,
|
| 42 |
+
"Culture & Tradition (country)":33.33
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"Model Name":"Qwen 2.5 VL 32B",
|
| 46 |
+
"Model Size":"32",
|
| 47 |
+
"Avg (object)":22.27,
|
| 48 |
+
"Avg (country)":48.8,
|
| 49 |
+
"History (object)":30.0,
|
| 50 |
+
"History (country)":67.5,
|
| 51 |
+
"Geography (object)":28.33,
|
| 52 |
+
"Geography (country)":66.67,
|
| 53 |
+
"Art & Entertainment (object)":22.41,
|
| 54 |
+
"Art & Entertainment (country)":31.03,
|
| 55 |
+
"Culture & Tradition (object)":8.33,
|
| 56 |
+
"Culture & Tradition (country)":30.0
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"Model Name":"Qwen 2.5 VL 7B",
|
| 60 |
+
"Model Size":"7",
|
| 61 |
+
"Avg (object)":21.62,
|
| 62 |
+
"Avg (country)":44.72,
|
| 63 |
+
"History (object)":32.5,
|
| 64 |
+
"History (country)":65.0,
|
| 65 |
+
"Geography (object)":28.33,
|
| 66 |
+
"Geography (country)":66.67,
|
| 67 |
+
"Art & Entertainment (object)":18.97,
|
| 68 |
+
"Art & Entertainment (country)":15.52,
|
| 69 |
+
"Culture & Tradition (object)":6.67,
|
| 70 |
+
"Culture & Tradition (country)":31.67
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"Model Name":"Google Gemma 3 27B",
|
| 74 |
+
"Model Size":"27",
|
| 75 |
+
"Avg (object)":19.14,
|
| 76 |
+
"Avg (country)":43.76,
|
| 77 |
+
"History (object)":12.5,
|
| 78 |
+
"History (country)":52.5,
|
| 79 |
+
"Geography (object)":28.33,
|
| 80 |
+
"Geography (country)":48.33,
|
| 81 |
+
"Art & Entertainment (object)":22.41,
|
| 82 |
+
"Art & Entertainment (country)":25.86,
|
| 83 |
+
"Culture & Tradition (object)":13.33,
|
| 84 |
+
"Culture & Tradition (country)":48.33
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"Model Name":"Meta Llama 4 Maverick",
|
| 88 |
+
"Model Size":"402",
|
| 89 |
+
"Avg (object)":17.49,
|
| 90 |
+
"Avg (country)":42.98,
|
| 91 |
+
"History (object)":17.5,
|
| 92 |
+
"History (country)":52.5,
|
| 93 |
+
"Geography (object)":20.0,
|
| 94 |
+
"Geography (country)":50.0,
|
| 95 |
+
"Art & Entertainment (object)":24.14,
|
| 96 |
+
"Art & Entertainment (country)":32.76,
|
| 97 |
+
"Culture & Tradition (object)":8.33,
|
| 98 |
+
"Culture & Tradition (country)":36.67
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"Model Name":"Mistral Medium 3",
|
| 102 |
+
"Model Size":"-",
|
| 103 |
+
"Avg (object)":17.45,
|
| 104 |
+
"Avg (country)":45.99,
|
| 105 |
+
"History (object)":12.5,
|
| 106 |
+
"History (country)":65.0,
|
| 107 |
+
"Geography (object)":31.67,
|
| 108 |
+
"Geography (country)":56.67,
|
| 109 |
+
"Art & Entertainment (object)":18.97,
|
| 110 |
+
"Art & Entertainment (country)":18.97,
|
| 111 |
+
"Culture & Tradition (object)":6.67,
|
| 112 |
+
"Culture & Tradition (country)":43.33
|
| 113 |
+
},
|
| 114 |
+
{
|
| 115 |
+
"Model Name":"Google Gemma 3 12B",
|
| 116 |
+
"Model Size":"12",
|
| 117 |
+
"Avg (object)":13.06,
|
| 118 |
+
"Avg (country)":40.04,
|
| 119 |
+
"History (object)":10.0,
|
| 120 |
+
"History (country)":42.5,
|
| 121 |
+
"Geography (object)":15.0,
|
| 122 |
+
"Geography (country)":46.67,
|
| 123 |
+
"Art & Entertainment (object)":17.24,
|
| 124 |
+
"Art & Entertainment (country)":29.31,
|
| 125 |
+
"Culture & Tradition (object)":10.0,
|
| 126 |
+
"Culture & Tradition (country)":41.67
|
| 127 |
+
},
|
| 128 |
+
{
|
| 129 |
+
"Model Name":"Google Gemma 3 4B",
|
| 130 |
+
"Model Size":"4",
|
| 131 |
+
"Avg (object)":9.72,
|
| 132 |
+
"Avg (country)":35.84,
|
| 133 |
+
"History (object)":5.0,
|
| 134 |
+
"History (country)":47.5,
|
| 135 |
+
"Geography (object)":8.33,
|
| 136 |
+
"Geography (country)":38.33,
|
| 137 |
+
"Art & Entertainment (object)":17.24,
|
| 138 |
+
"Art & Entertainment (country)":25.86,
|
| 139 |
+
"Culture & Tradition (object)":8.33,
|
| 140 |
+
"Culture & Tradition (country)":31.67
|
| 141 |
+
}
|
| 142 |
+
]
|
benchmark_results.csv
DELETED
|
@@ -1,189 +0,0 @@
|
|
| 1 |
-
Run ID, Benchmark Completed, Prompt Format, Model Path, Lora Path, Quantization, Benchmark Score, Benchmark Version, Num Questions Parseable, Num Iterations, Inference Engine, Ooba Params, Download Filters, Error
|
| 2 |
-
Bielik_v0.1,2024-06-18 12:48:51,,speakleash/Bielik-7B-Instruct-v0.1,,,47.1,eq-bench_v2,170.0,1,transformers, ,,
|
| 3 |
-
Bielik_v0.1,2024-06-18 13:44:54,,speakleash/Bielik-7B-Instruct-v0.1,,,34.17,eq-bench_v2_pl,149.0,1,transformers, ,,
|
| 4 |
-
Bielik_v0.1,2024-06-18 14:01:46,,speakleash/Bielik-7B-Instruct-v0.1,,,34.27,eq-bench_v2_pl,156.0,1,transformers, ,,
|
| 5 |
-
openchat-gemma,2024-06-18 14:03:04,,openchat/openchat-3.5-0106-gemma,,,FAILED,eq-bench,FAILED,1,transformers, ,,System role not supported
|
| 6 |
-
openchat-35-0106,2024-06-18 14:30:24,,openchat/openchat-3.5-0106,,,45.69,eq-bench_v2_pl,170.0,1,transformers, ,,
|
| 7 |
-
openchat-35-0106,2024-06-18 15:15:03,,openchat/openchat-3.5-0106,,,45.69,eq-bench_v2_pl,170.0,1,transformers, ,,
|
| 8 |
-
glm-4-9b-chat,2024-06-18 15:16:14,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,
|
| 9 |
-
openchat-35-0106,2024-06-18 15:19:01,,openchat/openchat-3.5-0106,,,72.92,eq-bench_v2,171.0,1,transformers, ,,
|
| 10 |
-
glm-4-9b-chat,2024-06-18 15:20:10,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,
|
| 11 |
-
openchat-35-0106,2024-06-18 15:22:41,,openchat/openchat-3.5-0106,,,45.69,eq-bench_v2_pl,170.0,1,transformers, ,,
|
| 12 |
-
glm-4-9b-chat,2024-06-18 15:23:50,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,
|
| 13 |
-
glm-4-9b-chat,2024-06-18 15:26:30,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,
|
| 14 |
-
glm-4-9b-chat,2024-06-18 16:30:21,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,
|
| 15 |
-
glm-4-9b-chat-1m,2024-06-18 16:54:28,,THUDM/glm-4-9b-chat-1m,,,FAILED,eq-bench,FAILED,1,transformers, ,,
|
| 16 |
-
glm-4-9b-chat-1m,2024-06-18 17:05:16,,THUDM/glm-4-9b-chat-1m,,,FAILED,eq-bench,FAILED,1,transformers, ,,
|
| 17 |
-
openchat-3.6-8b-20240522,2024-06-18 17:12:00,,openchat/openchat-3.6-8b-20240522,,,-1.339640900815702e+23,eq-bench_v2,171.0,1,transformers, ,,
|
| 18 |
-
openchat-gemma,2024-06-18 17:13:12,,openchat/openchat-3.5-0106-gemma,,,FAILED,eq-bench,FAILED,1,transformers, ,,System role not supported
|
| 19 |
-
Meta-Llama-3-8B-Instruct,2024-06-18 21:29:03,,meta-llama/Meta-Llama-3-8B-Instruct,,,69.09,eq-bench_v2,171.0,1,transformers, ,,
|
| 20 |
-
Starling-LM-7B-alpha,2024-06-18 21:45:18,,berkeley-nest/Starling-LM-7B-alpha,,,49.63,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 21 |
-
Starling-LM-7B-beta,2024-06-18 21:51:54,,Nexusflow/Starling-LM-7B-beta,,,44.91,eq-bench_v2_pl,159.0,1,transformers, ,,
|
| 22 |
-
Mistral-7B-Instruct-v0.2,2024-06-18 21:52:17,,mistralai/Mistral-7B-Instruct-v0.2,,,FAILED,eq-bench,FAILED,1,transformers, ,,Conversation roles must alternate user/assistant/user/assistant/...
|
| 23 |
-
Mistral-7B-Instruct-v0.1,2024-06-18 22:26:07,,mistralai/Mistral-7B-Instruct-v0.1,,,FAILED,eq-bench,FAILED,1,transformers, ,,Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)
|
| 24 |
-
Meta-Llama-3-8B-Instruct,2024-06-18 22:35:53,,meta-llama/Meta-Llama-3-8B-Instruct,,,46.53,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 25 |
-
openchat-gemma,2024-06-19 09:30:28,,openchat/openchat-3.5-0106-gemma,,,FAILED,eq-bench,FAILED,1,transformers, ,,System role not supported
|
| 26 |
-
Mistral-7B-Instruct-v0.2,2024-06-19 09:30:46,,mistralai/Mistral-7B-Instruct-v0.2,,,FAILED,eq-bench,FAILED,1,transformers, ,,Conversation roles must alternate user/assistant/user/assistant/...
|
| 27 |
-
openchat-gemma,2024-06-19 09:35:50,,openchat/openchat-3.5-0106-gemma,,,FAILED,eq-bench,FAILED,1,transformers, ,,System role not supported
|
| 28 |
-
Mistral-7B-Instruct-v0.2,2024-06-19 09:36:01,,mistralai/Mistral-7B-Instruct-v0.2,,,FAILED,eq-bench,FAILED,1,transformers, ,,Conversation roles must alternate user/assistant/user/assistant/...
|
| 29 |
-
openchat-gemma,2024-06-19 09:43:53,,openchat/openchat-3.5-0106-gemma,,,60.11,eq-bench_v2_pl,169.0,1,transformers, ,,
|
| 30 |
-
Mistral-7B-Instruct-v0.2,2024-06-19 09:49:42,,mistralai/Mistral-7B-Instruct-v0.2,,,52.99,eq-bench_v2_pl,148.0,1,transformers, ,,
|
| 31 |
-
openchat-gemma,2024-06-19 09:54:01,,openchat/openchat-3.5-0106-gemma,,,60.11,eq-bench_v2_pl,169.0,1,transformers, ,,
|
| 32 |
-
openchat-gemma,2024-06-19 10:16:52,,openchat/openchat-3.5-0106-gemma,,,59.93,eq-bench_v2_pl,170.0,1,transformers, ,,
|
| 33 |
-
openchat-gemma,2024-06-19 10:19:44,,openchat/openchat-3.5-0106-gemma,,,59.93,eq-bench_v2_pl,170.0,1,transformers, ,,
|
| 34 |
-
Nous-Hermes-2-SOLAR-10.7B,2024-06-19 10:27:36,,NousResearch/Nous-Hermes-2-SOLAR-10.7B,,,48.22,eq-bench_v2_pl,169.0,1,transformers, ,,
|
| 35 |
-
SOLAR-10.7B-Instruct-v1.0,2024-06-19 10:43:47,,upstage/SOLAR-10.7B-Instruct-v1.0,,,57.57,eq-bench_v2_pl,164.0,1,transformers, ,,
|
| 36 |
-
Qwen2-7B-Instruct,2024-06-19 10:46:52,,Qwen/Qwen2-7B-Instruct,,,53.08,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 37 |
-
models/gwint2,2024-06-19 11:21:15,,speakleash/Bielik-11B-v2.0-Instruct,,,68.24,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 38 |
-
Azurro/APT3-275M-Base,2024-06-19 11:36:43,,Azurro/APT3-275M-Base,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
|
| 39 |
-
Qwen/Qwen2-0.5B,2024-06-19 11:47:44,,Qwen/Qwen2-0.5B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,18.0 questions were parseable (min is 83%)
|
| 40 |
-
Qwen/Qwen2-0.5B-Instruct,2024-06-19 11:51:21,,Qwen/Qwen2-0.5B-Instruct,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,125.0 questions were parseable (min is 83%)
|
| 41 |
-
allegro/plt5-large,2024-06-19 11:51:22,,allegro/plt5-large,,,FAILED,eq-bench,FAILED,1,transformers, ,,Unrecognized configuration class <class 'transformers.models.t5.configuration_t5.T5Config'> for this kind of AutoModel: AutoModelForCausalLM. Model type should be one of BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CohereConfig, CpmAntConfig, CTRLConfig, Data2VecTextConfig, DbrxConfig, ElectraConfig, ErnieConfig, FalconConfig, FuyuConfig, GemmaConfig, GitConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, JambaConfig, JetMoeConfig, LlamaConfig, MambaConfig, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MixtralConfig, MptConfig, MusicgenConfig, MusicgenMelodyConfig, MvpConfig, OlmoConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PhiConfig, Phi3Config, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, Qwen2MoeConfig, RecurrentGemmaConfig, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, StableLmConfig, Starcoder2Config, TransfoXLConfig, TrOCRConfig, WhisperConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig.
|
| 42 |
-
APT3-1B-Instruct-e1,2024-06-19 11:51:22,,APT3-1B-Instruct-e1,,,FAILED,eq-bench,FAILED,1,transformers, ,,APT3-1B-Instruct-e1 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
|
| 43 |
-
APT3-1B-Instruct-e2,2024-06-19 11:51:23,,APT3-1B-Instruct-e2,,,FAILED,eq-bench,FAILED,1,transformers, ,,APT3-1B-Instruct-e2 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
|
| 44 |
-
Azurro/APT3-1B-Base,2024-06-19 12:00:40,,Azurro/APT3-1B-Base,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
|
| 45 |
-
OPI-PG/Qra-1b,2024-06-19 12:13:15,,OPI-PG/Qra-1b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
|
| 46 |
-
TinyLlama/TinyLlama-1.1B-Chat-v1.0,2024-06-19 12:23:45,,TinyLlama/TinyLlama-1.1B-Chat-v1.0,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,36.0 questions were parseable (min is 83%)
|
| 47 |
-
Qwen/Qwen2-1.5B,2024-06-19 12:35:37,,Qwen/Qwen2-1.5B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,54.0 questions were parseable (min is 83%)
|
| 48 |
-
Qwen/Qwen2-1.5B-Instruct,2024-06-19 12:38:29,,Qwen/Qwen2-1.5B-Instruct,,,15.33,eq-bench_v2_pl,165.0,1,transformers, ,,
|
| 49 |
-
sdadas/polish-gpt2-xl,2024-06-19 12:54:39,,sdadas/polish-gpt2-xl,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
|
| 50 |
-
internlm/internlm2-1_8b,2024-06-19 13:08:50,,internlm/internlm2-1_8b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
|
| 51 |
-
internlm/internlm2-chat-1_8b,2024-06-19 13:13:21,,internlm/internlm2-chat-1_8b,,,13.83,eq-bench_v2_pl,150.0,1,transformers, ,,
|
| 52 |
-
google/gemma-1.1-2b-it,2024-06-19 13:15:24,,google/gemma-1.1-2b-it,,,16.47,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 53 |
-
microsoft/phi-2,2024-06-19 13:28:07,,microsoft/phi-2,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
|
| 54 |
-
google/mt5-xl,2024-06-19 13:28:10,,google/mt5-xl,,,FAILED,eq-bench,FAILED,1,transformers, ,,Unrecognized configuration class <class 'transformers.models.mt5.configuration_mt5.MT5Config'> for this kind of AutoModel: AutoModelForCausalLM. Model type should be one of BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CohereConfig, CpmAntConfig, CTRLConfig, Data2VecTextConfig, DbrxConfig, ElectraConfig, ErnieConfig, FalconConfig, FuyuConfig, GemmaConfig, GitConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, JambaConfig, JetMoeConfig, LlamaConfig, MambaConfig, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MixtralConfig, MptConfig, MusicgenConfig, MusicgenMelodyConfig, MvpConfig, OlmoConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PhiConfig, Phi3Config, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, Qwen2MoeConfig, RecurrentGemmaConfig, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, StableLmConfig, Starcoder2Config, TransfoXLConfig, TrOCRConfig, WhisperConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig, InternLM2Config, InternLM2Config.
|
| 55 |
-
microsoft/Phi-3-mini-4k-instruct,2024-06-19 13:34:56,,microsoft/Phi-3-mini-4k-instruct,,,28.05,eq-bench_v2_pl,159.0,1,transformers, ,,
|
| 56 |
-
ssmits/Falcon2-5.5B-Polish,2024-06-19 13:47:21,,ssmits/Falcon2-5.5B-Polish,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
|
| 57 |
-
01-ai/Yi-1.5-6B,2024-06-19 14:04:20,,01-ai/Yi-1.5-6B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,1.0 questions were parseable (min is 83%)
|
| 58 |
-
01-ai/Yi-1.5-6B-Chat,2024-06-19 14:11:22,,01-ai/Yi-1.5-6B-Chat,,,5.19,eq-bench_v2_pl,161.0,1,transformers, ,,
|
| 59 |
-
THUDM/chatglm3-6b,2024-06-19 14:12:11,,THUDM/chatglm3-6b,,,FAILED,eq-bench,FAILED,1,transformers, ,,too many values to unpack (expected 2)
|
| 60 |
-
THUDM/chatglm3-6b-base,2024-06-19 14:13:00,,THUDM/chatglm3-6b-base,,,FAILED,eq-bench,FAILED,1,transformers, ,,too many values to unpack (expected 2)
|
| 61 |
-
alpindale/Mistral-7B-v0.2-hf,2024-06-19 14:16:37,,alpindale/Mistral-7B-v0.2-hf,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,45.0 questions were parseable (min is 83%)
|
| 62 |
-
berkeley-nest/Starling-LM-7B-alpha,2024-06-19 14:22:32,,berkeley-nest/Starling-LM-7B-alpha,,,46.26,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 63 |
-
google/gemma-7b,2024-06-19 14:38:02,,google/gemma-7b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
|
| 64 |
-
google/gemma-7b-it,2024-06-19 14:53:28,,google/gemma-7b-it,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
|
| 65 |
-
HuggingFaceH4/zephyr-7b-alpha,2024-06-19 15:05:31,,HuggingFaceH4/zephyr-7b-alpha,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,99.0 questions were parseable (min is 83%)
|
| 66 |
-
HuggingFaceH4/zephyr-7b-beta,2024-06-19 15:18:24,,HuggingFaceH4/zephyr-7b-beta,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,88.0 questions were parseable (min is 83%)
|
| 67 |
-
internlm/internlm2-7b,2024-06-19 15:36:06,,internlm/internlm2-7b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,43.0 questions were parseable (min is 83%)
|
| 68 |
-
internlm/internlm2-base-7b,2024-06-19 15:54:53,,internlm/internlm2-base-7b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,6.0 questions were parseable (min is 83%)
|
| 69 |
-
internlm/internlm2-chat-7b,2024-06-19 16:02:07,,internlm/internlm2-chat-7b,,,40.0,eq-bench_v2_pl,169.0,1,transformers, ,,
|
| 70 |
-
internlm/internlm2-chat-7b-sft,2024-06-19 16:07:04,,internlm/internlm2-chat-7b-sft,,,41.62,eq-bench_v2_pl,170.0,1,transformers, ,,
|
| 71 |
-
lex-hue/Delexa-7b,2024-06-19 16:12:19,,lex-hue/Delexa-7b,,,49.03,eq-bench_v2_pl,169.0,1,transformers, ,,
|
| 72 |
-
meta-llama/Llama-2-7b-chat-hf,2024-06-19 16:21:08,,meta-llama/Llama-2-7b-chat-hf,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,116.0 questions were parseable (min is 83%)
|
| 73 |
-
meta-llama/Llama-2-7b-hf,2024-06-19 16:36:41,,meta-llama/Llama-2-7b-hf,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,1.0 questions were parseable (min is 83%)
|
| 74 |
-
microsoft/WizardLM-2-7B,2024-06-19 16:44:22,,microsoft/WizardLM-2-7B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,137.0 questions were parseable (min is 83%)
|
| 75 |
-
mistralai/Mistral-7B-Instruct-v0.1,2024-06-19 16:44:33,,mistralai/Mistral-7B-Instruct-v0.1,,,FAILED,eq-bench,FAILED,1,transformers, ,,Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)
|
| 76 |
-
mistralai/Mistral-7B-Instruct-v0.2,2024-06-19 16:50:36,,mistralai/Mistral-7B-Instruct-v0.2,,,53.25,eq-bench_v2_pl,151.0,1,transformers, ,,
|
| 77 |
-
mistralai/Mistral-7B-Instruct-v0.3,2024-06-19 16:54:49,,mistralai/Mistral-7B-Instruct-v0.3,,,45.21,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 78 |
-
mistralai/Mistral-7B-v0.1,2024-06-19 16:59:50,,mistralai/Mistral-7B-v0.1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,65.0 questions were parseable (min is 83%)
|
| 79 |
-
mistralai/Mistral-7B-v0.3,2024-06-19 17:16:38,,mistralai/Mistral-7B-v0.3,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,14.0 questions were parseable (min is 83%)
|
| 80 |
-
Nexusflow/Starling-LM-7B-beta,2024-06-19 17:23:18,,Nexusflow/Starling-LM-7B-beta,,,45.1,eq-bench_v2_pl,166.0,1,transformers, ,,
|
| 81 |
-
openchat/openchat-3.5-0106,2024-06-19 17:27:10,,openchat/openchat-3.5-0106,,,43.81,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 82 |
-
openchat/openchat-3.5-0106-gemma,2024-06-19 17:30:31,,openchat/openchat-3.5-0106-gemma,,,58.62,eq-bench_v2_pl,169.0,1,transformers, ,,
|
| 83 |
-
openchat/openchat-3.5-1210,2024-06-19 17:34:27,,openchat/openchat-3.5-1210,,,49.04,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 84 |
-
OPI-PG/Qra-7b,2024-06-19 17:50:28,,OPI-PG/Qra-7b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
|
| 85 |
-
Qwen/Qwen1.5-7B,2024-06-19 17:57:53,,Qwen/Qwen1.5-7B,,,23.11,eq-bench_v2_pl,155.0,1,transformers, ,,
|
| 86 |
-
Qwen/Qwen1.5-7B-Chat,2024-06-19 18:03:34,,Qwen/Qwen1.5-7B-Chat,,,25.0,eq-bench_v2_pl,164.0,1,transformers, ,,
|
| 87 |
-
Qwen/Qwen2-7B,2024-06-19 18:09:23,,Qwen/Qwen2-7B,,,36.58,eq-bench_v2_pl,166.0,1,transformers, ,,
|
| 88 |
-
Qwen/Qwen2-7B-Instruct,2024-06-19 18:12:42,,Qwen/Qwen2-7B-Instruct,,,53.74,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 89 |
-
Remek/Kruk-7B-SP-001,2024-06-19 18:17:13,,Remek/Kruk-7B-SP-001,,,44.44,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 90 |
-
Remek/OpenChat-3.5-0106-PL-Omnibusv2,2024-06-19 18:17:24,,Remek/OpenChat-3.5-0106-PL-Omnibusv2,,,FAILED,eq-bench,FAILED,1,transformers, ,,'system_message' is undefined
|
| 91 |
-
Remek/OpenChat3.5-0106-Spichlerz-Bocian,2024-06-19 18:24:08,,Remek/OpenChat3.5-0106-Spichlerz-Bocian,,,44.13,eq-bench_v2_pl,166.0,1,transformers, ,,
|
| 92 |
-
Remek/OpenChat3.5-0106-Spichlerz-Inst-001,2024-06-19 18:28:48,,Remek/OpenChat3.5-0106-Spichlerz-Inst-001,,,41.6,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 93 |
-
RWKV/HF_v5-Eagle-7B,2024-06-19 19:16:27,,RWKV/HF_v5-Eagle-7B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
|
| 94 |
-
RWKV/v5-Eagle-7B-HF,2024-06-19 20:04:12,,RWKV/v5-Eagle-7B-HF,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
|
| 95 |
-
speakleash/Bielik-7B-v0.1,2024-06-19 20:11:16,,speakleash/Bielik-7B-v0.1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,139.0 questions were parseable (min is 83%)
|
| 96 |
-
szymonrucinski/Curie-7B-v1,2024-06-19 20:29:24,,szymonrucinski/Curie-7B-v1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,1.0 questions were parseable (min is 83%)
|
| 97 |
-
teknium/OpenHermes-2.5-Mistral-7B,2024-06-19 20:34:12,,teknium/OpenHermes-2.5-Mistral-7B,,,37.48,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 98 |
-
Voicelab/trurl-2-7b,2024-06-19 20:39:26,,Voicelab/trurl-2-7b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,141.0 questions were parseable (min is 83%)
|
| 99 |
-
microsoft/Phi-3-small-8k-instruct,2024-06-19 20:39:31,,microsoft/Phi-3-small-8k-instruct,,,FAILED,eq-bench,FAILED,1,transformers, ,,No module named 'pytest'
|
| 100 |
-
CohereForAI/aya-23-8B,2024-06-19 20:44:01,,CohereForAI/aya-23-8B,,,45.43,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 101 |
-
meta-llama/Meta-Llama-3-8B,2024-06-19 21:01:55,,meta-llama/Meta-Llama-3-8B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
|
| 102 |
-
meta-llama/Meta-Llama-3-8B-Instruct,2024-06-19 21:06:08,,meta-llama/Meta-Llama-3-8B-Instruct,,,46.27,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 103 |
-
mlabonne/NeuralDaredevil-8B-abliterated,2024-06-19 21:13:31,,mlabonne/NeuralDaredevil-8B-abliterated,,,54.74,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 104 |
-
NousResearch/Hermes-2-Pro-Llama-3-8B,2024-06-19 21:18:18,,NousResearch/Hermes-2-Pro-Llama-3-8B,,,54.57,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 105 |
-
NousResearch/Hermes-2-Theta-Llama-3-8B,2024-06-19 21:25:22,,NousResearch/Hermes-2-Theta-Llama-3-8B,,,54.88,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 106 |
-
nvidia/Llama3-ChatQA-1.5-8B,2024-06-19 22:27:24,,nvidia/Llama3-ChatQA-1.5-8B,,,40.55,eq-bench_v2_pl,166.0,1,transformers, ,,
|
| 107 |
-
openchat/openchat-3.6-8b-20240522,2024-06-19 22:34:56,,openchat/openchat-3.6-8b-20240522,,,-2.0090659464796595e+18,eq-bench_v2_pl,170.0,1,transformers, ,,
|
| 108 |
-
Remek/Llama-3-8B-Omnibus-1-PL-v01-INSTRUCT,2024-06-19 22:39:46,,Remek/Llama-3-8B-Omnibus-1-PL-v01-INSTRUCT,,,26.63,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 109 |
-
01-ai/Yi-1.5-9B,2024-06-19 23:07:56,,01-ai/Yi-1.5-9B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,1.0 questions were parseable (min is 83%)
|
| 110 |
-
01-ai/Yi-1.5-9B-Chat,2024-06-19 23:19:16,,01-ai/Yi-1.5-9B-Chat,,,48.78,eq-bench_v2_pl,163.0,1,transformers, ,,
|
| 111 |
-
google/recurrentgemma-9b-it,2024-06-19 23:28:19,,google/recurrentgemma-9b-it,,,52.82,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 112 |
-
THUDM/glm-4-9b,2024-06-19 23:28:41,,THUDM/glm-4-9b,,,FAILED,eq-bench,FAILED,1,transformers, ,,too many values to unpack (expected 2)
|
| 113 |
-
THUDM/glm-4-9b-chat,2024-06-19 23:29:01,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,too many values to unpack (expected 2)
|
| 114 |
-
NousResearch/Nous-Hermes-2-SOLAR-10.7B,2024-06-19 23:51:07,,NousResearch/Nous-Hermes-2-SOLAR-10.7B,,,49.85,eq-bench_v2_pl,169.0,1,transformers, ,,
|
| 115 |
-
TeeZee/Bielik-SOLAR-LIKE-10.7B-Instruct-v0.1,2024-06-20 00:00:02,,TeeZee/Bielik-SOLAR-LIKE-10.7B-Instruct-v0.1,,,35.63,eq-bench_v2_pl,164.0,1,transformers, ,,
|
| 116 |
-
upstage/SOLAR-10.7B-Instruct-v1.0,2024-06-20 00:19:48,,upstage/SOLAR-10.7B-Instruct-v1.0,,,57.35,eq-bench_v2_pl,162.0,1,transformers, ,,
|
| 117 |
-
upstage/SOLAR-10.7B-v1.0,2024-06-20 01:12:51,,upstage/SOLAR-10.7B-v1.0,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,1.0 questions were parseable (min is 83%)
|
| 118 |
-
tiiuae/falcon-11B,2024-06-20 01:23:54,,tiiuae/falcon-11B,,,42.41,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 119 |
-
lmsys/vicuna-13b-v1.5,2024-06-20 01:43:40,,lmsys/vicuna-13b-v1.5,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,84.0 questions were parseable (min is 83%)
|
| 120 |
-
OPI-PG/Qra-13b,2024-06-20 02:07:48,,OPI-PG/Qra-13b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
|
| 121 |
-
teknium/OpenHermes-13B,2024-06-20 02:32:04,,teknium/OpenHermes-13B,,,36.85,eq-bench_v2_pl,162.0,1,transformers, ,,
|
| 122 |
-
Voicelab/trurl-2-13b-academic,2024-06-20 02:38:04,,Voicelab/trurl-2-13b-academic,,,25.92,eq-bench_v2_pl,162.0,1,transformers, ,,
|
| 123 |
-
microsoft/Phi-3-medium-4k-instruct,2024-06-20 02:46:38,,microsoft/Phi-3-medium-4k-instruct,,,57.07,eq-bench_v2_pl,169.0,1,transformers, ,,
|
| 124 |
-
Qwen/Qwen1.5-14B-Chat,2024-06-20 02:52:13,,Qwen/Qwen1.5-14B-Chat,,,51.26,eq-bench_v2_pl,160.0,1,transformers, ,,
|
| 125 |
-
internlm/internlm2-20b,2024-06-20 09:04:33,,internlm/internlm2-20b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,4.0 questions were parseable (min is 83%)
|
| 126 |
-
internlm/internlm2-chat-20b,2024-06-20 09:47:11,,internlm/internlm2-chat-20b,,,36.52,eq-bench_v2_pl,170.0,1,transformers, ,,
|
| 127 |
-
Qwen/Qwen1.5-32B,2024-06-20 13:25:12,,Qwen/Qwen1.5-32B,,,54.35,eq-bench_v2_pl,170.0,1,transformers, ,,
|
| 128 |
-
Qwen/Qwen1.5-32B-Chat,2024-06-20 13:34:52,,Qwen/Qwen1.5-32B-Chat,,,60.69,eq-bench_v2_pl,168.0,1,transformers, ,,
|
| 129 |
-
01-ai/Yi-1.5-34B-Chat,2024-06-20 13:51:30,,01-ai/Yi-1.5-34B-Chat,,,46.32,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 130 |
-
CohereForAI/aya-23-35B,2024-06-20 14:03:07,,CohereForAI/aya-23-35B,,,58.41,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 131 |
-
CohereForAI/c4ai-command-r-v01,2024-06-20 14:14:54,,CohereForAI/c4ai-command-r-v01,,,56.43,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 132 |
-
mistralai/Mixtral-8x7B-Instruct-v0.1,2024-06-20 14:35:28,,mistralai/Mixtral-8x7B-Instruct-v0.1,,,58.64,eq-bench_v2_pl,168.0,1,transformers, ,,
|
| 133 |
-
mistralai/Mixtral-8x7B-v0.1,2024-06-20 15:30:24,,mistralai/Mixtral-8x7B-v0.1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,10.0 questions were parseable (min is 83%)
|
| 134 |
-
Qwen/Qwen2-57B-A14B-Instruct,2024-06-20 16:19:41,,Qwen/Qwen2-57B-A14B-Instruct,,,57.64,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 135 |
-
meta-llama/Meta-Llama-3-70B,2024-06-20 16:59:30,,meta-llama/Meta-Llama-3-70B,,,46.1,eq-bench_v2_pl,145.0,1,transformers, ,,
|
| 136 |
-
meta-llama/Meta-Llama-3-70B-Instruct,2024-06-20 17:15:58,,meta-llama/Meta-Llama-3-70B-Instruct,,,71.21,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 137 |
-
Qwen/Qwen1.5-72B,2024-06-20 17:50:17,,Qwen/Qwen1.5-72B,,,53.96,eq-bench_v2_pl,163.0,1,transformers, ,,
|
| 138 |
-
Qwen/Qwen1.5-72B-Chat,2024-06-20 18:06:58,,Qwen/Qwen1.5-72B-Chat,,,68.03,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 139 |
-
Qwen/Qwen2-72B,2024-06-20 18:36:22,,Qwen/Qwen2-72B,,,69.75,eq-bench_v2_pl,169.0,1,transformers, ,,
|
| 140 |
-
Qwen/Qwen2-72B-Instruct,2024-06-20 18:55:02,,Qwen/Qwen2-72B-Instruct,,,72.07,eq-bench_v2_pl,169.0,1,transformers, ,,
|
| 141 |
-
mistralai/Mixtral-8x22B-v0.1,2024-06-21 20:20:37,,mistralai/Mixtral-8x22B-v0.1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,34.0 questions were parseable (min is 83%)
|
| 142 |
-
mistralai/Mixtral-8x22B-Instruct-v0.1,2024-06-26 23:40:01,,mistralai/Mixtral-8x22B-Instruct-v0.1,,,67.63,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 143 |
-
mistralai/Mixtral-8x22B-v0.1,2024-06-27 01:17:13,,mistralai/Mixtral-8x22B-v0.1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,50.0 questions were parseable (min is 83%)
|
| 144 |
-
alpindale/WizardLM-2-8x22B,2024-06-27 01:50:42,,alpindale/WizardLM-2-8x22B,,,69.56,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 145 |
-
Bielik_v2.2b,2024-08-24 09:54:33,,speakleash/Bielik-11B-v2.2-Instruct,,,69.05,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 146 |
-
Bielik_v2.1,2024-08-24 10:07:46,,speakleash/Bielik-11B-v2.1-Instruct,,,66.27,eq-bench_v2_pl,155.0,1,transformers, ,,
|
| 147 |
-
meta-llama/Meta-Llama-3.1-70B-Instruct,2024-08-24 21:24:39,,meta-llama/Meta-Llama-3.1-70B-Instruct,,,FAILED,eq-bench,FAILED,1,transformers, ,,`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {'factor': 8.0, 'low_freq_factor': 1.0, 'high_freq_factor': 4.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}
|
| 148 |
-
mistralai/Mistral-Large-Instruct-2407,2024-08-24 21:51:53,,mistralai/Mistral-Large-Instruct-2407,,,78.07,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 149 |
-
meta-llama/Meta-Llama-3.1-70B-Instruct,2024-08-24 22:23:40,,meta-llama/Meta-Llama-3.1-70B-Instruct,,,72.53,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 150 |
-
meta-llama/Meta-Llama-3.1-405B-Instruct-FP8,2024-08-25 20:59:04,openai_api,meta-llama/Meta-Llama-3.1-405B-Instruct-FP8,,,77.23,eq-bench_v2_pl,171.0,1,openai,,,
|
| 151 |
-
gpt-3.5-turbo,2024-08-25 21:14:25,openai_api,gpt-3.5-turbo,,,57.7,eq-bench_v2_pl,171.0,1,openai,,,
|
| 152 |
-
gpt-4o-mini-2024-07-18,2024-08-25 21:17:34,openai_api,gpt-4o-mini-2024-07-18,,,71.15,eq-bench_v2_pl,171.0,1,openai,,,
|
| 153 |
-
gpt-4o-2024-08-06,2024-08-25 21:24:35,openai_api,gpt-4o-2024-08-06,,,75.15,eq-bench_v2_pl,171.0,1,openai,,,
|
| 154 |
-
gpt-4-turbo-2024-04-09,2024-08-25 21:31:42,openai_api,gpt-4-turbo-2024-04-09,,,77.77,eq-bench_v2_pl,164.0,1,openai,,,
|
| 155 |
-
Bielik_v2.3,2024-09-14 10:40:57,,speakleash/Bielik-11B-v2.3-Instruct,,,70.86,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 156 |
-
PLLuM-12B-nc-chat,2025-02-24 15:02:07,,CYFRAGOVPL/PLLuM-12B-nc-chat,,,49.23,eq-bench_pl,123.0,1,transformers, ,,123.0 questions were parseable (min is 83%)
|
| 157 |
-
Llama-PLLuM-8B-instruct,2025-02-24 16:55:16,,CYFRAGOVPL/Llama-PLLuM-8B-instruct,,,43.56,eq-bench_pl,124.0,1,transformers, ,,124.0 questions were parseable (min is 83%)
|
| 158 |
-
PLLuM-12B-nc-instruct,2025-02-24 17:38:48,,CYFRAGOVPL/PLLuM-12B-nc-instruct,,,29.50,eq-bench_pl,76.0,1,transformers, ,,76.0 questions were parseable (min is 83%)
|
| 159 |
-
PLLuM-12B-chat,2025-02-24 17:56:34,,CYFRAGOVPL/PLLuM-12B-chat,,,57.29,eq-bench_v2_pl,156.0,1,transformers, ,,
|
| 160 |
-
PLLuM-12B-instruct,2025-02-24 18:03:06,,CYFRAGOVPL/PLLuM-12B-instruct,,,40.21,eq-bench_v2_pl,154.0,1,transformers, ,,
|
| 161 |
-
Llama-PLLuM-8B-chat,2025-02-24 18:40:04,,CYFRAGOVPL/Llama-PLLuM-8B-chat,,,50.97,eq-bench_v2_pl,155.0,1,transformers, ,,
|
| 162 |
-
Llama-PLLuM-70B-instruct,2025-02-23 22:45:37,,CYFRAGOVPL/Llama-PLLuM-70B-instruct,,,69.99,eq-bench_v2_pl,171.0,1,transformers, ,,
|
| 163 |
-
Llama-PLLuM-70B-chat,2025-02-24 22:32:57,,CYFRAGOVPL/Llama-PLLuM-70B-chat,,,72.99,eq-bench_v2_pl,170.0,1,transformers, ,,
|
| 164 |
-
PLLuM-8x7B-nc-chat,2025-02-23 14:33:22,openai_api,CYFRAGOVPL/PLLuM-8x7B-nc-chat,,,47.29,eq-bench_v2_pl,171.0,1,openai,,,
|
| 165 |
-
PLLuM-8x7B-nc-instruct,2025-02-23 14:33:22,openai_api,CYFRAGOVPL/PLLuM-8x7B-nc-instruct,,,41.75,eq-bench_v2_pl,171.0,1,openai,,,
|
| 166 |
-
PLLuM-8x7B-chat,2025-02-23 14:33:22,openai_api,CYFRAGOVPL/PLLuM-8x7B-chat,,,45.22,eq-bench_v2_pl,171.0,1,openai,,,
|
| 167 |
-
PLLuM-8x7B-instruct,2025-02-23 14:33:22,openai_api,CYFRAGOVPL/PLLuM-8x7B-instruct,,,39.55,eq-bench_v2_pl,171.0,1,openai,,,
|
| 168 |
-
Qwen2.5-7B-Instruct,2025-03-01 11:49:28,,Qwen/Qwen2.5-7B-Instruct,,,58.58,eq-bench_v2_pl,171.0,1,transformers,,,
|
| 169 |
-
Qwen2.5-14B-Instruct,2025-03-01 12:01:56,,Qwen/Qwen2.5-14B-Instruct,,,69.58,eq-bench_v2_pl,170.0,1,transformers,,,
|
| 170 |
-
Qwen2.5-1.5B-Instruct,2025-03-01 12:09:18,,Qwen/Qwen2.5-1.5B-Instruct,,,27.79,eq-bench_v2_pl,170.0,1,transformers,,,
|
| 171 |
-
phi-4,2025-03-01 12:19:38,,microsoft/phi-4,,,64.37,eq-bench_v2_pl,157.0,1,transformers,,,
|
| 172 |
-
glm-4-9b-chat,2025-03-01 12:23:46,,THUDM/glm-4-9b-chat,,,61.79,eq-bench_v2_pl,171.0,1,transformers,,,
|
| 173 |
-
openchat-3.6-8b-20240522,2025-03-01 12:29:29,,openchat/openchat-3.6-8b-20240522,,,-2.0090659464796536e+18,eq-bench_v2_pl,170.0,1,transformers,,,
|
| 174 |
-
Qwen2.5-32B-Instruct,2025-03-02 14:08:52,,Qwen/Qwen2.5-32B-Instruct,,,71.15,eq-bench_v2_pl,171.0,1,transformers,,,
|
| 175 |
-
Qwen2.5-72B-Instruct,2025-03-02 14:25:32,,Qwen/Qwen2.5-72B-Instruct,,,68.89,eq-bench_v2_pl,170.0,1,transformers,,,
|
| 176 |
-
Llama-3.1-Nemotron-70B-Instruct-HF,2025-03-02 15:04:25,,nvidia/Llama-3.1-Nemotron-70B-Instruct-HF,,,74.75,eq-bench_pl,133.0,1,transformers,,,133.0 questions were parseable (min is 83%)
|
| 177 |
-
Llama-3.2-1B-Instruct,2025-03-02 16:35:24,,meta-llama/Llama-3.2-1B-Instruct,,,20.59,eq-bench_v2_pl,148.0,1,transformers,,,
|
| 178 |
-
EuroLLM-9B-Instruct,2025-03-02 16:41:02,,utter-project/EuroLLM-9B-Instruct,,,54.75,eq-bench_v2_pl,169.0,1,transformers,,,
|
| 179 |
-
Llama-3.3-70B-Instruct,2025-03-02 16:59:31,,meta-llama/Llama-3.3-70B-Instruct,,,72.86,eq-bench_v2_pl,166.0,1,transformers,,,
|
| 180 |
-
Llama-3.2-3B-Instruct,2025-03-02 17:14:17,,meta-llama/Llama-3.2-3B-Instruct,,,46.46,eq-bench_v2_pl,170.0,1,transformers,,,
|
| 181 |
-
Qwen2.5-3B-Instruct,2025-03-02 17:26:57,,Qwen/Qwen2.5-3B-Instruct,,,36.08,eq-bench_v2_pl,170.0,1,transformers,,,
|
| 182 |
-
Mistral-Small-24B-Instruct-2501,2025-03-02 17:33:14,,mistralai/Mistral-Small-24B-Instruct-2501,,,70.52,eq-bench_v2_pl,171.0,1,transformers,,,
|
| 183 |
-
Mistral-Small-Instruct-2409,2025-03-02 17:43:01,,mistralai/Mistral-Small-Instruct-2409,,,72.85,eq-bench_v2_pl,171.0,1,transformers,,,
|
| 184 |
-
Mistral-Nemo-Instruct-2407,2025-03-03 10:29:42,,mistralai/Mistral-Nemo-Instruct-2407,,,61.76,eq-bench_v2_pl,171.0,1,transformers,,,
|
| 185 |
-
Phi-4-mini-instruct,2025-03-03 13:20:03,,microsoft/Phi-4-mini-instruct,,,50.82,eq-bench_v2_pl,170.0,1,transformers,,,
|
| 186 |
-
Mistral-Large-Instruct-2411,2025-03-07 12:17:17,,mistralai/Mistral-Large-Instruct-2411,,,77.29,eq-bench_v2_pl,171.0,1,transformers,,,
|
| 187 |
-
Bielik-11B-v2.5-Instruct,2025-05-01 19:27:42,,speakleash/Bielik-11B-v2.5-Instruct,,,72.42,eq-bench_v2_pl,170.0,1,transformers,,,
|
| 188 |
-
Bielik-1.5B-v3.0-Instruct,2025-05-04 00:28:45,,speakleash/Bielik-1.5B-v3.0-Instruct,,,18.99,eq-bench_pl,125.0,1,transformers,,,125.0 questions were parseable (min is 83%)
|
| 189 |
-
Bielik-4.5B-v3.0-Instruct,2025-05-04 16:16:42,,speakleash/Bielik-4.5B-v3.0-Instruct,,,56.21,eq-bench_v2_pl,163.0,1,transformers,,,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
metadata.json
DELETED
|
@@ -1,355 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"Azurro/APT3-1B-Base": 1,
|
| 3 |
-
"HuggingFaceH4/zephyr-7b-alpha": 7,
|
| 4 |
-
"Voicelab/trurl-2-13b-academic": 13,
|
| 5 |
-
"HuggingFaceH4/zephyr-7b-beta": 7,
|
| 6 |
-
"Voicelab/trurl-2-7b": 7,
|
| 7 |
-
"mistralai/Mistral-7B-v0.1": 7,
|
| 8 |
-
"mistralai/Mistral-7B-v0.1,peft=lora/output/mistral-7b-v0.1-lora-pl/checkpoint-400/adapter_model": 7,
|
| 9 |
-
"mistralai/Mistral-7B-v0.1,peft=lora/output/mistral-7b-v0.1-lora-pl/checkpoint-200/adapter_model": 7,
|
| 10 |
-
"mistralai/Mistral-7B-v0.1,load_in_8bit=True": 7,
|
| 11 |
-
"Nondzu/zephyr-speakleash-007-pl-8192-32-16-0.05": 7,
|
| 12 |
-
"openchat/openchat-3.5-0106": 7,
|
| 13 |
-
"mistralai/Mistral-7B-v0.1,peft=lora/output/mistral-7b-v0.1-lora-pl/checkpoint-2000/adapter_model": 7,
|
| 14 |
-
"mistralai/Mistral-7B-v0.1,peft=lora/output/mistral-7b-v0.1-lora-pl/checkpoint-2200/adapter_model": 7,
|
| 15 |
-
"mistralai/Mistral-7B-Instruct-v0.1": 7,
|
| 16 |
-
"APT3-1B-Instruct-e1": 1,
|
| 17 |
-
"mistralai/Mistral-7B-v0.1,peft=lora/output/mistral-7b-v0.1-lora-pl/checkpoint-800/adapter_model": 7,
|
| 18 |
-
"mistralai/Mistral-7B-v0.1,peft=lora/output/mistral-7b-v0.1-lora-pl/checkpoint-600/adapter_model": 7,
|
| 19 |
-
"APT3-1B-Instruct-e2": 1,
|
| 20 |
-
"mistralai/Mistral-7B-v0.1,load_in_4bit=True": 7,
|
| 21 |
-
"speakleash/3-5B_high_base/epoch_2_hf": 3.5,
|
| 22 |
-
"speakleash/3-5B_high_base/epoch_1_hf": 3.5,
|
| 23 |
-
"speakleash/3-5B_high_base/epoch_0_hf": 3.5,
|
| 24 |
-
"speakleash/7B_high_base/epoch_1_hf": 7,
|
| 25 |
-
"speakleash/7B_high_base/epoch_0_hf": 7,
|
| 26 |
-
"Nondzu/zephyr-speakleash-010-pl-3072-32-16-0.01": 7,
|
| 27 |
-
"google/mt5-xl": 3.7,
|
| 28 |
-
"speakleash/7B_high_sft/epoch_2_base/epoch_2_hf": 7,
|
| 29 |
-
"OPI-PG/Qra-1b": 1,
|
| 30 |
-
"OPI-PG/Qra-13b": 13,
|
| 31 |
-
"OPI-PG/Qra-7b": 7,
|
| 32 |
-
"teknium/OpenHermes-2.5-Mistral-7B": 7,
|
| 33 |
-
"openchat/openchat-3.5-1210": 7,
|
| 34 |
-
"speakleash/apt3-1B_base/apt3-1B-sequential_hf": 1,
|
| 35 |
-
"speakleash/apt3-1B_base/apt3-1B-shuffled_hf": 1,
|
| 36 |
-
"speakleash/1B_high_base/like_apt3-1B_hf": 1,
|
| 37 |
-
"speakleash/1B_high_base/epoch_3_hf": 1,
|
| 38 |
-
"speakleash/7B_high_sft/epoch_1_base/epoch_2_hf": 7,
|
| 39 |
-
"speakleash/7B_high_sft/epoch_1_base/epoch_1_hf": 7,
|
| 40 |
-
"speakleash/7B_high_sft/epoch_0_base/epoch_0_hf": 7,
|
| 41 |
-
"speakleash/7B_high_sft/epoch_2_base/epoch_1_hf": 7,
|
| 42 |
-
"speakleash/3-5B_high_sft/epoch_3_base/epoch_2_hf": 3.5,
|
| 43 |
-
"allegro/plt5-large": 0.82,
|
| 44 |
-
"internlm/internlm2-7b": 7,
|
| 45 |
-
"sdadas/polish-gpt2-xl": 1.67,
|
| 46 |
-
"speakleash/1B_4k_high_sft/epoch_3_base/epoch_1_hf": 1,
|
| 47 |
-
"speakleash/mistral-PL_7B/epoch_0_hf": 7,
|
| 48 |
-
"speakleash/1B_high_sft/epoch_3_base/epoch_1_hf": 1,
|
| 49 |
-
"speakleash/polish-mistral-7B/epoch_0_hf": 7,
|
| 50 |
-
"speakleash/3-5B_high_sft/epoch_0_base/epoch_2_hf": 3.5,
|
| 51 |
-
"speakleash/3-5B_high_sft/epoch_0_base/epoch_1_hf": 3.5,
|
| 52 |
-
"speakleash/3-5B_high_sft/epoch_0_base/epoch_0_hf": 3.5,
|
| 53 |
-
"speakleash/7B_high_base/epoch_2_hf": 7,
|
| 54 |
-
"speakleash/10B-4k_high_sft/epoch_3_base/epoch_1_hf": 10,
|
| 55 |
-
"speakleash/3-5B_high_base/epoch_3_hf": 3.5,
|
| 56 |
-
"microsoft/phi-2": 2.7,
|
| 57 |
-
"RWKV/HF_v5-Eagle-7B": 7,
|
| 58 |
-
"mistralai/Mistral-7B-Instruct-v0.2": 7,
|
| 59 |
-
"speakleash/llama-apt3-7B/only-spi-e0_hf": 7,
|
| 60 |
-
"speakleash/llama-apt3-7B/spkl-only_sft/e4_hf": 7,
|
| 61 |
-
"speakleash/llama-apt3-7B/spkl-only_sft/e5_hf": 7,
|
| 62 |
-
"speakleash/llama-apt3-7B/spkl-only_sft/e3_hf": 7,
|
| 63 |
-
"speakleash/llama-apt3-7B/spkl-only_sft/e2_hf": 7,
|
| 64 |
-
"meta-llama/Llama-2-7b-hf": 7,
|
| 65 |
-
"meta-llama/Llama-2-7b-chat-hf": 7,
|
| 66 |
-
"internlm/internlm2-chat-7b": 7,
|
| 67 |
-
"internlm/internlm2-base-7b": 7,
|
| 68 |
-
"internlm/internlm2-1_8b": 1.8,
|
| 69 |
-
"internlm/internlm2-chat-1_8b": 1.8,
|
| 70 |
-
"speakleash/mistral-apt3-7B/only-spi_sft/e0_hf": 7,
|
| 71 |
-
"speakleash/mistral-apt3-7B/only-spi-e0_hf": 7,
|
| 72 |
-
"speakleash/mistral-apt3-7B/apt3-e0_hf": 7,
|
| 73 |
-
"speakleash/mistral-apt3-7B/spi-e0_hf": 7,
|
| 74 |
-
"speakleash/mistral-apt3-7B/spkl_sft_v2/e4_hf": 7,
|
| 75 |
-
"speakleash/mistral-apt3-7B/spkl_sft_v2/e5_hf": 7,
|
| 76 |
-
"speakleash/mistral-apt3-7B/spkl_sft_v2/e3_hf": 7,
|
| 77 |
-
"speakleash/mistral-apt3-7B/spkl_sft_v2/e2_hf": 7,
|
| 78 |
-
"speakleash/mistral-apt3-7B/only-spi_sft_v2/e4_bb62a5b8": 7,
|
| 79 |
-
"speakleash/mistral-apt3-7B/only-spi_sft_v2/e6_6b0aa8d6": 7,
|
| 80 |
-
"speakleash/mistral-apt3-7B/only-spi_sft_v2/e3_f8b5e568": 7,
|
| 81 |
-
"speakleash/mistral-apt3-7B/only-spi_sft_v2/e2_3b7fc53e": 7,
|
| 82 |
-
"speakleash/mistral-apt3-7B/only-spi_sft_v2/e5_f75cbc76": 7,
|
| 83 |
-
"speakleash/mistral-apt3-7B/only-spi_sft_v2/e7_642f3822": 7,
|
| 84 |
-
"speakleash/mistral-apt3-7B/spkl_sft/e3_17ef3119": 7,
|
| 85 |
-
"speakleash/mistral-apt3-7B/spkl_sft/e2_7dc8df86": 7,
|
| 86 |
-
"google/gemma-7b": 7,
|
| 87 |
-
"google/gemma-7b-it": 7,
|
| 88 |
-
"SOTA FT HerBERT (large)": 1,
|
| 89 |
-
"Baseline (majority class)": 0,
|
| 90 |
-
"SOTA FT Polish RoBERTa": 1,
|
| 91 |
-
"SOTA FT ULMFiT-SP-PL": 0.1,
|
| 92 |
-
"speakleash/llama-apt3-13B/spkl-plus/e0_caa5ad79": 13,
|
| 93 |
-
"speakleash/llama-apt3-13B/spkl-only/e0_cc0931c5": 13,
|
| 94 |
-
"eryk-mazus/polka-1.1b": 1.1,
|
| 95 |
-
"berkeley-nest/Starling-LM-7B-alpha": 7,
|
| 96 |
-
"Remek/OpenChat3.5-0106-Spichlerz-Inst-001": 7,
|
| 97 |
-
"speakleash/mistral_7B-v2/spkl-all-e2_5bd6027d": 7,
|
| 98 |
-
"speakleash/mistral_7B-v2/spkl-all-e0_8cf0987d": 7,
|
| 99 |
-
"speakleash/mistral_7B-v2/spkl-only-e0_ef715d74": 7,
|
| 100 |
-
"speakleash/mistral_7B-v2/spkl-only-e1_333887a5": 7,
|
| 101 |
-
"speakleash/mistral_7B-v2/spkl-all-e1_0b514ce9": 7,
|
| 102 |
-
"speakleash/mistral_7B-v2/spkl-only-e2_5dac700d": 7,
|
| 103 |
-
"speakleash/llama-apt3-13B/spkl-only_e0_sft/ext_e3_23b6bc9b": 13,
|
| 104 |
-
"speakleash/llama-apt3-13B/spkl-only_e0_sft/spkl_e4_e3a666b1": 13,
|
| 105 |
-
"speakleash/llama-apt3-13B/spkl-only_e0_sft/spkl_e3_45ef6b63": 13,
|
| 106 |
-
"speakleash/llama-apt3-13B/spkl-only_e0_sft/spkl_e5_bf95416b": 13,
|
| 107 |
-
"speakleash/llama-apt3-13B/spkl-only_e0_sft/ext_e2_f7606252": 13,
|
| 108 |
-
"speakleash/llama-apt3-13B/spkl-only_e0_sft/spkl_e2_898ae6c6": 13,
|
| 109 |
-
"speakleash/apt4-1B/spkl-only-e3_756856c4": 1,
|
| 110 |
-
"speakleash/apt4-1B/spkl-all-e0_7f6a991e": 1,
|
| 111 |
-
"speakleash/apt4-1B/spkl-only-e2_969e76b4": 1,
|
| 112 |
-
"speakleash/apt4-1B/spkl-all-e2_bfb44ded": 1,
|
| 113 |
-
"speakleash/apt4-1B/spkl-all-e3_063753f9": 1,
|
| 114 |
-
"speakleash/apt4-1B/spkl-all-e1_74a293c8": 1,
|
| 115 |
-
"speakleash/apt4-1B/spkl-only-e0_b9c8bb39": 1,
|
| 116 |
-
"speakleash/apt4-1B/spkl-only-e1_fea4b41b": 1,
|
| 117 |
-
"upstage/SOLAR-10.7B-Instruct-v1.0": 10.7,
|
| 118 |
-
"upstage/SOLAR-10.7B-v1.0": 10.7,
|
| 119 |
-
"speakleash/mistral_7B-v2/spkl-all_sft/e1_base/spkl-all-e1_9aee511a": 7,
|
| 120 |
-
"speakleash/mistral_7B-v2/spkl-all_sft/e1_base/spkl-all-e0_dd9d2777": 7,
|
| 121 |
-
"speakleash/mistral_7B-v2/spkl-only_sft/e1_base/spkl-only-e1_d0ac34b7": 7,
|
| 122 |
-
"speakleash/mistral_7B-v2/spkl-only_sft/e1_base/spkl-only-e0_9eea5944": 7,
|
| 123 |
-
"Remek/Kruk-7B-SP-001": 7,
|
| 124 |
-
"TinyLlama/TinyLlama-1.1B-Chat-v1.0": 1.1,
|
| 125 |
-
"internlm/internlm2-chat-7b-sft": 7,
|
| 126 |
-
"speakleash/mistral_7B-v2/spkl-all_sft/e1_base/spkl-all-e3_72a6c52a": 7,
|
| 127 |
-
"speakleash/mistral_7B-v2/spkl-only_sft/e1_base/spkl-only-e3_08a0fd89": 7,
|
| 128 |
-
"speakleash/mistral_7B-v2/spkl-all_sft/e1_base/spkl-all-e2_0a1a62c0": 7,
|
| 129 |
-
"speakleash/mistral_7B-v2/spkl-only_sft/e1_base/spkl-only-e2_a7c66ac5": 7,
|
| 130 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_2e5-e0_116fa2bc": 7,
|
| 131 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_7e6-e0_8544bbd3": 7,
|
| 132 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_2e5-e1_013bd434": 7,
|
| 133 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only-e1_87bfffac": 7,
|
| 134 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only-e2_939d897f": 7,
|
| 135 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only-e0_2a5be0dc": 7,
|
| 136 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5/spkl-only-e1_0303962d": 7,
|
| 137 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5/spkl-only-e0_f4aaf490": 7,
|
| 138 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_sft/e0_base_2e5/spkl-only-e0_009b090e": 7,
|
| 139 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_sft/e0_base_2e5/spkl-only-e1_91aae327": 7,
|
| 140 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_7e6w-e1_14d52992": 7,
|
| 141 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_7e6w-e2_72422a32": 7,
|
| 142 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only-e2_dcb87efc": 7,
|
| 143 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_7e6-e2_04382c38": 7,
|
| 144 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_7e6-e3_860889b1": 7,
|
| 145 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_7e6w-e3_78cf3243": 7,
|
| 146 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_9e7-e0_27275908": 7,
|
| 147 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only-e0_d31a18b7": 7,
|
| 148 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_7e6-e0_c26126c8": 7,
|
| 149 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only-e3_a5833b75": 7,
|
| 150 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_7e6w-e0_6c834bf7": 7,
|
| 151 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_7e6-e1_87b7c12f": 7,
|
| 152 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_9e7-e2_5ce06dd2": 7,
|
| 153 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_9e7-e1_561ac4bb": 7,
|
| 154 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only-e1_392d55d9": 7,
|
| 155 |
-
"speakleash/mistral_7B-v2/spkl-all_sft_v2/e1_base/spkl-all_2e6-e2_db0cd739": 7,
|
| 156 |
-
"speakleash/mistral_7B-v2/spkl-all_sft_v2/e1_base/spkl-all_2e6-e3_4960543c": 7,
|
| 157 |
-
"speakleash/mistral_7B-v2/spkl-all_sft_v2/e1_base/spkl-all_2e6-e0_1b65c3ac": 7,
|
| 158 |
-
"speakleash/mistral_7B-v2/spkl-all_sft_v2/e1_base/spkl-all_2e6-e1_70c70cc6": 7,
|
| 159 |
-
"speakleash/mistral-apt3-7B-v2/spkl-only_sft/e1_base/spkl-only-e2_3a071212": 7,
|
| 160 |
-
"speakleash/mistral-apt3-7B-v2/spkl-only_sft/e1_base/spkl-only-e0_6dc2e217": 7,
|
| 161 |
-
"speakleash/mistral-apt3-7B-v2/spkl-only_sft/e1_base/spkl-only-e1_46610eb1": 7,
|
| 162 |
-
"speakleash/mistral-apt3-7B-v2/spkl-only_sft-weighted/e1_base/spkl-only-e0_e79dcb9f": 7,
|
| 163 |
-
"speakleash/mistral-apt3-7B-v2/spkl-only_sft-weighted/e1_base/spkl-only-e1_10a78140": 7,
|
| 164 |
-
"Remek/OpenChat3.5-0106-Spichlerz-Bocian": 7,
|
| 165 |
-
"alpindale/Mistral-7B-v0.2-hf": 7,
|
| 166 |
-
"Azurro/APT3-275M-Base": 0.3,
|
| 167 |
-
"szymonrucinski/Curie-7B-v1": 7,
|
| 168 |
-
"speakleash/mistral-apt3-7B/spkl-all_sft_v4/e0_base/spkl-all-e0-lr5e5_a47a2047": 7,
|
| 169 |
-
"speakleash/mistral-apt3-7B/spkl-all_sft_v4/e0_base/spkl-all-e1_1774eb92": 7,
|
| 170 |
-
"speakleash/mistral-apt3-7B/spkl-all_sft_v4/e0_base/spkl-all-e0-lr2e6_71659188": 7,
|
| 171 |
-
"speakleash/mistral-apt3-7B/spkl-all_sft_v4/e0_base/spkl-all-e0_35239ee5": 7,
|
| 172 |
-
"speakleash/mistral-apt3-7B/spkl-all_sft_v4/e0_base/spkl-all-e2_5257da77": 7,
|
| 173 |
-
"speakleash/mistral-apt3-7B/spkl-all_sft_v4/e0_base/spkl-all-e3_5ca4603b": 7,
|
| 174 |
-
"speakleash/mistral-apt3-7B/spkl-only_sft_v3/e0_base/spkl-only-e3_90666ab5": 7,
|
| 175 |
-
"speakleash/mistral-apt3-7B/spkl-only_sft_v3/e0_base/spkl-only-e1_4e524cad": 7,
|
| 176 |
-
"speakleash/mistral-apt3-7B/spkl-only_sft_v3/e0_base/spkl-only-e0_40cdde38": 7,
|
| 177 |
-
"speakleash/mistral-apt3-7B/spkl-all_sft_v3/e0_base/spkl-all-e0_67274d1b": 7,
|
| 178 |
-
"speakleash/mistral-apt3-7B/spkl-all_sft_v3/e0_base/spkl-all-e1_695e8b44": 7,
|
| 179 |
-
"speakleash/mistral-apt3-7B/spkl-all_sft_v3/e0_base/spkl-all-e2_a9e6a2f0": 7,
|
| 180 |
-
"speakleash/mistral-apt3-7B/spkl-all_sft_v3/e0_base/spkl-all-e3_2ff00c2b": 7,
|
| 181 |
-
"speakleash/mistral-apt3-7B/spkl_sft_v2/e1_4067e14e": 7,
|
| 182 |
-
"speakleash/mistral-apt3-7B/spkl_sft_v2/e0_6214300a": 7,
|
| 183 |
-
"speakleash/mistral-apt3-7B/only-spi_sft_v2/e1_596202b3": 7,
|
| 184 |
-
"speakleash/mistral-apt3-7B/only-spi_sft_v2/e0_c4ea165e": 7,
|
| 185 |
-
"speakleash/mistral-apt3-7B/spkl-only_sft_v4/e0_base/spkl-only-e0_c00001c4": 7,
|
| 186 |
-
"speakleash/mistral-apt3-7B/spkl-only_sft_v4/e0_base/spkl-only-e3_2bcd3961": 7,
|
| 187 |
-
"speakleash/mistral-apt3-7B/spkl-only_sft_v4/e0_base/spkl-only-e1_f2730438": 7,
|
| 188 |
-
"speakleash/mistral-apt3-7B/spkl-only_sft_v4/e0_base/spkl-only-e2_f39a22a2": 7,
|
| 189 |
-
"speakleash/mistral-apt3-7B/spkl-all_sft_v3-lr2/e0_base/spkl-all-e0-lr6_376eb1d5": 7,
|
| 190 |
-
"speakleash/mistral-apt3-7B/spkl-all_sft_v3-lr2/e0_base/spkl-all-e0-lr5_54b6226f": 7,
|
| 191 |
-
"speakleash/mistral-apt3-7B/spkl-only_sft_v3/e0_base/spkl-only-e2_f036d0fd": 7,
|
| 192 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_7e5-e0_e143e6ce": 7,
|
| 193 |
-
"Nexusflow/Starling-LM-7B-beta": 7,
|
| 194 |
-
"RWKV/v5-Eagle-7B-HF": 7,
|
| 195 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_sft/e0_base_2e5/spkl-only-e2_afcfbe2d": 7,
|
| 196 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_sft/e0_base_2e5/spkl-only-e3_6908149d": 7,
|
| 197 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5/spkl-only-e2_d5a874b1": 7,
|
| 198 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5/spkl-only-e3_1be744af": 7,
|
| 199 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5_v6/spkl-only-e0_4efab00a": 7,
|
| 200 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5_v6/spkl-only-e1_1b706f85": 7,
|
| 201 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5_v6/spkl-only-e2_f86f7889": 7,
|
| 202 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5_v6/spkl-only-e3_13641875": 7,
|
| 203 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5_v7w/spkl-only-e0_1f5f4968": 7,
|
| 204 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5_v7w/spkl-only-e1_50de9812": 7,
|
| 205 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5_v7w/spkl-only-e2_dd38abb9": 7,
|
| 206 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5_v7w/spkl-only-e3_36236df3": 7,
|
| 207 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5_v8w/spkl-only-e0_e185fb84": 7,
|
| 208 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5_v8w/spkl-only-e1_fb5d327f": 7,
|
| 209 |
-
"speakleash/mistral-apt3-7B_v2/spkl-only_sft/e1_base_2e5_v8w/spkl-only-e2_dd71be08": 7,
|
| 210 |
-
"speakleash/mistral_7B-v2/spkl-all_sft_v2/e1_base/spkl-all_3e6_v8w-e0_d2d8a320": 7,
|
| 211 |
-
"speakleash/mistral_7B-v2/spkl-all_sft_v2/e1_base/spkl-all_3e6_v8w-e1_cd7c61a1": 7,
|
| 212 |
-
"speakleash/mistral_7B-v2/spkl-all_sft_v2/e1_base/spkl-all_v8wa_9e6-e0_32c27aa5": 7,
|
| 213 |
-
"speakleash/mistral_7B-v2/spkl-all_sft_v2/e1_base/spkl-all_v8wa_9e6-e1_518b38ca": 7,
|
| 214 |
-
"speakleash/mistral_7B-v2/spkl-all_sft_v2/e1_base/spkl-all_v8wa_9e6-e2_84fb05a1": 7,
|
| 215 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v10wa_3e6-e0_2ba34bd9": 7,
|
| 216 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v10wa_3e6-e1_35ecfaaa": 7,
|
| 217 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v10wa_3e6-e2_920b5c3f": 7,
|
| 218 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v10wa_7e6-e0_d137146f": 7,
|
| 219 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v10wa_7e6-e1_5bddbd74": 7,
|
| 220 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v10wa_7e6-e2_bbc67e89": 7,
|
| 221 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v10wa_7e6-e2b_53f28c53": 7,
|
| 222 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v10wa_7e6-e3_9931f988": 7,
|
| 223 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v10wa_7e6-e4_0bc82b61": 7,
|
| 224 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v8wa_9e6-e0_8aa4a0ae": 7,
|
| 225 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v8wa_9e6-e1_57357d6c": 7,
|
| 226 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v8wa_9e6-e2_5eb84913": 7,
|
| 227 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v9wa_3e6-e0_ae5e354c": 7,
|
| 228 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v9wa_7e6-e0_724b2d41": 7,
|
| 229 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v9wa_7e6-e1_d962636b": 7,
|
| 230 |
-
"speakleash/Bielik-7B-v0.1": 7,
|
| 231 |
-
"NousResearch/Nous-Hermes-2-SOLAR-10.7B": 10.7,
|
| 232 |
-
"Qwen/Qwen1.5-7B-Chat": 7,
|
| 233 |
-
"THUDM/chatglm3-6b-base": 6,
|
| 234 |
-
"THUDM/chatglm3-6b": 6,
|
| 235 |
-
"TeeZee/Bielik-SOLAR-LIKE-10.7B-Instruct-v0.1": 10.7,
|
| 236 |
-
"google/gemma-1.1-2b-it": 2,
|
| 237 |
-
"meta-llama/Meta-Llama-3-8B-Instruct": 8,
|
| 238 |
-
"meta-llama/Meta-Llama-3-8B-Instruct,max_length=4096": 8,
|
| 239 |
-
"meta-llama/Meta-Llama-3-8B": 8,
|
| 240 |
-
"meta-llama/Meta-Llama-3-8B,max_length=4096": 8,
|
| 241 |
-
"microsoft/WizardLM-2-7B": 7,
|
| 242 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v10wa4_9e6-e0_193ad881": 7,
|
| 243 |
-
"speakleash/mistral_7B-v2/spkl-only_sft_v2/e1_base/spkl-only_v10wa4_9e6-e1_f40e0808": 7,
|
| 244 |
-
"speakleash/Bielik-7B-Instruct-v0.1": 7,
|
| 245 |
-
"speakleash/mistral_7B-v3/spkl-only_sft_v0/e0_base/spkl-only_v11wa_9e6-e0_fe38d62e": 7,
|
| 246 |
-
"speakleash/mistral_7B-v3/spkl-only_sft_v0/e0_base/spkl-only_v11wa_9e6-e1_6f84698e": 7,
|
| 247 |
-
"speakleash/mistral_7B-v3/spkl-only_sft_v0/e0_base/spkl-only_v11wap_9e6-e0_5c6927dd": 7,
|
| 248 |
-
"speakleash/mistral_7B-v3/spkl-only_sft_v0/e0_base/spkl-only_v11wap_9e6-e1_1d6755a9": 7,
|
| 249 |
-
"speakleash/mistral_7B-v3/spkl-only_v0-e0_b93294c8": 7,
|
| 250 |
-
"speakleash/mistral_7B-v3/spkl-only_v2-e0_e5547fd5": 7,
|
| 251 |
-
"speakleash/Bielik-7B-Instruct-v0.1-GPTQ,autogptq=True": 7,
|
| 252 |
-
"speakleash/Bielik-7B-Instruct-v0.1,load_in_4bit=True": 7,
|
| 253 |
-
"speakleash/Test-v02-ep3": 7,
|
| 254 |
-
"speakleash/mistral_7B-v3/spkl-only_v2-e1.34500_a9c75816": 7,
|
| 255 |
-
"CohereForAI/c4ai-command-r-v01,max_length=4096": 35,
|
| 256 |
-
"Qwen/Qwen1.5-14B-Chat": 14,
|
| 257 |
-
"Remek/Llama-3-8B-Omnibus-1-PL-v01-INSTRUCT": 8,
|
| 258 |
-
"Remek/Llama-3-8B-Omnibus-1-PL-v01-INSTRUCT,max_length=4096": 8,
|
| 259 |
-
"internlm/internlm2-20b,max_length=4096": 20,
|
| 260 |
-
"internlm/internlm2-chat-20b,max_length=4096": 20,
|
| 261 |
-
"lex-hue/Delexa-7b": 7,
|
| 262 |
-
"lmsys/vicuna-13b-v1.5": 13,
|
| 263 |
-
"maciek-pioro/Mixtral-8x7B-v0.1-pl,max_length=4096": 46.7,
|
| 264 |
-
"mistralai/Mixtral-8x7B-Instruct-v0.1,max_length=4096": 46.7,
|
| 265 |
-
"mistralai/Mixtral-8x7B-v0.1,max_length=4096": 46.7,
|
| 266 |
-
"speakleash/Test-001-wiki": 7,
|
| 267 |
-
"speakleash/Test-002": 7,
|
| 268 |
-
"teknium/OpenHermes-13B": 13,
|
| 269 |
-
"meta-llama/Meta-Llama-3-70B-Instruct,max_length=4096": 70,
|
| 270 |
-
"meta-llama/Meta-Llama-3-70B,max_length=4096": 70,
|
| 271 |
-
"mistralai/Mixtral-8x22B-Instruct-v0.1,max_length=4096": 141,
|
| 272 |
-
"mistralai/Mixtral-8x22B-v0.1,max_length=4096": 141,
|
| 273 |
-
"Qwen/Qwen1.5-14B-Chat,max_length=4096": 14,
|
| 274 |
-
"Qwen/Qwen1.5-32B-Chat,max_length=4096": 32,
|
| 275 |
-
"Qwen/Qwen1.5-72B-Chat,max_length=4096": 72,
|
| 276 |
-
"Qwen/Qwen1.5-32B,max_length=4096": 32,
|
| 277 |
-
"Qwen/Qwen1.5-72B,max_length=4096": 72,
|
| 278 |
-
"Qwen/Qwen1.5-7B": 7,
|
| 279 |
-
"Qwen/Qwen2-0.5B-Instruct": 0.5,
|
| 280 |
-
"Qwen/Qwen2-0.5B": 0.5,
|
| 281 |
-
"Qwen/Qwen2-1.5B-Instruct": 1.5,
|
| 282 |
-
"Qwen/Qwen2-1.5B": 1.5,
|
| 283 |
-
"Qwen/Qwen2-7B-Instruct": 7,
|
| 284 |
-
"Qwen/Qwen2-7B": 7,
|
| 285 |
-
"model=gpt-3.5-turbo-instruct": 20,
|
| 286 |
-
"model=gpt-4-turbo-2024-04-09": 1000,
|
| 287 |
-
"01-ai/Yi-1.5-6B-Chat": 6,
|
| 288 |
-
"01-ai/Yi-1.5-6B": 6,
|
| 289 |
-
"01-ai/Yi-1.5-9B-Chat": 9,
|
| 290 |
-
"01-ai/Yi-1.5-9B": 9,
|
| 291 |
-
"CohereForAI/aya-23-35B,max_length=4096": 35,
|
| 292 |
-
"CohereForAI/aya-23-8B": 8,
|
| 293 |
-
"NousResearch/Hermes-2-Pro-Llama-3-8B": 8,
|
| 294 |
-
"NousResearch/Hermes-2-Theta-Llama-3-8B": 8,
|
| 295 |
-
"Remek/OpenChat-3.5-0106-PL-Omnibusv2": 7,
|
| 296 |
-
"mistralai/Mistral-7B-Instruct-v0.3": 7,
|
| 297 |
-
"mistralai/Mistral-7B-v0.3": 7,
|
| 298 |
-
"nvidia/Llama3-ChatQA-1.5-8B": 8,
|
| 299 |
-
"openchat/openchat-3.5-0106-gemma": 7,
|
| 300 |
-
"openchat/openchat-3.6-8b-20240522": 8,
|
| 301 |
-
"tiiuae/falcon-11B": 11,
|
| 302 |
-
"mlabonne/NeuralDaredevil-8B-abliterated": 8,
|
| 303 |
-
"01-ai/Yi-1.5-34B-Chat,max_length=4096": 34,
|
| 304 |
-
"Qwen/Qwen2-57B-A14B-Instruct,max_length=4096": 57,
|
| 305 |
-
"Qwen/Qwen2-72B-Instruct,max_length=4096": 72,
|
| 306 |
-
"Qwen/Qwen2-72B,max_length=4096": 72,
|
| 307 |
-
"THUDM/glm-4-9b-chat": 9,
|
| 308 |
-
"THUDM/glm-4-9b": 9,
|
| 309 |
-
"google/recurrentgemma-9b-it": 9,
|
| 310 |
-
"microsoft/Phi-3-medium-4k-instruct,max_length=4096": 14,
|
| 311 |
-
"microsoft/Phi-3-mini-4k-instruct": 3.8,
|
| 312 |
-
"microsoft/Phi-3-small-8k-instruct": 7.4,
|
| 313 |
-
"ssmits/Falcon2-5.5B-Polish": 5.5,
|
| 314 |
-
"alpindale/WizardLM-2-8x22B,max_length=4096": 141,
|
| 315 |
-
"dreamgen/WizardLM-2-7B": 7,
|
| 316 |
-
"mistralai/Mistral-Large-Instruct-2407": 123,
|
| 317 |
-
"meta-llama/Meta-Llama-3.1-70B-Instruct": 70,
|
| 318 |
-
"meta-llama/Meta-Llama-3.1-405B-Instruct-FP8": 405,
|
| 319 |
-
"speakleash/Bielik-11B-v2.0-Instruct": 11,
|
| 320 |
-
"speakleash/Bielik-11B-v2.2-Instruct": 11,
|
| 321 |
-
"speakleash/Bielik-11B-v2.1-Instruct": 11,
|
| 322 |
-
"speakleash/Bielik-11B-v2.3-Instruct": 11,
|
| 323 |
-
"CYFRAGOVPL/PLLuM-12B-nc-chat": 12,
|
| 324 |
-
"CYFRAGOVPL/PLLuM-12B-chat": 12,
|
| 325 |
-
"CYFRAGOVPL/PLLuM-12B-instruct": 12,
|
| 326 |
-
"CYFRAGOVPL/Llama-PLLuM-8B-instruct": 8,
|
| 327 |
-
"CYFRAGOVPL/PLLuM-12B-nc-instruct": 12,
|
| 328 |
-
"CYFRAGOVPL/Llama-PLLuM-8B-chat": 8,
|
| 329 |
-
"CYFRAGOVPL/PLLuM-8x7B-nc-chat": 46.7,
|
| 330 |
-
"CYFRAGOVPL/PLLuM-8x7B-nc-instruct": 46.7,
|
| 331 |
-
"CYFRAGOVPL/PLLuM-8x7B-chat": 46.7,
|
| 332 |
-
"CYFRAGOVPL/PLLuM-8x7B-instruct": 46.7,
|
| 333 |
-
"CYFRAGOVPL/Llama-PLLuM-70B-chat": 70,
|
| 334 |
-
"CYFRAGOVPL/Llama-PLLuM-70B-instruct": 70,
|
| 335 |
-
"Qwen/Qwen2.5-7B-Instruct": 7,
|
| 336 |
-
"Qwen/Qwen2.5-14B-Instruct": 14,
|
| 337 |
-
"Qwen/Qwen2.5-1.5B-Instruct": 1.5,
|
| 338 |
-
"microsoft/phi-4": 14.7,
|
| 339 |
-
"Qwen/Qwen2.5-32B-Instruct": 32,
|
| 340 |
-
"Qwen/Qwen2.5-72B-Instruct": 72,
|
| 341 |
-
"nvidia/Llama-3.1-Nemotron-70B-Instruct-HF": 70,
|
| 342 |
-
"meta-llama/Llama-3.2-1B-Instruct": 1,
|
| 343 |
-
"utter-project/EuroLLM-9B-Instruct": 9,
|
| 344 |
-
"mistralai/Mistral-Small-Instruct-2409": 22.2,
|
| 345 |
-
"mistralai/Mistral-Small-24B-Instruct-2501": 24,
|
| 346 |
-
"meta-llama/Llama-3.3-70B-Instruct": 70,
|
| 347 |
-
"meta-llama/Llama-3.2-3B-Instruct": 3,
|
| 348 |
-
"Qwen/Qwen2.5-3B-Instruct": 3,
|
| 349 |
-
"mistralai/Mistral-Nemo-Instruct-2407": 12,
|
| 350 |
-
"microsoft/Phi-4-mini-instruct": 4,
|
| 351 |
-
"mistralai/Mistral-Large-Instruct-2411": 123,
|
| 352 |
-
"speakleash/Bielik-11B-v2.5-Instruct": 11,
|
| 353 |
-
"speakleash/Bielik-4.5B-v3.0-Instruct": 4.5,
|
| 354 |
-
"speakleash/Bielik-1.5B-v3.0-Instruct": 1.5
|
| 355 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
plot_results.py
CHANGED
|
@@ -2,90 +2,78 @@ import pandas as pd
|
|
| 2 |
import matplotlib.pyplot as plt
|
| 3 |
import numpy as np
|
| 4 |
import json
|
| 5 |
-
import csv
|
| 6 |
|
| 7 |
-
def create_performance_plot(
|
| 8 |
# Define whitelist of interesting models (partial matches)
|
| 9 |
WHITELIST = [
|
| 10 |
-
'Meta
|
|
|
|
|
|
|
| 11 |
]
|
| 12 |
|
| 13 |
-
#
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
with open(csv_path, 'r') as f:
|
| 18 |
-
reader = csv.reader(f)
|
| 19 |
-
header = next(reader) # Get header row
|
| 20 |
-
# Strip whitespace from header names
|
| 21 |
-
header = [h.strip() for h in header]
|
| 22 |
-
for row in reader:
|
| 23 |
-
if len(row) == expected_fields: # Only keep rows with correct number of fields
|
| 24 |
-
# Strip whitespace from values
|
| 25 |
-
valid_rows.append([val.strip() for val in row])
|
| 26 |
-
|
| 27 |
-
# Create DataFrame from valid rows
|
| 28 |
-
df = pd.DataFrame(valid_rows, columns=header)
|
| 29 |
-
|
| 30 |
-
# Read model sizes from metadata
|
| 31 |
-
with open(metadata_path, 'r') as f:
|
| 32 |
-
metadata = json.load(f)
|
| 33 |
-
|
| 34 |
-
# Process the data
|
| 35 |
-
# Keep only successful runs (where Benchmark Score is not FAILED)
|
| 36 |
-
df = df[df['Benchmark Score'] != 'FAILED']
|
| 37 |
-
df = df[df['Benchmark Score'].notna()]
|
| 38 |
-
# Convert score to numeric, handling invalid values
|
| 39 |
-
df['Benchmark Score'] = pd.to_numeric(df['Benchmark Score'], errors='coerce')
|
| 40 |
-
df = df[df['Benchmark Score'].notna()] # Remove rows where conversion failed
|
| 41 |
-
|
| 42 |
-
# Convert Num Questions Parseable to numeric and calculate adjusted score
|
| 43 |
-
df['Num Questions Parseable'] = pd.to_numeric(df['Num Questions Parseable'], errors='coerce')
|
| 44 |
-
df['Benchmark Score'] = df['Benchmark Score'] * (df['Num Questions Parseable'] / 171)
|
| 45 |
-
|
| 46 |
-
# For each model, keep only the latest run
|
| 47 |
-
df['Run ID'] = df['Run ID'].fillna('')
|
| 48 |
-
df['timestamp'] = pd.to_datetime(df['Benchmark Completed'])
|
| 49 |
-
df = df.sort_values('timestamp')
|
| 50 |
-
df = df.drop_duplicates(subset=['Model Path'], keep='last')
|
| 51 |
-
|
| 52 |
-
# Get model sizes
|
| 53 |
-
def get_model_size(model_path):
|
| 54 |
-
# Try exact match first
|
| 55 |
-
if model_path in metadata:
|
| 56 |
-
return metadata[model_path]
|
| 57 |
-
# Try with max_length suffix
|
| 58 |
-
if f"{model_path},max_length=4096" in metadata:
|
| 59 |
-
return metadata[f"{model_path},max_length=4096"]
|
| 60 |
-
return None
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
# Print models without size before filtering
|
| 63 |
print("\nModels without size assigned:")
|
| 64 |
-
models_without_size = df[df['Model
|
| 65 |
-
for
|
| 66 |
-
print(f"- {
|
| 67 |
-
|
| 68 |
-
df['Model Size'] = df['Model Path'].apply(get_model_size)
|
| 69 |
-
df = df[df['Model Size'].notna()]
|
| 70 |
|
| 71 |
# Remove extreme outliers (scores that are clearly errors)
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
| 79 |
|
| 80 |
# Find models on Pareto frontier
|
| 81 |
-
sizes = sorted(
|
| 82 |
frontier_points = []
|
| 83 |
max_score = float('-inf')
|
| 84 |
frontier_models = set()
|
| 85 |
|
| 86 |
for size in sizes:
|
| 87 |
# Get scores for models of this size or smaller
|
| 88 |
-
subset =
|
| 89 |
if len(subset) > 0:
|
| 90 |
max_score_idx = subset['Benchmark Score'].idxmax()
|
| 91 |
current_max = subset.loc[max_score_idx, 'Benchmark Score']
|
|
@@ -95,59 +83,73 @@ def create_performance_plot(csv_path='benchmark_results.csv', metadata_path='met
|
|
| 95 |
frontier_models.add(subset.loc[max_score_idx, 'Model Path'])
|
| 96 |
|
| 97 |
# Filter models - keep those on Pareto frontier or matching whitelist
|
| 98 |
-
|
| 99 |
-
for idx, row in
|
| 100 |
if row['Model Path'] in frontier_models:
|
| 101 |
-
|
| 102 |
else:
|
| 103 |
for pattern in WHITELIST:
|
| 104 |
if pattern in row['Model Path']:
|
| 105 |
-
|
| 106 |
break
|
| 107 |
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
# Create the plot
|
| 111 |
fig = plt.figure(figsize=(12, 8))
|
| 112 |
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
|
|
|
| 117 |
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
model_name = row['Model Path'].split('/')[-1]
|
| 122 |
-
if any(pattern in row['Model Path'] for pattern in ['gpt-3', 'gpt-4']):
|
| 123 |
model_name = row['Model Path']
|
| 124 |
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
|
| 142 |
# Customize the plot
|
| 143 |
plt.grid(True, linestyle='--', alpha=0.7)
|
| 144 |
plt.xlabel('Model Size (billions of parameters)')
|
| 145 |
-
plt.ylabel('Benchmark Score')
|
| 146 |
-
plt.title('Model Performance vs Size
|
| 147 |
|
| 148 |
# Add legend
|
| 149 |
plt.legend()
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
# Adjust layout to prevent label cutoff
|
| 152 |
plt.tight_layout()
|
| 153 |
|
|
|
|
| 2 |
import matplotlib.pyplot as plt
|
| 3 |
import numpy as np
|
| 4 |
import json
|
|
|
|
| 5 |
|
| 6 |
+
def create_performance_plot(json_path='benchmark_report.json'):
|
| 7 |
# Define whitelist of interesting models (partial matches)
|
| 8 |
WHITELIST = [
|
| 9 |
+
'Meta Llama 4 Maverick',
|
| 10 |
+
'Anthropic Claude 3.7 Sonnet',
|
| 11 |
+
'OpenAI GPT-4o'
|
| 12 |
]
|
| 13 |
|
| 14 |
+
# Load the benchmark results from JSON
|
| 15 |
+
with open(json_path, 'r') as f:
|
| 16 |
+
json_data = json.load(f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
+
# Create DataFrame from JSON data
|
| 19 |
+
df = pd.DataFrame(json_data)
|
| 20 |
+
|
| 21 |
+
# Rename columns for consistency
|
| 22 |
+
df = df.rename(columns={
|
| 23 |
+
"Model Name": "Model Path",
|
| 24 |
+
"Model Size": "Model Size Raw"
|
| 25 |
+
})
|
| 26 |
+
|
| 27 |
+
# Calculate overall benchmark score as average of Avg (object) and Avg (country)
|
| 28 |
+
df['Benchmark Score'] = (df['Avg (object)'] + df['Avg (country)']) / 2
|
| 29 |
+
|
| 30 |
+
# Process model sizes - convert to numeric, handle "-" and extract numbers
|
| 31 |
+
df['Model Size'] = df['Model Size Raw'].replace("-", np.nan)
|
| 32 |
+
|
| 33 |
+
# Extract numeric values from size strings like "72 MB" -> 72 or plain "72" -> 72
|
| 34 |
+
def extract_size(size_val):
|
| 35 |
+
if pd.isna(size_val):
|
| 36 |
+
return np.nan
|
| 37 |
+
if isinstance(size_val, (int, float)):
|
| 38 |
+
return float(size_val)
|
| 39 |
+
if isinstance(size_val, str):
|
| 40 |
+
# Try to extract number from string (handles both "72" and "72 MB")
|
| 41 |
+
import re
|
| 42 |
+
match = re.search(r'(\d+(?:\.\d+)?)', str(size_val))
|
| 43 |
+
if match:
|
| 44 |
+
return float(match.group(1))
|
| 45 |
+
return np.nan
|
| 46 |
+
|
| 47 |
+
df['Model Size'] = df['Model Size'].apply(extract_size)
|
| 48 |
+
|
| 49 |
+
# Remove models without size information for plotting
|
| 50 |
+
df_with_size = df[df['Model Size'].notna()].copy()
|
| 51 |
+
|
| 52 |
# Print models without size before filtering
|
| 53 |
print("\nModels without size assigned:")
|
| 54 |
+
models_without_size = df[df['Model Size'].isna()]
|
| 55 |
+
for idx, row in models_without_size.iterrows():
|
| 56 |
+
print(f"- {row['Model Path']}")
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
# Remove extreme outliers (scores that are clearly errors)
|
| 59 |
+
if len(df_with_size) > 0:
|
| 60 |
+
q1 = df_with_size['Benchmark Score'].quantile(0.25)
|
| 61 |
+
q3 = df_with_size['Benchmark Score'].quantile(0.75)
|
| 62 |
+
iqr = q3 - q1
|
| 63 |
+
df_with_size = df_with_size[
|
| 64 |
+
(df_with_size['Benchmark Score'] >= q1 - 1.5 * iqr) &
|
| 65 |
+
(df_with_size['Benchmark Score'] <= q3 + 1.5 * iqr)
|
| 66 |
+
]
|
| 67 |
|
| 68 |
# Find models on Pareto frontier
|
| 69 |
+
sizes = sorted(df_with_size['Model Size'].unique())
|
| 70 |
frontier_points = []
|
| 71 |
max_score = float('-inf')
|
| 72 |
frontier_models = set()
|
| 73 |
|
| 74 |
for size in sizes:
|
| 75 |
# Get scores for models of this size or smaller
|
| 76 |
+
subset = df_with_size[df_with_size['Model Size'] <= size]
|
| 77 |
if len(subset) > 0:
|
| 78 |
max_score_idx = subset['Benchmark Score'].idxmax()
|
| 79 |
current_max = subset.loc[max_score_idx, 'Benchmark Score']
|
|
|
|
| 83 |
frontier_models.add(subset.loc[max_score_idx, 'Model Path'])
|
| 84 |
|
| 85 |
# Filter models - keep those on Pareto frontier or matching whitelist
|
| 86 |
+
df_with_size['Keep'] = False
|
| 87 |
+
for idx, row in df_with_size.iterrows():
|
| 88 |
if row['Model Path'] in frontier_models:
|
| 89 |
+
df_with_size.loc[idx, 'Keep'] = True
|
| 90 |
else:
|
| 91 |
for pattern in WHITELIST:
|
| 92 |
if pattern in row['Model Path']:
|
| 93 |
+
df_with_size.loc[idx, 'Keep'] = True
|
| 94 |
break
|
| 95 |
|
| 96 |
+
# Also include models without size if they're in whitelist
|
| 97 |
+
df_no_size = df[df['Model Size'].isna()].copy()
|
| 98 |
+
df_no_size['Keep'] = False
|
| 99 |
+
for idx, row in df_no_size.iterrows():
|
| 100 |
+
for pattern in WHITELIST:
|
| 101 |
+
if pattern in row['Model Path']:
|
| 102 |
+
df_no_size.loc[idx, 'Keep'] = True
|
| 103 |
+
break
|
| 104 |
+
|
| 105 |
+
# Combine datasets for plotting
|
| 106 |
+
plot_df = df_with_size[df_with_size['Keep']].copy()
|
| 107 |
|
| 108 |
# Create the plot
|
| 109 |
fig = plt.figure(figsize=(12, 8))
|
| 110 |
|
| 111 |
+
if len(plot_df) > 0:
|
| 112 |
+
# Create scatter plot
|
| 113 |
+
plt.scatter(plot_df['Model Size'],
|
| 114 |
+
plot_df['Benchmark Score'],
|
| 115 |
+
alpha=0.6, s=60)
|
| 116 |
|
| 117 |
+
# Add labels for points
|
| 118 |
+
for idx, row in plot_df.iterrows():
|
| 119 |
+
# Use the full model name for labeling
|
|
|
|
|
|
|
| 120 |
model_name = row['Model Path']
|
| 121 |
|
| 122 |
+
plt.annotate(model_name,
|
| 123 |
+
(row['Model Size'], row['Benchmark Score']),
|
| 124 |
+
xytext=(5, 5), textcoords='offset points',
|
| 125 |
+
fontsize=8,
|
| 126 |
+
bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=0.5))
|
| 127 |
+
|
| 128 |
+
# Plot the Pareto frontier line
|
| 129 |
+
if frontier_points:
|
| 130 |
+
frontier_x, frontier_y = zip(*frontier_points)
|
| 131 |
+
plt.plot(frontier_x, frontier_y, 'r--', label='Pareto frontier', linewidth=2)
|
| 132 |
+
|
| 133 |
+
# Add vertical line for consumer GPU budget (assuming 24GB can handle ~12B parameters)
|
| 134 |
+
plt.axvline(x=12, color='gray', linestyle=':', label='Consumer-budget GPU limit', ymin=-0.15, clip_on=False)
|
| 135 |
+
plt.text(12, plt.ylim()[0] - (plt.ylim()[1] - plt.ylim()[0]) * 0.1,
|
| 136 |
+
'Consumer-budget\nGPU (24GB) limit\nin half precision',
|
| 137 |
+
horizontalalignment='center', verticalalignment='top')
|
| 138 |
|
| 139 |
# Customize the plot
|
| 140 |
plt.grid(True, linestyle='--', alpha=0.7)
|
| 141 |
plt.xlabel('Model Size (billions of parameters)')
|
| 142 |
+
plt.ylabel('Benchmark Score (Average of Object & Country Recognition)')
|
| 143 |
+
plt.title('Polish Photo Recognition: Model Performance vs Size')
|
| 144 |
|
| 145 |
# Add legend
|
| 146 |
plt.legend()
|
| 147 |
|
| 148 |
+
# Set reasonable axis limits
|
| 149 |
+
if len(plot_df) > 0:
|
| 150 |
+
plt.xlim(left=0)
|
| 151 |
+
plt.ylim(bottom=0)
|
| 152 |
+
|
| 153 |
# Adjust layout to prevent label cutoff
|
| 154 |
plt.tight_layout()
|
| 155 |
|
script.py
DELETED
|
@@ -1,322 +0,0 @@
|
|
| 1 |
-
import pandas as pd
|
| 2 |
-
import json
|
| 3 |
-
import re
|
| 4 |
-
|
| 5 |
-
# Load the CSV file
|
| 6 |
-
leaderboard_df = []
|
| 7 |
-
with open("benchmark_results.csv", "r") as f:
|
| 8 |
-
header = f.readline().strip().split(",")
|
| 9 |
-
header = [h.strip() for h in header]
|
| 10 |
-
for i, line in enumerate(f):
|
| 11 |
-
leaderboard_df.append(line.strip().split(",", 13))
|
| 12 |
-
|
| 13 |
-
# Load metadata
|
| 14 |
-
metadata = json.load(open('metadata.json'))
|
| 15 |
-
for k, v in list(metadata.items()):
|
| 16 |
-
metadata[k.split(",")[0]] = v
|
| 17 |
-
|
| 18 |
-
# Create DataFrame
|
| 19 |
-
leaderboard_df = pd.DataFrame(leaderboard_df, columns=header)
|
| 20 |
-
|
| 21 |
-
# Filter and process DataFrame
|
| 22 |
-
leaderboard_df = leaderboard_df[(leaderboard_df["Benchmark Version"] == "eq-bench_v2_pl") | (
|
| 23 |
-
leaderboard_df["Benchmark Version"] == 'eq-bench_pl')]
|
| 24 |
-
leaderboard_df = leaderboard_df[["Model Path", "Benchmark Score", "Num Questions Parseable", "Error"]]
|
| 25 |
-
|
| 26 |
-
def parse_parseable(x):
|
| 27 |
-
if x["Num Questions Parseable"] == 'FAILED':
|
| 28 |
-
m = re.match(r'(\d+)\.0 questions were parseable', x["Error"])
|
| 29 |
-
return m.group(1)
|
| 30 |
-
return x["Num Questions Parseable"]
|
| 31 |
-
|
| 32 |
-
leaderboard_df["Num Questions Parseable"] = leaderboard_df[["Num Questions Parseable", "Error"]].apply(
|
| 33 |
-
lambda x: parse_parseable(x), axis=1)
|
| 34 |
-
|
| 35 |
-
NUMBER_OF_QUESTIONS = 171.0
|
| 36 |
-
|
| 37 |
-
def fraction_to_percentage(numerator: float, denominator: float) -> float:
|
| 38 |
-
return (numerator / denominator) * 100
|
| 39 |
-
|
| 40 |
-
leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].apply(lambda x: fraction_to_percentage(float(x), NUMBER_OF_QUESTIONS))
|
| 41 |
-
|
| 42 |
-
def get_params(model_name):
|
| 43 |
-
if model_name in metadata:
|
| 44 |
-
return metadata[model_name]
|
| 45 |
-
else:
|
| 46 |
-
print(model_name)
|
| 47 |
-
return None
|
| 48 |
-
|
| 49 |
-
leaderboard_df["Params"] = leaderboard_df["Model Path"].apply(lambda x: get_params(x))
|
| 50 |
-
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', None)
|
| 51 |
-
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float) * ((leaderboard_df["Num Questions Parseable"].astype(float) / 100))
|
| 52 |
-
leaderboard_df.loc[leaderboard_df["Benchmark Score"] < 0, "Benchmark Score"] = 0
|
| 53 |
-
leaderboard_df = leaderboard_df.sort_values(by=["Benchmark Score", "Num Questions Parseable"], ascending=[False, False])
|
| 54 |
-
leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model", "Num Questions Parseable": "Percentage Questions Parseable"})
|
| 55 |
-
|
| 56 |
-
# Generate HTML with DataTables
|
| 57 |
-
html = """
|
| 58 |
-
<!DOCTYPE html>
|
| 59 |
-
<html lang="en">
|
| 60 |
-
<head>
|
| 61 |
-
<meta charset="UTF-8">
|
| 62 |
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 63 |
-
<title>Leaderboard</title>
|
| 64 |
-
<link rel="stylesheet" href="https://cdn.datatables.net/1.11.5/css/jquery.dataTables.min.css">
|
| 65 |
-
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
|
| 66 |
-
<script src="https://cdn.datatables.net/1.11.5/js/jquery.dataTables.min.js"></script>
|
| 67 |
-
<style>
|
| 68 |
-
body {
|
| 69 |
-
font: 90%/1.45em "Helvetica Neue", HelveticaNeue, Verdana, Arial, Helvetica, sans-serif;
|
| 70 |
-
margin: 0;
|
| 71 |
-
padding: 20px;
|
| 72 |
-
color: #333;
|
| 73 |
-
background-color: #fff;
|
| 74 |
-
}
|
| 75 |
-
.numeric-cell {
|
| 76 |
-
text-align: right;
|
| 77 |
-
padding: 8px !important;
|
| 78 |
-
}
|
| 79 |
-
</style>
|
| 80 |
-
<script>
|
| 81 |
-
(function($) {
|
| 82 |
-
$.fn.colorize = function(oOptions) {
|
| 83 |
-
var settings = $.extend({
|
| 84 |
-
parse: function(e) {
|
| 85 |
-
return parseFloat(e.html());
|
| 86 |
-
},
|
| 87 |
-
min: undefined,
|
| 88 |
-
max: undefined,
|
| 89 |
-
readable: true,
|
| 90 |
-
themes: {
|
| 91 |
-
"default": {
|
| 92 |
-
color_min: "#C80000",
|
| 93 |
-
color_mid: "#FFFFFF",
|
| 94 |
-
color_max: "#10A54A"
|
| 95 |
-
}
|
| 96 |
-
},
|
| 97 |
-
theme: "default",
|
| 98 |
-
center: undefined,
|
| 99 |
-
percent: false
|
| 100 |
-
}, oOptions);
|
| 101 |
-
|
| 102 |
-
function getColor(color1, color2, ratio) {
|
| 103 |
-
var hex = function(x) {
|
| 104 |
-
x = x.toString(16);
|
| 105 |
-
return (x.length == 1) ? '0' + x : x;
|
| 106 |
-
}
|
| 107 |
-
color1 = (color1.charAt(0) == "#") ? color1.slice(1) : color1
|
| 108 |
-
color2 = (color2.charAt(0) == "#") ? color2.slice(1) : color2
|
| 109 |
-
var r = Math.ceil(parseInt(color1.substring(0,2), 16) * ratio + parseInt(color2.substring(0,2), 16) * (1-ratio));
|
| 110 |
-
var g = Math.ceil(parseInt(color1.substring(2,4), 16) * ratio + parseInt(color2.substring(2,4), 16) * (1-ratio));
|
| 111 |
-
var b = Math.ceil(parseInt(color1.substring(4,6), 16) * ratio + parseInt(color2.substring(4,6), 16) * (1-ratio));
|
| 112 |
-
return "#" + (hex(r) + hex(g) + hex(b)).toUpperCase();
|
| 113 |
-
}
|
| 114 |
-
|
| 115 |
-
function getContrastYIQ(hexcolor) {
|
| 116 |
-
var hex = (hexcolor.charAt(0) == "#") ? hexcolor.slice(1) : hexcolor;
|
| 117 |
-
var r = parseInt(hex.substr(0,2),16);
|
| 118 |
-
var g = parseInt(hex.substr(2,2),16);
|
| 119 |
-
var b = parseInt(hex.substr(4,2),16);
|
| 120 |
-
var yiq = ((r*299)+(g*587)+(b*114))/1000;
|
| 121 |
-
return (yiq >= 128) ? 'black' : 'white';
|
| 122 |
-
}
|
| 123 |
-
|
| 124 |
-
var min = settings.min;
|
| 125 |
-
var max = settings.max;
|
| 126 |
-
if (min === undefined || max === undefined) {
|
| 127 |
-
min = Infinity;
|
| 128 |
-
max = -Infinity;
|
| 129 |
-
this.each(function() {
|
| 130 |
-
var value = parseFloat(settings.parse($(this)));
|
| 131 |
-
if (!isNaN(value) && isFinite(value)) {
|
| 132 |
-
min = Math.min(min, value);
|
| 133 |
-
max = Math.max(max, value);
|
| 134 |
-
}
|
| 135 |
-
});
|
| 136 |
-
}
|
| 137 |
-
|
| 138 |
-
var center = settings.center !== undefined ? settings.center : (max + min) / 2;
|
| 139 |
-
var adj = Math.max(Math.abs(max - center), Math.abs(center - min));
|
| 140 |
-
|
| 141 |
-
this.each(function() {
|
| 142 |
-
var value = parseFloat(settings.parse($(this)));
|
| 143 |
-
if (isNaN(value) || !isFinite(value)) return;
|
| 144 |
-
|
| 145 |
-
var ratio = (value - center) / adj;
|
| 146 |
-
var color1, color2;
|
| 147 |
-
|
| 148 |
-
if (value < center) {
|
| 149 |
-
ratio = Math.abs(ratio);
|
| 150 |
-
if (ratio > 1) ratio = 1;
|
| 151 |
-
color1 = settings.themes[settings.theme].color_min;
|
| 152 |
-
color2 = settings.themes[settings.theme].color_mid;
|
| 153 |
-
} else {
|
| 154 |
-
ratio = Math.abs(ratio);
|
| 155 |
-
if (ratio > 1) ratio = 1;
|
| 156 |
-
color1 = settings.themes[settings.theme].color_max;
|
| 157 |
-
color2 = settings.themes[settings.theme].color_mid;
|
| 158 |
-
}
|
| 159 |
-
var color = getColor(color1, color2, ratio);
|
| 160 |
-
$(this).css('background-color', color);
|
| 161 |
-
if (settings.readable)
|
| 162 |
-
$(this).css('color', getContrastYIQ(color));
|
| 163 |
-
});
|
| 164 |
-
|
| 165 |
-
return this;
|
| 166 |
-
};
|
| 167 |
-
}(jQuery));
|
| 168 |
-
|
| 169 |
-
$(document).ready(function() {
|
| 170 |
-
// Add custom filtering function
|
| 171 |
-
$.fn.dataTable.ext.search.push(function(settings, data, dataIndex) {
|
| 172 |
-
var searchValue = $('.dataTables_filter input').val();
|
| 173 |
-
if (!searchValue) return true;
|
| 174 |
-
|
| 175 |
-
// Split search terms by semicolon and trim whitespace
|
| 176 |
-
var searchTerms = searchValue.split(';').map(term => term.trim().toLowerCase());
|
| 177 |
-
var modelName = data[0].toLowerCase(); // Model name is in first column
|
| 178 |
-
|
| 179 |
-
// Return true if ANY search terms are found in the model name (OR logic)
|
| 180 |
-
return searchTerms.some(term => modelName.includes(term));
|
| 181 |
-
});
|
| 182 |
-
|
| 183 |
-
// Custom sorting function for benchmark scores
|
| 184 |
-
$.fn.dataTable.ext.type.order['score-pre'] = function(data) {
|
| 185 |
-
var score = parseFloat(data);
|
| 186 |
-
return isNaN(score) ? -Infinity : score;
|
| 187 |
-
};
|
| 188 |
-
|
| 189 |
-
// Get min/max values for each numeric column before initializing DataTables
|
| 190 |
-
var columnRanges = {
|
| 191 |
-
1: { min: Infinity, max: -Infinity }, // Params
|
| 192 |
-
2: { min: Infinity, max: -Infinity }, // Benchmark Score
|
| 193 |
-
3: { min: Infinity, max: -Infinity } // Percentage Questions Parseable
|
| 194 |
-
};
|
| 195 |
-
|
| 196 |
-
$('#leaderboard tbody td').each(function() {
|
| 197 |
-
var columnIdx = $(this).index();
|
| 198 |
-
if (columnIdx in columnRanges) {
|
| 199 |
-
var value = parseFloat($(this).text());
|
| 200 |
-
if (!isNaN(value) && isFinite(value)) {
|
| 201 |
-
columnRanges[columnIdx].min = Math.min(columnRanges[columnIdx].min, value);
|
| 202 |
-
columnRanges[columnIdx].max = Math.max(columnRanges[columnIdx].max, value);
|
| 203 |
-
}
|
| 204 |
-
}
|
| 205 |
-
});
|
| 206 |
-
|
| 207 |
-
var table = $('#leaderboard').DataTable({
|
| 208 |
-
"order": [[2, "desc"]], // Sort by Benchmark Score by default
|
| 209 |
-
"pageLength": 20, // Show 20 results per page
|
| 210 |
-
"lengthMenu": [[10, 20, 50, 100, -1], [10, 20, 50, 100, "All"]], // Update length menu options
|
| 211 |
-
"columnDefs": [
|
| 212 |
-
{
|
| 213 |
-
"targets": [1],
|
| 214 |
-
"className": "numeric-cell"
|
| 215 |
-
},
|
| 216 |
-
{
|
| 217 |
-
"type": "score",
|
| 218 |
-
"targets": [2], // Apply custom sorting to Benchmark Score column
|
| 219 |
-
"className": "numeric-cell"
|
| 220 |
-
},
|
| 221 |
-
{
|
| 222 |
-
"targets": [3],
|
| 223 |
-
"className": "numeric-cell"
|
| 224 |
-
}
|
| 225 |
-
],
|
| 226 |
-
"drawCallback": function() {
|
| 227 |
-
// Apply colorization with pre-calculated ranges
|
| 228 |
-
$("#leaderboard tbody td:nth-child(2)").colorize({
|
| 229 |
-
parse: function(e) { return parseFloat($(e).text()); },
|
| 230 |
-
min: columnRanges[1].min,
|
| 231 |
-
max: columnRanges[1].max,
|
| 232 |
-
themes: {
|
| 233 |
-
"default": {
|
| 234 |
-
color_min: "#10A54A", // White for smaller models
|
| 235 |
-
color_mid: "#FFD700", // Gold/yellow for medium models
|
| 236 |
-
color_max: "#C80000" // Hot pink for larger models
|
| 237 |
-
}
|
| 238 |
-
}
|
| 239 |
-
});
|
| 240 |
-
$("#leaderboard tbody td:nth-child(3)").colorize({
|
| 241 |
-
parse: function(e) { return parseFloat($(e).text()); },
|
| 242 |
-
min: columnRanges[2].min,
|
| 243 |
-
max: columnRanges[2].max,
|
| 244 |
-
themes: {
|
| 245 |
-
"default": {
|
| 246 |
-
color_min: "#C80000", // Red for lower scores
|
| 247 |
-
color_mid: "#FFD700", // Gold/yellow for medium scores
|
| 248 |
-
color_max: "#10A54A" // Green for higher scores
|
| 249 |
-
}
|
| 250 |
-
}
|
| 251 |
-
});
|
| 252 |
-
$("#leaderboard tbody td:nth-child(4)").colorize({
|
| 253 |
-
parse: function(e) { return parseFloat($(e).text()); },
|
| 254 |
-
min: columnRanges[3].min,
|
| 255 |
-
max: columnRanges[3].max,
|
| 256 |
-
themes: {
|
| 257 |
-
"default": {
|
| 258 |
-
color_min: "#C80000", // Red for lower percentages
|
| 259 |
-
color_mid: "#FFD700", // Gold/yellow for medium percentages
|
| 260 |
-
color_max: "#10A54A" // Green for higher percentages
|
| 261 |
-
}
|
| 262 |
-
}
|
| 263 |
-
});
|
| 264 |
-
},
|
| 265 |
-
// Override the default search behavior
|
| 266 |
-
"search": {
|
| 267 |
-
"smart": false
|
| 268 |
-
},
|
| 269 |
-
|
| 270 |
-
// Update search on input change
|
| 271 |
-
"initComplete": function() {
|
| 272 |
-
var table = this.api();
|
| 273 |
-
$('.dataTables_filter input')
|
| 274 |
-
.off() // Remove default binding
|
| 275 |
-
.on('input', function() {
|
| 276 |
-
table.draw();
|
| 277 |
-
});
|
| 278 |
-
}
|
| 279 |
-
});
|
| 280 |
-
});
|
| 281 |
-
</script>
|
| 282 |
-
</head>
|
| 283 |
-
<body>
|
| 284 |
-
<h1>Leaderboard</h1>
|
| 285 |
-
<table id="leaderboard" class="display" style="width:100%">
|
| 286 |
-
<thead>
|
| 287 |
-
<tr>
|
| 288 |
-
<th>Model</th>
|
| 289 |
-
<th>Params</th>
|
| 290 |
-
<th>Benchmark Score</th>
|
| 291 |
-
<th>Percentage Questions Parseable</th>
|
| 292 |
-
<th>Error</th>
|
| 293 |
-
</tr>
|
| 294 |
-
</thead>
|
| 295 |
-
<tbody>
|
| 296 |
-
"""
|
| 297 |
-
|
| 298 |
-
# Add rows to the HTML table
|
| 299 |
-
for _, row in leaderboard_df.iterrows():
|
| 300 |
-
html += f"""
|
| 301 |
-
<tr>
|
| 302 |
-
<td>{row['Model']}</td>
|
| 303 |
-
<td>{row['Params']}</td>
|
| 304 |
-
<td>{row['Benchmark Score']:.2f}</td>
|
| 305 |
-
<td>{row['Percentage Questions Parseable']:.2f}</td>
|
| 306 |
-
<td>{row['Error']}</td>
|
| 307 |
-
</tr>
|
| 308 |
-
"""
|
| 309 |
-
|
| 310 |
-
# Close the HTML tags
|
| 311 |
-
html += """
|
| 312 |
-
</tbody>
|
| 313 |
-
</table>
|
| 314 |
-
</body>
|
| 315 |
-
</html>
|
| 316 |
-
"""
|
| 317 |
-
|
| 318 |
-
# Save the HTML to a file
|
| 319 |
-
with open("leaderboard.html", "w") as file:
|
| 320 |
-
file.write(html)
|
| 321 |
-
|
| 322 |
-
print("HTML leaderboard generated and saved as leaderboard.html")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/about.py
CHANGED
|
@@ -2,25 +2,43 @@
|
|
| 2 |
TITLE = """<div style="display: flex; flex-wrap: wrap; justify-content: space-around;">
|
| 3 |
<img src="https://speakleash.org/wp-content/uploads/2023/09/SpeakLeash_logo.svg">
|
| 4 |
<div>
|
| 5 |
-
<h1 align="center" id="space-title">Polish
|
| 6 |
-
<h2 align="center" id="space-subtitle">
|
| 7 |
</div>
|
| 8 |
</div>"""
|
| 9 |
|
| 10 |
# What does your leaderboard evaluate?
|
| 11 |
INTRODUCTION_TEXT = """
|
| 12 |
-
Polish
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
Help us develop Polish Large Language Model Bielik by using [Arena](https://arena.speakleash.org.pl/).
|
| 15 |
|
| 16 |
We gratefully acknowledge Polish high-performance computing infrastructure PLGrid (HPC Centers: ACK Cyfronet AGH) for providing computer facilities and support within computational grant no. PLG/2024/016951.
|
| 17 |
"""
|
| 18 |
|
| 19 |
-
AUTHORS = """
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
-
|
| 26 |
|
|
|
|
| 2 |
TITLE = """<div style="display: flex; flex-wrap: wrap; justify-content: space-around;">
|
| 3 |
<img src="https://speakleash.org/wp-content/uploads/2023/09/SpeakLeash_logo.svg">
|
| 4 |
<div>
|
| 5 |
+
<h1 align="center" id="space-title">Polish Cultural Vision Benchmark (PCVB)</h1>
|
| 6 |
+
<h2 align="center" id="space-subtitle">Evaluating Vision-Language Models on Polish Cultural Heritage</h2>
|
| 7 |
</div>
|
| 8 |
</div>"""
|
| 9 |
|
| 10 |
# What does your leaderboard evaluate?
|
| 11 |
INTRODUCTION_TEXT = """
|
| 12 |
+
A specialized evaluation dataset designed to assess vision-language models' understanding of Polish cultural heritage, history, geography, and traditions. This benchmark addresses the critical gap in multilingual and culturally-specific evaluation of multimodal AI systems.
|
| 13 |
+
|
| 14 |
+
**Benchmark Scope:**
|
| 15 |
+
- **Domain**: Polish Cultural Knowledge
|
| 16 |
+
- **Modality**: Vision + Language
|
| 17 |
+
- **Task Type**: Visual Recognition and Cultural Classification
|
| 18 |
+
- **Dataset Size**: ~220 curated image-text pairs across 11 subcategories
|
| 19 |
+
|
| 20 |
+
**Categories Evaluated:**
|
| 21 |
+
- 🎭 **Art & Entertainment**: Movies, Art, Theatre
|
| 22 |
+
- 🏛️ **Culture & Tradition**: Food, Folk Culture, Traditions
|
| 23 |
+
- 🗺️ **Geography**: Cities, Nature, Architecture
|
| 24 |
+
- 📚 **History**: Historical Figures, Historical Sites
|
| 25 |
|
| 26 |
Help us develop Polish Large Language Model Bielik by using [Arena](https://arena.speakleash.org.pl/).
|
| 27 |
|
| 28 |
We gratefully acknowledge Polish high-performance computing infrastructure PLGrid (HPC Centers: ACK Cyfronet AGH) for providing computer facilities and support within computational grant no. PLG/2024/016951.
|
| 29 |
"""
|
| 30 |
|
| 31 |
+
AUTHORS = """**Benchmark Details:**
|
| 32 |
+
|
| 33 |
+
**Methodology**: Each test item consists of carefully selected and manually verified images that represent authentic Polish cultural elements. Models are prompted to identify specific cultural objects, landmarks, foods, or personalities shown in images, along with their country of origin.
|
| 34 |
+
|
| 35 |
+
**Evaluation Protocol**: Responses are evaluated for both object accuracy and geographical attribution using binary scoring (correct/incorrect) across all categories.
|
| 36 |
+
|
| 37 |
+
**Unique Value Proposition**:
|
| 38 |
+
- Cultural Specificity: Tests deep understanding of Polish heritage beyond generic object recognition
|
| 39 |
+
- Multimodal Integration: Requires both visual processing and cultural knowledge
|
| 40 |
+
- Bias Detection: Reveals potential Western-centric biases in vision-language models
|
| 41 |
+
- Real-world Relevance: Evaluates practically useful cultural knowledge for Polish applications
|
| 42 |
|
| 43 |
+
This benchmark is maintained as a private evaluation suite to ensure result integrity and prevent training data contamination."""
|
| 44 |
|