Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,6 +4,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
|
| 4 |
from datasets import load_dataset
|
| 5 |
import torch
|
| 6 |
import re
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
# Cache to avoid reloading the model
|
| 9 |
model_cache = {}
|
|
@@ -20,8 +23,7 @@ def load_model(model_id):
|
|
| 20 |
return generator
|
| 21 |
|
| 22 |
def format_prompt(item):
|
| 23 |
-
system_instruction = "
|
| 24 |
-
Only answer with a single letter: A, B, C, or D."
|
| 25 |
prompt = f"{item['question']}
|
| 26 |
A. {item['choices'][0]}
|
| 27 |
B. {item['choices'][1]}
|
|
@@ -55,10 +57,23 @@ def evaluate(model_id, sample_count, config_name):
|
|
| 55 |
|
| 56 |
def run(model_id, sample_count, config_name):
|
| 57 |
score, details = evaluate(model_id, sample_count, config_name)
|
| 58 |
-
formatted = "
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
for q, o, a, g, c in details
|
| 61 |
])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
return score, formatted
|
| 63 |
|
| 64 |
def save_text(text):
|
|
@@ -105,4 +120,23 @@ with gr.Blocks(css="body {font-family: Inter, sans-serif; padding: 1em; max-widt
|
|
| 105 |
run_button.click(run, inputs=[model_id, sample_count, config_name], outputs=[acc_output, detail_output])
|
| 106 |
download_button.click(save_text, inputs=detail_output, outputs=gr.File())
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
demo.launch()
|
|
|
|
| 4 |
from datasets import load_dataset
|
| 5 |
import torch
|
| 6 |
import re
|
| 7 |
+
import json
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import matplotlib.pyplot as plt
|
| 10 |
|
| 11 |
# Cache to avoid reloading the model
|
| 12 |
model_cache = {}
|
|
|
|
| 23 |
return generator
|
| 24 |
|
| 25 |
def format_prompt(item):
|
| 26 |
+
system_instruction = " Only answer with a single letter: A, B, C, or D."
|
|
|
|
| 27 |
prompt = f"{item['question']}
|
| 28 |
A. {item['choices'][0]}
|
| 29 |
B. {item['choices'][1]}
|
|
|
|
| 57 |
|
| 58 |
def run(model_id, sample_count, config_name):
|
| 59 |
score, details = evaluate(model_id, sample_count, config_name)
|
| 60 |
+
formatted = "
|
| 61 |
+
|
| 62 |
+
".join([
|
| 63 |
+
f"### Question:
|
| 64 |
+
{q}
|
| 65 |
+
|
| 66 |
+
**Model Answer:** {o}
|
| 67 |
+
**Expected:** {a}
|
| 68 |
+
**Predicted:** {g}
|
| 69 |
+
**Correct:** {c}"
|
| 70 |
for q, o, a, g, c in details
|
| 71 |
])
|
| 72 |
+
accuracy_value = float(score.split()[1][:-1])
|
| 73 |
+
record = {"model_id": model_id, "subject": config_name, "accuracy": accuracy_value}
|
| 74 |
+
with open("eval.jsonl", "a") as f:
|
| 75 |
+
f.write(json.dumps(record) + "
|
| 76 |
+
")
|
| 77 |
return score, formatted
|
| 78 |
|
| 79 |
def save_text(text):
|
|
|
|
| 120 |
run_button.click(run, inputs=[model_id, sample_count, config_name], outputs=[acc_output, detail_output])
|
| 121 |
download_button.click(save_text, inputs=detail_output, outputs=gr.File())
|
| 122 |
|
| 123 |
+
with gr.Row():
|
| 124 |
+
leaderboard_plot = gr.Plot(label="Leaderboard Chart")
|
| 125 |
+
leaderboard_table = gr.Dataframe(headers=["Model ID", "Subject", "Accuracy"], interactive=False)
|
| 126 |
+
|
| 127 |
+
def load_leaderboard():
|
| 128 |
+
try:
|
| 129 |
+
df = pd.read_json("eval.jsonl", lines=True)
|
| 130 |
+
df_sorted = df.sort_values(by="accuracy", ascending=False).head(10)
|
| 131 |
+
fig, ax = plt.subplots()
|
| 132 |
+
ax.barh(df_sorted['model_id'], df_sorted['accuracy'])
|
| 133 |
+
ax.set_xlabel("Accuracy")
|
| 134 |
+
ax.set_ylabel("Model")
|
| 135 |
+
ax.set_title("Top 10 Models")
|
| 136 |
+
return fig, df_sorted
|
| 137 |
+
except Exception as e:
|
| 138 |
+
return plt.figure(), pd.DataFrame(columns=["model_id", "subject", "accuracy"])
|
| 139 |
+
|
| 140 |
+
demo.load(load_leaderboard, inputs=[], outputs=[leaderboard_plot, leaderboard_table])
|
| 141 |
+
|
| 142 |
demo.launch()
|