Spaces:
Build error
Build error
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py
|
| 2 |
+
from datasets import load_dataset
|
| 3 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
| 4 |
+
import gradio as gr
|
| 5 |
+
|
| 6 |
+
# 1️⃣ Load the dataset
|
| 7 |
+
ds = load_dataset("cais/hle")
|
| 8 |
+
|
| 9 |
+
# 2️⃣ Load a model (I picked 'gpt2' for simplicity; you can replace with a bigger HF model)
|
| 10 |
+
model_name = "gpt2"
|
| 11 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 12 |
+
model = AutoModelForCausalLM.from_pretrained(model_name)
|
| 13 |
+
|
| 14 |
+
# 3️⃣ Create a pipeline for text generation
|
| 15 |
+
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
| 16 |
+
|
| 17 |
+
# 4️⃣ Function to test all questions
|
| 18 |
+
def run_hle_test():
|
| 19 |
+
results = []
|
| 20 |
+
score = 0
|
| 21 |
+
total = len(ds['test'])
|
| 22 |
+
|
| 23 |
+
for example in ds['test']:
|
| 24 |
+
question = example['question']
|
| 25 |
+
reference = example['answer'] # gold answer
|
| 26 |
+
response = generator(question, max_length=100, do_sample=False)[0]['generated_text']
|
| 27 |
+
|
| 28 |
+
# Simple scoring: exact match (can customize to official scoring)
|
| 29 |
+
is_correct = int(response.strip().lower() == reference.strip().lower())
|
| 30 |
+
score += is_correct
|
| 31 |
+
results.append({
|
| 32 |
+
"question": question,
|
| 33 |
+
"model_answer": response,
|
| 34 |
+
"reference": reference,
|
| 35 |
+
"correct": bool(is_correct)
|
| 36 |
+
})
|
| 37 |
+
|
| 38 |
+
final_score = (score / total) * 100
|
| 39 |
+
return results, final_score
|
| 40 |
+
|
| 41 |
+
# 5️⃣ Gradio interface
|
| 42 |
+
def run_interface():
|
| 43 |
+
with gr.Blocks() as demo:
|
| 44 |
+
gr.Markdown("# HLE Benchmark Tester")
|
| 45 |
+
with gr.Row():
|
| 46 |
+
output_text = gr.Textbox(label="Live Output", lines=20)
|
| 47 |
+
score_box = gr.Textbox(label="Final Score")
|
| 48 |
+
def on_click():
|
| 49 |
+
results, final_score = run_hle_test()
|
| 50 |
+
# Show results for the first few examples as live output
|
| 51 |
+
output_preview = "\n\n".join([f"Q: {r['question']}\nA: {r['model_answer']}\nRef: {r['reference']}\nCorrect: {r['correct']}" for r in results[:10]])
|
| 52 |
+
output_text.value = output_preview
|
| 53 |
+
score_box.value = f"{final_score:.2f}%"
|
| 54 |
+
run_btn = gr.Button("Run Full Test")
|
| 55 |
+
run_btn.click(on_click)
|
| 56 |
+
demo.launch()
|
| 57 |
+
|
| 58 |
+
if __name__ == "__main__":
|
| 59 |
+
run_interface()
|