Main / app.py
Shinglouwei's picture
Create app.py
4a522cf verified
# app.py
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import gradio as gr
# 1️⃣ Load the dataset
ds = load_dataset("cais/hle")
# 2️⃣ Load a model (I picked 'gpt2' for simplicity; you can replace with a bigger HF model)
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# 3️⃣ Create a pipeline for text generation
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
# 4️⃣ Function to test all questions
def run_hle_test():
results = []
score = 0
total = len(ds['test'])
for example in ds['test']:
question = example['question']
reference = example['answer'] # gold answer
response = generator(question, max_length=100, do_sample=False)[0]['generated_text']
# Simple scoring: exact match (can customize to official scoring)
is_correct = int(response.strip().lower() == reference.strip().lower())
score += is_correct
results.append({
"question": question,
"model_answer": response,
"reference": reference,
"correct": bool(is_correct)
})
final_score = (score / total) * 100
return results, final_score
# 5️⃣ Gradio interface
def run_interface():
with gr.Blocks() as demo:
gr.Markdown("# HLE Benchmark Tester")
with gr.Row():
output_text = gr.Textbox(label="Live Output", lines=20)
score_box = gr.Textbox(label="Final Score")
def on_click():
results, final_score = run_hle_test()
# Show results for the first few examples as live output
output_preview = "\n\n".join([f"Q: {r['question']}\nA: {r['model_answer']}\nRef: {r['reference']}\nCorrect: {r['correct']}" for r in results[:10]])
output_text.value = output_preview
score_box.value = f"{final_score:.2f}%"
run_btn = gr.Button("Run Full Test")
run_btn.click(on_click)
demo.launch()
if __name__ == "__main__":
run_interface()