Spaces:
Build error
Build error
| # app.py | |
| from datasets import load_dataset | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
| import gradio as gr | |
| # 1️⃣ Load the dataset | |
| ds = load_dataset("cais/hle") | |
| # 2️⃣ Load a model (I picked 'gpt2' for simplicity; you can replace with a bigger HF model) | |
| model_name = "gpt2" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForCausalLM.from_pretrained(model_name) | |
| # 3️⃣ Create a pipeline for text generation | |
| generator = pipeline("text-generation", model=model, tokenizer=tokenizer) | |
| # 4️⃣ Function to test all questions | |
| def run_hle_test(): | |
| results = [] | |
| score = 0 | |
| total = len(ds['test']) | |
| for example in ds['test']: | |
| question = example['question'] | |
| reference = example['answer'] # gold answer | |
| response = generator(question, max_length=100, do_sample=False)[0]['generated_text'] | |
| # Simple scoring: exact match (can customize to official scoring) | |
| is_correct = int(response.strip().lower() == reference.strip().lower()) | |
| score += is_correct | |
| results.append({ | |
| "question": question, | |
| "model_answer": response, | |
| "reference": reference, | |
| "correct": bool(is_correct) | |
| }) | |
| final_score = (score / total) * 100 | |
| return results, final_score | |
| # 5️⃣ Gradio interface | |
| def run_interface(): | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# HLE Benchmark Tester") | |
| with gr.Row(): | |
| output_text = gr.Textbox(label="Live Output", lines=20) | |
| score_box = gr.Textbox(label="Final Score") | |
| def on_click(): | |
| results, final_score = run_hle_test() | |
| # Show results for the first few examples as live output | |
| output_preview = "\n\n".join([f"Q: {r['question']}\nA: {r['model_answer']}\nRef: {r['reference']}\nCorrect: {r['correct']}" for r in results[:10]]) | |
| output_text.value = output_preview | |
| score_box.value = f"{final_score:.2f}%" | |
| run_btn = gr.Button("Run Full Test") | |
| run_btn.click(on_click) | |
| demo.launch() | |
| if __name__ == "__main__": | |
| run_interface() |