# app.py from datasets import load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline import gradio as gr # 1️⃣ Load the dataset ds = load_dataset("cais/hle") # 2️⃣ Load a model (I picked 'gpt2' for simplicity; you can replace with a bigger HF model) model_name = "gpt2" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) # 3️⃣ Create a pipeline for text generation generator = pipeline("text-generation", model=model, tokenizer=tokenizer) # 4️⃣ Function to test all questions def run_hle_test(): results = [] score = 0 total = len(ds['test']) for example in ds['test']: question = example['question'] reference = example['answer'] # gold answer response = generator(question, max_length=100, do_sample=False)[0]['generated_text'] # Simple scoring: exact match (can customize to official scoring) is_correct = int(response.strip().lower() == reference.strip().lower()) score += is_correct results.append({ "question": question, "model_answer": response, "reference": reference, "correct": bool(is_correct) }) final_score = (score / total) * 100 return results, final_score # 5️⃣ Gradio interface def run_interface(): with gr.Blocks() as demo: gr.Markdown("# HLE Benchmark Tester") with gr.Row(): output_text = gr.Textbox(label="Live Output", lines=20) score_box = gr.Textbox(label="Final Score") def on_click(): results, final_score = run_hle_test() # Show results for the first few examples as live output output_preview = "\n\n".join([f"Q: {r['question']}\nA: {r['model_answer']}\nRef: {r['reference']}\nCorrect: {r['correct']}" for r in results[:10]]) output_text.value = output_preview score_box.value = f"{final_score:.2f}%" run_btn = gr.Button("Run Full Test") run_btn.click(on_click) demo.launch() if __name__ == "__main__": run_interface()