Shinglouwei commited on
Commit
4a522cf
·
verified ·
1 Parent(s): d772dc5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -0
app.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ from datasets import load_dataset
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
4
+ import gradio as gr
5
+
6
+ # 1️⃣ Load the dataset
7
+ ds = load_dataset("cais/hle")
8
+
9
+ # 2️⃣ Load a model (I picked 'gpt2' for simplicity; you can replace with a bigger HF model)
10
+ model_name = "gpt2"
11
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
12
+ model = AutoModelForCausalLM.from_pretrained(model_name)
13
+
14
+ # 3️⃣ Create a pipeline for text generation
15
+ generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
16
+
17
+ # 4️⃣ Function to test all questions
18
+ def run_hle_test():
19
+ results = []
20
+ score = 0
21
+ total = len(ds['test'])
22
+
23
+ for example in ds['test']:
24
+ question = example['question']
25
+ reference = example['answer'] # gold answer
26
+ response = generator(question, max_length=100, do_sample=False)[0]['generated_text']
27
+
28
+ # Simple scoring: exact match (can customize to official scoring)
29
+ is_correct = int(response.strip().lower() == reference.strip().lower())
30
+ score += is_correct
31
+ results.append({
32
+ "question": question,
33
+ "model_answer": response,
34
+ "reference": reference,
35
+ "correct": bool(is_correct)
36
+ })
37
+
38
+ final_score = (score / total) * 100
39
+ return results, final_score
40
+
41
+ # 5️⃣ Gradio interface
42
+ def run_interface():
43
+ with gr.Blocks() as demo:
44
+ gr.Markdown("# HLE Benchmark Tester")
45
+ with gr.Row():
46
+ output_text = gr.Textbox(label="Live Output", lines=20)
47
+ score_box = gr.Textbox(label="Final Score")
48
+ def on_click():
49
+ results, final_score = run_hle_test()
50
+ # Show results for the first few examples as live output
51
+ output_preview = "\n\n".join([f"Q: {r['question']}\nA: {r['model_answer']}\nRef: {r['reference']}\nCorrect: {r['correct']}" for r in results[:10]])
52
+ output_text.value = output_preview
53
+ score_box.value = f"{final_score:.2f}%"
54
+ run_btn = gr.Button("Run Full Test")
55
+ run_btn.click(on_click)
56
+ demo.launch()
57
+
58
+ if __name__ == "__main__":
59
+ run_interface()