File size: 2,153 Bytes
4a522cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# app.py
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import gradio as gr

# 1️⃣ Load the dataset
ds = load_dataset("cais/hle")

# 2️⃣ Load a model (I picked 'gpt2' for simplicity; you can replace with a bigger HF model)
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# 3️⃣ Create a pipeline for text generation
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# 4️⃣ Function to test all questions
def run_hle_test():
    results = []
    score = 0
    total = len(ds['test'])
    
    for example in ds['test']:
        question = example['question']
        reference = example['answer']  # gold answer
        response = generator(question, max_length=100, do_sample=False)[0]['generated_text']
        
        # Simple scoring: exact match (can customize to official scoring)
        is_correct = int(response.strip().lower() == reference.strip().lower())
        score += is_correct
        results.append({
            "question": question,
            "model_answer": response,
            "reference": reference,
            "correct": bool(is_correct)
        })
    
    final_score = (score / total) * 100
    return results, final_score

# 5️⃣ Gradio interface
def run_interface():
    with gr.Blocks() as demo:
        gr.Markdown("# HLE Benchmark Tester")
        with gr.Row():
            output_text = gr.Textbox(label="Live Output", lines=20)
            score_box = gr.Textbox(label="Final Score")
        def on_click():
            results, final_score = run_hle_test()
            # Show results for the first few examples as live output
            output_preview = "\n\n".join([f"Q: {r['question']}\nA: {r['model_answer']}\nRef: {r['reference']}\nCorrect: {r['correct']}" for r in results[:10]])
            output_text.value = output_preview
            score_box.value = f"{final_score:.2f}%"
        run_btn = gr.Button("Run Full Test")
        run_btn.click(on_click)
    demo.launch()

if __name__ == "__main__":
    run_interface()