File size: 2,174 Bytes
21be703
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import pyarrow.parquet as pq
import requests
import json
import os
from dotenv import load_dotenv
load_dotenv()

# Fetch questions from scoring space
print("Fetching questions...")
resp = requests.get('https://agents-course-unit4-scoring.hf.space/questions')
questions = resp.json()
print(f"Fetched {len(questions)} questions")

# Get ground truth from HF
print("Fetching ground truth...")
from huggingface_hub import hf_hub_download
token = os.getenv('HF_TOKEN') or os.getenv('HUGGINGFACEHUB_API_TOKEN')
path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
df = pq.read_table(path).to_pandas()

# Create mapping of task_id -> ground truth answer
answer_map = dict(zip(df['task_id'], df['Final answer']))
print(f"Loaded {len(answer_map)} ground truth answers")

# Load submission
submission_path = 'backup_submission.json'
if not os.path.exists(submission_path):
    print(f"\nError: {submission_path} not found!")
    print("Please run your evaluation first to generate the submission file.")
    exit(1)

with open(submission_path, 'r') as f:
    submission = json.load(f)

print(f"Loaded submission with {len(submission['answers'])} answers")

# Detailed comparison
print('\n' + '='*70)
print('DETAILED COMPARISON: Ground Truth vs Submitted Answers')
print('='*70 + '\n')

correct = 0
for i, ans in enumerate(submission['answers']):
    task_id = ans['task_id']
    submitted = str(ans['submitted_answer']).strip()
    ground_truth = str(answer_map.get(task_id, 'NOT FOUND')).strip()
    
    is_correct = submitted.lower() == ground_truth.lower()
    if is_correct:
        correct += 1
        status = '✅'
    else:
        status = '❌'
    
    # Find the question
    q = next((x['question'] for x in questions if x['task_id'] == task_id), 'N/A')
    
    print(f"{status} [{i+1}] Task: {task_id[:30]}...")
    print(f"   Q: {q[:60]}...")
    print(f"   Submitted: {submitted[:50]}")
    print(f"   Ground:   {ground_truth[:50]}")
    print()

print('='*70)
print(f'FINAL SCORE: {correct}/{len(submission["answers"])} = {correct/len(submission["answers"])*100:.0f}%')
print('='*70)