| import pyarrow.parquet as pq |
| import requests |
| import json |
| import os |
| from dotenv import load_dotenv |
| load_dotenv() |
|
|
| |
| print("Fetching questions...") |
| resp = requests.get('https://agents-course-unit4-scoring.hf.space/questions') |
| questions = resp.json() |
| print(f"Fetched {len(questions)} questions") |
|
|
| |
| print("Fetching ground truth...") |
| from huggingface_hub import hf_hub_download |
| token = os.getenv('HF_TOKEN') or os.getenv('HUGGINGFACEHUB_API_TOKEN') |
| path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token) |
| df = pq.read_table(path).to_pandas() |
|
|
| |
| answer_map = dict(zip(df['task_id'], df['Final answer'])) |
| print(f"Loaded {len(answer_map)} ground truth answers") |
|
|
| |
| submission_path = 'backup_submission.json' |
| if not os.path.exists(submission_path): |
| print(f"\nError: {submission_path} not found!") |
| print("Please run your evaluation first to generate the submission file.") |
| exit(1) |
|
|
| with open(submission_path, 'r') as f: |
| submission = json.load(f) |
|
|
| print(f"Loaded submission with {len(submission['answers'])} answers") |
|
|
| |
| print('\n' + '='*70) |
| print('DETAILED COMPARISON: Ground Truth vs Submitted Answers') |
| print('='*70 + '\n') |
|
|
| correct = 0 |
| for i, ans in enumerate(submission['answers']): |
| task_id = ans['task_id'] |
| submitted = str(ans['submitted_answer']).strip() |
| ground_truth = str(answer_map.get(task_id, 'NOT FOUND')).strip() |
| |
| is_correct = submitted.lower() == ground_truth.lower() |
| if is_correct: |
| correct += 1 |
| status = '✅' |
| else: |
| status = '❌' |
| |
| |
| q = next((x['question'] for x in questions if x['task_id'] == task_id), 'N/A') |
| |
| print(f"{status} [{i+1}] Task: {task_id[:30]}...") |
| print(f" Q: {q[:60]}...") |
| print(f" Submitted: {submitted[:50]}") |
| print(f" Ground: {ground_truth[:50]}") |
| print() |
|
|
| print('='*70) |
| print(f'FINAL SCORE: {correct}/{len(submission["answers"])} = {correct/len(submission["answers"])*100:.0f}%') |
| print('='*70) |
|
|