Spaces:
Sleeping
Sleeping
claude fix
Browse files- .codex/config.toml +8 -0
- __pycache__/agent.cpython-312.pyc +0 -0
- agent.py +25 -10
- app.py +7 -4
- compare_answers.py +66 -0
- run_local.py +124 -0
.codex/config.toml
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[shell_environment_policy]
|
| 2 |
+
inherit = "core"
|
| 3 |
+
|
| 4 |
+
[shell_environment_policy.set]
|
| 5 |
+
ANTHROPIC_API_KEY = ""
|
| 6 |
+
ANTHROPIC_AUTH_TOKEN = "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"
|
| 7 |
+
ANTHROPIC_BASE_URL = "https://openrouter.ai/api"
|
| 8 |
+
ANTHROPIC_MODEL = "qwen/qwen3-coder:free"
|
__pycache__/agent.cpython-312.pyc
CHANGED
|
Binary files a/__pycache__/agent.cpython-312.pyc and b/__pycache__/agent.cpython-312.pyc differ
|
|
|
agent.py
CHANGED
|
@@ -61,11 +61,11 @@ def smart_invoke(msgs, use_tools=False, start_tier=0):
|
|
| 61 |
gemini_alternatives = ["gemini-2.5-flash", "gemini-2.0-flash", "gemini-flash-latest", "gemini-pro-latest"]
|
| 62 |
|
| 63 |
tiers_config = [
|
| 64 |
-
{"name": "OpenRouter-FreeRouter", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "openrouter/free", "base_url": "https://openrouter.ai/api/v1"},
|
| 65 |
-
{"name": "DeepSeek-R1", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "deepseek/deepseek-r1:free", "base_url": "https://openrouter.ai/api/v1"},
|
| 66 |
{"name": "Qwen3-Next-80B", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "qwen/qwen3-next-80b-a3b-instruct:free", "base_url": "https://openrouter.ai/api/v1"},
|
| 67 |
-
{"name": "NVIDIA-Nemotron-Super", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "nvidia/nemotron-3-super-120b-a12b:free", "base_url": "https://openrouter.ai/api/v1"},
|
| 68 |
{"name": "Gemma-3-27B", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "google/gemma-3-27b-it:free", "base_url": "https://openrouter.ai/api/v1"},
|
|
|
|
|
|
|
|
|
|
| 69 |
{"name": "Gemini-Flash", "key": "GOOGLE_API_KEY", "provider": "google", "model_name": "gemini-2.0-flash", "alternatives": gemini_alternatives},
|
| 70 |
{"name": "Groq", "key": "GROQ_API_KEY", "provider": "groq", "model_name": "llama-3.3-70b-versatile"},
|
| 71 |
]
|
|
@@ -523,15 +523,30 @@ CRITICAL RULES:
|
|
| 523 |
if isinstance(msg, HumanMessage) and "[Attached File Local Path:" in msg.content:
|
| 524 |
messages.append(HumanMessage(content="IMPORTANT: I see an image path in the message. I MUST call the analyze_image tool IMMEDIATELY in my next step to see it."))
|
| 525 |
|
| 526 |
-
#
|
|
|
|
| 527 |
draft_response = None
|
| 528 |
current_tier = 0
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 535 |
|
| 536 |
# Execute requested tools and append their text output into the conversation
|
| 537 |
for tool_call in tool_calls:
|
|
|
|
| 61 |
gemini_alternatives = ["gemini-2.5-flash", "gemini-2.0-flash", "gemini-flash-latest", "gemini-pro-latest"]
|
| 62 |
|
| 63 |
tiers_config = [
|
|
|
|
|
|
|
| 64 |
{"name": "Qwen3-Next-80B", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "qwen/qwen3-next-80b-a3b-instruct:free", "base_url": "https://openrouter.ai/api/v1"},
|
|
|
|
| 65 |
{"name": "Gemma-3-27B", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "google/gemma-3-27b-it:free", "base_url": "https://openrouter.ai/api/v1"},
|
| 66 |
+
{"name": "NVIDIA-Nemotron-Super", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "nvidia/nemotron-3-super-120b-a12b:free", "base_url": "https://openrouter.ai/api/v1"},
|
| 67 |
+
{"name": "OpenRouter-FreeRouter", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "openrouter/free", "base_url": "https://openrouter.ai/api/v1"},
|
| 68 |
+
{"name": "DeepSeek-R1", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "deepseek/deepseek-r1:free", "base_url": "https://openrouter.ai/api/v1"},
|
| 69 |
{"name": "Gemini-Flash", "key": "GOOGLE_API_KEY", "provider": "google", "model_name": "gemini-2.0-flash", "alternatives": gemini_alternatives},
|
| 70 |
{"name": "Groq", "key": "GROQ_API_KEY", "provider": "groq", "model_name": "llama-3.3-70b-versatile"},
|
| 71 |
]
|
|
|
|
| 523 |
if isinstance(msg, HumanMessage) and "[Attached File Local Path:" in msg.content:
|
| 524 |
messages.append(HumanMessage(content="IMPORTANT: I see an image path in the message. I MUST call the analyze_image tool IMMEDIATELY in my next step to see it."))
|
| 525 |
|
| 526 |
+
# Multi-step ReAct Loop (Up to 12 reasoning steps)
|
| 527 |
+
max_steps = 12
|
| 528 |
draft_response = None
|
| 529 |
current_tier = 0
|
| 530 |
+
|
| 531 |
+
for step in range(max_steps):
|
| 532 |
+
if step > 0:
|
| 533 |
+
time.sleep(3)
|
| 534 |
+
|
| 535 |
+
print(f"--- ReAct Step {step + 1} ---")
|
| 536 |
+
|
| 537 |
+
# Max history truncation to avoid 413 Request Too Large errors
|
| 538 |
+
safe_messages = messages[:2] + messages[-6:] if len(messages) > 10 else messages
|
| 539 |
+
|
| 540 |
+
ai_msg, current_tier = smart_invoke(safe_messages, use_tools=True, start_tier=current_tier)
|
| 541 |
+
messages.append(ai_msg)
|
| 542 |
+
|
| 543 |
+
# Check if the model requested tools
|
| 544 |
+
tool_calls = getattr(ai_msg, "tool_calls", None) or []
|
| 545 |
+
if not tool_calls:
|
| 546 |
+
# Model decided it has enough info to answer
|
| 547 |
+
draft_response = ai_msg
|
| 548 |
+
print(f"Model found answer or stopped tools: {ai_msg.content}")
|
| 549 |
+
break
|
| 550 |
|
| 551 |
# Execute requested tools and append their text output into the conversation
|
| 552 |
for tool_call in tool_calls:
|
app.py
CHANGED
|
@@ -142,17 +142,20 @@ def run_and_submit_all(profile: Optional[gr.OAuthProfile] = None):
|
|
| 142 |
|
| 143 |
import concurrent.futures
|
| 144 |
import time
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
max_workers = min(8, len(questions_data)) if questions_data else 1
|
| 148 |
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 149 |
-
futures = {
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
for future in concurrent.futures.as_completed(futures):
|
| 152 |
res = future.result()
|
| 153 |
if res:
|
| 154 |
answers_payload.append({"task_id": res["task_id"], "submitted_answer": res["submitted_answer"]})
|
| 155 |
results_log.append({"Task ID": res["task_id"], "Question": res["question"], "Submitted Answer": res["submitted_answer"]})
|
|
|
|
| 156 |
|
| 157 |
if not answers_payload:
|
| 158 |
print("Agent did not produce any answers to submit.")
|
|
|
|
| 142 |
|
| 143 |
import concurrent.futures
|
| 144 |
import time
|
| 145 |
+
# Use 2 workers to avoid rate limits - free tier has strict limits
|
| 146 |
+
max_workers = 2
|
|
|
|
| 147 |
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 148 |
+
futures = {}
|
| 149 |
+
for item in questions_data[:2]:
|
| 150 |
+
futures[executor.submit(process_item, item)] = item
|
| 151 |
+
time.sleep(1.5) # Stagger to avoid rate limits
|
| 152 |
|
| 153 |
for future in concurrent.futures.as_completed(futures):
|
| 154 |
res = future.result()
|
| 155 |
if res:
|
| 156 |
answers_payload.append({"task_id": res["task_id"], "submitted_answer": res["submitted_answer"]})
|
| 157 |
results_log.append({"Task ID": res["task_id"], "Question": res["question"], "Submitted Answer": res["submitted_answer"]})
|
| 158 |
+
time.sleep(0.5) # Small delay between completions
|
| 159 |
|
| 160 |
if not answers_payload:
|
| 161 |
print("Agent did not produce any answers to submit.")
|
compare_answers.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pyarrow.parquet as pq
|
| 2 |
+
import requests
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
load_dotenv()
|
| 7 |
+
|
| 8 |
+
# Fetch questions from scoring space
|
| 9 |
+
print("Fetching questions...")
|
| 10 |
+
resp = requests.get('https://agents-course-unit4-scoring.hf.space/questions')
|
| 11 |
+
questions = resp.json()
|
| 12 |
+
print(f"Fetched {len(questions)} questions")
|
| 13 |
+
|
| 14 |
+
# Get ground truth from HF
|
| 15 |
+
print("Fetching ground truth...")
|
| 16 |
+
from huggingface_hub import hf_hub_download
|
| 17 |
+
token = os.getenv('HF_TOKEN') or os.getenv('HUGGINGFACEHUB_API_TOKEN')
|
| 18 |
+
path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
|
| 19 |
+
df = pq.read_table(path).to_pandas()
|
| 20 |
+
|
| 21 |
+
# Create mapping of task_id -> ground truth answer
|
| 22 |
+
answer_map = dict(zip(df['task_id'], df['Final answer']))
|
| 23 |
+
print(f"Loaded {len(answer_map)} ground truth answers")
|
| 24 |
+
|
| 25 |
+
# Load submission
|
| 26 |
+
submission_path = 'backup_submission.json'
|
| 27 |
+
if not os.path.exists(submission_path):
|
| 28 |
+
print(f"\nError: {submission_path} not found!")
|
| 29 |
+
print("Please run your evaluation first to generate the submission file.")
|
| 30 |
+
exit(1)
|
| 31 |
+
|
| 32 |
+
with open(submission_path, 'r') as f:
|
| 33 |
+
submission = json.load(f)
|
| 34 |
+
|
| 35 |
+
print(f"Loaded submission with {len(submission['answers'])} answers")
|
| 36 |
+
|
| 37 |
+
# Detailed comparison
|
| 38 |
+
print('\n' + '='*70)
|
| 39 |
+
print('DETAILED COMPARISON: Ground Truth vs Submitted Answers')
|
| 40 |
+
print('='*70 + '\n')
|
| 41 |
+
|
| 42 |
+
correct = 0
|
| 43 |
+
for i, ans in enumerate(submission['answers']):
|
| 44 |
+
task_id = ans['task_id']
|
| 45 |
+
submitted = str(ans['submitted_answer']).strip()
|
| 46 |
+
ground_truth = str(answer_map.get(task_id, 'NOT FOUND')).strip()
|
| 47 |
+
|
| 48 |
+
is_correct = submitted.lower() == ground_truth.lower()
|
| 49 |
+
if is_correct:
|
| 50 |
+
correct += 1
|
| 51 |
+
status = '✅'
|
| 52 |
+
else:
|
| 53 |
+
status = '❌'
|
| 54 |
+
|
| 55 |
+
# Find the question
|
| 56 |
+
q = next((x['question'] for x in questions if x['task_id'] == task_id), 'N/A')
|
| 57 |
+
|
| 58 |
+
print(f"{status} [{i+1}] Task: {task_id[:30]}...")
|
| 59 |
+
print(f" Q: {q[:60]}...")
|
| 60 |
+
print(f" Submitted: {submitted[:50]}")
|
| 61 |
+
print(f" Ground: {ground_truth[:50]}")
|
| 62 |
+
print()
|
| 63 |
+
|
| 64 |
+
print('='*70)
|
| 65 |
+
print(f'FINAL SCORE: {correct}/{len(submission["answers"])} = {correct/len(submission["answers"])*100:.0f}%')
|
| 66 |
+
print('='*70)
|
run_local.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import pyarrow.parquet as pq
|
| 5 |
+
import json
|
| 6 |
+
import time
|
| 7 |
+
from langchain_core.messages import HumanMessage
|
| 8 |
+
from agent import build_graph
|
| 9 |
+
from huggingface_hub import hf_hub_download
|
| 10 |
+
from dotenv import load_dotenv
|
| 11 |
+
|
| 12 |
+
load_dotenv(override=True)
|
| 13 |
+
|
| 14 |
+
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 15 |
+
|
| 16 |
+
class BasicAgent:
|
| 17 |
+
def __init__(self):
|
| 18 |
+
print("BasicAgent initialized.")
|
| 19 |
+
self.graph = build_graph()
|
| 20 |
+
|
| 21 |
+
def __call__(self, question: str) -> str:
|
| 22 |
+
messages = [HumanMessage(content=question)]
|
| 23 |
+
result = self.graph.invoke({"messages": messages})
|
| 24 |
+
answer = result['messages'][-1].content
|
| 25 |
+
return answer
|
| 26 |
+
|
| 27 |
+
def file_extract(local_file_path, task_id):
|
| 28 |
+
if not local_file_path:
|
| 29 |
+
return None
|
| 30 |
+
|
| 31 |
+
token = os.getenv("HUGGINGFACEHUB_API_TOKEN") or os.getenv("HF_TOKEN")
|
| 32 |
+
prefixes = ["2023/validation/", "2023/test/", "2023/train/", ""]
|
| 33 |
+
|
| 34 |
+
for prefix in prefixes:
|
| 35 |
+
try:
|
| 36 |
+
resolved_path = hf_hub_download(
|
| 37 |
+
repo_id="gaia-benchmark/GAIA",
|
| 38 |
+
filename=f"{prefix}{local_file_path}",
|
| 39 |
+
repo_type="dataset",
|
| 40 |
+
token=token
|
| 41 |
+
)
|
| 42 |
+
return resolved_path
|
| 43 |
+
except Exception:
|
| 44 |
+
continue
|
| 45 |
+
return None
|
| 46 |
+
|
| 47 |
+
def main():
|
| 48 |
+
# 1. Fetch questions
|
| 49 |
+
print("Fetching questions...")
|
| 50 |
+
questions_url = f"{DEFAULT_API_URL}/questions"
|
| 51 |
+
response = requests.get(questions_url, timeout=15)
|
| 52 |
+
questions_data = response.json()
|
| 53 |
+
print(f"Fetched {len(questions_data)} questions")
|
| 54 |
+
|
| 55 |
+
# 2. Load ground truth
|
| 56 |
+
print("Loading ground truth...")
|
| 57 |
+
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 58 |
+
path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
|
| 59 |
+
df = pq.read_table(path).to_pandas()
|
| 60 |
+
answer_map = dict(zip(df['task_id'], df['Final answer']))
|
| 61 |
+
|
| 62 |
+
# 3. Initialize agent
|
| 63 |
+
agent = BasicAgent()
|
| 64 |
+
|
| 65 |
+
# 4. Run on all questions (can slice for testing)
|
| 66 |
+
results = []
|
| 67 |
+
|
| 68 |
+
# Run ALL questions
|
| 69 |
+
for i, item in enumerate(questions_data):
|
| 70 |
+
task_id = item.get("task_id")
|
| 71 |
+
question_text = item.get("question")
|
| 72 |
+
file_name = item.get("file_name")
|
| 73 |
+
|
| 74 |
+
if not task_id or question_text is None:
|
| 75 |
+
continue
|
| 76 |
+
|
| 77 |
+
if file_name:
|
| 78 |
+
resolved_path = file_extract(file_name, task_id)
|
| 79 |
+
if resolved_path:
|
| 80 |
+
question_text += f"\n\n[Attached File Local Path: {resolved_path}]"
|
| 81 |
+
|
| 82 |
+
print(f"\n[{i+1}/{len(questions_data)}] Task: {task_id[:20]}...")
|
| 83 |
+
|
| 84 |
+
try:
|
| 85 |
+
answer = agent(question_text)
|
| 86 |
+
except Exception as e:
|
| 87 |
+
answer = f"ERROR: {e}"
|
| 88 |
+
|
| 89 |
+
ground_truth = answer_map.get(task_id, "NOT FOUND")
|
| 90 |
+
is_correct = str(answer).strip().lower() == str(ground_truth).strip().lower()
|
| 91 |
+
|
| 92 |
+
results.append({
|
| 93 |
+
"task_id": task_id,
|
| 94 |
+
"question": item.get("question"),
|
| 95 |
+
"submitted_answer": answer,
|
| 96 |
+
"ground_truth": ground_truth,
|
| 97 |
+
"correct": is_correct
|
| 98 |
+
})
|
| 99 |
+
|
| 100 |
+
status = "✅" if is_correct else "❌"
|
| 101 |
+
print(f" {status} Submitted: {str(answer)[:40]}")
|
| 102 |
+
print(f" Ground: {str(ground_truth)[:40]}")
|
| 103 |
+
|
| 104 |
+
time.sleep(1.5)
|
| 105 |
+
|
| 106 |
+
# 5. Calculate score
|
| 107 |
+
correct_count = sum(1 for r in results if r["correct"])
|
| 108 |
+
total = len(results)
|
| 109 |
+
score_pct = correct_count / total * 100 if total > 0 else 0
|
| 110 |
+
|
| 111 |
+
print("\n" + "="*60)
|
| 112 |
+
print(f"FINAL SCORE: {correct_count}/{total} = {score_pct:.0f}%")
|
| 113 |
+
print("="*60)
|
| 114 |
+
|
| 115 |
+
# 6. Save results
|
| 116 |
+
output = {"score": score_pct, "correct": correct_count, "total": total, "results": results}
|
| 117 |
+
|
| 118 |
+
with open("gaia_results.json", "w") as f:
|
| 119 |
+
json.dump(output, f, indent=2)
|
| 120 |
+
pd.DataFrame(results).to_csv("gaia_results.csv", index=False)
|
| 121 |
+
print("Results saved!")
|
| 122 |
+
|
| 123 |
+
if __name__ == "__main__":
|
| 124 |
+
main()
|