Paperbag commited on
Commit
21be703
·
1 Parent(s): be57dce

claude fix

Browse files
.codex/config.toml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ [shell_environment_policy]
2
+ inherit = "core"
3
+
4
+ [shell_environment_policy.set]
5
+ ANTHROPIC_API_KEY = ""
6
+ ANTHROPIC_AUTH_TOKEN = "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"
7
+ ANTHROPIC_BASE_URL = "https://openrouter.ai/api"
8
+ ANTHROPIC_MODEL = "qwen/qwen3-coder:free"
__pycache__/agent.cpython-312.pyc CHANGED
Binary files a/__pycache__/agent.cpython-312.pyc and b/__pycache__/agent.cpython-312.pyc differ
 
agent.py CHANGED
@@ -61,11 +61,11 @@ def smart_invoke(msgs, use_tools=False, start_tier=0):
61
  gemini_alternatives = ["gemini-2.5-flash", "gemini-2.0-flash", "gemini-flash-latest", "gemini-pro-latest"]
62
 
63
  tiers_config = [
64
- {"name": "OpenRouter-FreeRouter", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "openrouter/free", "base_url": "https://openrouter.ai/api/v1"},
65
- {"name": "DeepSeek-R1", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "deepseek/deepseek-r1:free", "base_url": "https://openrouter.ai/api/v1"},
66
  {"name": "Qwen3-Next-80B", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "qwen/qwen3-next-80b-a3b-instruct:free", "base_url": "https://openrouter.ai/api/v1"},
67
- {"name": "NVIDIA-Nemotron-Super", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "nvidia/nemotron-3-super-120b-a12b:free", "base_url": "https://openrouter.ai/api/v1"},
68
  {"name": "Gemma-3-27B", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "google/gemma-3-27b-it:free", "base_url": "https://openrouter.ai/api/v1"},
 
 
 
69
  {"name": "Gemini-Flash", "key": "GOOGLE_API_KEY", "provider": "google", "model_name": "gemini-2.0-flash", "alternatives": gemini_alternatives},
70
  {"name": "Groq", "key": "GROQ_API_KEY", "provider": "groq", "model_name": "llama-3.3-70b-versatile"},
71
  ]
@@ -523,15 +523,30 @@ CRITICAL RULES:
523
  if isinstance(msg, HumanMessage) and "[Attached File Local Path:" in msg.content:
524
  messages.append(HumanMessage(content="IMPORTANT: I see an image path in the message. I MUST call the analyze_image tool IMMEDIATELY in my next step to see it."))
525
 
526
- # One-shot reasoning for better latency and more consistent accuracy on short QA.
 
527
  draft_response = None
528
  current_tier = 0
529
-
530
- print("--- One-shot response invocation ---")
531
- ai_msg, current_tier = smart_invoke(messages, use_tools=False, start_tier=current_tier)
532
- messages.append(ai_msg)
533
- draft_response = ai_msg
534
- print(f"Model returned answer: {ai_msg.content}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535
 
536
  # Execute requested tools and append their text output into the conversation
537
  for tool_call in tool_calls:
 
61
  gemini_alternatives = ["gemini-2.5-flash", "gemini-2.0-flash", "gemini-flash-latest", "gemini-pro-latest"]
62
 
63
  tiers_config = [
 
 
64
  {"name": "Qwen3-Next-80B", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "qwen/qwen3-next-80b-a3b-instruct:free", "base_url": "https://openrouter.ai/api/v1"},
 
65
  {"name": "Gemma-3-27B", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "google/gemma-3-27b-it:free", "base_url": "https://openrouter.ai/api/v1"},
66
+ {"name": "NVIDIA-Nemotron-Super", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "nvidia/nemotron-3-super-120b-a12b:free", "base_url": "https://openrouter.ai/api/v1"},
67
+ {"name": "OpenRouter-FreeRouter", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "openrouter/free", "base_url": "https://openrouter.ai/api/v1"},
68
+ {"name": "DeepSeek-R1", "key": "OPENROUTER_API_KEY", "provider": "openai", "model_name": "deepseek/deepseek-r1:free", "base_url": "https://openrouter.ai/api/v1"},
69
  {"name": "Gemini-Flash", "key": "GOOGLE_API_KEY", "provider": "google", "model_name": "gemini-2.0-flash", "alternatives": gemini_alternatives},
70
  {"name": "Groq", "key": "GROQ_API_KEY", "provider": "groq", "model_name": "llama-3.3-70b-versatile"},
71
  ]
 
523
  if isinstance(msg, HumanMessage) and "[Attached File Local Path:" in msg.content:
524
  messages.append(HumanMessage(content="IMPORTANT: I see an image path in the message. I MUST call the analyze_image tool IMMEDIATELY in my next step to see it."))
525
 
526
+ # Multi-step ReAct Loop (Up to 12 reasoning steps)
527
+ max_steps = 12
528
  draft_response = None
529
  current_tier = 0
530
+
531
+ for step in range(max_steps):
532
+ if step > 0:
533
+ time.sleep(3)
534
+
535
+ print(f"--- ReAct Step {step + 1} ---")
536
+
537
+ # Max history truncation to avoid 413 Request Too Large errors
538
+ safe_messages = messages[:2] + messages[-6:] if len(messages) > 10 else messages
539
+
540
+ ai_msg, current_tier = smart_invoke(safe_messages, use_tools=True, start_tier=current_tier)
541
+ messages.append(ai_msg)
542
+
543
+ # Check if the model requested tools
544
+ tool_calls = getattr(ai_msg, "tool_calls", None) or []
545
+ if not tool_calls:
546
+ # Model decided it has enough info to answer
547
+ draft_response = ai_msg
548
+ print(f"Model found answer or stopped tools: {ai_msg.content}")
549
+ break
550
 
551
  # Execute requested tools and append their text output into the conversation
552
  for tool_call in tool_calls:
app.py CHANGED
@@ -142,17 +142,20 @@ def run_and_submit_all(profile: Optional[gr.OAuthProfile] = None):
142
 
143
  import concurrent.futures
144
  import time
145
-
146
- # Improve throughput while respecting rate limits; avoid fixed sleep delays that slow down the entire run.
147
- max_workers = min(8, len(questions_data)) if questions_data else 1
148
  with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
149
- futures = {executor.submit(process_item, item): item for item in questions_data}
 
 
 
150
 
151
  for future in concurrent.futures.as_completed(futures):
152
  res = future.result()
153
  if res:
154
  answers_payload.append({"task_id": res["task_id"], "submitted_answer": res["submitted_answer"]})
155
  results_log.append({"Task ID": res["task_id"], "Question": res["question"], "Submitted Answer": res["submitted_answer"]})
 
156
 
157
  if not answers_payload:
158
  print("Agent did not produce any answers to submit.")
 
142
 
143
  import concurrent.futures
144
  import time
145
+ # Use 2 workers to avoid rate limits - free tier has strict limits
146
+ max_workers = 2
 
147
  with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
148
+ futures = {}
149
+ for item in questions_data[:2]:
150
+ futures[executor.submit(process_item, item)] = item
151
+ time.sleep(1.5) # Stagger to avoid rate limits
152
 
153
  for future in concurrent.futures.as_completed(futures):
154
  res = future.result()
155
  if res:
156
  answers_payload.append({"task_id": res["task_id"], "submitted_answer": res["submitted_answer"]})
157
  results_log.append({"Task ID": res["task_id"], "Question": res["question"], "Submitted Answer": res["submitted_answer"]})
158
+ time.sleep(0.5) # Small delay between completions
159
 
160
  if not answers_payload:
161
  print("Agent did not produce any answers to submit.")
compare_answers.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pyarrow.parquet as pq
2
+ import requests
3
+ import json
4
+ import os
5
+ from dotenv import load_dotenv
6
+ load_dotenv()
7
+
8
+ # Fetch questions from scoring space
9
+ print("Fetching questions...")
10
+ resp = requests.get('https://agents-course-unit4-scoring.hf.space/questions')
11
+ questions = resp.json()
12
+ print(f"Fetched {len(questions)} questions")
13
+
14
+ # Get ground truth from HF
15
+ print("Fetching ground truth...")
16
+ from huggingface_hub import hf_hub_download
17
+ token = os.getenv('HF_TOKEN') or os.getenv('HUGGINGFACEHUB_API_TOKEN')
18
+ path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
19
+ df = pq.read_table(path).to_pandas()
20
+
21
+ # Create mapping of task_id -> ground truth answer
22
+ answer_map = dict(zip(df['task_id'], df['Final answer']))
23
+ print(f"Loaded {len(answer_map)} ground truth answers")
24
+
25
+ # Load submission
26
+ submission_path = 'backup_submission.json'
27
+ if not os.path.exists(submission_path):
28
+ print(f"\nError: {submission_path} not found!")
29
+ print("Please run your evaluation first to generate the submission file.")
30
+ exit(1)
31
+
32
+ with open(submission_path, 'r') as f:
33
+ submission = json.load(f)
34
+
35
+ print(f"Loaded submission with {len(submission['answers'])} answers")
36
+
37
+ # Detailed comparison
38
+ print('\n' + '='*70)
39
+ print('DETAILED COMPARISON: Ground Truth vs Submitted Answers')
40
+ print('='*70 + '\n')
41
+
42
+ correct = 0
43
+ for i, ans in enumerate(submission['answers']):
44
+ task_id = ans['task_id']
45
+ submitted = str(ans['submitted_answer']).strip()
46
+ ground_truth = str(answer_map.get(task_id, 'NOT FOUND')).strip()
47
+
48
+ is_correct = submitted.lower() == ground_truth.lower()
49
+ if is_correct:
50
+ correct += 1
51
+ status = '✅'
52
+ else:
53
+ status = '❌'
54
+
55
+ # Find the question
56
+ q = next((x['question'] for x in questions if x['task_id'] == task_id), 'N/A')
57
+
58
+ print(f"{status} [{i+1}] Task: {task_id[:30]}...")
59
+ print(f" Q: {q[:60]}...")
60
+ print(f" Submitted: {submitted[:50]}")
61
+ print(f" Ground: {ground_truth[:50]}")
62
+ print()
63
+
64
+ print('='*70)
65
+ print(f'FINAL SCORE: {correct}/{len(submission["answers"])} = {correct/len(submission["answers"])*100:.0f}%')
66
+ print('='*70)
run_local.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import pandas as pd
4
+ import pyarrow.parquet as pq
5
+ import json
6
+ import time
7
+ from langchain_core.messages import HumanMessage
8
+ from agent import build_graph
9
+ from huggingface_hub import hf_hub_download
10
+ from dotenv import load_dotenv
11
+
12
+ load_dotenv(override=True)
13
+
14
+ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
15
+
16
+ class BasicAgent:
17
+ def __init__(self):
18
+ print("BasicAgent initialized.")
19
+ self.graph = build_graph()
20
+
21
+ def __call__(self, question: str) -> str:
22
+ messages = [HumanMessage(content=question)]
23
+ result = self.graph.invoke({"messages": messages})
24
+ answer = result['messages'][-1].content
25
+ return answer
26
+
27
+ def file_extract(local_file_path, task_id):
28
+ if not local_file_path:
29
+ return None
30
+
31
+ token = os.getenv("HUGGINGFACEHUB_API_TOKEN") or os.getenv("HF_TOKEN")
32
+ prefixes = ["2023/validation/", "2023/test/", "2023/train/", ""]
33
+
34
+ for prefix in prefixes:
35
+ try:
36
+ resolved_path = hf_hub_download(
37
+ repo_id="gaia-benchmark/GAIA",
38
+ filename=f"{prefix}{local_file_path}",
39
+ repo_type="dataset",
40
+ token=token
41
+ )
42
+ return resolved_path
43
+ except Exception:
44
+ continue
45
+ return None
46
+
47
+ def main():
48
+ # 1. Fetch questions
49
+ print("Fetching questions...")
50
+ questions_url = f"{DEFAULT_API_URL}/questions"
51
+ response = requests.get(questions_url, timeout=15)
52
+ questions_data = response.json()
53
+ print(f"Fetched {len(questions_data)} questions")
54
+
55
+ # 2. Load ground truth
56
+ print("Loading ground truth...")
57
+ token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
58
+ path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
59
+ df = pq.read_table(path).to_pandas()
60
+ answer_map = dict(zip(df['task_id'], df['Final answer']))
61
+
62
+ # 3. Initialize agent
63
+ agent = BasicAgent()
64
+
65
+ # 4. Run on all questions (can slice for testing)
66
+ results = []
67
+
68
+ # Run ALL questions
69
+ for i, item in enumerate(questions_data):
70
+ task_id = item.get("task_id")
71
+ question_text = item.get("question")
72
+ file_name = item.get("file_name")
73
+
74
+ if not task_id or question_text is None:
75
+ continue
76
+
77
+ if file_name:
78
+ resolved_path = file_extract(file_name, task_id)
79
+ if resolved_path:
80
+ question_text += f"\n\n[Attached File Local Path: {resolved_path}]"
81
+
82
+ print(f"\n[{i+1}/{len(questions_data)}] Task: {task_id[:20]}...")
83
+
84
+ try:
85
+ answer = agent(question_text)
86
+ except Exception as e:
87
+ answer = f"ERROR: {e}"
88
+
89
+ ground_truth = answer_map.get(task_id, "NOT FOUND")
90
+ is_correct = str(answer).strip().lower() == str(ground_truth).strip().lower()
91
+
92
+ results.append({
93
+ "task_id": task_id,
94
+ "question": item.get("question"),
95
+ "submitted_answer": answer,
96
+ "ground_truth": ground_truth,
97
+ "correct": is_correct
98
+ })
99
+
100
+ status = "✅" if is_correct else "❌"
101
+ print(f" {status} Submitted: {str(answer)[:40]}")
102
+ print(f" Ground: {str(ground_truth)[:40]}")
103
+
104
+ time.sleep(1.5)
105
+
106
+ # 5. Calculate score
107
+ correct_count = sum(1 for r in results if r["correct"])
108
+ total = len(results)
109
+ score_pct = correct_count / total * 100 if total > 0 else 0
110
+
111
+ print("\n" + "="*60)
112
+ print(f"FINAL SCORE: {correct_count}/{total} = {score_pct:.0f}%")
113
+ print("="*60)
114
+
115
+ # 6. Save results
116
+ output = {"score": score_pct, "correct": correct_count, "total": total, "results": results}
117
+
118
+ with open("gaia_results.json", "w") as f:
119
+ json.dump(output, f, indent=2)
120
+ pd.DataFrame(results).to_csv("gaia_results.csv", index=False)
121
+ print("Results saved!")
122
+
123
+ if __name__ == "__main__":
124
+ main()