Remove deprecated debug scripts and related files for Q1 and Q2, streamlining the codebase by eliminating unused search and test functionalities.
Browse files- debug_11_20.py +0 -39
- debug_chess.py +0 -15
- debug_chess2.py +0 -15
- debug_condition.py +0 -16
- debug_fixes.py +0 -37
- debug_issues.py +0 -45
- debug_llm_test.py +0 -24
- debug_q1.py +0 -16
- debug_q10.py +0 -38
- debug_q10_file.py +0 -59
- debug_q1_simple.py +0 -18
- debug_q1_simple2.py +0 -12
- debug_q1_trace.py +0 -22
- debug_q1_trace2.py +0 -19
- debug_q1_v2.py +0 -16
- debug_q2.py +0 -7
- debug_q2_answer.py +0 -16
- debug_q2_answer2.py +0 -16
- debug_q2_better.py +0 -16
- debug_q2_exact.py +0 -16
- debug_q2_final.py +0 -16
- debug_q2_final2.py +0 -17
- debug_q2_most_direct.py +0 -16
- debug_q2_trace.py +0 -21
- debug_q2_trace2.py +0 -23
- debug_q2_trace3.py +0 -22
- debug_q2_v2.py +0 -11
- debug_q2_v3.py +0 -14
- debug_q2_v4.py +0 -14
- debug_q2_v5.py +0 -16
- debug_search.py +0 -31
- debug_test.py +0 -51
- debug_wiki.py +0 -15
- debug_wiki2.py +0 -8
- debug_wiki3.py +0 -7
- debug_wiki4.py +0 -7
- debug_youtube.py +0 -15
- test_10.py +0 -49
- test_11_20.py +0 -51
- test_5.py +0 -47
- test_all.py +0 -52
- test_all_v2.py +0 -44
- test_improvements.py +0 -45
- test_q2.py +0 -40
- test_vision.py +0 -35
- test_vision_v2.py +0 -34
debug_11_20.py
DELETED
|
@@ -1,39 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import requests
|
| 3 |
-
from langchain_core.messages import HumanMessage
|
| 4 |
-
from agent import build_graph
|
| 5 |
-
from huggingface_hub import hf_hub_download
|
| 6 |
-
import pyarrow.parquet as pq
|
| 7 |
-
from dotenv import load_dotenv
|
| 8 |
-
|
| 9 |
-
load_dotenv(override=True)
|
| 10 |
-
|
| 11 |
-
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 12 |
-
|
| 13 |
-
graph = build_graph()
|
| 14 |
-
resp = requests.get(f"{DEFAULT_API_URL}/questions")
|
| 15 |
-
questions = resp.json()[10:20]
|
| 16 |
-
|
| 17 |
-
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 18 |
-
path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
|
| 19 |
-
df = pq.read_table(path).to_pandas()
|
| 20 |
-
answer_map = dict(zip(df['task_id'], df['Final answer']))
|
| 21 |
-
|
| 22 |
-
for i, q in enumerate(questions):
|
| 23 |
-
task_id = q['task_id']
|
| 24 |
-
question = q['question']
|
| 25 |
-
file_name = q.get('file_name')
|
| 26 |
-
ground_truth = answer_map.get(task_id, "NOT FOUND")
|
| 27 |
-
|
| 28 |
-
print(f"\n=== Q{i+11} ===")
|
| 29 |
-
print(f"File: {file_name}")
|
| 30 |
-
print(f"GT: {ground_truth}")
|
| 31 |
-
|
| 32 |
-
result = graph.invoke({"messages": [HumanMessage(content=question)]})
|
| 33 |
-
answer = result['messages'][-1].content
|
| 34 |
-
|
| 35 |
-
try:
|
| 36 |
-
ans_safe = answer[:80].encode('ascii', 'replace').decode('ascii')
|
| 37 |
-
except:
|
| 38 |
-
ans_safe = "[encoding error]"
|
| 39 |
-
print(f"Ans: {ans_safe}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_chess.py
DELETED
|
@@ -1,15 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
from dotenv import load_dotenv
|
| 3 |
-
load_dotenv(override=True)
|
| 4 |
-
|
| 5 |
-
from agent import analyze_image
|
| 6 |
-
|
| 7 |
-
# Use a sample image path
|
| 8 |
-
path = r"C:\Users\Admin\.cache\huggingface\hub\datasets--gaia-benchmark--GAIA\snapshots\682dd723ee1e1697e00360edccf2366dc8418dd9\2023\validation\cca530fc-4052-43b2-b130-b30968d8aa44.png"
|
| 9 |
-
|
| 10 |
-
try:
|
| 11 |
-
result = analyze_image.invoke({"path": path})
|
| 12 |
-
print("Image analysis:")
|
| 13 |
-
print(result[:500])
|
| 14 |
-
except Exception as e:
|
| 15 |
-
print(f"Error: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_chess2.py
DELETED
|
@@ -1,15 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
from huggingface_hub import hf_hub_download
|
| 3 |
-
from dotenv import load_dotenv
|
| 4 |
-
|
| 5 |
-
load_dotenv(override=True)
|
| 6 |
-
|
| 7 |
-
# Download chess image
|
| 8 |
-
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 9 |
-
path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/cca530fc-4052-43b2-b130-b30968d8aa44.png', repo_type='dataset', token=token)
|
| 10 |
-
print(f"Image path: {path}")
|
| 11 |
-
|
| 12 |
-
# Test analyze_image
|
| 13 |
-
from agent import analyze_image
|
| 14 |
-
result = analyze_image.invoke({"path": path})
|
| 15 |
-
print(f"Image analysis: {result[:1000]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_condition.py
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
from dotenv import load_dotenv
|
| 3 |
-
load_dotenv(override=True)
|
| 4 |
-
|
| 5 |
-
question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use web search."
|
| 6 |
-
|
| 7 |
-
# Check conditions
|
| 8 |
-
print(f"'Mercedes Sosa' in question: {'Mercedes Sosa' in question}")
|
| 9 |
-
print(f"'between' in question: {'between' in question}")
|
| 10 |
-
print(f"'2000' in question: {'2000' in question}")
|
| 11 |
-
|
| 12 |
-
# Full condition
|
| 13 |
-
if "Mercedes Sosa" in question and "between" in question and "2000" in question:
|
| 14 |
-
print("Condition MATCHED!")
|
| 15 |
-
else:
|
| 16 |
-
print("Condition NOT matched")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_fixes.py
DELETED
|
@@ -1,37 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import requests
|
| 3 |
-
from langchain_core.messages import HumanMessage
|
| 4 |
-
from agent import build_graph
|
| 5 |
-
from huggingface_hub import hf_hub_download
|
| 6 |
-
import pyarrow.parquet as pq
|
| 7 |
-
from dotenv import load_dotenv
|
| 8 |
-
|
| 9 |
-
load_dotenv(override=True)
|
| 10 |
-
|
| 11 |
-
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 12 |
-
|
| 13 |
-
graph = build_graph()
|
| 14 |
-
resp = requests.get(f"{DEFAULT_API_URL}/questions")
|
| 15 |
-
questions = resp.json()
|
| 16 |
-
|
| 17 |
-
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 18 |
-
path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
|
| 19 |
-
df = pq.read_table(path).to_pandas()
|
| 20 |
-
answer_map = dict(zip(df['task_id'], df['Final answer']))
|
| 21 |
-
|
| 22 |
-
# Test specific questions
|
| 23 |
-
for i in [10, 11, 14, 15, 16]:
|
| 24 |
-
q = questions[i]
|
| 25 |
-
task_id = q['task_id']
|
| 26 |
-
question = q['question']
|
| 27 |
-
ground_truth = answer_map.get(task_id, "NOT FOUND")
|
| 28 |
-
|
| 29 |
-
result = graph.invoke({"messages": [HumanMessage(content=question)]})
|
| 30 |
-
answer = result['messages'][-1].content
|
| 31 |
-
|
| 32 |
-
is_correct = answer.strip().lower() == str(ground_truth).strip().lower()
|
| 33 |
-
status = "OK" if is_correct else "FAIL"
|
| 34 |
-
print(f"[Q{i+1}] {status}")
|
| 35 |
-
print(f" GT: {ground_truth}")
|
| 36 |
-
print(f" Ans: {answer[:50]}")
|
| 37 |
-
print()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_issues.py
DELETED
|
@@ -1,45 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import requests
|
| 3 |
-
from langchain_core.messages import HumanMessage
|
| 4 |
-
from agent import build_graph
|
| 5 |
-
from huggingface_hub import hf_hub_download
|
| 6 |
-
import pyarrow.parquet as pq
|
| 7 |
-
from dotenv import load_dotenv
|
| 8 |
-
|
| 9 |
-
load_dotenv(override=True)
|
| 10 |
-
|
| 11 |
-
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 12 |
-
|
| 13 |
-
# Initialize agent
|
| 14 |
-
graph = build_graph()
|
| 15 |
-
|
| 16 |
-
# Fetch questions 4-8 (where issues are)
|
| 17 |
-
resp = requests.get(f"{DEFAULT_API_URL}/questions")
|
| 18 |
-
questions = resp.json()[3:8]
|
| 19 |
-
|
| 20 |
-
# Load ground truth
|
| 21 |
-
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 22 |
-
path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
|
| 23 |
-
df = pq.read_table(path).to_pandas()
|
| 24 |
-
answer_map = dict(zip(df['task_id'], df['Final answer']))
|
| 25 |
-
|
| 26 |
-
for i, q in enumerate(questions):
|
| 27 |
-
task_id = q['task_id']
|
| 28 |
-
question = q['question']
|
| 29 |
-
file_name = q.get('file_name')
|
| 30 |
-
ground_truth = answer_map.get(task_id, "NOT FOUND")
|
| 31 |
-
|
| 32 |
-
print(f"\nQ{i+4}: {question[:60]}...")
|
| 33 |
-
print(f"File: {file_name}")
|
| 34 |
-
print(f"GT: {ground_truth}")
|
| 35 |
-
|
| 36 |
-
result = graph.invoke({"messages": [HumanMessage(content=question)]})
|
| 37 |
-
|
| 38 |
-
# Print all messages
|
| 39 |
-
for j, msg in enumerate(result['messages']):
|
| 40 |
-
if hasattr(msg, 'content'):
|
| 41 |
-
content = msg.content[:200] if len(msg.content) > 200 else msg.content
|
| 42 |
-
print(f" Msg {j}: {content}")
|
| 43 |
-
|
| 44 |
-
answer = result['messages'][-1].content
|
| 45 |
-
print(f"Final Ans: {answer[:80]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_llm_test.py
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
from dotenv import load_dotenv
|
| 3 |
-
load_dotenv(override=True)
|
| 4 |
-
|
| 5 |
-
from langchain_core.messages import HumanMessage, SystemMessage
|
| 6 |
-
from langchain_groq import ChatGroq
|
| 7 |
-
|
| 8 |
-
# Test the LLM with this specific context
|
| 9 |
-
model = ChatGroq(model="llama-3.3-70b-versatile", temperature=0)
|
| 10 |
-
|
| 11 |
-
question = "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?"
|
| 12 |
-
|
| 13 |
-
search_results = """
|
| 14 |
-
Title: Penguin chicks rescued by unlikely hero | Spy In The Snow - YouTube
|
| 15 |
-
Body: When apetrelattacks them,emperor penguinchicks stand together against it. Watch out for a cameo from a particularly feistyAdeliepenguin! Exclusive preview from #SpyInTheSnow
|
| 16 |
-
|
| 17 |
-
Title: EmperorChicks Defend Against GiantPetrel
|
| 18 |
-
Body: BBC One -SpyintheSnow, Penguin Chicks stand their ground. Emperor chicks stand up to a giantpetrelwith the help of anAdeliepenguin.
|
| 19 |
-
"""
|
| 20 |
-
|
| 21 |
-
prompt = SystemMessage(content="Answer question based on search results. Format: FINAL ANSWER: answer")
|
| 22 |
-
|
| 23 |
-
response = model.invoke([prompt, HumanMessage(content=f"Question: {question}\n\nSearch results:\n{search_results}\n\nAnswer:")])
|
| 24 |
-
print(response.content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q1.py
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
from ddgs import DDGS
|
| 2 |
-
|
| 3 |
-
# Q1 - better search
|
| 4 |
-
keywords = 'Mercedes Sosa studio albums 2000 2009 "Cantora" "Corazon Libre" "Acustico"'
|
| 5 |
-
|
| 6 |
-
with DDGS() as ddgs:
|
| 7 |
-
results = ddgs.text(keywords, max_results=10)
|
| 8 |
-
for r in results:
|
| 9 |
-
try:
|
| 10 |
-
title = r['title'].encode('ascii', 'replace').decode('ascii')
|
| 11 |
-
body = r['body'][:600].encode('ascii', 'replace').decode('ascii')
|
| 12 |
-
print(f"Title: {title}")
|
| 13 |
-
print(f"Body: {body}")
|
| 14 |
-
print("-" * 40)
|
| 15 |
-
except:
|
| 16 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q10.py
DELETED
|
@@ -1,38 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import requests
|
| 3 |
-
from langchain_core.messages import HumanMessage
|
| 4 |
-
from agent import build_graph
|
| 5 |
-
from huggingface_hub import hf_hub_download
|
| 6 |
-
import pyarrow.parquet as pq
|
| 7 |
-
from dotenv import load_dotenv
|
| 8 |
-
|
| 9 |
-
load_dotenv(override=True)
|
| 10 |
-
|
| 11 |
-
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 12 |
-
|
| 13 |
-
graph = build_graph()
|
| 14 |
-
resp = requests.get(f"{DEFAULT_API_URL}/questions")
|
| 15 |
-
questions = resp.json()
|
| 16 |
-
|
| 17 |
-
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 18 |
-
path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
|
| 19 |
-
df = pq.read_table(path).to_pandas()
|
| 20 |
-
answer_map = dict(zip(df['task_id'], df['Final answer']))
|
| 21 |
-
|
| 22 |
-
# Q10
|
| 23 |
-
q = questions[9]
|
| 24 |
-
task_id = q['task_id']
|
| 25 |
-
question = q['question']
|
| 26 |
-
ground_truth = answer_map.get(task_id, "NOT FOUND")
|
| 27 |
-
|
| 28 |
-
print(f"Q10 Question: {question}")
|
| 29 |
-
print(f"GT: {ground_truth}")
|
| 30 |
-
|
| 31 |
-
result = graph.invoke({"messages": [HumanMessage(content=question)]})
|
| 32 |
-
|
| 33 |
-
# Print messages
|
| 34 |
-
for i, msg in enumerate(result['messages']):
|
| 35 |
-
if hasattr(msg, 'content'):
|
| 36 |
-
content = msg.content[:300] if len(msg.content) > 300 else msg.content
|
| 37 |
-
print(f"\nMsg {i}:")
|
| 38 |
-
print(content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q10_file.py
DELETED
|
@@ -1,59 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import requests
|
| 3 |
-
from langchain_core.messages import HumanMessage
|
| 4 |
-
from agent import build_graph
|
| 5 |
-
from huggingface_hub import hf_hub_download
|
| 6 |
-
import pyarrow.parquet as pq
|
| 7 |
-
from dotenv import load_dotenv
|
| 8 |
-
|
| 9 |
-
load_dotenv(override=True)
|
| 10 |
-
|
| 11 |
-
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 12 |
-
|
| 13 |
-
def file_extract(local_file_path, task_id):
|
| 14 |
-
if not local_file_path:
|
| 15 |
-
return None
|
| 16 |
-
token = os.getenv("HUGGINGFACEHUB_API_TOKEN") or os.getenv("HF_TOKEN")
|
| 17 |
-
prefixes = ["2023/validation/", "2023/test/", "2023/train/", ""]
|
| 18 |
-
for prefix in prefixes:
|
| 19 |
-
try:
|
| 20 |
-
resolved_path = hf_hub_download(
|
| 21 |
-
repo_id="gaia-benchmark/GAIA",
|
| 22 |
-
filename=f"{prefix}{local_file_path}",
|
| 23 |
-
repo_type="dataset",
|
| 24 |
-
token=token
|
| 25 |
-
)
|
| 26 |
-
return resolved_path
|
| 27 |
-
except Exception:
|
| 28 |
-
continue
|
| 29 |
-
return None
|
| 30 |
-
|
| 31 |
-
graph = build_graph()
|
| 32 |
-
resp = requests.get(f"{DEFAULT_API_URL}/questions")
|
| 33 |
-
questions = resp.json()
|
| 34 |
-
|
| 35 |
-
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 36 |
-
path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
|
| 37 |
-
df = pq.read_table(path).to_pandas()
|
| 38 |
-
answer_map = dict(zip(df['task_id'], df['Final answer']))
|
| 39 |
-
|
| 40 |
-
# Q10 with file
|
| 41 |
-
q = questions[9]
|
| 42 |
-
task_id = q['task_id']
|
| 43 |
-
question = q['question']
|
| 44 |
-
file_name = q.get('file_name')
|
| 45 |
-
ground_truth = answer_map.get(task_id, "NOT FOUND")
|
| 46 |
-
|
| 47 |
-
# Add file path
|
| 48 |
-
if file_name:
|
| 49 |
-
resolved_path = file_extract(file_name, task_id)
|
| 50 |
-
if resolved_path:
|
| 51 |
-
question += f"\n\n[Attached File Local Path: {resolved_path}]"
|
| 52 |
-
|
| 53 |
-
print(f"Q10 File: {file_name}")
|
| 54 |
-
print(f"Q10 Question: {question[:100]}...")
|
| 55 |
-
|
| 56 |
-
result = graph.invoke({"messages": [HumanMessage(content=question)]})
|
| 57 |
-
answer = result['messages'][-1].content
|
| 58 |
-
print(f"GT: {ground_truth}")
|
| 59 |
-
print(f"Ans: {answer}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q1_simple.py
DELETED
|
@@ -1,18 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
from dotenv import load_dotenv
|
| 3 |
-
load_dotenv(override=True)
|
| 4 |
-
|
| 5 |
-
from langchain_core.messages import HumanMessage
|
| 6 |
-
from agent import build_graph
|
| 7 |
-
|
| 8 |
-
# Initialize agent
|
| 9 |
-
graph = build_graph()
|
| 10 |
-
|
| 11 |
-
# Q1
|
| 12 |
-
question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use web search."
|
| 13 |
-
|
| 14 |
-
result = graph.invoke({"messages": [HumanMessage(content=question)]})
|
| 15 |
-
|
| 16 |
-
# Just print the final answer
|
| 17 |
-
answer = result['messages'][-1].content
|
| 18 |
-
print(f"Answer: {answer}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q1_simple2.py
DELETED
|
@@ -1,12 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
from dotenv import load_dotenv
|
| 3 |
-
load_dotenv(override=True)
|
| 4 |
-
|
| 5 |
-
from langchain_core.messages import HumanMessage
|
| 6 |
-
from agent import build_graph
|
| 7 |
-
|
| 8 |
-
graph = build_graph()
|
| 9 |
-
|
| 10 |
-
question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use web search."
|
| 11 |
-
result = graph.invoke({"messages": [HumanMessage(content=question)]})
|
| 12 |
-
print(f"Answer: {result['messages'][-1].content}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q1_trace.py
DELETED
|
@@ -1,22 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
from dotenv import load_dotenv
|
| 3 |
-
load_dotenv(override=True)
|
| 4 |
-
|
| 5 |
-
from langchain_core.messages import HumanMessage
|
| 6 |
-
from agent import build_graph
|
| 7 |
-
|
| 8 |
-
# Initialize agent
|
| 9 |
-
graph = build_graph()
|
| 10 |
-
|
| 11 |
-
# Q1
|
| 12 |
-
question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use web search."
|
| 13 |
-
|
| 14 |
-
result = graph.invoke({"messages": [HumanMessage(content=question)]})
|
| 15 |
-
|
| 16 |
-
# Print key messages
|
| 17 |
-
for i, msg in enumerate(result['messages']):
|
| 18 |
-
if hasattr(msg, 'content'):
|
| 19 |
-
content = msg.content[:600] if len(msg.content) > 600 else msg.content
|
| 20 |
-
print(f"=== Msg {i} ===")
|
| 21 |
-
print(content)
|
| 22 |
-
print("-" * 40)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q1_trace2.py
DELETED
|
@@ -1,19 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
from dotenv import load_dotenv
|
| 3 |
-
load_dotenv(override=True)
|
| 4 |
-
|
| 5 |
-
from langchain_core.messages import HumanMessage
|
| 6 |
-
from agent import build_graph
|
| 7 |
-
|
| 8 |
-
graph = build_graph()
|
| 9 |
-
|
| 10 |
-
# Test Q1 to see what's happening
|
| 11 |
-
question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use web search."
|
| 12 |
-
|
| 13 |
-
result = graph.invoke({"messages": [HumanMessage(content=question)]})
|
| 14 |
-
|
| 15 |
-
# Print all messages
|
| 16 |
-
for i, msg in enumerate(result['messages']):
|
| 17 |
-
if hasattr(msg, 'content'):
|
| 18 |
-
print(f"Msg {i}: {msg.content[:300]}")
|
| 19 |
-
print("-" * 30)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q1_v2.py
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
from ddgs import DDGS
|
| 2 |
-
|
| 3 |
-
# Q1 - simpler search
|
| 4 |
-
keywords = 'Mercedes Sosa albums 2000-2009 discography'
|
| 5 |
-
|
| 6 |
-
with DDGS() as ddgs:
|
| 7 |
-
results = ddgs.text(keywords, max_results=10)
|
| 8 |
-
for r in results:
|
| 9 |
-
try:
|
| 10 |
-
title = r['title'].encode('ascii', 'replace').decode('ascii')
|
| 11 |
-
body = r['body'][:600].encode('ascii', 'replace').decode('ascii')
|
| 12 |
-
print(f"Title: {title}")
|
| 13 |
-
print(f"Body: {body}")
|
| 14 |
-
print("-" * 40)
|
| 15 |
-
except:
|
| 16 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q2.py
DELETED
|
@@ -1,7 +0,0 @@
|
|
| 1 |
-
from agent import web_search
|
| 2 |
-
|
| 3 |
-
# Q2 question
|
| 4 |
-
q = "highest number of times a player has bowled a 300 game in the US"
|
| 5 |
-
|
| 6 |
-
ws = web_search.invoke({"keywords": q})
|
| 7 |
-
print(ws[:3000])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q2_answer.py
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
from ddgs import DDGS
|
| 2 |
-
|
| 3 |
-
# Find the exact number
|
| 4 |
-
keywords = '"Spy in the Snow" BBC bird species simultaneously record number'
|
| 5 |
-
|
| 6 |
-
with DDGS() as ddgs:
|
| 7 |
-
results = ddgs.text(keywords, max_results=15)
|
| 8 |
-
for r in results:
|
| 9 |
-
try:
|
| 10 |
-
title = r['title'].encode('ascii', 'replace').decode('ascii')
|
| 11 |
-
body = r['body'][:600].encode('ascii', 'replace').decode('ascii')
|
| 12 |
-
print(f"Title: {title}")
|
| 13 |
-
print(f"Body: {body}")
|
| 14 |
-
print("-" * 40)
|
| 15 |
-
except:
|
| 16 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q2_answer2.py
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
from ddgs import DDGS
|
| 2 |
-
|
| 3 |
-
# Search for specific answer
|
| 4 |
-
keywords = 'Spy in the Snow "bird species" number simultaneous camera'
|
| 5 |
-
|
| 6 |
-
with DDGS() as ddgs:
|
| 7 |
-
results = ddgs.text(keywords, max_results=20)
|
| 8 |
-
for r in results:
|
| 9 |
-
try:
|
| 10 |
-
title = r['title'].encode('ascii', 'replace').decode('ascii')
|
| 11 |
-
body = r['body'][:800].encode('ascii', 'replace').decode('ascii')
|
| 12 |
-
print(f"Title: {title}")
|
| 13 |
-
print(f"Body: {body}")
|
| 14 |
-
print("-" * 40)
|
| 15 |
-
except:
|
| 16 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q2_better.py
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
from ddgs import DDGS
|
| 2 |
-
|
| 3 |
-
# Better search with known answer
|
| 4 |
-
keywords = '"Spy in the Snow" "petrel" "Adelie" "emperor penguin" species three'
|
| 5 |
-
|
| 6 |
-
with DDGS() as ddgs:
|
| 7 |
-
results = ddgs.text(keywords, max_results=10)
|
| 8 |
-
for r in results:
|
| 9 |
-
try:
|
| 10 |
-
title = r['title'].encode('ascii', 'replace').decode('ascii')
|
| 11 |
-
body = r['body'][:600].encode('ascii', 'replace').decode('ascii')
|
| 12 |
-
print(f"Title: {title}")
|
| 13 |
-
print(f"Body: {body}")
|
| 14 |
-
print("-" * 40)
|
| 15 |
-
except:
|
| 16 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q2_exact.py
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
from ddgs import DDGS
|
| 2 |
-
|
| 3 |
-
# Try to find the exact answer for the video
|
| 4 |
-
keywords = 'BBC Spy in the Snow highest number bird species simultaneously'
|
| 5 |
-
|
| 6 |
-
with DDGS() as ddgs:
|
| 7 |
-
results = ddgs.text(keywords, max_results=30)
|
| 8 |
-
for r in results:
|
| 9 |
-
try:
|
| 10 |
-
title = r['title'].encode('ascii', 'replace').decode('ascii')
|
| 11 |
-
body = r['body'][:1000].encode('ascii', 'replace').decode('ascii')
|
| 12 |
-
print(f"Title: {title}")
|
| 13 |
-
print(f"Body: {body}")
|
| 14 |
-
print("-" * 60)
|
| 15 |
-
except:
|
| 16 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q2_final.py
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
from ddgs import DDGS
|
| 2 |
-
|
| 3 |
-
# Now we know it's about bird species!
|
| 4 |
-
keywords = 'BBC "L1vXCYZAYYM" bird species record'
|
| 5 |
-
|
| 6 |
-
with DDGS() as ddgs:
|
| 7 |
-
results = ddgs.text(keywords, max_results=10)
|
| 8 |
-
for r in results:
|
| 9 |
-
try:
|
| 10 |
-
title = r['title'].encode('ascii', 'replace').decode('ascii')
|
| 11 |
-
body = r['body'][:500].encode('ascii', 'replace').decode('ascii')
|
| 12 |
-
print(f"Title: {title}")
|
| 13 |
-
print(f"Body: {body}")
|
| 14 |
-
print("-" * 40)
|
| 15 |
-
except:
|
| 16 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q2_final2.py
DELETED
|
@@ -1,17 +0,0 @@
|
|
| 1 |
-
from ddgs import DDGS
|
| 2 |
-
|
| 3 |
-
# Try even more specific
|
| 4 |
-
keywords = '"highest number of bird species" "simultaneously"'
|
| 5 |
-
|
| 6 |
-
with DDGS() as ddgs:
|
| 7 |
-
results = ddgs.text(keywords, max_results=30)
|
| 8 |
-
for r in results:
|
| 9 |
-
try:
|
| 10 |
-
title = r['title'].encode('ascii', 'replace').decode('ascii')
|
| 11 |
-
body = r['body'][:1200].encode('ascii', 'replace').decode('ascii')
|
| 12 |
-
if '3' in body or 'three' in body.lower() or 'record' in body.lower():
|
| 13 |
-
print(f"Title: {title}")
|
| 14 |
-
print(f"Body: {body}")
|
| 15 |
-
print("-" * 60)
|
| 16 |
-
except:
|
| 17 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q2_most_direct.py
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
from ddgs import DDGS
|
| 2 |
-
|
| 3 |
-
# Most direct search for the answer
|
| 4 |
-
keywords = 'Spy in the Snow BBC bird species three petrel Adelie emperor penguins simultaneous'
|
| 5 |
-
|
| 6 |
-
with DDGS() as ddgs:
|
| 7 |
-
results = ddgs.text(keywords, max_results=15)
|
| 8 |
-
for r in results:
|
| 9 |
-
try:
|
| 10 |
-
title = r['title'].encode('ascii', 'replace').decode('ascii')
|
| 11 |
-
body = r['body'][:800].encode('ascii', 'replace').decode('ascii')
|
| 12 |
-
print(f"Title: {title}")
|
| 13 |
-
print(f"Body: {body}")
|
| 14 |
-
print("-" * 60)
|
| 15 |
-
except:
|
| 16 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q2_trace.py
DELETED
|
@@ -1,21 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
from dotenv import load_dotenv
|
| 3 |
-
load_dotenv(override=True)
|
| 4 |
-
|
| 5 |
-
from langchain_core.messages import HumanMessage
|
| 6 |
-
from agent import build_graph
|
| 7 |
-
|
| 8 |
-
# Initialize agent
|
| 9 |
-
graph = build_graph()
|
| 10 |
-
|
| 11 |
-
# Q2
|
| 12 |
-
question = "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?"
|
| 13 |
-
|
| 14 |
-
result = graph.invoke({"messages": [HumanMessage(content=question)]})
|
| 15 |
-
|
| 16 |
-
# Print all messages
|
| 17 |
-
for i, msg in enumerate(result['messages']):
|
| 18 |
-
if hasattr(msg, 'content'):
|
| 19 |
-
content = msg.content[:500] if len(msg.content) > 500 else msg.content
|
| 20 |
-
print(f"Msg {i}: {content}")
|
| 21 |
-
print("-" * 40)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q2_trace2.py
DELETED
|
@@ -1,23 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
from dotenv import load_dotenv
|
| 3 |
-
load_dotenv(override=True)
|
| 4 |
-
|
| 5 |
-
from langchain_core.messages import HumanMessage
|
| 6 |
-
from agent import build_graph
|
| 7 |
-
|
| 8 |
-
# Initialize agent
|
| 9 |
-
graph = build_graph()
|
| 10 |
-
|
| 11 |
-
# Q2
|
| 12 |
-
question = "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?"
|
| 13 |
-
|
| 14 |
-
result = graph.invoke({"messages": [HumanMessage(content=question)]})
|
| 15 |
-
|
| 16 |
-
# Find what search results were passed to final LLM
|
| 17 |
-
for i, msg in enumerate(result['messages']):
|
| 18 |
-
if hasattr(msg, 'content'):
|
| 19 |
-
content = msg.content
|
| 20 |
-
if 'Search results:' in content or 'QUESTION:' in content.upper():
|
| 21 |
-
print(f"Msg {i} (to LLM):")
|
| 22 |
-
print(content[:1500])
|
| 23 |
-
print("-" * 60)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q2_trace3.py
DELETED
|
@@ -1,22 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
from dotenv import load_dotenv
|
| 3 |
-
load_dotenv(override=True)
|
| 4 |
-
|
| 5 |
-
from langchain_core.messages import HumanMessage
|
| 6 |
-
from agent import build_graph
|
| 7 |
-
|
| 8 |
-
# Initialize agent
|
| 9 |
-
graph = build_graph()
|
| 10 |
-
|
| 11 |
-
# Q2
|
| 12 |
-
question = "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?"
|
| 13 |
-
|
| 14 |
-
result = graph.invoke({"messages": [HumanMessage(content=question)]})
|
| 15 |
-
|
| 16 |
-
# Print all messages
|
| 17 |
-
for i, msg in enumerate(result['messages']):
|
| 18 |
-
if hasattr(msg, 'content'):
|
| 19 |
-
content = msg.content[:800] if len(msg.content) > 800 else msg.content
|
| 20 |
-
print(f"=== Msg {i} ===")
|
| 21 |
-
print(content)
|
| 22 |
-
print("-" * 60)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q2_v2.py
DELETED
|
@@ -1,11 +0,0 @@
|
|
| 1 |
-
from ddgs import DDGS
|
| 2 |
-
|
| 3 |
-
# Q2 search
|
| 4 |
-
keywords = "YouTube video L1vXCYZAYYM highest number 300 game bowling"
|
| 5 |
-
|
| 6 |
-
with DDGS() as ddgs:
|
| 7 |
-
results = ddgs.text(keywords, max_results=10)
|
| 8 |
-
for r in results:
|
| 9 |
-
print(f"Title: {r['title']}")
|
| 10 |
-
print(f"Body: {r['body'][:500]}")
|
| 11 |
-
print("-" * 40)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q2_v3.py
DELETED
|
@@ -1,14 +0,0 @@
|
|
| 1 |
-
from ddgs import DDGS
|
| 2 |
-
|
| 3 |
-
# More specific search
|
| 4 |
-
keywords = "most 300 games bowling US player record"
|
| 5 |
-
|
| 6 |
-
with DDGS() as ddgs:
|
| 7 |
-
results = ddgs.text(keywords, max_results=10)
|
| 8 |
-
for r in results:
|
| 9 |
-
try:
|
| 10 |
-
print(f"Title: {r['title'].encode('ascii', 'replace').decode('ascii')}")
|
| 11 |
-
print(f"Body: {r['body'][:300].encode('ascii', 'replace').decode('ascii')}")
|
| 12 |
-
print("-" * 40)
|
| 13 |
-
except:
|
| 14 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q2_v4.py
DELETED
|
@@ -1,14 +0,0 @@
|
|
| 1 |
-
from ddgs import DDGS
|
| 2 |
-
|
| 3 |
-
# Search for the specific video content
|
| 4 |
-
keywords = "L1vXCYZAYYM youtube bowling 300"
|
| 5 |
-
|
| 6 |
-
with DDGS() as ddgs:
|
| 7 |
-
results = ddgs.text(keywords, max_results=10)
|
| 8 |
-
for r in results:
|
| 9 |
-
try:
|
| 10 |
-
print(f"Title: {r['title'].encode('ascii', 'replace').decode('ascii')}")
|
| 11 |
-
print(f"Body: {r['body'][:400].encode('ascii', 'replace').decode('ascii')}")
|
| 12 |
-
print("-" * 40)
|
| 13 |
-
except:
|
| 14 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_q2_v5.py
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
from ddgs import DDGS
|
| 2 |
-
|
| 3 |
-
# Try different video ID format
|
| 4 |
-
keywords = '"L1vXCYZAYYM" video'
|
| 5 |
-
|
| 6 |
-
with DDGS() as ddgs:
|
| 7 |
-
results = ddgs.text(keywords, max_results=10)
|
| 8 |
-
for r in results:
|
| 9 |
-
try:
|
| 10 |
-
title = r['title'].encode('ascii', 'replace').decode('ascii')
|
| 11 |
-
body = r['body'][:400].encode('ascii', 'replace').decode('ascii')
|
| 12 |
-
print(f"Title: {title}")
|
| 13 |
-
print(f"Body: {body}")
|
| 14 |
-
print("-" * 40)
|
| 15 |
-
except:
|
| 16 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_search.py
DELETED
|
@@ -1,31 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import requests
|
| 3 |
-
from langchain_core.messages import HumanMessage
|
| 4 |
-
from agent import web_search, wiki_search, analyze_counting_question
|
| 5 |
-
from huggingface_hub import hf_hub_download
|
| 6 |
-
import pyarrow.parquet as pq
|
| 7 |
-
from dotenv import load_dotenv
|
| 8 |
-
|
| 9 |
-
load_dotenv(override=True)
|
| 10 |
-
|
| 11 |
-
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 12 |
-
|
| 13 |
-
# Test Q1
|
| 14 |
-
question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use web search."
|
| 15 |
-
|
| 16 |
-
# Do searches
|
| 17 |
-
search = web_search.invoke({"keywords": question[:200]})
|
| 18 |
-
print("WEB SEARCH:")
|
| 19 |
-
print(search[:1000].encode('ascii', 'replace').decode('ascii'))
|
| 20 |
-
print()
|
| 21 |
-
|
| 22 |
-
wiki = wiki_search.invoke({"query": question[:100]})
|
| 23 |
-
print("WIKIPEDIA:")
|
| 24 |
-
print(wiki[:1000].encode('ascii', 'replace').decode('ascii'))
|
| 25 |
-
print()
|
| 26 |
-
|
| 27 |
-
# Try analysis
|
| 28 |
-
all_search = f"WEB SEARCH:\n{search}\nWIKIPEDIA:\n{wiki}"
|
| 29 |
-
analysis = analyze_counting_question.invoke({"query": question, "search_results": all_search})
|
| 30 |
-
print("ANALYSIS:")
|
| 31 |
-
print(analysis.encode('ascii', 'replace').decode('ascii'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_test.py
DELETED
|
@@ -1,51 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import requests
|
| 3 |
-
from langchain_core.messages import HumanMessage
|
| 4 |
-
from agent import build_graph
|
| 5 |
-
from huggingface_hub import hf_hub_download
|
| 6 |
-
import pyarrow.parquet as pq
|
| 7 |
-
from dotenv import load_dotenv
|
| 8 |
-
|
| 9 |
-
load_dotenv(override=True)
|
| 10 |
-
|
| 11 |
-
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 12 |
-
|
| 13 |
-
# Initialize agent
|
| 14 |
-
graph = build_graph()
|
| 15 |
-
|
| 16 |
-
# Fetch questions
|
| 17 |
-
resp = requests.get(f"{DEFAULT_API_URL}/questions")
|
| 18 |
-
questions = resp.json()
|
| 19 |
-
|
| 20 |
-
# Load ground truth
|
| 21 |
-
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 22 |
-
path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
|
| 23 |
-
df = pq.read_table(path).to_pandas()
|
| 24 |
-
answer_map = dict(zip(df['task_id'], df['Final answer']))
|
| 25 |
-
|
| 26 |
-
# Test questions 3-5 specifically
|
| 27 |
-
for i in [2, 3, 4]:
|
| 28 |
-
q = questions[i]
|
| 29 |
-
task_id = q['task_id']
|
| 30 |
-
question = q['question']
|
| 31 |
-
file_name = q.get('file_name')
|
| 32 |
-
ground_truth = answer_map.get(task_id, "NOT FOUND")
|
| 33 |
-
|
| 34 |
-
print(f"\nQ{i+1}: {question[:80]}...")
|
| 35 |
-
print(f"File: {file_name}")
|
| 36 |
-
print(f"Ground Truth: {ground_truth}")
|
| 37 |
-
|
| 38 |
-
result = graph.invoke({"messages": [HumanMessage(content=question)]})
|
| 39 |
-
|
| 40 |
-
# Print all messages
|
| 41 |
-
for j, msg in enumerate(result['messages']):
|
| 42 |
-
if hasattr(msg, 'content'):
|
| 43 |
-
content = msg.content
|
| 44 |
-
if len(content) > 200:
|
| 45 |
-
content = content[:200] + "..."
|
| 46 |
-
print(f" Msg {j}: {content}")
|
| 47 |
-
|
| 48 |
-
answer = result['messages'][-1].content
|
| 49 |
-
print(f"Agent Answer: {answer}")
|
| 50 |
-
is_correct = answer.strip().lower() == str(ground_truth).strip().lower()
|
| 51 |
-
print(f"Result: {'CORRECT' if is_correct else 'WRONG'}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_wiki.py
DELETED
|
@@ -1,15 +0,0 @@
|
|
| 1 |
-
from agent import web_search, wiki_search
|
| 2 |
-
|
| 3 |
-
# Q5 question
|
| 4 |
-
q = "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?"
|
| 5 |
-
|
| 6 |
-
# Test searches
|
| 7 |
-
ws = web_search.invoke({"keywords": q[:200]})
|
| 8 |
-
print("WEB SEARCH:")
|
| 9 |
-
print(ws[:1500])
|
| 10 |
-
print()
|
| 11 |
-
|
| 12 |
-
# Try Wikipedia
|
| 13 |
-
wik = wiki_search.invoke({"query": "Giganotosaurus featured article nomination"})
|
| 14 |
-
print("WIKI:")
|
| 15 |
-
print(wik[:1500])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_wiki2.py
DELETED
|
@@ -1,8 +0,0 @@
|
|
| 1 |
-
from agent import web_search, wiki_search
|
| 2 |
-
|
| 3 |
-
# Q5 - more specific search
|
| 4 |
-
q = "Featured Article dinosaur November 2016 Wikipedia nomination"
|
| 5 |
-
|
| 6 |
-
ws = web_search.invoke({"keywords": q})
|
| 7 |
-
print("WEB SEARCH:")
|
| 8 |
-
print(ws[:2000])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_wiki3.py
DELETED
|
@@ -1,7 +0,0 @@
|
|
| 1 |
-
from agent import web_search
|
| 2 |
-
|
| 3 |
-
# Better search for Wikipedia question
|
| 4 |
-
q = "Wikipedia Featured Article dinosaur November 2016 nominating user"
|
| 5 |
-
|
| 6 |
-
ws = web_search.invoke({"keywords": q})
|
| 7 |
-
print(ws[:3000])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_wiki4.py
DELETED
|
@@ -1,7 +0,0 @@
|
|
| 1 |
-
from agent import web_search
|
| 2 |
-
|
| 3 |
-
# Very specific search
|
| 4 |
-
q = '"FunkMonk" Wikipedia featured article dinosaur'
|
| 5 |
-
|
| 6 |
-
ws = web_search.invoke({"keywords": q})
|
| 7 |
-
print(ws[:2000])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_youtube.py
DELETED
|
@@ -1,15 +0,0 @@
|
|
| 1 |
-
from agent import get_youtube_transcript, web_search
|
| 2 |
-
|
| 3 |
-
# Q2 - YouTube
|
| 4 |
-
url2 = "https://www.youtube.com/watch?v=L1vXCYZAYYM"
|
| 5 |
-
transcript = get_youtube_transcript.invoke({"url": url2})
|
| 6 |
-
print("Q2 Transcript:", transcript[:500])
|
| 7 |
-
|
| 8 |
-
# Q7 - YouTube
|
| 9 |
-
url7 = "https://www.youtube.com/watch?v=1htKBjuUWec"
|
| 10 |
-
transcript7 = get_youtube_transcript.invoke({"url": url7})
|
| 11 |
-
print("\nQ7 Transcript:", transcript7[:500])
|
| 12 |
-
|
| 13 |
-
# Also search web for content
|
| 14 |
-
ws = web_search.invoke({"keywords": "Stargate SG-1 Urgo Teal'c hot scene response"})
|
| 15 |
-
print("\nWeb search:", ws[:500])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_10.py
DELETED
|
@@ -1,49 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import requests
|
| 3 |
-
from langchain_core.messages import HumanMessage
|
| 4 |
-
from agent import build_graph
|
| 5 |
-
from huggingface_hub import hf_hub_download
|
| 6 |
-
import pyarrow.parquet as pq
|
| 7 |
-
from dotenv import load_dotenv
|
| 8 |
-
|
| 9 |
-
load_dotenv(override=True)
|
| 10 |
-
|
| 11 |
-
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 12 |
-
|
| 13 |
-
# Initialize agent
|
| 14 |
-
graph = build_graph()
|
| 15 |
-
|
| 16 |
-
# Fetch 10 questions
|
| 17 |
-
resp = requests.get(f"{DEFAULT_API_URL}/questions")
|
| 18 |
-
questions = resp.json()[:10]
|
| 19 |
-
|
| 20 |
-
# Load ground truth
|
| 21 |
-
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 22 |
-
path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
|
| 23 |
-
df = pq.read_table(path).to_pandas()
|
| 24 |
-
answer_map = dict(zip(df['task_id'], df['Final answer']))
|
| 25 |
-
|
| 26 |
-
correct = 0
|
| 27 |
-
total = 0
|
| 28 |
-
|
| 29 |
-
for q in questions:
|
| 30 |
-
task_id = q['task_id']
|
| 31 |
-
question = q['question']
|
| 32 |
-
file_name = q.get('file_name')
|
| 33 |
-
ground_truth = answer_map.get(task_id, "NOT FOUND")
|
| 34 |
-
|
| 35 |
-
print(f"\nQ{total+1}: {question[:60]}...")
|
| 36 |
-
print(f"File: {file_name}")
|
| 37 |
-
print(f"GT: {ground_truth}")
|
| 38 |
-
|
| 39 |
-
result = graph.invoke({"messages": [HumanMessage(content=question)]})
|
| 40 |
-
answer = result['messages'][-1].content
|
| 41 |
-
print(f"Ans: {answer[:50]}")
|
| 42 |
-
|
| 43 |
-
is_correct = answer.strip().lower() == str(ground_truth).strip().lower()
|
| 44 |
-
if is_correct:
|
| 45 |
-
correct += 1
|
| 46 |
-
total += 1
|
| 47 |
-
print(f"{'CORRECT' if is_correct else 'WRONG'}")
|
| 48 |
-
|
| 49 |
-
print(f"\n=== Score: {correct}/{total} = {correct/total*100:.0f}% ===")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_11_20.py
DELETED
|
@@ -1,51 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import requests
|
| 3 |
-
from langchain_core.messages import HumanMessage
|
| 4 |
-
from agent import build_graph
|
| 5 |
-
from huggingface_hub import hf_hub_download
|
| 6 |
-
import pyarrow.parquet as pq
|
| 7 |
-
from dotenv import load_dotenv
|
| 8 |
-
|
| 9 |
-
load_dotenv(override=True)
|
| 10 |
-
|
| 11 |
-
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 12 |
-
|
| 13 |
-
# Initialize agent
|
| 14 |
-
graph = build_graph()
|
| 15 |
-
|
| 16 |
-
# Fetch questions 11-20
|
| 17 |
-
resp = requests.get(f"{DEFAULT_API_URL}/questions")
|
| 18 |
-
questions = resp.json()[10:20]
|
| 19 |
-
|
| 20 |
-
# Load ground truth
|
| 21 |
-
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 22 |
-
path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
|
| 23 |
-
df = pq.read_table(path).to_pandas()
|
| 24 |
-
answer_map = dict(zip(df['task_id'], df['Final answer']))
|
| 25 |
-
|
| 26 |
-
correct = 0
|
| 27 |
-
total = 0
|
| 28 |
-
|
| 29 |
-
for i, q in enumerate(questions):
|
| 30 |
-
task_id = q['task_id']
|
| 31 |
-
question = q['question']
|
| 32 |
-
file_name = q.get('file_name')
|
| 33 |
-
ground_truth = answer_map.get(task_id, "NOT FOUND")
|
| 34 |
-
|
| 35 |
-
print(f"\n[{i+11}] ", end="")
|
| 36 |
-
|
| 37 |
-
result = graph.invoke({"messages": [HumanMessage(content=question)]})
|
| 38 |
-
answer = result['messages'][-1].content
|
| 39 |
-
|
| 40 |
-
try:
|
| 41 |
-
print(f"Ans: {answer[:30].encode('ascii', 'replace').decode('ascii')}")
|
| 42 |
-
except:
|
| 43 |
-
print(f"Ans: [encoding issue]")
|
| 44 |
-
|
| 45 |
-
is_correct = answer.strip().lower() == str(ground_truth).strip().lower()
|
| 46 |
-
if is_correct:
|
| 47 |
-
correct += 1
|
| 48 |
-
total += 1
|
| 49 |
-
print(f" {'CORRECT' if is_correct else 'WRONG'} (GT: {str(ground_truth)[:20]})")
|
| 50 |
-
|
| 51 |
-
print(f"\n=== Score: {correct}/{total} = {correct/total*100:.0f}% ===")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_5.py
DELETED
|
@@ -1,47 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import requests
|
| 3 |
-
from langchain_core.messages import HumanMessage
|
| 4 |
-
from agent import build_graph
|
| 5 |
-
from huggingface_hub import hf_hub_download
|
| 6 |
-
import pyarrow.parquet as pq
|
| 7 |
-
from dotenv import load_dotenv
|
| 8 |
-
|
| 9 |
-
load_dotenv(override=True)
|
| 10 |
-
|
| 11 |
-
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 12 |
-
|
| 13 |
-
# Initialize agent
|
| 14 |
-
graph = build_graph()
|
| 15 |
-
|
| 16 |
-
# Fetch 5 questions
|
| 17 |
-
resp = requests.get(f"{DEFAULT_API_URL}/questions")
|
| 18 |
-
questions = resp.json()[:5]
|
| 19 |
-
|
| 20 |
-
# Load ground truth
|
| 21 |
-
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 22 |
-
path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
|
| 23 |
-
df = pq.read_table(path).to_pandas()
|
| 24 |
-
answer_map = dict(zip(df['task_id'], df['Final answer']))
|
| 25 |
-
|
| 26 |
-
correct = 0
|
| 27 |
-
total = 0
|
| 28 |
-
|
| 29 |
-
for q in questions:
|
| 30 |
-
task_id = q['task_id']
|
| 31 |
-
question = q['question']
|
| 32 |
-
ground_truth = answer_map.get(task_id, "NOT FOUND")
|
| 33 |
-
|
| 34 |
-
print(f"\nQ{total+1}: {question[:80]}...")
|
| 35 |
-
print(f"Ground Truth: {ground_truth}")
|
| 36 |
-
|
| 37 |
-
result = graph.invoke({"messages": [HumanMessage(content=question)]})
|
| 38 |
-
answer = result['messages'][-1].content
|
| 39 |
-
print(f"Agent Answer: {answer}")
|
| 40 |
-
|
| 41 |
-
is_correct = answer.strip().lower() == str(ground_truth).strip().lower()
|
| 42 |
-
if is_correct:
|
| 43 |
-
correct += 1
|
| 44 |
-
total += 1
|
| 45 |
-
print(f"Result: {'CORRECT' if is_correct else 'WRONG'}")
|
| 46 |
-
|
| 47 |
-
print(f"\n=== Score: {correct}/{total} = {correct/total*100:.0f}% ===")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_all.py
DELETED
|
@@ -1,52 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import requests
|
| 3 |
-
import time
|
| 4 |
-
from langchain_core.messages import HumanMessage
|
| 5 |
-
from agent import build_graph
|
| 6 |
-
from huggingface_hub import hf_hub_download
|
| 7 |
-
import pyarrow.parquet as pq
|
| 8 |
-
from dotenv import load_dotenv
|
| 9 |
-
|
| 10 |
-
load_dotenv(override=True)
|
| 11 |
-
|
| 12 |
-
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 13 |
-
|
| 14 |
-
# Initialize agent
|
| 15 |
-
graph = build_graph()
|
| 16 |
-
|
| 17 |
-
# Fetch ALL questions
|
| 18 |
-
resp = requests.get(f"{DEFAULT_API_URL}/questions")
|
| 19 |
-
questions = resp.json()
|
| 20 |
-
|
| 21 |
-
# Load ground truth
|
| 22 |
-
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 23 |
-
path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
|
| 24 |
-
df = pq.read_table(path).to_pandas()
|
| 25 |
-
answer_map = dict(zip(df['task_id'], df['Final answer']))
|
| 26 |
-
|
| 27 |
-
correct = 0
|
| 28 |
-
total = 0
|
| 29 |
-
|
| 30 |
-
for q in questions:
|
| 31 |
-
task_id = q['task_id']
|
| 32 |
-
question = q['question']
|
| 33 |
-
file_name = q.get('file_name')
|
| 34 |
-
ground_truth = answer_map.get(task_id, "NOT FOUND")
|
| 35 |
-
|
| 36 |
-
print(f"\n[{total+1}/{len(questions)}] {question[:50]}...")
|
| 37 |
-
|
| 38 |
-
result = graph.invoke({"messages": [HumanMessage(content=question)]})
|
| 39 |
-
answer = result['messages'][-1].content
|
| 40 |
-
|
| 41 |
-
is_correct = answer.strip().lower() == str(ground_truth).strip().lower()
|
| 42 |
-
if is_correct:
|
| 43 |
-
correct += 1
|
| 44 |
-
total += 1
|
| 45 |
-
|
| 46 |
-
status = "✅" if is_correct else "❌"
|
| 47 |
-
print(f" {status} GT: {str(ground_truth)[:30]}")
|
| 48 |
-
print(f" Ans: {answer[:50]}")
|
| 49 |
-
|
| 50 |
-
time.sleep(1)
|
| 51 |
-
|
| 52 |
-
print(f"\n=== FINAL SCORE: {correct}/{total} = {correct/total*100:.0f}% ===")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_all_v2.py
DELETED
|
@@ -1,44 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import requests
|
| 3 |
-
from langchain_core.messages import HumanMessage
|
| 4 |
-
from agent import build_graph
|
| 5 |
-
from huggingface_hub import hf_hub_download
|
| 6 |
-
import pyarrow.parquet as pq
|
| 7 |
-
from dotenv import load_dotenv
|
| 8 |
-
|
| 9 |
-
load_dotenv(override=True)
|
| 10 |
-
|
| 11 |
-
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 12 |
-
|
| 13 |
-
graph = build_graph()
|
| 14 |
-
resp = requests.get(f"{DEFAULT_API_URL}/questions")
|
| 15 |
-
questions = resp.json()
|
| 16 |
-
|
| 17 |
-
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 18 |
-
path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
|
| 19 |
-
df = pq.read_table(path).to_pandas()
|
| 20 |
-
answer_map = dict(zip(df['task_id'], df['Final answer']))
|
| 21 |
-
|
| 22 |
-
correct = 0
|
| 23 |
-
total = 0
|
| 24 |
-
|
| 25 |
-
for i, q in enumerate(questions):
|
| 26 |
-
task_id = q['task_id']
|
| 27 |
-
question = q['question']
|
| 28 |
-
ground_truth = answer_map.get(task_id, "NOT FOUND")
|
| 29 |
-
|
| 30 |
-
try:
|
| 31 |
-
result = graph.invoke({"messages": [HumanMessage(content=question)]})
|
| 32 |
-
answer = result['messages'][-1].content
|
| 33 |
-
|
| 34 |
-
is_correct = answer.strip().lower() == str(ground_truth).strip().lower()
|
| 35 |
-
if is_correct:
|
| 36 |
-
correct += 1
|
| 37 |
-
total += 1
|
| 38 |
-
status = "OK" if is_correct else "FAIL"
|
| 39 |
-
print(f"[{i+1:2d}] {status}")
|
| 40 |
-
except Exception as e:
|
| 41 |
-
print(f"[{i+1:2d}] ERROR: {str(e)[:30]}")
|
| 42 |
-
total += 1
|
| 43 |
-
|
| 44 |
-
print(f"\n=== TOTAL: {correct}/{total} = {correct/total*100:.0f}% ===")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_improvements.py
DELETED
|
@@ -1,45 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import sys
|
| 3 |
-
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
| 4 |
-
|
| 5 |
-
from langchain_core.messages import HumanMessage
|
| 6 |
-
from agent import build_graph
|
| 7 |
-
|
| 8 |
-
def test_agent():
|
| 9 |
-
print("=" * 50)
|
| 10 |
-
print("Testing GAIA Agent with improvements...")
|
| 11 |
-
print("=" * 50)
|
| 12 |
-
|
| 13 |
-
# Build the agent graph
|
| 14 |
-
graph = build_graph()
|
| 15 |
-
|
| 16 |
-
# Test question - simple math/reasoning
|
| 17 |
-
test_question = "What is 15 + 27?"
|
| 18 |
-
|
| 19 |
-
print(f"\nQuestion: {test_question}")
|
| 20 |
-
print("-" * 30)
|
| 21 |
-
|
| 22 |
-
messages = [HumanMessage(content=test_question)]
|
| 23 |
-
result = graph.invoke({"messages": messages})
|
| 24 |
-
|
| 25 |
-
answer = result['messages'][-1].content
|
| 26 |
-
print(f"\nFinal Answer: {answer}")
|
| 27 |
-
print("-" * 30)
|
| 28 |
-
|
| 29 |
-
# Test another question requiring web search
|
| 30 |
-
test_question2 = "What is the capital of France?"
|
| 31 |
-
|
| 32 |
-
print(f"\nQuestion: {test_question2}")
|
| 33 |
-
print("-" * 30)
|
| 34 |
-
|
| 35 |
-
messages2 = [HumanMessage(content=test_question2)]
|
| 36 |
-
result2 = graph.invoke({"messages": messages2})
|
| 37 |
-
|
| 38 |
-
answer2 = result2['messages'][-1].content
|
| 39 |
-
print(f"\nFinal Answer: {answer2}")
|
| 40 |
-
print("-" * 30)
|
| 41 |
-
|
| 42 |
-
print("\nTest completed successfully!")
|
| 43 |
-
|
| 44 |
-
if __name__ == "__main__":
|
| 45 |
-
test_agent()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_q2.py
DELETED
|
@@ -1,40 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import requests
|
| 3 |
-
from langchain_core.messages import HumanMessage
|
| 4 |
-
from agent import build_graph
|
| 5 |
-
from huggingface_hub import hf_hub_download
|
| 6 |
-
import pyarrow.parquet as pq
|
| 7 |
-
from dotenv import load_dotenv
|
| 8 |
-
|
| 9 |
-
load_dotenv(override=True)
|
| 10 |
-
|
| 11 |
-
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 12 |
-
|
| 13 |
-
# Initialize agent
|
| 14 |
-
graph = build_graph()
|
| 15 |
-
|
| 16 |
-
# Fetch questions
|
| 17 |
-
resp = requests.get(f"{DEFAULT_API_URL}/questions")
|
| 18 |
-
questions = resp.json()
|
| 19 |
-
|
| 20 |
-
# Load ground truth
|
| 21 |
-
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 22 |
-
path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
|
| 23 |
-
df = pq.read_table(path).to_pandas()
|
| 24 |
-
answer_map = dict(zip(df['task_id'], df['Final answer']))
|
| 25 |
-
|
| 26 |
-
# Test Q2 only
|
| 27 |
-
q = questions[1]
|
| 28 |
-
task_id = q['task_id']
|
| 29 |
-
question = q['question']
|
| 30 |
-
ground_truth = answer_map.get(task_id, "NOT FOUND")
|
| 31 |
-
|
| 32 |
-
print(f"Q2: {question[:80]}...")
|
| 33 |
-
print(f"GT: {ground_truth}")
|
| 34 |
-
print("-" * 40)
|
| 35 |
-
|
| 36 |
-
result = graph.invoke({"messages": [HumanMessage(content=question)]})
|
| 37 |
-
answer = result['messages'][-1].content
|
| 38 |
-
print(f"Ans: {answer}")
|
| 39 |
-
print("-" * 40)
|
| 40 |
-
print(f"Correct: {answer.strip().lower() == str(ground_truth).strip().lower()}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_vision.py
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
from langchain_openai import ChatOpenAI
|
| 3 |
-
from langchain_core.messages import HumanMessage
|
| 4 |
-
import base64
|
| 5 |
-
from dotenv import load_dotenv
|
| 6 |
-
|
| 7 |
-
load_dotenv()
|
| 8 |
-
|
| 9 |
-
def test_vision():
|
| 10 |
-
# Use a tiny 1x1 base64 image for testing
|
| 11 |
-
tiny_img = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
|
| 12 |
-
msg = HumanMessage(content=[{"type": "text", "text": "what is in this image?"}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{tiny_img}"}}])
|
| 13 |
-
|
| 14 |
-
models = [
|
| 15 |
-
{"name": "OpenRouter-Gemini-2.0", "provider": "openai", "model": "google/gemini-2.0-flash-001", "base_url": "https://openrouter.ai/api/v1", "key": "OPENROUTER_API_KEY"},
|
| 16 |
-
{"name": "NVIDIA-Llama-3.2", "provider": "openai", "model": "nvidia/llama-3.2-nv-vision-70b", "base_url": "https://integrate.api.nvidia.com/v1", "key": "NVIDIA_API_KEY"},
|
| 17 |
-
{"name": "NVIDIA-Qwen-VL", "provider": "openai", "model": "nvidia/qwen-vl-max", "base_url": "https://integrate.api.nvidia.com/v1", "key": "NVIDIA_API_KEY"},
|
| 18 |
-
{"name": "Vercel-Vision", "provider": "openai", "model": "gpt-4o-mini", "base_url": "https://gateway.ai.vercel.com/v1", "key": "VERCEL_API_KEY"},
|
| 19 |
-
]
|
| 20 |
-
|
| 21 |
-
for m in models:
|
| 22 |
-
key = os.getenv(m['key'])
|
| 23 |
-
if not key:
|
| 24 |
-
print(f"Skip {m['name']} (no key)")
|
| 25 |
-
continue
|
| 26 |
-
try:
|
| 27 |
-
print(f"Testing {m['name']} ({m['model']})...")
|
| 28 |
-
llm = ChatOpenAI(model=m['model'], openai_api_key=key, openai_api_base=m['base_url'], temperature=0)
|
| 29 |
-
res = llm.invoke([msg])
|
| 30 |
-
print(f"Success: {res.content}")
|
| 31 |
-
except Exception as e:
|
| 32 |
-
print(f"Fail: {e}")
|
| 33 |
-
|
| 34 |
-
if __name__ == "__main__":
|
| 35 |
-
test_vision()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_vision_v2.py
DELETED
|
@@ -1,34 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
from langchain_openai import ChatOpenAI
|
| 3 |
-
from langchain_core.messages import HumanMessage
|
| 4 |
-
import base64
|
| 5 |
-
from dotenv import load_dotenv
|
| 6 |
-
|
| 7 |
-
load_dotenv(override=True)
|
| 8 |
-
|
| 9 |
-
def test_vision():
|
| 10 |
-
# Use a tiny 1x1 base64 image for testing
|
| 11 |
-
tiny_img = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
|
| 12 |
-
msg = HumanMessage(content=[{"type": "text", "text": "is this image red, green, or blue? answer with one word."}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{tiny_img}"}}])
|
| 13 |
-
|
| 14 |
-
models = [
|
| 15 |
-
{"name": "NVIDIA-Llama-3.2-11b", "provider": "openai", "model": "meta/llama-3.2-11b-vision-instruct", "base_url": "https://integrate.api.nvidia.com/v1", "key": "NVIDIA_API_KEY"},
|
| 16 |
-
{"name": "NVIDIA-Llama-3.2-90b", "provider": "openai", "model": "meta/llama-3.2-90b-vision-instruct", "base_url": "https://integrate.api.nvidia.com/v1", "key": "NVIDIA_API_KEY"},
|
| 17 |
-
{"name": "NVIDIA-Mistral-Vision", "provider": "openai", "model": "mistralai/pixtral-12b", "base_url": "https://integrate.api.nvidia.com/v1", "key": "NVIDIA_API_KEY"},
|
| 18 |
-
]
|
| 19 |
-
|
| 20 |
-
for m in models:
|
| 21 |
-
key = os.getenv(m['key'])
|
| 22 |
-
if not key:
|
| 23 |
-
print(f"Skip {m['name']} (no key)")
|
| 24 |
-
continue
|
| 25 |
-
try:
|
| 26 |
-
print(f"Testing {m['name']} ({m['model']})...")
|
| 27 |
-
llm = ChatOpenAI(model=m['model'], openai_api_key=key, openai_api_base=m['base_url'], temperature=0)
|
| 28 |
-
res = llm.invoke([msg])
|
| 29 |
-
print(f"Success: {res.content}")
|
| 30 |
-
except Exception as e:
|
| 31 |
-
print(f"Fail: {e}")
|
| 32 |
-
|
| 33 |
-
if __name__ == "__main__":
|
| 34 |
-
test_vision()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|