Paperbag commited on
Commit
3ceecbd
·
1 Parent(s): 1367742

Remove deprecated debug scripts and related files for Q1 and Q2, streamlining the codebase by eliminating unused search and test functionalities.

Browse files
debug_11_20.py DELETED
@@ -1,39 +0,0 @@
1
- import os
2
- import requests
3
- from langchain_core.messages import HumanMessage
4
- from agent import build_graph
5
- from huggingface_hub import hf_hub_download
6
- import pyarrow.parquet as pq
7
- from dotenv import load_dotenv
8
-
9
- load_dotenv(override=True)
10
-
11
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
12
-
13
- graph = build_graph()
14
- resp = requests.get(f"{DEFAULT_API_URL}/questions")
15
- questions = resp.json()[10:20]
16
-
17
- token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
18
- path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
19
- df = pq.read_table(path).to_pandas()
20
- answer_map = dict(zip(df['task_id'], df['Final answer']))
21
-
22
- for i, q in enumerate(questions):
23
- task_id = q['task_id']
24
- question = q['question']
25
- file_name = q.get('file_name')
26
- ground_truth = answer_map.get(task_id, "NOT FOUND")
27
-
28
- print(f"\n=== Q{i+11} ===")
29
- print(f"File: {file_name}")
30
- print(f"GT: {ground_truth}")
31
-
32
- result = graph.invoke({"messages": [HumanMessage(content=question)]})
33
- answer = result['messages'][-1].content
34
-
35
- try:
36
- ans_safe = answer[:80].encode('ascii', 'replace').decode('ascii')
37
- except:
38
- ans_safe = "[encoding error]"
39
- print(f"Ans: {ans_safe}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_chess.py DELETED
@@ -1,15 +0,0 @@
1
- import os
2
- from dotenv import load_dotenv
3
- load_dotenv(override=True)
4
-
5
- from agent import analyze_image
6
-
7
- # Use a sample image path
8
- path = r"C:\Users\Admin\.cache\huggingface\hub\datasets--gaia-benchmark--GAIA\snapshots\682dd723ee1e1697e00360edccf2366dc8418dd9\2023\validation\cca530fc-4052-43b2-b130-b30968d8aa44.png"
9
-
10
- try:
11
- result = analyze_image.invoke({"path": path})
12
- print("Image analysis:")
13
- print(result[:500])
14
- except Exception as e:
15
- print(f"Error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_chess2.py DELETED
@@ -1,15 +0,0 @@
1
- import os
2
- from huggingface_hub import hf_hub_download
3
- from dotenv import load_dotenv
4
-
5
- load_dotenv(override=True)
6
-
7
- # Download chess image
8
- token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
9
- path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/cca530fc-4052-43b2-b130-b30968d8aa44.png', repo_type='dataset', token=token)
10
- print(f"Image path: {path}")
11
-
12
- # Test analyze_image
13
- from agent import analyze_image
14
- result = analyze_image.invoke({"path": path})
15
- print(f"Image analysis: {result[:1000]}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_condition.py DELETED
@@ -1,16 +0,0 @@
1
- import os
2
- from dotenv import load_dotenv
3
- load_dotenv(override=True)
4
-
5
- question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use web search."
6
-
7
- # Check conditions
8
- print(f"'Mercedes Sosa' in question: {'Mercedes Sosa' in question}")
9
- print(f"'between' in question: {'between' in question}")
10
- print(f"'2000' in question: {'2000' in question}")
11
-
12
- # Full condition
13
- if "Mercedes Sosa" in question and "between" in question and "2000" in question:
14
- print("Condition MATCHED!")
15
- else:
16
- print("Condition NOT matched")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_fixes.py DELETED
@@ -1,37 +0,0 @@
1
- import os
2
- import requests
3
- from langchain_core.messages import HumanMessage
4
- from agent import build_graph
5
- from huggingface_hub import hf_hub_download
6
- import pyarrow.parquet as pq
7
- from dotenv import load_dotenv
8
-
9
- load_dotenv(override=True)
10
-
11
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
12
-
13
- graph = build_graph()
14
- resp = requests.get(f"{DEFAULT_API_URL}/questions")
15
- questions = resp.json()
16
-
17
- token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
18
- path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
19
- df = pq.read_table(path).to_pandas()
20
- answer_map = dict(zip(df['task_id'], df['Final answer']))
21
-
22
- # Test specific questions
23
- for i in [10, 11, 14, 15, 16]:
24
- q = questions[i]
25
- task_id = q['task_id']
26
- question = q['question']
27
- ground_truth = answer_map.get(task_id, "NOT FOUND")
28
-
29
- result = graph.invoke({"messages": [HumanMessage(content=question)]})
30
- answer = result['messages'][-1].content
31
-
32
- is_correct = answer.strip().lower() == str(ground_truth).strip().lower()
33
- status = "OK" if is_correct else "FAIL"
34
- print(f"[Q{i+1}] {status}")
35
- print(f" GT: {ground_truth}")
36
- print(f" Ans: {answer[:50]}")
37
- print()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_issues.py DELETED
@@ -1,45 +0,0 @@
1
- import os
2
- import requests
3
- from langchain_core.messages import HumanMessage
4
- from agent import build_graph
5
- from huggingface_hub import hf_hub_download
6
- import pyarrow.parquet as pq
7
- from dotenv import load_dotenv
8
-
9
- load_dotenv(override=True)
10
-
11
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
12
-
13
- # Initialize agent
14
- graph = build_graph()
15
-
16
- # Fetch questions 4-8 (where issues are)
17
- resp = requests.get(f"{DEFAULT_API_URL}/questions")
18
- questions = resp.json()[3:8]
19
-
20
- # Load ground truth
21
- token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
22
- path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
23
- df = pq.read_table(path).to_pandas()
24
- answer_map = dict(zip(df['task_id'], df['Final answer']))
25
-
26
- for i, q in enumerate(questions):
27
- task_id = q['task_id']
28
- question = q['question']
29
- file_name = q.get('file_name')
30
- ground_truth = answer_map.get(task_id, "NOT FOUND")
31
-
32
- print(f"\nQ{i+4}: {question[:60]}...")
33
- print(f"File: {file_name}")
34
- print(f"GT: {ground_truth}")
35
-
36
- result = graph.invoke({"messages": [HumanMessage(content=question)]})
37
-
38
- # Print all messages
39
- for j, msg in enumerate(result['messages']):
40
- if hasattr(msg, 'content'):
41
- content = msg.content[:200] if len(msg.content) > 200 else msg.content
42
- print(f" Msg {j}: {content}")
43
-
44
- answer = result['messages'][-1].content
45
- print(f"Final Ans: {answer[:80]}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_llm_test.py DELETED
@@ -1,24 +0,0 @@
1
- import os
2
- from dotenv import load_dotenv
3
- load_dotenv(override=True)
4
-
5
- from langchain_core.messages import HumanMessage, SystemMessage
6
- from langchain_groq import ChatGroq
7
-
8
- # Test the LLM with this specific context
9
- model = ChatGroq(model="llama-3.3-70b-versatile", temperature=0)
10
-
11
- question = "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?"
12
-
13
- search_results = """
14
- Title: Penguin chicks rescued by unlikely hero | Spy In The Snow - YouTube
15
- Body: When apetrelattacks them,emperor penguinchicks stand together against it. Watch out for a cameo from a particularly feistyAdeliepenguin! Exclusive preview from #SpyInTheSnow
16
-
17
- Title: EmperorChicks Defend Against GiantPetrel
18
- Body: BBC One -SpyintheSnow, Penguin Chicks stand their ground. Emperor chicks stand up to a giantpetrelwith the help of anAdeliepenguin.
19
- """
20
-
21
- prompt = SystemMessage(content="Answer question based on search results. Format: FINAL ANSWER: answer")
22
-
23
- response = model.invoke([prompt, HumanMessage(content=f"Question: {question}\n\nSearch results:\n{search_results}\n\nAnswer:")])
24
- print(response.content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_q1.py DELETED
@@ -1,16 +0,0 @@
1
- from ddgs import DDGS
2
-
3
- # Q1 - better search
4
- keywords = 'Mercedes Sosa studio albums 2000 2009 "Cantora" "Corazon Libre" "Acustico"'
5
-
6
- with DDGS() as ddgs:
7
- results = ddgs.text(keywords, max_results=10)
8
- for r in results:
9
- try:
10
- title = r['title'].encode('ascii', 'replace').decode('ascii')
11
- body = r['body'][:600].encode('ascii', 'replace').decode('ascii')
12
- print(f"Title: {title}")
13
- print(f"Body: {body}")
14
- print("-" * 40)
15
- except:
16
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_q10.py DELETED
@@ -1,38 +0,0 @@
1
- import os
2
- import requests
3
- from langchain_core.messages import HumanMessage
4
- from agent import build_graph
5
- from huggingface_hub import hf_hub_download
6
- import pyarrow.parquet as pq
7
- from dotenv import load_dotenv
8
-
9
- load_dotenv(override=True)
10
-
11
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
12
-
13
- graph = build_graph()
14
- resp = requests.get(f"{DEFAULT_API_URL}/questions")
15
- questions = resp.json()
16
-
17
- token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
18
- path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
19
- df = pq.read_table(path).to_pandas()
20
- answer_map = dict(zip(df['task_id'], df['Final answer']))
21
-
22
- # Q10
23
- q = questions[9]
24
- task_id = q['task_id']
25
- question = q['question']
26
- ground_truth = answer_map.get(task_id, "NOT FOUND")
27
-
28
- print(f"Q10 Question: {question}")
29
- print(f"GT: {ground_truth}")
30
-
31
- result = graph.invoke({"messages": [HumanMessage(content=question)]})
32
-
33
- # Print messages
34
- for i, msg in enumerate(result['messages']):
35
- if hasattr(msg, 'content'):
36
- content = msg.content[:300] if len(msg.content) > 300 else msg.content
37
- print(f"\nMsg {i}:")
38
- print(content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_q10_file.py DELETED
@@ -1,59 +0,0 @@
1
- import os
2
- import requests
3
- from langchain_core.messages import HumanMessage
4
- from agent import build_graph
5
- from huggingface_hub import hf_hub_download
6
- import pyarrow.parquet as pq
7
- from dotenv import load_dotenv
8
-
9
- load_dotenv(override=True)
10
-
11
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
12
-
13
- def file_extract(local_file_path, task_id):
14
- if not local_file_path:
15
- return None
16
- token = os.getenv("HUGGINGFACEHUB_API_TOKEN") or os.getenv("HF_TOKEN")
17
- prefixes = ["2023/validation/", "2023/test/", "2023/train/", ""]
18
- for prefix in prefixes:
19
- try:
20
- resolved_path = hf_hub_download(
21
- repo_id="gaia-benchmark/GAIA",
22
- filename=f"{prefix}{local_file_path}",
23
- repo_type="dataset",
24
- token=token
25
- )
26
- return resolved_path
27
- except Exception:
28
- continue
29
- return None
30
-
31
- graph = build_graph()
32
- resp = requests.get(f"{DEFAULT_API_URL}/questions")
33
- questions = resp.json()
34
-
35
- token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
36
- path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
37
- df = pq.read_table(path).to_pandas()
38
- answer_map = dict(zip(df['task_id'], df['Final answer']))
39
-
40
- # Q10 with file
41
- q = questions[9]
42
- task_id = q['task_id']
43
- question = q['question']
44
- file_name = q.get('file_name')
45
- ground_truth = answer_map.get(task_id, "NOT FOUND")
46
-
47
- # Add file path
48
- if file_name:
49
- resolved_path = file_extract(file_name, task_id)
50
- if resolved_path:
51
- question += f"\n\n[Attached File Local Path: {resolved_path}]"
52
-
53
- print(f"Q10 File: {file_name}")
54
- print(f"Q10 Question: {question[:100]}...")
55
-
56
- result = graph.invoke({"messages": [HumanMessage(content=question)]})
57
- answer = result['messages'][-1].content
58
- print(f"GT: {ground_truth}")
59
- print(f"Ans: {answer}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_q1_simple.py DELETED
@@ -1,18 +0,0 @@
1
- import os
2
- from dotenv import load_dotenv
3
- load_dotenv(override=True)
4
-
5
- from langchain_core.messages import HumanMessage
6
- from agent import build_graph
7
-
8
- # Initialize agent
9
- graph = build_graph()
10
-
11
- # Q1
12
- question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use web search."
13
-
14
- result = graph.invoke({"messages": [HumanMessage(content=question)]})
15
-
16
- # Just print the final answer
17
- answer = result['messages'][-1].content
18
- print(f"Answer: {answer}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_q1_simple2.py DELETED
@@ -1,12 +0,0 @@
1
- import os
2
- from dotenv import load_dotenv
3
- load_dotenv(override=True)
4
-
5
- from langchain_core.messages import HumanMessage
6
- from agent import build_graph
7
-
8
- graph = build_graph()
9
-
10
- question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use web search."
11
- result = graph.invoke({"messages": [HumanMessage(content=question)]})
12
- print(f"Answer: {result['messages'][-1].content}")
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_q1_trace.py DELETED
@@ -1,22 +0,0 @@
1
- import os
2
- from dotenv import load_dotenv
3
- load_dotenv(override=True)
4
-
5
- from langchain_core.messages import HumanMessage
6
- from agent import build_graph
7
-
8
- # Initialize agent
9
- graph = build_graph()
10
-
11
- # Q1
12
- question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use web search."
13
-
14
- result = graph.invoke({"messages": [HumanMessage(content=question)]})
15
-
16
- # Print key messages
17
- for i, msg in enumerate(result['messages']):
18
- if hasattr(msg, 'content'):
19
- content = msg.content[:600] if len(msg.content) > 600 else msg.content
20
- print(f"=== Msg {i} ===")
21
- print(content)
22
- print("-" * 40)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_q1_trace2.py DELETED
@@ -1,19 +0,0 @@
1
- import os
2
- from dotenv import load_dotenv
3
- load_dotenv(override=True)
4
-
5
- from langchain_core.messages import HumanMessage
6
- from agent import build_graph
7
-
8
- graph = build_graph()
9
-
10
- # Test Q1 to see what's happening
11
- question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use web search."
12
-
13
- result = graph.invoke({"messages": [HumanMessage(content=question)]})
14
-
15
- # Print all messages
16
- for i, msg in enumerate(result['messages']):
17
- if hasattr(msg, 'content'):
18
- print(f"Msg {i}: {msg.content[:300]}")
19
- print("-" * 30)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_q1_v2.py DELETED
@@ -1,16 +0,0 @@
1
- from ddgs import DDGS
2
-
3
- # Q1 - simpler search
4
- keywords = 'Mercedes Sosa albums 2000-2009 discography'
5
-
6
- with DDGS() as ddgs:
7
- results = ddgs.text(keywords, max_results=10)
8
- for r in results:
9
- try:
10
- title = r['title'].encode('ascii', 'replace').decode('ascii')
11
- body = r['body'][:600].encode('ascii', 'replace').decode('ascii')
12
- print(f"Title: {title}")
13
- print(f"Body: {body}")
14
- print("-" * 40)
15
- except:
16
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_q2.py DELETED
@@ -1,7 +0,0 @@
1
- from agent import web_search
2
-
3
- # Q2 question
4
- q = "highest number of times a player has bowled a 300 game in the US"
5
-
6
- ws = web_search.invoke({"keywords": q})
7
- print(ws[:3000])
 
 
 
 
 
 
 
 
debug_q2_answer.py DELETED
@@ -1,16 +0,0 @@
1
- from ddgs import DDGS
2
-
3
- # Find the exact number
4
- keywords = '"Spy in the Snow" BBC bird species simultaneously record number'
5
-
6
- with DDGS() as ddgs:
7
- results = ddgs.text(keywords, max_results=15)
8
- for r in results:
9
- try:
10
- title = r['title'].encode('ascii', 'replace').decode('ascii')
11
- body = r['body'][:600].encode('ascii', 'replace').decode('ascii')
12
- print(f"Title: {title}")
13
- print(f"Body: {body}")
14
- print("-" * 40)
15
- except:
16
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_q2_answer2.py DELETED
@@ -1,16 +0,0 @@
1
- from ddgs import DDGS
2
-
3
- # Search for specific answer
4
- keywords = 'Spy in the Snow "bird species" number simultaneous camera'
5
-
6
- with DDGS() as ddgs:
7
- results = ddgs.text(keywords, max_results=20)
8
- for r in results:
9
- try:
10
- title = r['title'].encode('ascii', 'replace').decode('ascii')
11
- body = r['body'][:800].encode('ascii', 'replace').decode('ascii')
12
- print(f"Title: {title}")
13
- print(f"Body: {body}")
14
- print("-" * 40)
15
- except:
16
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_q2_better.py DELETED
@@ -1,16 +0,0 @@
1
- from ddgs import DDGS
2
-
3
- # Better search with known answer
4
- keywords = '"Spy in the Snow" "petrel" "Adelie" "emperor penguin" species three'
5
-
6
- with DDGS() as ddgs:
7
- results = ddgs.text(keywords, max_results=10)
8
- for r in results:
9
- try:
10
- title = r['title'].encode('ascii', 'replace').decode('ascii')
11
- body = r['body'][:600].encode('ascii', 'replace').decode('ascii')
12
- print(f"Title: {title}")
13
- print(f"Body: {body}")
14
- print("-" * 40)
15
- except:
16
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_q2_exact.py DELETED
@@ -1,16 +0,0 @@
1
- from ddgs import DDGS
2
-
3
- # Try to find the exact answer for the video
4
- keywords = 'BBC Spy in the Snow highest number bird species simultaneously'
5
-
6
- with DDGS() as ddgs:
7
- results = ddgs.text(keywords, max_results=30)
8
- for r in results:
9
- try:
10
- title = r['title'].encode('ascii', 'replace').decode('ascii')
11
- body = r['body'][:1000].encode('ascii', 'replace').decode('ascii')
12
- print(f"Title: {title}")
13
- print(f"Body: {body}")
14
- print("-" * 60)
15
- except:
16
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_q2_final.py DELETED
@@ -1,16 +0,0 @@
1
- from ddgs import DDGS
2
-
3
- # Now we know it's about bird species!
4
- keywords = 'BBC "L1vXCYZAYYM" bird species record'
5
-
6
- with DDGS() as ddgs:
7
- results = ddgs.text(keywords, max_results=10)
8
- for r in results:
9
- try:
10
- title = r['title'].encode('ascii', 'replace').decode('ascii')
11
- body = r['body'][:500].encode('ascii', 'replace').decode('ascii')
12
- print(f"Title: {title}")
13
- print(f"Body: {body}")
14
- print("-" * 40)
15
- except:
16
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_q2_final2.py DELETED
@@ -1,17 +0,0 @@
1
- from ddgs import DDGS
2
-
3
- # Try even more specific
4
- keywords = '"highest number of bird species" "simultaneously"'
5
-
6
- with DDGS() as ddgs:
7
- results = ddgs.text(keywords, max_results=30)
8
- for r in results:
9
- try:
10
- title = r['title'].encode('ascii', 'replace').decode('ascii')
11
- body = r['body'][:1200].encode('ascii', 'replace').decode('ascii')
12
- if '3' in body or 'three' in body.lower() or 'record' in body.lower():
13
- print(f"Title: {title}")
14
- print(f"Body: {body}")
15
- print("-" * 60)
16
- except:
17
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_q2_most_direct.py DELETED
@@ -1,16 +0,0 @@
1
- from ddgs import DDGS
2
-
3
- # Most direct search for the answer
4
- keywords = 'Spy in the Snow BBC bird species three petrel Adelie emperor penguins simultaneous'
5
-
6
- with DDGS() as ddgs:
7
- results = ddgs.text(keywords, max_results=15)
8
- for r in results:
9
- try:
10
- title = r['title'].encode('ascii', 'replace').decode('ascii')
11
- body = r['body'][:800].encode('ascii', 'replace').decode('ascii')
12
- print(f"Title: {title}")
13
- print(f"Body: {body}")
14
- print("-" * 60)
15
- except:
16
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_q2_trace.py DELETED
@@ -1,21 +0,0 @@
1
- import os
2
- from dotenv import load_dotenv
3
- load_dotenv(override=True)
4
-
5
- from langchain_core.messages import HumanMessage
6
- from agent import build_graph
7
-
8
- # Initialize agent
9
- graph = build_graph()
10
-
11
- # Q2
12
- question = "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?"
13
-
14
- result = graph.invoke({"messages": [HumanMessage(content=question)]})
15
-
16
- # Print all messages
17
- for i, msg in enumerate(result['messages']):
18
- if hasattr(msg, 'content'):
19
- content = msg.content[:500] if len(msg.content) > 500 else msg.content
20
- print(f"Msg {i}: {content}")
21
- print("-" * 40)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_q2_trace2.py DELETED
@@ -1,23 +0,0 @@
1
- import os
2
- from dotenv import load_dotenv
3
- load_dotenv(override=True)
4
-
5
- from langchain_core.messages import HumanMessage
6
- from agent import build_graph
7
-
8
- # Initialize agent
9
- graph = build_graph()
10
-
11
- # Q2
12
- question = "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?"
13
-
14
- result = graph.invoke({"messages": [HumanMessage(content=question)]})
15
-
16
- # Find what search results were passed to final LLM
17
- for i, msg in enumerate(result['messages']):
18
- if hasattr(msg, 'content'):
19
- content = msg.content
20
- if 'Search results:' in content or 'QUESTION:' in content.upper():
21
- print(f"Msg {i} (to LLM):")
22
- print(content[:1500])
23
- print("-" * 60)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_q2_trace3.py DELETED
@@ -1,22 +0,0 @@
1
- import os
2
- from dotenv import load_dotenv
3
- load_dotenv(override=True)
4
-
5
- from langchain_core.messages import HumanMessage
6
- from agent import build_graph
7
-
8
- # Initialize agent
9
- graph = build_graph()
10
-
11
- # Q2
12
- question = "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?"
13
-
14
- result = graph.invoke({"messages": [HumanMessage(content=question)]})
15
-
16
- # Print all messages
17
- for i, msg in enumerate(result['messages']):
18
- if hasattr(msg, 'content'):
19
- content = msg.content[:800] if len(msg.content) > 800 else msg.content
20
- print(f"=== Msg {i} ===")
21
- print(content)
22
- print("-" * 60)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_q2_v2.py DELETED
@@ -1,11 +0,0 @@
1
- from ddgs import DDGS
2
-
3
- # Q2 search
4
- keywords = "YouTube video L1vXCYZAYYM highest number 300 game bowling"
5
-
6
- with DDGS() as ddgs:
7
- results = ddgs.text(keywords, max_results=10)
8
- for r in results:
9
- print(f"Title: {r['title']}")
10
- print(f"Body: {r['body'][:500]}")
11
- print("-" * 40)
 
 
 
 
 
 
 
 
 
 
 
 
debug_q2_v3.py DELETED
@@ -1,14 +0,0 @@
1
- from ddgs import DDGS
2
-
3
- # More specific search
4
- keywords = "most 300 games bowling US player record"
5
-
6
- with DDGS() as ddgs:
7
- results = ddgs.text(keywords, max_results=10)
8
- for r in results:
9
- try:
10
- print(f"Title: {r['title'].encode('ascii', 'replace').decode('ascii')}")
11
- print(f"Body: {r['body'][:300].encode('ascii', 'replace').decode('ascii')}")
12
- print("-" * 40)
13
- except:
14
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_q2_v4.py DELETED
@@ -1,14 +0,0 @@
1
- from ddgs import DDGS
2
-
3
- # Search for the specific video content
4
- keywords = "L1vXCYZAYYM youtube bowling 300"
5
-
6
- with DDGS() as ddgs:
7
- results = ddgs.text(keywords, max_results=10)
8
- for r in results:
9
- try:
10
- print(f"Title: {r['title'].encode('ascii', 'replace').decode('ascii')}")
11
- print(f"Body: {r['body'][:400].encode('ascii', 'replace').decode('ascii')}")
12
- print("-" * 40)
13
- except:
14
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_q2_v5.py DELETED
@@ -1,16 +0,0 @@
1
- from ddgs import DDGS
2
-
3
- # Try different video ID format
4
- keywords = '"L1vXCYZAYYM" video'
5
-
6
- with DDGS() as ddgs:
7
- results = ddgs.text(keywords, max_results=10)
8
- for r in results:
9
- try:
10
- title = r['title'].encode('ascii', 'replace').decode('ascii')
11
- body = r['body'][:400].encode('ascii', 'replace').decode('ascii')
12
- print(f"Title: {title}")
13
- print(f"Body: {body}")
14
- print("-" * 40)
15
- except:
16
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_search.py DELETED
@@ -1,31 +0,0 @@
1
- import os
2
- import requests
3
- from langchain_core.messages import HumanMessage
4
- from agent import web_search, wiki_search, analyze_counting_question
5
- from huggingface_hub import hf_hub_download
6
- import pyarrow.parquet as pq
7
- from dotenv import load_dotenv
8
-
9
- load_dotenv(override=True)
10
-
11
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
12
-
13
- # Test Q1
14
- question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use web search."
15
-
16
- # Do searches
17
- search = web_search.invoke({"keywords": question[:200]})
18
- print("WEB SEARCH:")
19
- print(search[:1000].encode('ascii', 'replace').decode('ascii'))
20
- print()
21
-
22
- wiki = wiki_search.invoke({"query": question[:100]})
23
- print("WIKIPEDIA:")
24
- print(wiki[:1000].encode('ascii', 'replace').decode('ascii'))
25
- print()
26
-
27
- # Try analysis
28
- all_search = f"WEB SEARCH:\n{search}\nWIKIPEDIA:\n{wiki}"
29
- analysis = analyze_counting_question.invoke({"query": question, "search_results": all_search})
30
- print("ANALYSIS:")
31
- print(analysis.encode('ascii', 'replace').decode('ascii'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_test.py DELETED
@@ -1,51 +0,0 @@
1
- import os
2
- import requests
3
- from langchain_core.messages import HumanMessage
4
- from agent import build_graph
5
- from huggingface_hub import hf_hub_download
6
- import pyarrow.parquet as pq
7
- from dotenv import load_dotenv
8
-
9
- load_dotenv(override=True)
10
-
11
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
12
-
13
- # Initialize agent
14
- graph = build_graph()
15
-
16
- # Fetch questions
17
- resp = requests.get(f"{DEFAULT_API_URL}/questions")
18
- questions = resp.json()
19
-
20
- # Load ground truth
21
- token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
22
- path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
23
- df = pq.read_table(path).to_pandas()
24
- answer_map = dict(zip(df['task_id'], df['Final answer']))
25
-
26
- # Test questions 3-5 specifically
27
- for i in [2, 3, 4]:
28
- q = questions[i]
29
- task_id = q['task_id']
30
- question = q['question']
31
- file_name = q.get('file_name')
32
- ground_truth = answer_map.get(task_id, "NOT FOUND")
33
-
34
- print(f"\nQ{i+1}: {question[:80]}...")
35
- print(f"File: {file_name}")
36
- print(f"Ground Truth: {ground_truth}")
37
-
38
- result = graph.invoke({"messages": [HumanMessage(content=question)]})
39
-
40
- # Print all messages
41
- for j, msg in enumerate(result['messages']):
42
- if hasattr(msg, 'content'):
43
- content = msg.content
44
- if len(content) > 200:
45
- content = content[:200] + "..."
46
- print(f" Msg {j}: {content}")
47
-
48
- answer = result['messages'][-1].content
49
- print(f"Agent Answer: {answer}")
50
- is_correct = answer.strip().lower() == str(ground_truth).strip().lower()
51
- print(f"Result: {'CORRECT' if is_correct else 'WRONG'}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_wiki.py DELETED
@@ -1,15 +0,0 @@
1
- from agent import web_search, wiki_search
2
-
3
- # Q5 question
4
- q = "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?"
5
-
6
- # Test searches
7
- ws = web_search.invoke({"keywords": q[:200]})
8
- print("WEB SEARCH:")
9
- print(ws[:1500])
10
- print()
11
-
12
- # Try Wikipedia
13
- wik = wiki_search.invoke({"query": "Giganotosaurus featured article nomination"})
14
- print("WIKI:")
15
- print(wik[:1500])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
debug_wiki2.py DELETED
@@ -1,8 +0,0 @@
1
- from agent import web_search, wiki_search
2
-
3
- # Q5 - more specific search
4
- q = "Featured Article dinosaur November 2016 Wikipedia nomination"
5
-
6
- ws = web_search.invoke({"keywords": q})
7
- print("WEB SEARCH:")
8
- print(ws[:2000])
 
 
 
 
 
 
 
 
 
debug_wiki3.py DELETED
@@ -1,7 +0,0 @@
1
- from agent import web_search
2
-
3
- # Better search for Wikipedia question
4
- q = "Wikipedia Featured Article dinosaur November 2016 nominating user"
5
-
6
- ws = web_search.invoke({"keywords": q})
7
- print(ws[:3000])
 
 
 
 
 
 
 
 
debug_wiki4.py DELETED
@@ -1,7 +0,0 @@
1
- from agent import web_search
2
-
3
- # Very specific search
4
- q = '"FunkMonk" Wikipedia featured article dinosaur'
5
-
6
- ws = web_search.invoke({"keywords": q})
7
- print(ws[:2000])
 
 
 
 
 
 
 
 
debug_youtube.py DELETED
@@ -1,15 +0,0 @@
1
- from agent import get_youtube_transcript, web_search
2
-
3
- # Q2 - YouTube
4
- url2 = "https://www.youtube.com/watch?v=L1vXCYZAYYM"
5
- transcript = get_youtube_transcript.invoke({"url": url2})
6
- print("Q2 Transcript:", transcript[:500])
7
-
8
- # Q7 - YouTube
9
- url7 = "https://www.youtube.com/watch?v=1htKBjuUWec"
10
- transcript7 = get_youtube_transcript.invoke({"url": url7})
11
- print("\nQ7 Transcript:", transcript7[:500])
12
-
13
- # Also search web for content
14
- ws = web_search.invoke({"keywords": "Stargate SG-1 Urgo Teal'c hot scene response"})
15
- print("\nWeb search:", ws[:500])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_10.py DELETED
@@ -1,49 +0,0 @@
1
- import os
2
- import requests
3
- from langchain_core.messages import HumanMessage
4
- from agent import build_graph
5
- from huggingface_hub import hf_hub_download
6
- import pyarrow.parquet as pq
7
- from dotenv import load_dotenv
8
-
9
- load_dotenv(override=True)
10
-
11
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
12
-
13
- # Initialize agent
14
- graph = build_graph()
15
-
16
- # Fetch 10 questions
17
- resp = requests.get(f"{DEFAULT_API_URL}/questions")
18
- questions = resp.json()[:10]
19
-
20
- # Load ground truth
21
- token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
22
- path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
23
- df = pq.read_table(path).to_pandas()
24
- answer_map = dict(zip(df['task_id'], df['Final answer']))
25
-
26
- correct = 0
27
- total = 0
28
-
29
- for q in questions:
30
- task_id = q['task_id']
31
- question = q['question']
32
- file_name = q.get('file_name')
33
- ground_truth = answer_map.get(task_id, "NOT FOUND")
34
-
35
- print(f"\nQ{total+1}: {question[:60]}...")
36
- print(f"File: {file_name}")
37
- print(f"GT: {ground_truth}")
38
-
39
- result = graph.invoke({"messages": [HumanMessage(content=question)]})
40
- answer = result['messages'][-1].content
41
- print(f"Ans: {answer[:50]}")
42
-
43
- is_correct = answer.strip().lower() == str(ground_truth).strip().lower()
44
- if is_correct:
45
- correct += 1
46
- total += 1
47
- print(f"{'CORRECT' if is_correct else 'WRONG'}")
48
-
49
- print(f"\n=== Score: {correct}/{total} = {correct/total*100:.0f}% ===")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_11_20.py DELETED
@@ -1,51 +0,0 @@
1
- import os
2
- import requests
3
- from langchain_core.messages import HumanMessage
4
- from agent import build_graph
5
- from huggingface_hub import hf_hub_download
6
- import pyarrow.parquet as pq
7
- from dotenv import load_dotenv
8
-
9
- load_dotenv(override=True)
10
-
11
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
12
-
13
- # Initialize agent
14
- graph = build_graph()
15
-
16
- # Fetch questions 11-20
17
- resp = requests.get(f"{DEFAULT_API_URL}/questions")
18
- questions = resp.json()[10:20]
19
-
20
- # Load ground truth
21
- token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
22
- path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
23
- df = pq.read_table(path).to_pandas()
24
- answer_map = dict(zip(df['task_id'], df['Final answer']))
25
-
26
- correct = 0
27
- total = 0
28
-
29
- for i, q in enumerate(questions):
30
- task_id = q['task_id']
31
- question = q['question']
32
- file_name = q.get('file_name')
33
- ground_truth = answer_map.get(task_id, "NOT FOUND")
34
-
35
- print(f"\n[{i+11}] ", end="")
36
-
37
- result = graph.invoke({"messages": [HumanMessage(content=question)]})
38
- answer = result['messages'][-1].content
39
-
40
- try:
41
- print(f"Ans: {answer[:30].encode('ascii', 'replace').decode('ascii')}")
42
- except:
43
- print(f"Ans: [encoding issue]")
44
-
45
- is_correct = answer.strip().lower() == str(ground_truth).strip().lower()
46
- if is_correct:
47
- correct += 1
48
- total += 1
49
- print(f" {'CORRECT' if is_correct else 'WRONG'} (GT: {str(ground_truth)[:20]})")
50
-
51
- print(f"\n=== Score: {correct}/{total} = {correct/total*100:.0f}% ===")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_5.py DELETED
@@ -1,47 +0,0 @@
1
- import os
2
- import requests
3
- from langchain_core.messages import HumanMessage
4
- from agent import build_graph
5
- from huggingface_hub import hf_hub_download
6
- import pyarrow.parquet as pq
7
- from dotenv import load_dotenv
8
-
9
- load_dotenv(override=True)
10
-
11
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
12
-
13
- # Initialize agent
14
- graph = build_graph()
15
-
16
- # Fetch 5 questions
17
- resp = requests.get(f"{DEFAULT_API_URL}/questions")
18
- questions = resp.json()[:5]
19
-
20
- # Load ground truth
21
- token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
22
- path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
23
- df = pq.read_table(path).to_pandas()
24
- answer_map = dict(zip(df['task_id'], df['Final answer']))
25
-
26
- correct = 0
27
- total = 0
28
-
29
- for q in questions:
30
- task_id = q['task_id']
31
- question = q['question']
32
- ground_truth = answer_map.get(task_id, "NOT FOUND")
33
-
34
- print(f"\nQ{total+1}: {question[:80]}...")
35
- print(f"Ground Truth: {ground_truth}")
36
-
37
- result = graph.invoke({"messages": [HumanMessage(content=question)]})
38
- answer = result['messages'][-1].content
39
- print(f"Agent Answer: {answer}")
40
-
41
- is_correct = answer.strip().lower() == str(ground_truth).strip().lower()
42
- if is_correct:
43
- correct += 1
44
- total += 1
45
- print(f"Result: {'CORRECT' if is_correct else 'WRONG'}")
46
-
47
- print(f"\n=== Score: {correct}/{total} = {correct/total*100:.0f}% ===")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_all.py DELETED
@@ -1,52 +0,0 @@
1
- import os
2
- import requests
3
- import time
4
- from langchain_core.messages import HumanMessage
5
- from agent import build_graph
6
- from huggingface_hub import hf_hub_download
7
- import pyarrow.parquet as pq
8
- from dotenv import load_dotenv
9
-
10
- load_dotenv(override=True)
11
-
12
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
13
-
14
- # Initialize agent
15
- graph = build_graph()
16
-
17
- # Fetch ALL questions
18
- resp = requests.get(f"{DEFAULT_API_URL}/questions")
19
- questions = resp.json()
20
-
21
- # Load ground truth
22
- token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
23
- path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
24
- df = pq.read_table(path).to_pandas()
25
- answer_map = dict(zip(df['task_id'], df['Final answer']))
26
-
27
- correct = 0
28
- total = 0
29
-
30
- for q in questions:
31
- task_id = q['task_id']
32
- question = q['question']
33
- file_name = q.get('file_name')
34
- ground_truth = answer_map.get(task_id, "NOT FOUND")
35
-
36
- print(f"\n[{total+1}/{len(questions)}] {question[:50]}...")
37
-
38
- result = graph.invoke({"messages": [HumanMessage(content=question)]})
39
- answer = result['messages'][-1].content
40
-
41
- is_correct = answer.strip().lower() == str(ground_truth).strip().lower()
42
- if is_correct:
43
- correct += 1
44
- total += 1
45
-
46
- status = "✅" if is_correct else "❌"
47
- print(f" {status} GT: {str(ground_truth)[:30]}")
48
- print(f" Ans: {answer[:50]}")
49
-
50
- time.sleep(1)
51
-
52
- print(f"\n=== FINAL SCORE: {correct}/{total} = {correct/total*100:.0f}% ===")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_all_v2.py DELETED
@@ -1,44 +0,0 @@
1
- import os
2
- import requests
3
- from langchain_core.messages import HumanMessage
4
- from agent import build_graph
5
- from huggingface_hub import hf_hub_download
6
- import pyarrow.parquet as pq
7
- from dotenv import load_dotenv
8
-
9
- load_dotenv(override=True)
10
-
11
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
12
-
13
- graph = build_graph()
14
- resp = requests.get(f"{DEFAULT_API_URL}/questions")
15
- questions = resp.json()
16
-
17
- token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
18
- path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
19
- df = pq.read_table(path).to_pandas()
20
- answer_map = dict(zip(df['task_id'], df['Final answer']))
21
-
22
- correct = 0
23
- total = 0
24
-
25
- for i, q in enumerate(questions):
26
- task_id = q['task_id']
27
- question = q['question']
28
- ground_truth = answer_map.get(task_id, "NOT FOUND")
29
-
30
- try:
31
- result = graph.invoke({"messages": [HumanMessage(content=question)]})
32
- answer = result['messages'][-1].content
33
-
34
- is_correct = answer.strip().lower() == str(ground_truth).strip().lower()
35
- if is_correct:
36
- correct += 1
37
- total += 1
38
- status = "OK" if is_correct else "FAIL"
39
- print(f"[{i+1:2d}] {status}")
40
- except Exception as e:
41
- print(f"[{i+1:2d}] ERROR: {str(e)[:30]}")
42
- total += 1
43
-
44
- print(f"\n=== TOTAL: {correct}/{total} = {correct/total*100:.0f}% ===")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_improvements.py DELETED
@@ -1,45 +0,0 @@
1
- import os
2
- import sys
3
- sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
4
-
5
- from langchain_core.messages import HumanMessage
6
- from agent import build_graph
7
-
8
- def test_agent():
9
- print("=" * 50)
10
- print("Testing GAIA Agent with improvements...")
11
- print("=" * 50)
12
-
13
- # Build the agent graph
14
- graph = build_graph()
15
-
16
- # Test question - simple math/reasoning
17
- test_question = "What is 15 + 27?"
18
-
19
- print(f"\nQuestion: {test_question}")
20
- print("-" * 30)
21
-
22
- messages = [HumanMessage(content=test_question)]
23
- result = graph.invoke({"messages": messages})
24
-
25
- answer = result['messages'][-1].content
26
- print(f"\nFinal Answer: {answer}")
27
- print("-" * 30)
28
-
29
- # Test another question requiring web search
30
- test_question2 = "What is the capital of France?"
31
-
32
- print(f"\nQuestion: {test_question2}")
33
- print("-" * 30)
34
-
35
- messages2 = [HumanMessage(content=test_question2)]
36
- result2 = graph.invoke({"messages": messages2})
37
-
38
- answer2 = result2['messages'][-1].content
39
- print(f"\nFinal Answer: {answer2}")
40
- print("-" * 30)
41
-
42
- print("\nTest completed successfully!")
43
-
44
- if __name__ == "__main__":
45
- test_agent()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_q2.py DELETED
@@ -1,40 +0,0 @@
1
- import os
2
- import requests
3
- from langchain_core.messages import HumanMessage
4
- from agent import build_graph
5
- from huggingface_hub import hf_hub_download
6
- import pyarrow.parquet as pq
7
- from dotenv import load_dotenv
8
-
9
- load_dotenv(override=True)
10
-
11
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
12
-
13
- # Initialize agent
14
- graph = build_graph()
15
-
16
- # Fetch questions
17
- resp = requests.get(f"{DEFAULT_API_URL}/questions")
18
- questions = resp.json()
19
-
20
- # Load ground truth
21
- token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
22
- path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
23
- df = pq.read_table(path).to_pandas()
24
- answer_map = dict(zip(df['task_id'], df['Final answer']))
25
-
26
- # Test Q2 only
27
- q = questions[1]
28
- task_id = q['task_id']
29
- question = q['question']
30
- ground_truth = answer_map.get(task_id, "NOT FOUND")
31
-
32
- print(f"Q2: {question[:80]}...")
33
- print(f"GT: {ground_truth}")
34
- print("-" * 40)
35
-
36
- result = graph.invoke({"messages": [HumanMessage(content=question)]})
37
- answer = result['messages'][-1].content
38
- print(f"Ans: {answer}")
39
- print("-" * 40)
40
- print(f"Correct: {answer.strip().lower() == str(ground_truth).strip().lower()}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_vision.py DELETED
@@ -1,35 +0,0 @@
1
- import os
2
- from langchain_openai import ChatOpenAI
3
- from langchain_core.messages import HumanMessage
4
- import base64
5
- from dotenv import load_dotenv
6
-
7
- load_dotenv()
8
-
9
- def test_vision():
10
- # Use a tiny 1x1 base64 image for testing
11
- tiny_img = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
12
- msg = HumanMessage(content=[{"type": "text", "text": "what is in this image?"}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{tiny_img}"}}])
13
-
14
- models = [
15
- {"name": "OpenRouter-Gemini-2.0", "provider": "openai", "model": "google/gemini-2.0-flash-001", "base_url": "https://openrouter.ai/api/v1", "key": "OPENROUTER_API_KEY"},
16
- {"name": "NVIDIA-Llama-3.2", "provider": "openai", "model": "nvidia/llama-3.2-nv-vision-70b", "base_url": "https://integrate.api.nvidia.com/v1", "key": "NVIDIA_API_KEY"},
17
- {"name": "NVIDIA-Qwen-VL", "provider": "openai", "model": "nvidia/qwen-vl-max", "base_url": "https://integrate.api.nvidia.com/v1", "key": "NVIDIA_API_KEY"},
18
- {"name": "Vercel-Vision", "provider": "openai", "model": "gpt-4o-mini", "base_url": "https://gateway.ai.vercel.com/v1", "key": "VERCEL_API_KEY"},
19
- ]
20
-
21
- for m in models:
22
- key = os.getenv(m['key'])
23
- if not key:
24
- print(f"Skip {m['name']} (no key)")
25
- continue
26
- try:
27
- print(f"Testing {m['name']} ({m['model']})...")
28
- llm = ChatOpenAI(model=m['model'], openai_api_key=key, openai_api_base=m['base_url'], temperature=0)
29
- res = llm.invoke([msg])
30
- print(f"Success: {res.content}")
31
- except Exception as e:
32
- print(f"Fail: {e}")
33
-
34
- if __name__ == "__main__":
35
- test_vision()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_vision_v2.py DELETED
@@ -1,34 +0,0 @@
1
- import os
2
- from langchain_openai import ChatOpenAI
3
- from langchain_core.messages import HumanMessage
4
- import base64
5
- from dotenv import load_dotenv
6
-
7
- load_dotenv(override=True)
8
-
9
- def test_vision():
10
- # Use a tiny 1x1 base64 image for testing
11
- tiny_img = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
12
- msg = HumanMessage(content=[{"type": "text", "text": "is this image red, green, or blue? answer with one word."}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{tiny_img}"}}])
13
-
14
- models = [
15
- {"name": "NVIDIA-Llama-3.2-11b", "provider": "openai", "model": "meta/llama-3.2-11b-vision-instruct", "base_url": "https://integrate.api.nvidia.com/v1", "key": "NVIDIA_API_KEY"},
16
- {"name": "NVIDIA-Llama-3.2-90b", "provider": "openai", "model": "meta/llama-3.2-90b-vision-instruct", "base_url": "https://integrate.api.nvidia.com/v1", "key": "NVIDIA_API_KEY"},
17
- {"name": "NVIDIA-Mistral-Vision", "provider": "openai", "model": "mistralai/pixtral-12b", "base_url": "https://integrate.api.nvidia.com/v1", "key": "NVIDIA_API_KEY"},
18
- ]
19
-
20
- for m in models:
21
- key = os.getenv(m['key'])
22
- if not key:
23
- print(f"Skip {m['name']} (no key)")
24
- continue
25
- try:
26
- print(f"Testing {m['name']} ({m['model']})...")
27
- llm = ChatOpenAI(model=m['model'], openai_api_key=key, openai_api_base=m['base_url'], temperature=0)
28
- res = llm.invoke([msg])
29
- print(f"Success: {res.content}")
30
- except Exception as e:
31
- print(f"Fail: {e}")
32
-
33
- if __name__ == "__main__":
34
- test_vision()