Paperbag commited on
Commit
04e7101
·
1 Parent(s): 98022e3

Add vision-related tests and tools for image description and analysis

Browse files

- Introduced _test_network.py, _test_other_vision.py, _test_vision.py, _test_vqa.py, _test_wiki.py, _test_zen_vision.py, and _test_zen_vision_all.py for testing various vision models and APIs.
- Implemented describe_image tool in tools/vision/describe_image.py to generate descriptions for images using a local vision model.
- Updated agent.py and app.py to include describe_image tool in the workflow.
- Modified gaia_results.csv and gaia_results.json to reflect correct answers and improve data accuracy.
- Enhanced web search functionality in tools/web/wiki.py to fetch Wikipedia snippets using the MediaWiki API.
- Improved error handling and logging across various modules for better debugging and user feedback.

__pycache__/agent.cpython-39.pyc CHANGED
Binary files a/__pycache__/agent.cpython-39.pyc and b/__pycache__/agent.cpython-39.pyc differ
 
_test_blip.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, time
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+ from PIL import Image
5
+ from huggingface_hub import hf_hub_download
6
+
7
+ fn = 'cca530fc-4052-43b2-b130-b30968d8aa44.png'
8
+ token = os.getenv('HF_TOKEN')
9
+ path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename=f'2023/validation/{fn}', repo_type='dataset', token=token)
10
+
11
+ img = Image.open(path)
12
+ print(f'Image size: {img.size}, mode: {img.mode}')
13
+
14
+ # Try to use a small BLIP model
15
+ from transformers import pipeline
16
+ print('Loading BLIP image captioning...')
17
+ t0 = time.time()
18
+ try:
19
+ captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
20
+ print(f'Model loaded in {time.time()-t0:.1f}s')
21
+ t0 = time.time()
22
+ result = captioner(img)
23
+ print(f'Result in {time.time()-t0:.1f}s: {result}')
24
+ except Exception as e:
25
+ print(f'Error: {type(e).__name__}: {e}')
26
+
27
+ # Try with a tiny model
28
+ print('\nTrying tiny model...')
29
+ t0 = time.time()
30
+ try:
31
+ captioner2 = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
32
+ print(f'Model loaded in {time.time()-t0:.1f}s')
33
+ t0 = time.time()
34
+ result2 = captioner2(img)
35
+ print(f'Result in {time.time()-t0:.1f}s: {result2}')
36
+ except Exception as e:
37
+ print(f'Error: {type(e).__name__}: {e}')
_test_client_attrs.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+ from huggingface_hub import InferenceClient
5
+
6
+ client = InferenceClient(token=os.getenv('HF_TOKEN'))
7
+
8
+ # Check attributes
9
+ attrs = [a for a in dir(client) if not a.startswith('_')]
10
+ print('Public methods:')
11
+ for a in attrs:
12
+ if 'image' in a.lower() or 'vision' in a.lower() or 'base' in a.lower() or 'url' in a.lower() or 'endpoint' in a.lower():
13
+ print(f' {a}')
_test_hf_chat.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, base64
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+ from huggingface_hub import InferenceClient, hf_hub_download
5
+
6
+ token = os.getenv('HF_TOKEN')
7
+ client = InferenceClient(token=token, provider='hf-inference')
8
+
9
+ fn = 'cca530fc-4052-43b2-b130-b30968d8aa44.png'
10
+ path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename=f'2023/validation/{fn}', repo_type='dataset', token=token)
11
+
12
+ with open(path, 'rb') as f:
13
+ img_bytes = f.read()
14
+ b64 = base64.b64encode(img_bytes).decode('utf-8')
15
+ data_uri = f'data:image/png;base64,{b64}'
16
+
17
+ print('Trying chat_completion with hf-inference provider...')
18
+ try:
19
+ resp = client.chat_completion(
20
+ model='meta-llama/Llama-3.2-11B-Vision-Instruct',
21
+ messages=[{
22
+ 'role': 'user',
23
+ 'content': [
24
+ {'type': 'text', 'text': 'Describe this image in detail.'},
25
+ {'type': 'image_url', 'image_url': {'url': data_uri}},
26
+ ]
27
+ }],
28
+ max_tokens=1024,
29
+ temperature=0,
30
+ )
31
+ print(f'SUCCESS!')
32
+ print(f'Response: {resp.choices[0].message.content[:500]}')
33
+ except Exception as e:
34
+ print(f'Error: {type(e).__name__}: {e}')
35
+ import traceback
36
+ traceback.print_exc()
_test_hf_constants.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import constants, InferenceClient
3
+
4
+ print(f'HUGGINGFACE_HUB_ENDPOINT: {constants.HUGGINGFACE_HUB_ENDPOINT}')
5
+ print(f'INFERENCE_ENDPOINT env: {os.getenv("HF_INFERENCE_ENDPOINT", "not set")}')
6
+ print(f'HF_ENDPOINT env: {os.getenv("HF_ENDPOINT", "not set")}')
7
+
8
+ client = InferenceClient()
9
+ print(f'Client base_url: {client.base_url}')
_test_hf_endpoints.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, base64, requests, json
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+
5
+ token = os.getenv('HF_TOKEN')
6
+
7
+ from huggingface_hub import hf_hub_download
8
+ fn = 'cca530fc-4052-43b2-b130-b30968d8aa44.png'
9
+ path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename=f'2023/validation/{fn}', repo_type='dataset', token=token)
10
+
11
+ with open(path, 'rb') as f:
12
+ img_bytes = f.read()
13
+ b64 = base64.b64encode(img_bytes).decode('utf-8')
14
+ data_uri = f'data:image/png;base64,{b64}'
15
+
16
+ payload = {
17
+ "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
18
+ "messages": [{
19
+ "role": "user",
20
+ "content": [
21
+ {"type": "text", "text": "Describe this image briefly."},
22
+ {"type": "image_url", "image_url": {"url": data_uri}},
23
+ ]
24
+ }],
25
+ "max_tokens": 512,
26
+ "temperature": 0,
27
+ }
28
+
29
+ # Try different endpoint formats
30
+ endpoints = [
31
+ "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.2-11B-Vision-Instruct/v1/chat/completions",
32
+ "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.2-11B-Vision-Instruct",
33
+ "https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-11B-Vision-Instruct",
34
+ "https://router.huggingface.co/hf-inference/models/Qwen/Qwen2-VL-7B-Instruct/v1/chat/completions",
35
+ "https://router.huggingface.co/hf-inference/models/microsoft/Florence-2-large/v1/chat/completions",
36
+ ]
37
+
38
+ headers = {
39
+ "Authorization": f"Bearer {token}",
40
+ "Content-Type": "application/json",
41
+ }
42
+
43
+ for ep in endpoints:
44
+ print(f'\n--- Trying: {ep}')
45
+ try:
46
+ resp = requests.post(ep, headers=headers, json=payload, timeout=30)
47
+ print(f'Status: {resp.status_code}')
48
+ if resp.status_code == 200:
49
+ print(f'Response: {resp.text[:300]}')
50
+ else:
51
+ print(f'Error: {resp.text[:200]}')
52
+ except Exception as e:
53
+ print(f'Exception: {e}')
_test_hf_final.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+ from huggingface_hub import InferenceClient, hf_hub_download
5
+
6
+ token = os.getenv('HF_TOKEN')
7
+ client = InferenceClient(token=token)
8
+
9
+ fn = 'cca530fc-4052-43b2-b130-b30968d8aa44.png'
10
+ path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename=f'2023/validation/{fn}', repo_type='dataset', token=token)
11
+
12
+ with open(path, 'rb') as f:
13
+ img_bytes = f.read()
14
+
15
+ # Method 1: image_to_text WITHOUT model (let HF auto-detect)
16
+ print('=== image_to_text without model ===')
17
+ try:
18
+ result = client.image_to_text(img_bytes)
19
+ print(f'Result: {result[:300]}')
20
+ except Exception as e:
21
+ print(f'Error: {type(e).__name__}: {e}')
22
+
23
+ # Method 2: Explicitly pass model as full URL to force hf-inference
24
+ print('\n=== image_to_text with explicit URL ===')
25
+ try:
26
+ result = client.image_to_text(
27
+ img_bytes,
28
+ model='https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-11B-Vision-Instruct'
29
+ )
30
+ print(f'Result: {result[:300]}')
31
+ except Exception as e:
32
+ print(f'Error: {type(e).__name__}: {e}')
33
+
34
+ # Method 3: visual_question_answering
35
+ print('\n=== visual_question_answering without model ===')
36
+ try:
37
+ result = client.visual_question_answering(
38
+ img_bytes,
39
+ question='Describe this image in detail.'
40
+ )
41
+ print(f'Result: {result}')
42
+ except Exception as e:
43
+ print(f'Error: {type(e).__name__}: {e}')
44
+
45
+ # Method 4: Use chat_completion with explicit provider
46
+ print('\n=== chat_completion with provider and model as URL ===')
47
+ import base64
48
+ b64 = base64.b64encode(img_bytes).decode('utf-8')
49
+ data_uri = f'data:image/png;base64,{b64}'
50
+ try:
51
+ resp = client.chat_completion(
52
+ model='meta-llama/Llama-3.2-11B-Vision-Instruct',
53
+ messages=[{
54
+ 'role': 'user',
55
+ 'content': [
56
+ {'type': 'text', 'text': 'Describe this image in detail.'},
57
+ {'type': 'image_url', 'image_url': {'url': data_uri}},
58
+ ]
59
+ }],
60
+ max_tokens=1024,
61
+ temperature=0,
62
+ )
63
+ print(f'Result: {resp.choices[0].message.content[:300]}')
64
+ except Exception as e:
65
+ print(f'Error: {type(e).__name__}: {e}')
_test_hf_inference_client.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, base64
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+
5
+ token = os.getenv('HF_TOKEN')
6
+
7
+ from huggingface_hub import hf_hub_download
8
+ from huggingface_hub import InferenceClient
9
+
10
+ client = InferenceClient(token=token)
11
+
12
+ fn = 'cca530fc-4052-43b2-b130-b30968d8aa44.png'
13
+ path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename=f'2023/validation/{fn}', repo_type='dataset', token=token)
14
+
15
+ with open(path, 'rb') as f:
16
+ img_bytes = f.read()
17
+
18
+ # Method 1: image_to_text
19
+ print("=== image_to_text ===")
20
+ try:
21
+ result = client.image_to_text(img_bytes, model="meta-llama/Llama-3.2-11B-Vision-Instruct")
22
+ print(f'Result: {result[:300]}')
23
+ except Exception as e:
24
+ print(f'Exception: {type(e).__name__}: {e}')
25
+
26
+ # Method 2: Use InferenceClient with task endpoint
27
+ print("\n=== image_to_text (blip2) ===")
28
+ try:
29
+ result = client.image_to_text(img_bytes, model="Salesforce/blip2-flan-t5-xl")
30
+ print(f'Result: {result[:300]}')
31
+ except Exception as e:
32
+ print(f'Exception: {type(e).__name__}: {e}')
33
+
34
+ # Method 3: Use InferenceClient.document_question_answering with image
35
+ # Actually that's for documents. Let me try visual_question_answering
36
+ print("\n=== visual_question_answering ===")
37
+ try:
38
+ result = client.visual_question_answering(
39
+ img_bytes,
40
+ question="Describe this image in detail. Include any visible text, numbers, labels, diagrams, chess piece positions.",
41
+ model="meta-llama/Llama-3.2-11B-Vision-Instruct"
42
+ )
43
+ print(f'Result: {result}')
44
+ except Exception as e:
45
+ print(f'Exception: {type(e).__name__}: {e}')
_test_hf_methods.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, base64, requests
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+
5
+ token = os.getenv('HF_TOKEN')
6
+
7
+ from huggingface_hub import hf_hub_download, HfApi
8
+ fn = 'cca530fc-4052-43b2-b130-b30968d8aa44.png'
9
+ path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename=f'2023/validation/{fn}', repo_type='dataset', token=token)
10
+
11
+ with open(path, 'rb') as f:
12
+ img_bytes = f.read()
13
+
14
+ # Method: Use HfApi.get_model_info to get inference URL
15
+ api = HfApi(token=token)
16
+ try:
17
+ info = api.get_model_info('meta-llama/Llama-3.2-11B-Vision-Instruct')
18
+ # Check inference-related attributes
19
+ for attr in dir(info):
20
+ if 'infer' in attr.lower() or 'pipeline' in attr.lower() or 'widget' in attr.lower():
21
+ print(f'{attr}: {getattr(info, attr)}')
22
+ except Exception as e:
23
+ print(f'get_model_info error: {e}')
24
+
25
+ # Method: Try sending image bytes to the HF inference API
26
+ # The URL format for tasks API
27
+ print('\n=== Trying raw image to text task ===')
28
+ model_id = 'meta-llama/Llama-3.2-11B-Vision-Instruct'
29
+
30
+ # The correct HF Inference API uses this format:
31
+ # POST /models/{model_id} with task inference
32
+ # But on the router it's different
33
+ # Let me try the explicit hf-inference router path
34
+ urls_to_try = [
35
+ ('router huggingface.co direct', f'https://router.huggingface.co/hf-inference/models/{model_id}'),
36
+ ('router with tasks prefix', f'https://router.huggingface.co/hf-inference/models/{model_id}/image-to-text'),
37
+ ]
38
+
39
+ for name, url in urls_to_try:
40
+ print(f'\n--- {name} ---')
41
+ try:
42
+ resp = requests.post(
43
+ url,
44
+ headers={"Authorization": f"Bearer {token}", "Content-Type": "application/octet-stream"},
45
+ data=img_bytes,
46
+ timeout=90
47
+ )
48
+ print(f'Status: {resp.status_code}')
49
+ print(f'Response: {resp.text[:300]}')
50
+ except Exception as e:
51
+ print(f'Exception: {e}')
52
+
53
+ # Method: Try to resolve api-inference.huggingface.co via alternative DNS
54
+ print('\n=== Trying api-inference.huggingface.co with nslookup ===')
55
+ import subprocess
56
+ try:
57
+ result = subprocess.run(['nslookup', 'api-inference.huggingface.co'], capture_output=True, text=True, timeout=10)
58
+ print(f'nslookup stdout: {result.stdout}')
59
+ print(f'nslookup stderr: {result.stderr}')
60
+ except Exception as e:
61
+ print(f'nslookup failed: {e}')
_test_hf_providers.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+ from huggingface_hub import InferenceClient
5
+ from huggingface_hub import hf_hub_download
6
+
7
+ token = os.getenv('HF_TOKEN')
8
+
9
+ # Try with explicit provider
10
+ for provider_name in ['hf-inference', 'together', 'fal-ai', 'replicate', 'novita', 'nebius']:
11
+ try:
12
+ client = InferenceClient(token=token, provider=provider_name)
13
+ fn = 'cca530fc-4052-43b2-b130-b30968d8aa44.png'
14
+ path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename=f'2023/validation/{fn}', repo_type='dataset', token=token)
15
+ with open(path, 'rb') as f:
16
+ img_bytes = f.read()
17
+
18
+ result = client.image_to_text(img_bytes, model='meta-llama/Llama-3.2-11B-Vision-Instruct')
19
+ print(f'Provider {provider_name}: OK - {result[:200]}')
20
+ except Exception as e:
21
+ err_str = str(e)[:100]
22
+ print(f'Provider {provider_name}: {type(e).__name__} - {err_str}')
_test_hf_tasks.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, base64, requests
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+
5
+ token = os.getenv('HF_TOKEN')
6
+
7
+ from huggingface_hub import hf_hub_download
8
+ fn = 'cca530fc-4052-43b2-b130-b30968d8aa44.png'
9
+ path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename=f'2023/validation/{fn}', repo_type='dataset', token=token)
10
+
11
+ with open(path, 'rb') as f:
12
+ img_bytes = f.read()
13
+
14
+ b64 = base64.b64encode(img_bytes).decode('utf-8')
15
+
16
+ # Try Tasks API format - image-to-text
17
+ headers = {
18
+ "Authorization": f"Bearer {token}",
19
+ }
20
+
21
+ # Option 1: Send raw image with task header
22
+ print("=== Option 1: router.huggingface.co with raw image ===")
23
+ try:
24
+ resp = requests.post(
25
+ "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.2-11B-Vision-Instruct",
26
+ headers={**headers, "Content-Type": "application/octet-stream", "x-use-cache": "false"},
27
+ data=img_bytes,
28
+ timeout=60
29
+ )
30
+ print(f'Status: {resp.status_code}')
31
+ print(f'Response: {resp.text[:300]}')
32
+ except Exception as e:
33
+ print(f'Exception: {e}')
34
+
35
+ # Option 2: Try with HF-Inference API dedicated endpoint using tasks
36
+ # The correct format for image-to-text tasks
37
+ print("\n=== Option 2: router with image-to-text task ===")
38
+ try:
39
+ resp = requests.post(
40
+ "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.2-11B-Vision-Instruct",
41
+ headers={**headers, "Content-Type": "application/json"},
42
+ json={"inputs": f"data:image/png;base64,{b64}", "parameters": {"max_new_tokens": 500}},
43
+ timeout=60
44
+ )
45
+ print(f'Status: {resp.status_code}')
46
+ print(f'Response: {resp.text[:300]}')
47
+ except Exception as e:
48
+ print(f'Exception: {e}')
49
+
50
+ # Option 3: Try without model-level routing (direct to HF free API)
51
+ print("\n=== Option 3: try alternative model ===")
52
+ models_to_try = [
53
+ "meta-llama/Llama-3.2-11B-Vision-Instruct",
54
+ "Salesforce/blip2-flan-t5-xl",
55
+ ]
56
+ for model in models_to_try:
57
+ print(f'Trying {model}')
58
+ try:
59
+ resp = requests.post(
60
+ f"https://router.huggingface.co/hf-inference/models/{model}",
61
+ headers={**headers, "Content-Type": "application/octet-stream"},
62
+ data=img_bytes,
63
+ timeout=30
64
+ )
65
+ print(f' Status: {resp.status_code}')
66
+ print(f' Response: {resp.text[:200]}')
67
+ except Exception as e:
68
+ print(f' Exception: {e}')
_test_hf_tasks2.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, base64, requests
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+
5
+ token = os.getenv('HF_TOKEN')
6
+ from huggingface_hub import hf_hub_download
7
+ fn = 'cca530fc-4052-43b2-b130-b30968d8aa44.png'
8
+ path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename=f'2023/validation/{fn}', repo_type='dataset', token=token)
9
+
10
+ with open(path, 'rb') as f:
11
+ img_bytes = f.read()
12
+
13
+ headers = {"Authorization": f"Bearer {token}"}
14
+
15
+ # Try the Tasks API format: POST image bytes to model endpoint
16
+ # This is the standard HF Inference API format for image-to-text
17
+ print("=== Method 1: Direct model endpoint via router ===")
18
+ for url in [
19
+ "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.2-11B-Vision-Instruct",
20
+ "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.2-11B-Vision-Instruct/image-to-text",
21
+ ]:
22
+ try:
23
+ resp = requests.post(
24
+ url,
25
+ headers={**headers, "Content-Type": "application/octet-stream"},
26
+ data=img_bytes,
27
+ timeout=60
28
+ )
29
+ print(f' {url}: Status {resp.status_code}')
30
+ if resp.status_code == 200:
31
+ print(f' OK: {resp.text[:500]}')
32
+ else:
33
+ print(f' Error: {resp.text[:200]}')
34
+ except Exception as e:
35
+ print(f' Exception: {e}')
36
+
37
+ # Method 2: Try using the HF dedicated inference endpoint IP/host
38
+ # The actual serverless endpoint might resolve through router
39
+ print("\n=== Method 2: Direct HF endpoint via HTTPS ===")
40
+ # Sometimes the DNS just needs retry
41
+ import socket
42
+ try:
43
+ # Try resolving
44
+ ips = socket.getaddrinfo('api-inference.huggingface.co', 443)
45
+ print(f' api-inference.huggingface.co resolved to: {ips}')
46
+ except Exception as e:
47
+ print(f' api-inference.huggingface.co DNS: {e}')
48
+ # Try ping to see if it's a transient issue
49
+ try:
50
+ ips = socket.getaddrinfo('api-inference.huggingface.co', 443, socket.AF_INET)
51
+ print(f' api-inference.huggingface.co (IPv4) resolved to: {ips}')
52
+ except Exception as e2:
53
+ print(f' api-inference.huggingface.co (IPv4) DNS: {e2}')
54
+
55
+ # Method 3: Use router.huggingface.co with text-generation route (some vision models have it)
56
+ print("\n=== Method 3: Text generation inference via router ===")
57
+ import json
58
+ payload = {
59
+ "inputs": base64.b64encode(img_bytes).decode('utf-8'),
60
+ "parameters": {"max_new_tokens": 500}
61
+ }
62
+ try:
63
+ resp = requests.post(
64
+ "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.2-11B-Vision-Instruct",
65
+ headers={**headers, "Content-Type": "application/json"},
66
+ json={"inputs": "Describe this image", "parameters": {"max_new_tokens": 500}},
67
+ timeout=30
68
+ )
69
+ print(f' Status: {resp.status_code}')
70
+ print(f' Response: {resp.text[:200]}')
71
+ except Exception as e:
72
+ print(f' Exception: {e}')
_test_hf_vision.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, base64
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+ from huggingface_hub import InferenceClient
5
+
6
+ token = os.getenv('HF_TOKEN')
7
+ client = InferenceClient(token=token)
8
+
9
+ fn = 'cca530fc-4052-43b2-b130-b30968d8aa44.png'
10
+ from huggingface_hub import hf_hub_download
11
+ path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename=f'2023/validation/{fn}', repo_type='dataset', token=token)
12
+
13
+ with open(path, 'rb') as f:
14
+ img_bytes = f.read()
15
+ b64 = base64.b64encode(img_bytes).decode('utf-8')
16
+ data_uri = f'data:image/png;base64,{b64}'
17
+ print(f'Image size: {len(img_bytes)} bytes, b64 len: {len(b64)}')
18
+
19
+ try:
20
+ resp = client.chat_completion(
21
+ model='meta-llama/Llama-3.2-11B-Vision-Instruct',
22
+ messages=[{
23
+ 'role': 'user',
24
+ 'content': [
25
+ {'type': 'text', 'text': 'Describe this image in detail.'},
26
+ {'type': 'image_url', 'image_url': {'url': data_uri}},
27
+ ]
28
+ }],
29
+ max_tokens=512,
30
+ temperature=0,
31
+ )
32
+ print(f'Response: {resp}')
33
+ print(f'Choices: {resp.choices}')
34
+ if resp.choices:
35
+ print(f'Content: {resp.choices[0].message.content}')
36
+ except Exception as e:
37
+ print(f'Exception type: {type(e).__name__}')
38
+ print(f'Exception: {e}')
39
+ import traceback
40
+ traceback.print_exc()
_test_images.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ r = requests.get('https://agents-course-unit4-scoring.hf.space/questions', timeout=15)
3
+ data = r.json()
4
+ image_exts = ('.png', '.jpg', '.jpeg')
5
+ for item in data:
6
+ fn = item.get('file_name', '') or ''
7
+ if fn.lower().endswith(image_exts):
8
+ tid = item.get('task_id', '?')
9
+ q = item.get('question', '')[:80]
10
+ print(f'{tid}: file={fn} question={q}')
_test_models.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, time
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+ from PIL import Image
5
+ from huggingface_hub import hf_hub_download
6
+ from transformers import pipeline
7
+
8
+ fn = 'cca530fc-4052-43b2-b130-b30968d8aa44.png'
9
+ token = os.getenv('HF_TOKEN')
10
+ path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename=f'2023/validation/{fn}', repo_type='dataset', token=token)
11
+ img = Image.open(path)
12
+
13
+ # Try BLIP with limited max_new_tokens
14
+ print('=== BLIP with max_new_tokens=30 ===')
15
+ t0 = time.time()
16
+ try:
17
+ pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
18
+ print(f'Load: {time.time()-t0:.1f}s')
19
+ t0 = time.time()
20
+ result = pipe(img, max_new_tokens=30)
21
+ print(f'Inference: {time.time()-t0:.1f}s')
22
+ print(f'Result: {result}')
23
+ except Exception as e:
24
+ print(f'Error: {e}')
25
+
26
+ # Try BLIP-large
27
+ print('\n=== BLIP-large ===')
28
+ t0 = time.time()
29
+ try:
30
+ pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
31
+ print(f'Load: {time.time()-t0:.1f}s')
32
+ t0 = time.time()
33
+ result = pipe(img, max_new_tokens=30)
34
+ print(f'Inference: {time.time()-t0:.1f}s')
35
+ print(f'Result: {result}')
36
+ except Exception as e:
37
+ print(f'Error: {e}')
38
+
39
+ # Try VIT-GPT2 (smaller, already tested)
40
+ print('\n=== vit-gpt2 with max_new_tokens=30 ===')
41
+ t0 = time.time()
42
+ try:
43
+ pipe = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
44
+ print(f'Load: {time.time()-t0:.1f}s')
45
+ t0 = time.time()
46
+ result = pipe(img, max_new_tokens=30)
47
+ print(f'Inference: {time.time()-t0:.1f}s')
48
+ print(f'Result: {result}')
49
+ except Exception as e:
50
+ print(f'Error: {e}')
51
+
52
+ # Use VQA model with different questions
53
+ print('\n=== VQA multiple questions ===')
54
+ t0 = time.time()
55
+ try:
56
+ vqa = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
57
+ print(f'Load: {time.time()-t0:.1f}s')
58
+ for q in ['What is in this image?', 'What color is the background?', 'What shapes do you see?']:
59
+ t0 = time.time()
60
+ result = vqa(img, q, top_k=1)
61
+ print(f'Q: {q} -> {result} ({time.time()-t0:.1f}s)')
62
+ except Exception as e:
63
+ print(f'Error: {e}')
_test_network.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, base64, http.client, ssl
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+
5
+ token = os.getenv('HF_TOKEN')
6
+ from huggingface_hub import hf_hub_download
7
+
8
+ fn = 'cca530fc-4052-43b2-b130-b30968d8aa44.png'
9
+ path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename=f'2023/validation/{fn}', repo_type='dataset', token=token)
10
+ with open(path, 'rb') as f:
11
+ img_bytes = f.read()
12
+
13
+ # Try to find the IP for api-inference.huggingface.co using different DNS methods
14
+ import subprocess
15
+ # Use ping -n 1 to trigger DNS resolution
16
+ try:
17
+ result = subprocess.run(['ping', '-n', '1', 'api-inference.huggingface.co'], capture_output=True, text=True, timeout=10)
18
+ print(f'ping stdout:\n{result.stdout}')
19
+ print(f'ping stderr:\n{result.stderr}')
20
+ except Exception as e:
21
+ print(f'ping failed: {e}')
22
+
23
+ # Try using the HF custom inference endpoint URL
24
+ # The old endpoint used Cloudflare, let me try with known Cloudflare IPs
25
+ # But first, let me try to fix DNS by using the hosts file equivalent
26
+
27
+ # Actually let me try a completely different approach:
28
+ # Use the requests library with a session and custom DNS resolution
29
+ print('\n=== Attempting direct connection to known HF IPs ===')
30
+ # Check if hf-inference is accessible via the router with the right path
31
+ # Some endpoints work with router.huggingface.co/hf-inference/models/{model}
32
+ # but vision models might not work through hf-inference provider
33
+
34
+ # Let me try using tasks endpoint via the api-inference domain via a direct IP
35
+ # The actual Cloudflare IPs change but let me check common ones
36
+ import socket
37
+
38
+ # Try to resolve via DNS-over-HTTPS or just use requests.get with allow_redirects
39
+ for domain in ['router.huggingface.co', 'huggingface.co']:
40
+ try:
41
+ ip = socket.gethostbyname(domain)
42
+ print(f'{domain} -> {ip}')
43
+ except:
44
+ print(f'{domain}: DNS failed')
45
+
46
+ # Try the curl equivalent to check what the actual endpoint returns
47
+ print('\n=== Testing model endpoint via HTTP GET ===')
48
+ try:
49
+ resp = requests.get(
50
+ f'https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct',
51
+ headers={'Authorization': f'Bearer {token}'},
52
+ timeout=15,
53
+ allow_redirects=True
54
+ )
55
+ print(f'Status: {resp.status_code}')
56
+ except Exception as e:
57
+ print(f'Error: {e}')
58
+
59
+ # Try the HF Spaces API for chess recognition
60
+ print('\n=== Trying HF Spaces chess recognizer ===')
61
+ import requests as req
62
+ try:
63
+ resp = req.post(
64
+ 'https://salominavina-chessboard-recognizer.hf.space/api/predict',
65
+ headers={'Authorization': f'Bearer {token}'},
66
+ files={'image': img_bytes},
67
+ timeout=30
68
+ )
69
+ print(f'Status: {resp.status_code}, Response: {resp.text[:300]}')
70
+ except Exception as e:
71
+ print(f'Error: {e}')
72
+
73
+ # Try another spaces approach
74
+ try:
75
+ resp = req.post(
76
+ 'https://salominavina-chessboard-recognizer.hf.space/run/predict',
77
+ json={
78
+ 'data': [base64.b64encode(img_bytes).decode('utf-8')]
79
+ },
80
+ timeout=30
81
+ )
82
+ print(f'Gradio API Status: {resp.status_code}, Response: {resp.text[:300]}')
83
+ except Exception as e:
84
+ print(f'Gradio API Error: {e}')
_test_other_vision.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, base64, requests, json
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+
5
+ token = os.getenv('HF_TOKEN')
6
+
7
+ from huggingface_hub import hf_hub_download
8
+ fn = 'cca530fc-4052-43b2-b130-b30968d8aa44.png'
9
+ path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename=f'2023/validation/{fn}', repo_type='dataset', token=token)
10
+
11
+ with open(path, 'rb') as f:
12
+ img_bytes = f.read()
13
+ b64 = base64.b64encode(img_bytes).decode('utf-8')
14
+ data_uri = f'data:image/png;base64,{b64}'
15
+
16
+ # Try OpenCode Zen vision models - they might have some
17
+ zen_key = os.getenv('ZEN_API_KEY') or os.getenv('OPENCODE_ZEN_API_KEY')
18
+ print(f'ZEN_KEY present: {bool(zen_key)}')
19
+
20
+ # Try through the opencode zen provider
21
+ zen_models = [
22
+ 'gpt-4o-mini',
23
+ 'gpt-4o',
24
+ 'claude-3-5-sonnet-latest',
25
+ 'gemini-2.0-flash-exp',
26
+ 'qwen-vl-plus',
27
+ ]
28
+
29
+ # Also try OpenRouter - we have a key for that
30
+ openrouter_key = os.getenv('OPENROUTER_API_KEY')
31
+ print(f'OpenRouter key present: {bool(openrouter_key)}')
32
+
33
+ if openrouter_key:
34
+ print('\n=== Trying OpenRouter vision models ===')
35
+ openrouter_models = [
36
+ 'openai/gpt-4o-mini',
37
+ 'google/gemini-2.0-flash-exp:free',
38
+ 'qwen/qwen-vl-plus:free',
39
+ ]
40
+ for model in openrouter_models:
41
+ payload = {
42
+ 'model': model,
43
+ 'messages': [{
44
+ 'role': 'user',
45
+ 'content': [
46
+ {'type': 'text', 'text': 'Describe this image in detail. Include any visible text, numbers, labels, diagrams, chess piece positions, charts, graphs, or specific visual elements. Be precise and thorough.'},
47
+ {'type': 'image_url', 'image_url': {'url': data_uri}},
48
+ ]
49
+ }],
50
+ 'max_tokens': 1024,
51
+ }
52
+ print(f' Trying {model}...')
53
+ try:
54
+ resp = requests.post(
55
+ 'https://openrouter.ai/api/v1/chat/completions',
56
+ headers={
57
+ 'Authorization': f'Bearer {openrouter_key}',
58
+ 'Content-Type': 'application/json',
59
+ },
60
+ json=payload,
61
+ timeout=120
62
+ )
63
+ if resp.status_code == 200:
64
+ text = resp.json()['choices'][0]['message']['content']
65
+ print(f' SUCCESS! Response: {text[:500]}')
66
+ else:
67
+ print(f' Status {resp.status_code}: {resp.text[:150]}')
68
+ except Exception as e:
69
+ print(f' Exception: {e}')
70
+
71
+ # Also check which opencode_zen models support vision
72
+ print('\n=== Checking opencode_zen models ===')
73
+ auth_file = os.path.expanduser('~/.local/share/opencode/auth.json')
74
+ if os.path.exists(auth_file):
75
+ with open(auth_file) as f:
76
+ auth = json.load(f)
77
+ zen_api_key = auth.get('providers', {}).get('opencode_zen', {}).get('api_key', '')
78
+ print(f'ZEN key from auth.json: {bool(zen_api_key)}')
79
+ if zen_api_key:
80
+ model = 'deepseek-v4-flash-free'
81
+ payload = {
82
+ 'model': model,
83
+ 'messages': [{
84
+ 'role': 'user',
85
+ 'content': [
86
+ {'type': 'text', 'text': 'Describe this image briefly.'},
87
+ {'type': 'image_url', 'image_url': {'url': data_uri}},
88
+ ]
89
+ }],
90
+ 'max_tokens': 512,
91
+ }
92
+ try:
93
+ resp = requests.post(
94
+ 'https://opencode.ai/zen/v1/chat/completions',
95
+ headers={
96
+ 'Authorization': f'Bearer {zen_api_key}',
97
+ 'Content-Type': 'application/json',
98
+ },
99
+ json=payload,
100
+ timeout=60
101
+ )
102
+ print(f' deepseek-v4-flash-free: Status {resp.status_code}')
103
+ if resp.status_code == 200:
104
+ print(f' Response: {resp.json()["choices"][0]["message"]["content"][:300]}')
105
+ else:
106
+ print(f' Error: {resp.text[:200]}')
107
+ except Exception as e:
108
+ print(f' Exception: {e}')
_test_vision.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+ from tools.vision.describe_image import describe_image
5
+ from huggingface_hub import hf_hub_download
6
+
7
+ token = os.getenv('HF_TOKEN')
8
+ fn = 'cca530fc-4052-43b2-b130-b30968d8aa44.png'
9
+ path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename=f'2023/validation/{fn}', repo_type='dataset', token=token)
10
+ print(f'Image path: {path}')
11
+
12
+ result = describe_image.invoke({'path': path})
13
+ print(f'Result length: {len(result)}')
14
+ print(f'Result:\n{result[:1200]}')
_test_vqa.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, time
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+ from PIL import Image
5
+ from huggingface_hub import hf_hub_download
6
+ from transformers import pipeline
7
+
8
+ fn = 'cca530fc-4052-43b2-b130-b30968d8aa44.png'
9
+ token = os.getenv('HF_TOKEN')
10
+ path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename=f'2023/validation/{fn}', repo_type='dataset', token=token)
11
+
12
+ img = Image.open(path)
13
+
14
+ # Try VQA model
15
+ print('Loading VQA model...')
16
+ t0 = time.time()
17
+ try:
18
+ vqa = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
19
+ print(f'Model loaded in {time.time()-t0:.1f}s')
20
+ t0 = time.time()
21
+ result = vqa(img, "What is in this image?")
22
+ print(f'Result in {time.time()-t0:.1f}s: {result}')
23
+ except Exception as e:
24
+ print(f'Error: {type(e).__name__}: {e}')
25
+
26
+ # Try Moondream2 (small, good for detailed captioning)
27
+ print('\nTrying Moondream2...')
28
+ t0 = time.time()
29
+ try:
30
+ moondream = pipeline("image-to-text", model="vikhyatk/moondream2")
31
+ print(f'Model loaded in {time.time()-t0:.1f}s')
32
+ t0 = time.time()
33
+ result = moondream(img)
34
+ print(f'Result in {time.time()-t0:.1f}s: {result}')
35
+ except Exception as e:
36
+ print(f'Error: {type(e).__name__}: {e}')
_test_wiki.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import WikipediaLoader
2
+ try:
3
+ docs = WikipediaLoader(query='Mercedes Sosa', load_max_docs=2).load()
4
+ for d in docs:
5
+ title = d.metadata.get("title", "?")
6
+ content = d.page_content[:100]
7
+ print(f'Title: {title}')
8
+ print(f'Content: {content}')
9
+ except Exception as e:
10
+ print(f'Error: {e}')
_test_zen_vision.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+
5
+ # Check all relevant env vars
6
+ keys = ['ZEN_API_KEY', 'OPENCODE_ZEN_API_KEY', 'OPENROUTER_API_KEY', 'HF_TOKEN', 'OPENAI_API_KEY']
7
+ for k in keys:
8
+ v = os.getenv(k, '')
9
+ print(f'{k}: {"present" if v else "MISSING"}')
10
+
11
+ # Try the opencode zen endpoint with vision
12
+ import base64, requests, json
13
+
14
+ from huggingface_hub import hf_hub_download
15
+ fn = 'cca530fc-4052-43b2-b130-b30968d8aa44.png'
16
+ token = os.getenv('HF_TOKEN')
17
+ path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename=f'2023/validation/{fn}', repo_type='dataset', token=token)
18
+
19
+ with open(path, 'rb') as f:
20
+ img_bytes = f.read()
21
+ b64 = base64.b64encode(img_bytes).decode('utf-8')
22
+ data_uri = f'data:image/png;base64,{b64}'
23
+
24
+ zen_key = os.getenv('ZEN_API_KEY')
25
+ if zen_key:
26
+ payload = {
27
+ 'model': 'deepseek-v4-flash-free',
28
+ 'messages': [{
29
+ 'role': 'user',
30
+ 'content': [
31
+ {'type': 'text', 'text': 'Describe this image briefly.'},
32
+ {'type': 'image_url', 'image_url': {'url': data_uri}},
33
+ ]
34
+ }],
35
+ 'max_tokens': 512,
36
+ }
37
+ try:
38
+ resp = requests.post(
39
+ 'https://opencode.ai/zen/v1/chat/completions',
40
+ headers={
41
+ 'Authorization': f'Bearer {zen_key}',
42
+ 'Content-Type': 'application/json',
43
+ },
44
+ json=payload,
45
+ timeout=60
46
+ )
47
+ print(f'\nZEN deepseek-v4-flash-free vision: Status {resp.status_code}')
48
+ if resp.status_code == 200:
49
+ print(f'Response: {resp.json()["choices"][0]["message"]["content"][:500]}')
50
+ else:
51
+ print(f'Error: {resp.text[:300]}')
52
+ except Exception as e:
53
+ print(f'Exception: {e}')
_test_zen_vision_all.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, base64, requests
2
+ from dotenv import load_dotenv
3
+ load_dotenv()
4
+
5
+ from huggingface_hub import hf_hub_download
6
+ fn = 'cca530fc-4052-43b2-b130-b30968d8aa44.png'
7
+ token = os.getenv('HF_TOKEN')
8
+ path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename=f'2023/validation/{fn}', repo_type='dataset', token=token)
9
+
10
+ with open(path, 'rb') as f:
11
+ img_bytes = f.read()
12
+ b64 = base64.b64encode(img_bytes).decode('utf-8')
13
+ data_uri = f'data:image/png;base64,{b64}'
14
+
15
+ zen_key = os.getenv('ZEN_API_KEY')
16
+ models = ['deepseek-v4-flash-free', 'nemotron-3-super-free', 'big-pickle']
17
+
18
+ for model in models:
19
+ payload = {
20
+ 'model': model,
21
+ 'messages': [{
22
+ 'role': 'user',
23
+ 'content': [
24
+ {'type': 'text', 'text': 'Describe this image briefly.'},
25
+ {'type': 'image_url', 'image_url': {'url': data_uri}},
26
+ ]
27
+ }],
28
+ 'max_tokens': 512,
29
+ }
30
+ try:
31
+ resp = requests.post(
32
+ 'https://opencode.ai/zen/v1/chat/completions',
33
+ headers={
34
+ 'Authorization': f'Bearer {zen_key}',
35
+ 'Content-Type': 'application/json',
36
+ },
37
+ json=payload,
38
+ timeout=60
39
+ )
40
+ print(f'{model}: Status {resp.status_code}')
41
+ if resp.status_code == 200:
42
+ print(f' OK: {resp.json()["choices"][0]["message"]["content"][:300]}')
43
+ else:
44
+ err = resp.json().get('error', {}).get('message', resp.text)[:150]
45
+ print(f' Error: {err}')
46
+ except Exception as e:
47
+ print(f'{model}: Exception: {e}')
agent.py CHANGED
@@ -52,15 +52,16 @@ TOOL SELECTION:
52
  - parse_spreadsheet: Excel (.xlsx) and CSV files. Always use this for spreadsheets.
53
  - transcribe_audio: Audio files (.mp3, .wav). Call once, use the returned text immediately.
54
  - python_repl: Calculations, data analysis, file processing. Variables persist between calls.
55
- - get_youtube_transcript: YouTube video transcripts only.
 
56
 
57
  EFFICIENCY RULES (save tool calls):
58
- - After transcribe_audio returns text, USE IT. Do NOT call read_file on audio, do NOT search for audio files with python_repl.
59
- - After read_file shows a .py script, run it with exec(open(path).read()) — do NOT rewrite the code.
60
- - After getting a tool result, analyze it. Do not search again with slightly different queries.
61
- - Never call the same tool with the same arguments twice.
62
- - Images (.png/.jpg) cannot be analyzed no vision available. Skip them completely.
63
- - Don't install packages in python_repl — use what's already available."""
64
 
65
 
66
  def call_model(state: AgentState):
 
52
  - parse_spreadsheet: Excel (.xlsx) and CSV files. Always use this for spreadsheets.
53
  - transcribe_audio: Audio files (.mp3, .wav). Call once, use the returned text immediately.
54
  - python_repl: Calculations, data analysis, file processing. Variables persist between calls.
55
+ - get_youtube_transcript: YouTube video transcripts only.
56
+ - describe_image: Images (.png/.jpg/.jpeg). Use this to describe what's VISIBLE in an image (text, numbers, diagrams, chess pieces, charts). Call once and use the description.
57
 
58
  EFFICIENCY RULES (save tool calls):
59
+ - After transcribe_audio returns text, USE IT. Do NOT call read_file on audio, do NOT search for audio files with python_repl.
60
+ - After read_file shows a .py script, run it with exec(open(path).read()) — do NOT rewrite the code.
61
+ - After getting a tool result, analyze it. Do not search again with slightly different queries.
62
+ - Never call the same tool with the same arguments twice.
63
+ - After describe_image returns a description, USE IT. Do not call describe_image again on the same file.
64
+ - Don't install packages in python_repl — use what's already available."""
65
 
66
 
67
  def call_model(state: AgentState):
app.py CHANGED
@@ -181,9 +181,11 @@ def run_and_submit_all(profile: Optional[gr.OAuthProfile] = None):
181
  ".csv": "Tip: This is a CSV file. Use parse_spreadsheet to read it.",
182
  ".mp3": "Tip: This is an audio file. Use transcribe_audio to transcribe it.",
183
  ".wav": "Tip: This is an audio file. Use transcribe_audio to transcribe it.",
184
- ".png": "Tip: This is an image no vision model available. Skip file-based analysis.",
185
- ".jpg": "Tip: This is an image — no vision model available. Skip file-based analysis.",
186
- ".jpeg": "Tip: This is an image no vision model available. Skip file-based analysis.",
 
 
187
  ".py": "Tip: This is a Python script. Use read_file to view, then python_repl with exec(open(path).read()) to run it directly.",
188
  ".pdf": "Tip: This is a PDF. Use read_file or python_repl with PyPDF2 to read it.",
189
  }
 
181
  ".csv": "Tip: This is a CSV file. Use parse_spreadsheet to read it.",
182
  ".mp3": "Tip: This is an audio file. Use transcribe_audio to transcribe it.",
183
  ".wav": "Tip: This is an audio file. Use transcribe_audio to transcribe it.",
184
+ ".png": "Tip: This is an image. Use describe_image to describe what is visible.",
185
+
186
+ ".jpg": "Tip: This is an image. Use describe_image to describe what is visible.",
187
+
188
+ ".jpeg": "Tip: This is an image. Use describe_image to describe what is visible.",
189
  ".py": "Tip: This is a Python script. Use read_file to view, then python_repl with exec(open(path).read()) to run it directly.",
190
  ".pdf": "Tip: This is a PDF. Use read_file or python_repl with PyPDF2 to read it.",
191
  }
gaia_results.csv CHANGED
@@ -1,9 +1,9 @@
1
  task_id,question,submitted_answer,ground_truth,correct
2
- 8e867cd7-cff9-4e6c-867a-ff5ddc2550be,How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.,We'll take the text we have from wiki_page (which we stored? Not stored as variable). We need to,3,False
3
  a1e91b78-d3d8-4675-bb8d-62741b4b68a6,"In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",,3,False
4
  2d83110e-a098-4ebb-9987-066c06fa42d0,".rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",right,Right,True
5
  cca530fc-4052-43b2-b130-b30968d8aa44,Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.,,Rd5,False
6
- 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8,Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?,,FunkMonk,False
7
  6f37996b-2ac7-44b0-8e68-6d28256631b4,"Given this table defining * on the set S = {a, b, c, d, e}
8
 
9
  |*|a|b|c|d|e|
@@ -14,30 +14,30 @@ cca530fc-4052-43b2-b130-b30968d8aa44,Review the chess position provided in the i
14
  |d|b|e|b|e|d|
15
  |e|d|b|a|d|c|
16
 
17
- provide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.","b,e","b, e",True
18
  9d191bce-651d-4746-be2d-7ef8ecadb9c2,"Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.
19
 
20
  What does Teal'c say in response to the question ""Isn't that hot?""",,Extremely,False
21
- cabe07ed-9eca-40ea-8ead-410ef5e83f91,What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?,,Louvrier,False
22
  3cef3a44-215e-4aed-8e3b-b1e3f08063b7,"I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:
23
 
24
  milk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts
25
 
26
- I need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.","broccoli, celery, fresh basil, lettuce, sweet potato","broccoli, celery, fresh basil, lettuce, sweet potatoes",False
27
  99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3,"Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.
28
 
29
  In your response, please only list the ingredients, not any measurements. So if the recipe calls for ""a pinch of salt"" or ""two cups of ripe strawberries"" the ingredients on the list would be ""salt"" and ""ripe strawberries"".
30
 
31
  Please format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.",,"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries",False
32
  305ac316-eef6-4446-960a-92d80d542f82,Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.,,Wojciech,False
33
- f918266a-b3e0-4914-865d-4faa564f1aef,What is the final numeric output from the attached Python code?,0,0,True
34
  3f57289b-8c60-48be-bd80-01f8099ca449,How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?,,519,False
35
  1f975693-876d-457b-a649-393859e79bf3,"Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(
36
 
37
  Could you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",,"132, 133, 134, 197, 245",False
38
  840bfca7-4f7b-481a-8794-c560c340185d,"On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?",,80GSFC21M0002,False
39
- bda648d7-d618-4883-88f4-3466eabd860e,Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.,,Saint Petersburg,False
40
- cf106601-ab4f-4af9-b045-5295fe67b37d,"What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",CUB,CUB,True
41
- a0c07678-e491-4bbc-8f0b-07405144218f,"Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.",ERROR: 'charmap' codec can't encode character '\u014d' in position 55: character maps to <undefined>,"Yoshida, Uehara",False
42
  7bd855d8-463d-4ed5-93ca-5fe35145f733,The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.,89706.00,89706.00,True
43
- 5a0c1adf-205e-4841-a666-7c3ef95def9d,What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?,Claus,Claus,True
 
1
  task_id,question,submitted_answer,ground_truth,correct
2
+ 8e867cd7-cff9-4e6c-867a-ff5ddc2550be,How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.,3,3,True
3
  a1e91b78-d3d8-4675-bb8d-62741b4b68a6,"In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",,3,False
4
  2d83110e-a098-4ebb-9987-066c06fa42d0,".rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",right,Right,True
5
  cca530fc-4052-43b2-b130-b30968d8aa44,Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.,,Rd5,False
6
+ 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8,Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?,FunkMonk,FunkMonk,True
7
  6f37996b-2ac7-44b0-8e68-6d28256631b4,"Given this table defining * on the set S = {a, b, c, d, e}
8
 
9
  |*|a|b|c|d|e|
 
14
  |d|b|e|b|e|d|
15
  |e|d|b|a|d|c|
16
 
17
+ provide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.","b, e","b, e",True
18
  9d191bce-651d-4746-be2d-7ef8ecadb9c2,"Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.
19
 
20
  What does Teal'c say in response to the question ""Isn't that hot?""",,Extremely,False
21
+ cabe07ed-9eca-40ea-8ead-410ef5e83f91,What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?,Louvrier,Louvrier,True
22
  3cef3a44-215e-4aed-8e3b-b1e3f08063b7,"I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:
23
 
24
  milk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts
25
 
26
+ I need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.","broccoli, celery, fresh basil, lettuce, sweet potatoes","broccoli, celery, fresh basil, lettuce, sweet potatoes",True
27
  99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3,"Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.
28
 
29
  In your response, please only list the ingredients, not any measurements. So if the recipe calls for ""a pinch of salt"" or ""two cups of ripe strawberries"" the ingredients on the list would be ""salt"" and ""ripe strawberries"".
30
 
31
  Please format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.",,"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries",False
32
  305ac316-eef6-4446-960a-92d80d542f82,Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.,,Wojciech,False
33
+ f918266a-b3e0-4914-865d-4faa564f1aef,What is the final numeric output from the attached Python code?,,0,False
34
  3f57289b-8c60-48be-bd80-01f8099ca449,How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?,,519,False
35
  1f975693-876d-457b-a649-393859e79bf3,"Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(
36
 
37
  Could you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",,"132, 133, 134, 197, 245",False
38
  840bfca7-4f7b-481a-8794-c560c340185d,"On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?",,80GSFC21M0002,False
39
+ bda648d7-d618-4883-88f4-3466eabd860e,Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.,Saint Petersburg,Saint Petersburg,True
40
+ cf106601-ab4f-4af9-b045-5295fe67b37d,"What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",,CUB,False
41
+ a0c07678-e491-4bbc-8f0b-07405144218f,"Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.",,"Yoshida, Uehara",False
42
  7bd855d8-463d-4ed5-93ca-5fe35145f733,The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.,89706.00,89706.00,True
43
+ 5a0c1adf-205e-4841-a666-7c3ef95def9d,What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?,,Claus,False
gaia_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
- "score": 30.0,
3
- "correct": 6,
4
  "total": 20,
5
  "results": [
6
  {
7
  "task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
8
  "question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
9
- "submitted_answer": "We'll take the text we have from wiki_page (which we stored? Not stored as variable). We need to",
10
  "ground_truth": "3",
11
- "correct": false
12
  },
13
  {
14
  "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
@@ -34,14 +34,14 @@
34
  {
35
  "task_id": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
36
  "question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
37
- "submitted_answer": "",
38
  "ground_truth": "FunkMonk",
39
- "correct": false
40
  },
41
  {
42
  "task_id": "6f37996b-2ac7-44b0-8e68-6d28256631b4",
43
  "question": "Given this table defining * on the set S = {a, b, c, d, e}\n\n|*|a|b|c|d|e|\n|---|---|---|---|---|---|\n|a|a|b|c|b|d|\n|b|b|c|a|e|c|\n|c|c|a|b|b|a|\n|d|b|e|b|e|d|\n|e|d|b|a|d|c|\n\nprovide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.",
44
- "submitted_answer": "b,e",
45
  "ground_truth": "b, e",
46
  "correct": true
47
  },
@@ -55,16 +55,16 @@
55
  {
56
  "task_id": "cabe07ed-9eca-40ea-8ead-410ef5e83f91",
57
  "question": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?",
58
- "submitted_answer": "",
59
  "ground_truth": "Louvrier",
60
- "correct": false
61
  },
62
  {
63
  "task_id": "3cef3a44-215e-4aed-8e3b-b1e3f08063b7",
64
  "question": "I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:\n\nmilk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts\n\nI need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.",
65
- "submitted_answer": "broccoli, celery, fresh basil, lettuce, sweet potato",
66
  "ground_truth": "broccoli, celery, fresh basil, lettuce, sweet potatoes",
67
- "correct": false
68
  },
69
  {
70
  "task_id": "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3",
@@ -83,9 +83,9 @@
83
  {
84
  "task_id": "f918266a-b3e0-4914-865d-4faa564f1aef",
85
  "question": "What is the final numeric output from the attached Python code?",
86
- "submitted_answer": "0",
87
  "ground_truth": "0",
88
- "correct": true
89
  },
90
  {
91
  "task_id": "3f57289b-8c60-48be-bd80-01f8099ca449",
@@ -111,21 +111,21 @@
111
  {
112
  "task_id": "bda648d7-d618-4883-88f4-3466eabd860e",
113
  "question": "Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.",
114
- "submitted_answer": "",
115
  "ground_truth": "Saint Petersburg",
116
- "correct": false
117
  },
118
  {
119
  "task_id": "cf106601-ab4f-4af9-b045-5295fe67b37d",
120
  "question": "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",
121
- "submitted_answer": "CUB",
122
  "ground_truth": "CUB",
123
- "correct": true
124
  },
125
  {
126
  "task_id": "a0c07678-e491-4bbc-8f0b-07405144218f",
127
  "question": "Who are the pitchers with the number before and after Taish\u014d Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.",
128
- "submitted_answer": "ERROR: 'charmap' codec can't encode character '\\u014d' in position 55: character maps to <undefined>",
129
  "ground_truth": "Yoshida, Uehara",
130
  "correct": false
131
  },
@@ -139,9 +139,9 @@
139
  {
140
  "task_id": "5a0c1adf-205e-4841-a666-7c3ef95def9d",
141
  "question": "What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?",
142
- "submitted_answer": "Claus",
143
  "ground_truth": "Claus",
144
- "correct": true
145
  }
146
  ]
147
  }
 
1
  {
2
+ "score": 40.0,
3
+ "correct": 8,
4
  "total": 20,
5
  "results": [
6
  {
7
  "task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
8
  "question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
9
+ "submitted_answer": "3",
10
  "ground_truth": "3",
11
+ "correct": true
12
  },
13
  {
14
  "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
 
34
  {
35
  "task_id": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
36
  "question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
37
+ "submitted_answer": "FunkMonk",
38
  "ground_truth": "FunkMonk",
39
+ "correct": true
40
  },
41
  {
42
  "task_id": "6f37996b-2ac7-44b0-8e68-6d28256631b4",
43
  "question": "Given this table defining * on the set S = {a, b, c, d, e}\n\n|*|a|b|c|d|e|\n|---|---|---|---|---|---|\n|a|a|b|c|b|d|\n|b|b|c|a|e|c|\n|c|c|a|b|b|a|\n|d|b|e|b|e|d|\n|e|d|b|a|d|c|\n\nprovide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.",
44
+ "submitted_answer": "b, e",
45
  "ground_truth": "b, e",
46
  "correct": true
47
  },
 
55
  {
56
  "task_id": "cabe07ed-9eca-40ea-8ead-410ef5e83f91",
57
  "question": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?",
58
+ "submitted_answer": "Louvrier",
59
  "ground_truth": "Louvrier",
60
+ "correct": true
61
  },
62
  {
63
  "task_id": "3cef3a44-215e-4aed-8e3b-b1e3f08063b7",
64
  "question": "I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:\n\nmilk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts\n\nI need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.",
65
+ "submitted_answer": "broccoli, celery, fresh basil, lettuce, sweet potatoes",
66
  "ground_truth": "broccoli, celery, fresh basil, lettuce, sweet potatoes",
67
+ "correct": true
68
  },
69
  {
70
  "task_id": "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3",
 
83
  {
84
  "task_id": "f918266a-b3e0-4914-865d-4faa564f1aef",
85
  "question": "What is the final numeric output from the attached Python code?",
86
+ "submitted_answer": "",
87
  "ground_truth": "0",
88
+ "correct": false
89
  },
90
  {
91
  "task_id": "3f57289b-8c60-48be-bd80-01f8099ca449",
 
111
  {
112
  "task_id": "bda648d7-d618-4883-88f4-3466eabd860e",
113
  "question": "Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.",
114
+ "submitted_answer": "Saint Petersburg",
115
  "ground_truth": "Saint Petersburg",
116
+ "correct": true
117
  },
118
  {
119
  "task_id": "cf106601-ab4f-4af9-b045-5295fe67b37d",
120
  "question": "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",
121
+ "submitted_answer": "",
122
  "ground_truth": "CUB",
123
+ "correct": false
124
  },
125
  {
126
  "task_id": "a0c07678-e491-4bbc-8f0b-07405144218f",
127
  "question": "Who are the pitchers with the number before and after Taish\u014d Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.",
128
+ "submitted_answer": "",
129
  "ground_truth": "Yoshida, Uehara",
130
  "correct": false
131
  },
 
139
  {
140
  "task_id": "5a0c1adf-205e-4841-a666-7c3ef95def9d",
141
  "question": "What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?",
142
+ "submitted_answer": "",
143
  "ground_truth": "Claus",
144
+ "correct": false
145
  }
146
  ]
147
  }
run_local.py CHANGED
@@ -137,9 +137,9 @@ def main():
137
  ".csv": "Tip: This is a CSV file. Use parse_spreadsheet to read it.",
138
  ".mp3": "Tip: This is an audio file. Use transcribe_audio to transcribe it.",
139
  ".wav": "Tip: This is an audio file. Use transcribe_audio to transcribe it.",
140
- ".png": "Tip: This is an image no vision model available. Skip file-based analysis.",
141
- ".jpg": "Tip: This is an image no vision model available. Skip file-based analysis.",
142
- ".jpeg": "Tip: This is an image no vision model available. Skip file-based analysis.",
143
  ".py": "Tip: This is a Python script. Use read_file to view, then python_repl with exec(open(path).read()) to run it directly.",
144
  ".pdf": "Tip: This is a PDF. Use read_file or python_repl with PyPDF2 to read it.",
145
  }
 
137
  ".csv": "Tip: This is a CSV file. Use parse_spreadsheet to read it.",
138
  ".mp3": "Tip: This is an audio file. Use transcribe_audio to transcribe it.",
139
  ".wav": "Tip: This is an audio file. Use transcribe_audio to transcribe it.",
140
+ ".png": "Tip: This is an image. Use describe_image to describe what is visible.",
141
+ ".jpg": "Tip: This is an image. Use describe_image to describe what is visible.",
142
+ ".jpeg": "Tip: This is an image. Use describe_image to describe what is visible.",
143
  ".py": "Tip: This is a Python script. Use read_file to view, then python_repl with exec(open(path).read()) to run it directly.",
144
  ".pdf": "Tip: This is a PDF. Use read_file or python_repl with PyPDF2 to read it.",
145
  }
tools/__init__.py CHANGED
@@ -7,6 +7,7 @@ from tools.file.spreadsheet import parse_spreadsheet
7
  from tools.python import python_repl
8
  from tools.youtube import get_youtube_transcript
9
  from tools.audio import transcribe_audio
 
10
 
11
  __all__ = [
12
  web_search,
@@ -18,6 +19,7 @@ __all__ = [
18
  python_repl,
19
  get_youtube_transcript,
20
  transcribe_audio,
 
21
  ]
22
 
23
  tools_by_name = {t.name: t for t in __all__}
 
7
  from tools.python import python_repl
8
  from tools.youtube import get_youtube_transcript
9
  from tools.audio import transcribe_audio
10
+ from tools.vision.describe_image import describe_image
11
 
12
  __all__ = [
13
  web_search,
 
19
  python_repl,
20
  get_youtube_transcript,
21
  transcribe_audio,
22
+ describe_image,
23
  ]
24
 
25
  tools_by_name = {t.name: t for t in __all__}
tools/vision/__init__.py ADDED
File without changes
tools/vision/describe_image.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain_core.tools import tool
3
+
4
+ _pipe = None
5
+
6
+
7
+ def _get_pipe():
8
+ global _pipe
9
+ if _pipe is None:
10
+ from transformers import pipeline
11
+ _pipe = pipeline(
12
+ "image-to-text",
13
+ model="Salesforce/blip-image-captioning-large",
14
+ )
15
+ return _pipe
16
+
17
+
18
+ @tool
19
+ def describe_image(path: str) -> str:
20
+ """Describe an image file (.png, .jpg, .jpeg) using a local vision model. Returns a text description of what is visible. Use this for any image (chess boards, diagrams, charts, screenshots, photos). Call ONCE per image and use the description directly."""
21
+ if not path or not os.path.exists(path):
22
+ return "ERROR: Image file not found"
23
+
24
+ abs_path = os.path.abspath(path)
25
+
26
+ try:
27
+ from PIL import Image
28
+ img = Image.open(abs_path)
29
+
30
+ pipe = _get_pipe()
31
+ result = pipe(img, max_new_tokens=30)
32
+ desc = result[0]["generated_text"] if result else ""
33
+
34
+ w, h = img.size
35
+ info = f"Image: {w}x{h}px"
36
+
37
+ if not desc:
38
+ return f"{info}. No description generated."
39
+
40
+ return f"{info}. Description: {desc.strip()}"
41
+ except Exception as e:
42
+ return f"VISION_DESCRIPTION_ERROR: {e}"
43
+
44
+
tools/web/search.py CHANGED
@@ -5,12 +5,11 @@ from langchain_core.tools import tool
5
  @tool
6
  def web_search(keywords: str) -> str:
7
  """Search the web. Use this for finding information, facts, URLs, and general web content. Returns title, URL, and snippet for each result."""
8
- headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"}
9
  last_err = None
10
  for attempt in range(3):
11
  try:
12
  from ddgs import DDGS
13
- results = list(DDGS(headers=headers).text(keywords, max_results=5, backend="html"))
14
  if not results:
15
  return "NO_RESULTS"
16
  formatted = []
 
5
  @tool
6
  def web_search(keywords: str) -> str:
7
  """Search the web. Use this for finding information, facts, URLs, and general web content. Returns title, URL, and snippet for each result."""
 
8
  last_err = None
9
  for attempt in range(3):
10
  try:
11
  from ddgs import DDGS
12
+ results = list(DDGS().text(keywords, max_results=5, backend="html"))
13
  if not results:
14
  return "NO_RESULTS"
15
  formatted = []
tools/web/wiki.py CHANGED
@@ -1,12 +1,38 @@
1
- from langchain_community.document_loaders import WikipediaLoader
2
  from langchain_core.tools import tool
3
 
4
 
5
  @tool
6
  def wiki_search(query: str) -> str:
7
- """Search Wikipedia."""
8
  try:
9
- docs = WikipediaLoader(query=query, load_max_docs=2).load()
10
- return "\n".join([f"{d.metadata.get('title', 'Unknown')}: {d.page_content[:500]}" for d in docs]) or "NO_RESULTS"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  except Exception as e:
12
  return f"WIKI_ERROR: {e}"
 
 
1
  from langchain_core.tools import tool
2
 
3
 
4
  @tool
5
  def wiki_search(query: str) -> str:
6
+ """Search Wikipedia pages by keyword. Returns up to 5 matching page titles and short snippets. Use wiki_page after this to get full article text."""
7
  try:
8
+ import requests
9
+
10
+ params = {
11
+ "action": "query",
12
+ "format": "json",
13
+ "list": "search",
14
+ "srsearch": query,
15
+ "srlimit": 5,
16
+ "srprop": "snippet",
17
+ }
18
+ resp = requests.get(
19
+ "https://en.wikipedia.org/w/api.php",
20
+ params=params,
21
+ headers={"User-Agent": "GAIA-Benchmark-Agent/1.0"},
22
+ timeout=15,
23
+ )
24
+ resp.raise_for_status()
25
+ data = resp.json()
26
+ results = data.get("query", {}).get("search", [])
27
+ if not results:
28
+ return "NO_RESULTS"
29
+ formatted = []
30
+ for r in results:
31
+ title = r.get("title", "")
32
+ snippet = r.get("snippet", "")
33
+ import html
34
+ snippet = html.unescape(snippet).replace("<span class=\"searchmatch\">", "").replace("</span>", "")
35
+ formatted.append(f"Title: {title}\nSnippet: {snippet[:300]}")
36
+ return "\n\n".join(formatted)
37
  except Exception as e:
38
  return f"WIKI_ERROR: {e}"