Spaces:
Sleeping
Sleeping
fixing errors
Browse files- README.md +13 -7
- __pycache__/agent.cpython-312.pyc +0 -0
- __pycache__/llm_backends.cpython-312.pyc +0 -0
- agent.py +70 -32
- llm_backends.py +61 -0
- requirements.txt +1 -0
- run_local_eval.py +4 -2
- tools/__pycache__/gaia_deterministic.cpython-312.pyc +0 -0
- tools/__pycache__/media_tools.cpython-312.pyc +0 -0
- tools/media_tools.py +108 -8
README.md
CHANGED
|
@@ -21,14 +21,20 @@ This folder is a **drop-in replacement** for the course Space
|
|
| 21 |
|
| 22 |
1. On Hugging Face, **Duplicate** the template Space above (or create a new Gradio Space and copy these files into the repo root).
|
| 23 |
2. The repo **`README.md` frontmatter must include `hf_oauth: true`** so Hugging Face injects `OAUTH_CLIENT_ID` / OAuth for `gr.LoginButton()` (required by Gradio 5.10+).
|
| 24 |
-
3. In the Space **Settings → Repository secrets**, add:
|
| 25 |
-
- **`
|
| 26 |
-
-
|
|
|
|
| 27 |
4. Optional **Variables** (or secrets) to tune models:
|
| 28 |
- `HF_INFERENCE_PROVIDER` — **omit by default** so the client uses **`auto`**: the first [inference provider](https://hf.co/settings/inference-providers) that supports your **chosen model** on the Hub. Do **not** set `hf-inference` unless that model lists it — many chat models (e.g. Qwen2.5-7B-Instruct) only support **together** / **featherless-ai**, and forcing `hf-inference` yields **404**. If the auto order hits a provider that returns **401** (e.g. Novita), reorder providers in HF settings or pin e.g. `HF_INFERENCE_PROVIDER=together`.
|
| 29 |
-
- `
|
| 30 |
-
- `
|
| 31 |
-
- `
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
- `GAIA_API_URL` — default `https://agents-course-unit4-scoring.hf.space`
|
| 33 |
- `GAIA_USE_CACHE` — `1` (default) or `0` to disable `gaia_answers_cache.json`
|
| 34 |
|
|
@@ -40,7 +46,7 @@ Keep the Space **public** so `agent_code` (`…/tree/main`) verifies for the lea
|
|
| 40 |
cd gaia_unit4_space
|
| 41 |
python -m venv .venv && source .venv/bin/activate
|
| 42 |
pip install -r requirements.txt
|
| 43 |
-
export HF_TOKEN=hf_...
|
| 44 |
python run_local_eval.py
|
| 45 |
```
|
| 46 |
|
|
|
|
| 21 |
|
| 22 |
1. On Hugging Face, **Duplicate** the template Space above (or create a new Gradio Space and copy these files into the repo root).
|
| 23 |
2. The repo **`README.md` frontmatter must include `hf_oauth: true`** so Hugging Face injects `OAUTH_CLIENT_ID` / OAuth for `gr.LoginButton()` (required by Gradio 5.10+).
|
| 24 |
+
3. In the Space **Settings → Repository secrets**, add **at least one** LLM credential:
|
| 25 |
+
- **`GROQ_API_KEY`** (recommended, [free tier](https://console.groq.com)): the agent uses **Groq’s OpenAI-compatible API** for chat, Whisper-class ASR, and vision when this is set. You do **not** need Hugging Face Inference credits or a paid Hub plan for the LLM.
|
| 26 |
+
- **`OPENAI_API_KEY`** (optional): same wiring via the OpenAI SDK; used only if `GROQ_API_KEY` is **not** set.
|
| 27 |
+
- **`HF_TOKEN`**: Hugging Face token with **read** permission — used only when neither Groq nor OpenAI keys are set (Hub **Inference** / serverless). If you see **402 Payment Required**, your **Inference Provider credits** are exhausted; prefer **`GROQ_API_KEY`** instead of adding billing, or the run will fail on LLM-heavy tasks. Several tasks are solved **without** the LLM (deterministic) to save quota.
|
| 28 |
4. Optional **Variables** (or secrets) to tune models:
|
| 29 |
- `HF_INFERENCE_PROVIDER` — **omit by default** so the client uses **`auto`**: the first [inference provider](https://hf.co/settings/inference-providers) that supports your **chosen model** on the Hub. Do **not** set `hf-inference` unless that model lists it — many chat models (e.g. Qwen2.5-7B-Instruct) only support **together** / **featherless-ai**, and forcing `hf-inference` yields **404**. If the auto order hits a provider that returns **401** (e.g. Novita), reorder providers in HF settings or pin e.g. `HF_INFERENCE_PROVIDER=together`.
|
| 30 |
+
- `GAIA_GROQ_CHAT_MODEL` — default `llama-3.1-8b-instant` (Groq chat + tool calls).
|
| 31 |
+
- `GAIA_OPENAI_CHAT_MODEL` — default `gpt-4o-mini` (when using `OPENAI_API_KEY`).
|
| 32 |
+
- `GAIA_TEXT_MODEL` — default `Qwen/Qwen2.5-7B-Instruct` (HF Inference only).
|
| 33 |
+
- `GAIA_GROQ_ASR_MODEL` / `GAIA_OPENAI_ASR_MODEL` — Groq default `whisper-large-v3`; OpenAI default `whisper-1`.
|
| 34 |
+
- `GAIA_GROQ_VISION_MODEL` — default `llama-3.2-11b-vision-preview`.
|
| 35 |
+
- `GAIA_OPENAI_VISION_MODEL` — default `gpt-4o-mini`.
|
| 36 |
+
- `GAIA_ASR_MODEL` — HF-only default `openai/whisper-large-v3`
|
| 37 |
+
- `GAIA_VISION_MODEL` — HF-only default `meta-llama/Llama-3.2-11B-Vision-Instruct`
|
| 38 |
- `GAIA_API_URL` — default `https://agents-course-unit4-scoring.hf.space`
|
| 39 |
- `GAIA_USE_CACHE` — `1` (default) or `0` to disable `gaia_answers_cache.json`
|
| 40 |
|
|
|
|
| 46 |
cd gaia_unit4_space
|
| 47 |
python -m venv .venv && source .venv/bin/activate
|
| 48 |
pip install -r requirements.txt
|
| 49 |
+
export GROQ_API_KEY=gsk_... # or: export HF_TOKEN=hf_...
|
| 50 |
python run_local_eval.py
|
| 51 |
```
|
| 52 |
|
__pycache__/agent.cpython-312.pyc
CHANGED
|
Binary files a/__pycache__/agent.cpython-312.pyc and b/__pycache__/agent.cpython-312.pyc differ
|
|
|
__pycache__/llm_backends.cpython-312.pyc
ADDED
|
Binary file (3.11 kB). View file
|
|
|
agent.py
CHANGED
|
@@ -1,16 +1,27 @@
|
|
| 1 |
-
"""GAIA Unit 4 agent: tool-calling loop via Hugging Face Inference
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
import os
|
| 6 |
from typing import Any, Optional
|
| 7 |
|
| 8 |
-
from huggingface_hub import InferenceClient
|
| 9 |
-
|
| 10 |
from answer_normalize import normalize_answer
|
| 11 |
from inference_client_factory import inference_client_kwargs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
from tools.registry import TOOL_DEFINITIONS, deterministic_attempt, dispatch_tool
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
SYSTEM_PROMPT = """You solve GAIA benchmark questions for the Hugging Face Agents Course.
|
| 15 |
|
| 16 |
Hard rules:
|
|
@@ -36,21 +47,34 @@ class GaiaAgent:
|
|
| 36 |
or os.environ.get("HF_TOKEN")
|
| 37 |
or os.environ.get("HUGGINGFACEHUB_API_TOKEN")
|
| 38 |
)
|
| 39 |
-
self.
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
self.max_iterations = max_iterations
|
| 43 |
-
self._client: Optional[InferenceClient] = None
|
| 44 |
|
| 45 |
-
def
|
| 46 |
-
if
|
|
|
|
|
|
|
| 47 |
if not self.hf_token:
|
| 48 |
raise RuntimeError(
|
| 49 |
-
"HF_TOKEN or HUGGINGFACEHUB_API_TOKEN is required
|
|
|
|
| 50 |
)
|
| 51 |
kw = inference_client_kwargs(self.hf_token)
|
| 52 |
-
self.
|
| 53 |
-
return self.
|
| 54 |
|
| 55 |
def __call__(
|
| 56 |
self,
|
|
@@ -62,9 +86,10 @@ class GaiaAgent:
|
|
| 62 |
if det is not None:
|
| 63 |
return normalize_answer(det)
|
| 64 |
|
| 65 |
-
if not self.hf_token:
|
| 66 |
return normalize_answer(
|
| 67 |
-
"Error:
|
|
|
|
| 68 |
)
|
| 69 |
|
| 70 |
user_text = _build_user_payload(question, attachment_path, task_id)
|
|
@@ -73,36 +98,48 @@ class GaiaAgent:
|
|
| 73 |
{"role": "user", "content": user_text},
|
| 74 |
]
|
| 75 |
|
| 76 |
-
client = self._get_client()
|
| 77 |
last_text = ""
|
| 78 |
|
| 79 |
for _ in range(self.max_iterations):
|
| 80 |
try:
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
except Exception as e:
|
| 90 |
es = str(e)
|
| 91 |
if "402" in es or "Payment Required" in es or "depleted" in es.lower():
|
| 92 |
last_text = (
|
| 93 |
"Error: Hugging Face Inference credits exhausted (402). "
|
| 94 |
-
"
|
| 95 |
-
"
|
| 96 |
)
|
| 97 |
else:
|
| 98 |
last_text = f"Inference error: {e}"
|
| 99 |
break
|
| 100 |
|
| 101 |
-
choice = completion.choices[0]
|
| 102 |
-
msg = choice.message
|
| 103 |
last_text = (msg.content or "").strip()
|
|
|
|
| 104 |
|
| 105 |
-
if
|
| 106 |
messages.append(
|
| 107 |
{
|
| 108 |
"role": "assistant",
|
|
@@ -113,14 +150,14 @@ class GaiaAgent:
|
|
| 113 |
"type": "function",
|
| 114 |
"function": {
|
| 115 |
"name": tc.function.name,
|
| 116 |
-
"arguments": tc.function.arguments,
|
| 117 |
},
|
| 118 |
}
|
| 119 |
-
for tc in
|
| 120 |
],
|
| 121 |
}
|
| 122 |
)
|
| 123 |
-
for tc in
|
| 124 |
name = tc.function.name
|
| 125 |
args = tc.function.arguments or "{}"
|
| 126 |
result = dispatch_tool(name, args, hf_token=self.hf_token)
|
|
@@ -136,7 +173,8 @@ class GaiaAgent:
|
|
| 136 |
if last_text:
|
| 137 |
break
|
| 138 |
|
| 139 |
-
|
|
|
|
| 140 |
last_text = "Error: model hit max length without an answer."
|
| 141 |
break
|
| 142 |
|
|
|
|
| 1 |
+
"""GAIA Unit 4 agent: tool-calling loop via Groq, OpenAI, or Hugging Face Inference."""
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
import os
|
| 6 |
from typing import Any, Optional
|
| 7 |
|
|
|
|
|
|
|
| 8 |
from answer_normalize import normalize_answer
|
| 9 |
from inference_client_factory import inference_client_kwargs
|
| 10 |
+
from llm_backends import (
|
| 11 |
+
chat_complete_openai,
|
| 12 |
+
detect_llm_backend,
|
| 13 |
+
groq_chat_model,
|
| 14 |
+
hf_chat_model,
|
| 15 |
+
make_openai_sdk_client,
|
| 16 |
+
openai_chat_model,
|
| 17 |
+
)
|
| 18 |
from tools.registry import TOOL_DEFINITIONS, deterministic_attempt, dispatch_tool
|
| 19 |
|
| 20 |
+
try:
|
| 21 |
+
from huggingface_hub import InferenceClient
|
| 22 |
+
except ImportError:
|
| 23 |
+
InferenceClient = None # type: ignore
|
| 24 |
+
|
| 25 |
SYSTEM_PROMPT = """You solve GAIA benchmark questions for the Hugging Face Agents Course.
|
| 26 |
|
| 27 |
Hard rules:
|
|
|
|
| 47 |
or os.environ.get("HF_TOKEN")
|
| 48 |
or os.environ.get("HUGGINGFACEHUB_API_TOKEN")
|
| 49 |
)
|
| 50 |
+
self.backend = detect_llm_backend()
|
| 51 |
+
if self.backend == "groq":
|
| 52 |
+
self.text_model = text_model or groq_chat_model()
|
| 53 |
+
self._oa_client, _ = make_openai_sdk_client("groq")
|
| 54 |
+
self._hf_client = None
|
| 55 |
+
elif self.backend == "openai":
|
| 56 |
+
self.text_model = text_model or openai_chat_model()
|
| 57 |
+
self._oa_client, _ = make_openai_sdk_client("openai")
|
| 58 |
+
self._hf_client = None
|
| 59 |
+
else:
|
| 60 |
+
self.text_model = text_model or hf_chat_model()
|
| 61 |
+
self._oa_client = None
|
| 62 |
+
self._hf_client: Optional[InferenceClient] = None
|
| 63 |
+
|
| 64 |
self.max_iterations = max_iterations
|
|
|
|
| 65 |
|
| 66 |
+
def _get_hf_client(self) -> InferenceClient:
|
| 67 |
+
if InferenceClient is None:
|
| 68 |
+
raise RuntimeError("huggingface_hub is not installed.")
|
| 69 |
+
if self._hf_client is None:
|
| 70 |
if not self.hf_token:
|
| 71 |
raise RuntimeError(
|
| 72 |
+
"HF_TOKEN or HUGGINGFACEHUB_API_TOKEN is required when using "
|
| 73 |
+
"Hugging Face Inference (no GROQ_API_KEY / OPENAI_API_KEY set)."
|
| 74 |
)
|
| 75 |
kw = inference_client_kwargs(self.hf_token)
|
| 76 |
+
self._hf_client = InferenceClient(**kw)
|
| 77 |
+
return self._hf_client
|
| 78 |
|
| 79 |
def __call__(
|
| 80 |
self,
|
|
|
|
| 86 |
if det is not None:
|
| 87 |
return normalize_answer(det)
|
| 88 |
|
| 89 |
+
if self.backend == "hf" and not self.hf_token:
|
| 90 |
return normalize_answer(
|
| 91 |
+
"Error: set GROQ_API_KEY (free Groq tier), OPENAI_API_KEY, or HF_TOKEN "
|
| 92 |
+
"for LLM inference."
|
| 93 |
)
|
| 94 |
|
| 95 |
user_text = _build_user_payload(question, attachment_path, task_id)
|
|
|
|
| 98 |
{"role": "user", "content": user_text},
|
| 99 |
]
|
| 100 |
|
|
|
|
| 101 |
last_text = ""
|
| 102 |
|
| 103 |
for _ in range(self.max_iterations):
|
| 104 |
try:
|
| 105 |
+
if self.backend in ("groq", "openai"):
|
| 106 |
+
assert self._oa_client is not None
|
| 107 |
+
completion = chat_complete_openai(
|
| 108 |
+
self._oa_client,
|
| 109 |
+
model=self.text_model,
|
| 110 |
+
messages=messages,
|
| 111 |
+
tools=TOOL_DEFINITIONS,
|
| 112 |
+
max_tokens=1024,
|
| 113 |
+
temperature=0.15,
|
| 114 |
+
)
|
| 115 |
+
msg = completion.choices[0].message
|
| 116 |
+
else:
|
| 117 |
+
client = self._get_hf_client()
|
| 118 |
+
completion = client.chat_completion(
|
| 119 |
+
messages=messages,
|
| 120 |
+
model=self.text_model,
|
| 121 |
+
tools=TOOL_DEFINITIONS,
|
| 122 |
+
tool_choice="auto",
|
| 123 |
+
max_tokens=1024,
|
| 124 |
+
temperature=0.15,
|
| 125 |
+
)
|
| 126 |
+
msg = completion.choices[0].message
|
| 127 |
except Exception as e:
|
| 128 |
es = str(e)
|
| 129 |
if "402" in es or "Payment Required" in es or "depleted" in es.lower():
|
| 130 |
last_text = (
|
| 131 |
"Error: Hugging Face Inference credits exhausted (402). "
|
| 132 |
+
"Set Space secret GROQ_API_KEY (free at https://console.groq.com) "
|
| 133 |
+
"to use Groq instead, or add HF billing."
|
| 134 |
)
|
| 135 |
else:
|
| 136 |
last_text = f"Inference error: {e}"
|
| 137 |
break
|
| 138 |
|
|
|
|
|
|
|
| 139 |
last_text = (msg.content or "").strip()
|
| 140 |
+
tool_calls = getattr(msg, "tool_calls", None)
|
| 141 |
|
| 142 |
+
if tool_calls:
|
| 143 |
messages.append(
|
| 144 |
{
|
| 145 |
"role": "assistant",
|
|
|
|
| 150 |
"type": "function",
|
| 151 |
"function": {
|
| 152 |
"name": tc.function.name,
|
| 153 |
+
"arguments": tc.function.arguments or "{}",
|
| 154 |
},
|
| 155 |
}
|
| 156 |
+
for tc in tool_calls
|
| 157 |
],
|
| 158 |
}
|
| 159 |
)
|
| 160 |
+
for tc in tool_calls:
|
| 161 |
name = tc.function.name
|
| 162 |
args = tc.function.arguments or "{}"
|
| 163 |
result = dispatch_tool(name, args, hf_token=self.hf_token)
|
|
|
|
| 173 |
if last_text:
|
| 174 |
break
|
| 175 |
|
| 176 |
+
fr = getattr(completion.choices[0], "finish_reason", None)
|
| 177 |
+
if fr == "length":
|
| 178 |
last_text = "Error: model hit max length without an answer."
|
| 179 |
break
|
| 180 |
|
llm_backends.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Optional OpenAI-compatible backends (Groq free tier, OpenAI) to avoid HF Inference credits."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
from typing import Any, Literal, Optional
|
| 7 |
+
|
| 8 |
+
Backend = Literal["groq", "openai", "hf"]
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def detect_llm_backend() -> Backend:
|
| 12 |
+
if os.environ.get("GROQ_API_KEY", "").strip():
|
| 13 |
+
return "groq"
|
| 14 |
+
if os.environ.get("OPENAI_API_KEY", "").strip():
|
| 15 |
+
return "openai"
|
| 16 |
+
return "hf"
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def groq_chat_model() -> str:
|
| 20 |
+
return os.environ.get("GAIA_GROQ_CHAT_MODEL", "llama-3.1-8b-instant")
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def openai_chat_model() -> str:
|
| 24 |
+
return os.environ.get("GAIA_OPENAI_CHAT_MODEL", "gpt-4o-mini")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def hf_chat_model() -> str:
|
| 28 |
+
return os.environ.get("GAIA_TEXT_MODEL", "Qwen/Qwen2.5-7B-Instruct")
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def make_openai_sdk_client(backend: Backend):
|
| 32 |
+
"""Return (client, base_url_label) for Groq or direct OpenAI."""
|
| 33 |
+
from openai import OpenAI
|
| 34 |
+
|
| 35 |
+
if backend == "groq":
|
| 36 |
+
key = os.environ["GROQ_API_KEY"].strip()
|
| 37 |
+
return OpenAI(api_key=key, base_url="https://api.groq.com/openai/v1"), "groq"
|
| 38 |
+
key = os.environ["OPENAI_API_KEY"].strip()
|
| 39 |
+
base = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1").strip()
|
| 40 |
+
return OpenAI(api_key=key, base_url=base), "openai"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def chat_complete_openai(
|
| 44 |
+
client: Any,
|
| 45 |
+
*,
|
| 46 |
+
model: str,
|
| 47 |
+
messages: list[dict[str, Any]],
|
| 48 |
+
tools: Optional[list[dict[str, Any]]],
|
| 49 |
+
max_tokens: int,
|
| 50 |
+
temperature: float,
|
| 51 |
+
) -> Any:
|
| 52 |
+
kwargs: dict[str, Any] = {
|
| 53 |
+
"model": model,
|
| 54 |
+
"messages": messages,
|
| 55 |
+
"max_tokens": max_tokens,
|
| 56 |
+
"temperature": temperature,
|
| 57 |
+
}
|
| 58 |
+
if tools:
|
| 59 |
+
kwargs["tools"] = tools
|
| 60 |
+
kwargs["tool_choice"] = "auto"
|
| 61 |
+
return client.chat.completions.create(**kwargs)
|
requirements.txt
CHANGED
|
@@ -8,5 +8,6 @@ lxml>=5.0.0
|
|
| 8 |
duckduckgo-search>=6.0.0
|
| 9 |
wikipedia>=1.4.0
|
| 10 |
huggingface_hub>=0.26.0
|
|
|
|
| 11 |
youtube-transcript-api>=0.6.0
|
| 12 |
Pillow>=10.0.0
|
|
|
|
| 8 |
duckduckgo-search>=6.0.0
|
| 9 |
wikipedia>=1.4.0
|
| 10 |
huggingface_hub>=0.26.0
|
| 11 |
+
openai>=1.40.0
|
| 12 |
youtube-transcript-api>=0.6.0
|
| 13 |
Pillow>=10.0.0
|
run_local_eval.py
CHANGED
|
@@ -66,7 +66,9 @@ def main() -> None:
|
|
| 66 |
print(f"{len(items)} questions")
|
| 67 |
|
| 68 |
token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACEHUB_API_TOKEN")
|
| 69 |
-
|
|
|
|
|
|
|
| 70 |
|
| 71 |
out: list[dict] = []
|
| 72 |
for item in items:
|
|
@@ -85,7 +87,7 @@ def main() -> None:
|
|
| 85 |
from tools.registry import deterministic_attempt
|
| 86 |
|
| 87 |
d = deterministic_attempt(str(q), local)
|
| 88 |
-
ans = d if d is not None else "
|
| 89 |
finally:
|
| 90 |
if local and Path(local).is_file():
|
| 91 |
Path(local).unlink(missing_ok=True)
|
|
|
|
| 66 |
print(f"{len(items)} questions")
|
| 67 |
|
| 68 |
token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACEHUB_API_TOKEN")
|
| 69 |
+
has_groq = bool(os.environ.get("GROQ_API_KEY", "").strip())
|
| 70 |
+
has_openai = bool(os.environ.get("OPENAI_API_KEY", "").strip())
|
| 71 |
+
agent = GaiaAgent(hf_token=token) if (token or has_groq or has_openai) else None
|
| 72 |
|
| 73 |
out: list[dict] = []
|
| 74 |
for item in items:
|
|
|
|
| 87 |
from tools.registry import deterministic_attempt
|
| 88 |
|
| 89 |
d = deterministic_attempt(str(q), local)
|
| 90 |
+
ans = d if d is not None else "NO_LLM_KEYS"
|
| 91 |
finally:
|
| 92 |
if local and Path(local).is_file():
|
| 93 |
Path(local).unlink(missing_ok=True)
|
tools/__pycache__/gaia_deterministic.cpython-312.pyc
CHANGED
|
Binary files a/tools/__pycache__/gaia_deterministic.cpython-312.pyc and b/tools/__pycache__/gaia_deterministic.cpython-312.pyc differ
|
|
|
tools/__pycache__/media_tools.cpython-312.pyc
CHANGED
|
Binary files a/tools/__pycache__/media_tools.cpython-312.pyc and b/tools/__pycache__/media_tools.cpython-312.pyc differ
|
|
|
tools/media_tools.py
CHANGED
|
@@ -1,22 +1,66 @@
|
|
| 1 |
import base64
|
| 2 |
import os
|
| 3 |
from pathlib import Path
|
| 4 |
-
from typing import Optional
|
| 5 |
|
| 6 |
from inference_client_factory import make_inference_client
|
| 7 |
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
def transcribe_audio(
|
| 10 |
file_path: str,
|
| 11 |
*,
|
| 12 |
hf_token: Optional[str] = None,
|
| 13 |
model: Optional[str] = None,
|
| 14 |
) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
token = hf_token or os.environ.get("HF_TOKEN") or os.environ.get(
|
| 16 |
"HUGGINGFACEHUB_API_TOKEN"
|
| 17 |
)
|
| 18 |
if not token:
|
| 19 |
-
return
|
|
|
|
|
|
|
| 20 |
mid = model or os.environ.get("GAIA_ASR_MODEL", "openai/whisper-large-v3")
|
| 21 |
client = make_inference_client(token)
|
| 22 |
try:
|
|
@@ -26,6 +70,37 @@ def transcribe_audio(
|
|
| 26 |
return f"ASR error: {e}"
|
| 27 |
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
def analyze_image_with_vlm(
|
| 30 |
file_path: str,
|
| 31 |
question: str,
|
|
@@ -33,18 +108,39 @@ def analyze_image_with_vlm(
|
|
| 33 |
hf_token: Optional[str] = None,
|
| 34 |
model: Optional[str] = None,
|
| 35 |
) -> str:
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
token = hf_token or os.environ.get("HF_TOKEN") or os.environ.get(
|
| 38 |
"HUGGINGFACEHUB_API_TOKEN"
|
| 39 |
)
|
| 40 |
if not token:
|
| 41 |
-
return
|
|
|
|
|
|
|
| 42 |
mid = model or os.environ.get(
|
| 43 |
"GAIA_VISION_MODEL", "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
| 44 |
)
|
| 45 |
-
path = Path(file_path)
|
| 46 |
-
if not path.is_file():
|
| 47 |
-
return f"Error: image not found: {file_path}"
|
| 48 |
raw = path.read_bytes()
|
| 49 |
b64 = base64.b64encode(raw).decode("ascii")
|
| 50 |
mime = "image/png" if path.suffix.lower() == ".png" else "image/jpeg"
|
|
@@ -79,7 +175,11 @@ def visual_question_short(
|
|
| 79 |
hf_token: Optional[str] = None,
|
| 80 |
model: Optional[str] = None,
|
| 81 |
) -> str:
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
token = hf_token or os.environ.get("HF_TOKEN") or os.environ.get(
|
| 84 |
"HUGGINGFACEHUB_API_TOKEN"
|
| 85 |
)
|
|
|
|
| 1 |
import base64
|
| 2 |
import os
|
| 3 |
from pathlib import Path
|
| 4 |
+
from typing import Any, Optional
|
| 5 |
|
| 6 |
from inference_client_factory import make_inference_client
|
| 7 |
|
| 8 |
|
| 9 |
+
def _groq_openai_client():
|
| 10 |
+
k = os.environ.get("GROQ_API_KEY", "").strip()
|
| 11 |
+
if not k:
|
| 12 |
+
return None
|
| 13 |
+
from openai import OpenAI
|
| 14 |
+
|
| 15 |
+
return OpenAI(api_key=k, base_url="https://api.groq.com/openai/v1")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _openai_platform_client():
|
| 19 |
+
k = os.environ.get("OPENAI_API_KEY", "").strip()
|
| 20 |
+
if not k:
|
| 21 |
+
return None
|
| 22 |
+
from openai import OpenAI
|
| 23 |
+
|
| 24 |
+
base = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1").strip()
|
| 25 |
+
return OpenAI(api_key=k, base_url=base)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
def transcribe_audio(
|
| 29 |
file_path: str,
|
| 30 |
*,
|
| 31 |
hf_token: Optional[str] = None,
|
| 32 |
model: Optional[str] = None,
|
| 33 |
) -> str:
|
| 34 |
+
gc = _groq_openai_client()
|
| 35 |
+
if gc:
|
| 36 |
+
mid = model or os.environ.get("GAIA_GROQ_ASR_MODEL", "whisper-large-v3")
|
| 37 |
+
try:
|
| 38 |
+
with open(file_path, "rb") as audio_f:
|
| 39 |
+
tr = gc.audio.transcriptions.create(
|
| 40 |
+
model=mid,
|
| 41 |
+
file=audio_f,
|
| 42 |
+
)
|
| 43 |
+
return (tr.text or "").strip()
|
| 44 |
+
except Exception as e:
|
| 45 |
+
return f"ASR error (Groq): {e}"
|
| 46 |
+
|
| 47 |
+
oc = _openai_platform_client()
|
| 48 |
+
if oc:
|
| 49 |
+
mid = model or os.environ.get("GAIA_OPENAI_ASR_MODEL", "whisper-1")
|
| 50 |
+
try:
|
| 51 |
+
with open(file_path, "rb") as audio_f:
|
| 52 |
+
tr = oc.audio.transcriptions.create(model=mid, file=audio_f)
|
| 53 |
+
return (tr.text or "").strip()
|
| 54 |
+
except Exception as e:
|
| 55 |
+
return f"ASR error (OpenAI): {e}"
|
| 56 |
+
|
| 57 |
token = hf_token or os.environ.get("HF_TOKEN") or os.environ.get(
|
| 58 |
"HUGGINGFACEHUB_API_TOKEN"
|
| 59 |
)
|
| 60 |
if not token:
|
| 61 |
+
return (
|
| 62 |
+
"Error: set GROQ_API_KEY (free), OPENAI_API_KEY, or HF_TOKEN for speech."
|
| 63 |
+
)
|
| 64 |
mid = model or os.environ.get("GAIA_ASR_MODEL", "openai/whisper-large-v3")
|
| 65 |
client = make_inference_client(token)
|
| 66 |
try:
|
|
|
|
| 70 |
return f"ASR error: {e}"
|
| 71 |
|
| 72 |
|
| 73 |
+
def _vision_chat_openai(
|
| 74 |
+
client: Any,
|
| 75 |
+
*,
|
| 76 |
+
model: str,
|
| 77 |
+
file_path: Path,
|
| 78 |
+
question: str,
|
| 79 |
+
) -> str:
|
| 80 |
+
raw = file_path.read_bytes()
|
| 81 |
+
b64 = base64.b64encode(raw).decode("ascii")
|
| 82 |
+
mime = "image/png" if file_path.suffix.lower() == ".png" else "image/jpeg"
|
| 83 |
+
data_url = f"data:{mime};base64,{b64}"
|
| 84 |
+
comp = client.chat.completions.create(
|
| 85 |
+
model=model,
|
| 86 |
+
messages=[
|
| 87 |
+
{
|
| 88 |
+
"role": "user",
|
| 89 |
+
"content": [
|
| 90 |
+
{"type": "text", "text": question},
|
| 91 |
+
{
|
| 92 |
+
"type": "image_url",
|
| 93 |
+
"image_url": {"url": data_url},
|
| 94 |
+
},
|
| 95 |
+
],
|
| 96 |
+
}
|
| 97 |
+
],
|
| 98 |
+
max_tokens=512,
|
| 99 |
+
temperature=0.2,
|
| 100 |
+
)
|
| 101 |
+
return (comp.choices[0].message.content or "").strip()
|
| 102 |
+
|
| 103 |
+
|
| 104 |
def analyze_image_with_vlm(
|
| 105 |
file_path: str,
|
| 106 |
question: str,
|
|
|
|
| 108 |
hf_token: Optional[str] = None,
|
| 109 |
model: Optional[str] = None,
|
| 110 |
) -> str:
|
| 111 |
+
path = Path(file_path)
|
| 112 |
+
if not path.is_file():
|
| 113 |
+
return f"Error: image not found: {file_path}"
|
| 114 |
+
|
| 115 |
+
gc = _groq_openai_client()
|
| 116 |
+
if gc:
|
| 117 |
+
mid = model or os.environ.get(
|
| 118 |
+
"GAIA_GROQ_VISION_MODEL",
|
| 119 |
+
"llama-3.2-11b-vision-preview",
|
| 120 |
+
)
|
| 121 |
+
try:
|
| 122 |
+
return _vision_chat_openai(gc, model=mid, file_path=path, question=question)
|
| 123 |
+
except Exception as e:
|
| 124 |
+
return f"Vision error (Groq): {e}"
|
| 125 |
+
|
| 126 |
+
oc = _openai_platform_client()
|
| 127 |
+
if oc:
|
| 128 |
+
mid = model or os.environ.get("GAIA_OPENAI_VISION_MODEL", "gpt-4o-mini")
|
| 129 |
+
try:
|
| 130 |
+
return _vision_chat_openai(oc, model=mid, file_path=path, question=question)
|
| 131 |
+
except Exception as e:
|
| 132 |
+
return f"Vision error (OpenAI): {e}"
|
| 133 |
+
|
| 134 |
token = hf_token or os.environ.get("HF_TOKEN") or os.environ.get(
|
| 135 |
"HUGGINGFACEHUB_API_TOKEN"
|
| 136 |
)
|
| 137 |
if not token:
|
| 138 |
+
return (
|
| 139 |
+
"Error: set GROQ_API_KEY, OPENAI_API_KEY, or HF_TOKEN for vision."
|
| 140 |
+
)
|
| 141 |
mid = model or os.environ.get(
|
| 142 |
"GAIA_VISION_MODEL", "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
| 143 |
)
|
|
|
|
|
|
|
|
|
|
| 144 |
raw = path.read_bytes()
|
| 145 |
b64 = base64.b64encode(raw).decode("ascii")
|
| 146 |
mime = "image/png" if path.suffix.lower() == ".png" else "image/jpeg"
|
|
|
|
| 175 |
hf_token: Optional[str] = None,
|
| 176 |
model: Optional[str] = None,
|
| 177 |
) -> str:
|
| 178 |
+
if _groq_openai_client() or _openai_platform_client():
|
| 179 |
+
return analyze_image_with_vlm(
|
| 180 |
+
file_path, question, hf_token=hf_token, model=model
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
token = hf_token or os.environ.get("HF_TOKEN") or os.environ.get(
|
| 184 |
"HUGGINGFACEHUB_API_TOKEN"
|
| 185 |
)
|