Update app.py
Browse files
app.py
CHANGED
|
@@ -31,18 +31,37 @@ MODEL_PRIORITY = [
|
|
| 31 |
|
| 32 |
SYSTEM_PROMPT = """You are a precise AI assistant solving GAIA benchmark questions.
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
CRITICAL RULES:
|
| 35 |
-
1.
|
| 36 |
-
2.
|
| 37 |
-
3.
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
"""
|
| 47 |
|
| 48 |
# ── Tools ──────────────────────────────────────────────────────────────────────
|
|
@@ -176,27 +195,79 @@ def make_llm(model_id: str):
|
|
| 176 |
def agent_node(state: AgentState):
|
| 177 |
"""
|
| 178 |
Essaie les modèles dans l'ordre MODEL_PRIORITY.
|
| 179 |
-
|
|
|
|
| 180 |
"""
|
| 181 |
last_error = None
|
|
|
|
| 182 |
for model_id in MODEL_PRIORITY:
|
| 183 |
try:
|
| 184 |
print(f" [agent] Essai modèle : {model_id}")
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
return {"messages": [response]}
|
|
|
|
| 187 |
except Exception as e:
|
| 188 |
print(f" [agent] Modèle {model_id} échoué : {e}")
|
| 189 |
last_error = e
|
| 190 |
continue
|
| 191 |
|
|
|
|
| 192 |
raise RuntimeError(f"Tous les modèles Groq ont échoué. Dernière erreur : {last_error}")
|
| 193 |
|
| 194 |
|
| 195 |
def should_continue(state: AgentState):
|
| 196 |
-
"""Décide si on appelle des outils ou si on termine."""
|
| 197 |
last = state["messages"][-1]
|
|
|
|
|
|
|
| 198 |
if hasattr(last, "tool_calls") and last.tool_calls:
|
| 199 |
return "tools"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
return END
|
| 201 |
|
| 202 |
|
|
|
|
| 31 |
|
| 32 |
SYSTEM_PROMPT = """You are a precise AI assistant solving GAIA benchmark questions.
|
| 33 |
|
| 34 |
+
AVAILABLE TOOLS (USE ONLY THESE EXACT NAMES):
|
| 35 |
+
- web_search
|
| 36 |
+
- wikipedia_search
|
| 37 |
+
- visit_webpage
|
| 38 |
+
- download_task_file
|
| 39 |
+
- calculator
|
| 40 |
+
|
| 41 |
CRITICAL RULES:
|
| 42 |
+
1. NEVER call any tool outside this list.
|
| 43 |
+
2. DO NOT use brave_search or browser.search.
|
| 44 |
+
3. Always use the exact tool names provided.
|
| 45 |
+
|
| 46 |
+
4. Answer ONLY the exact question asked.
|
| 47 |
+
5. Use tools whenever you are not 100% certain.
|
| 48 |
+
6. Think step by step before answering.
|
| 49 |
+
|
| 50 |
+
7. Final answer must be:
|
| 51 |
+
- SHORT
|
| 52 |
+
- EXACT format
|
| 53 |
+
- NO explanation
|
| 54 |
+
|
| 55 |
+
8. If a file is mentioned → ALWAYS call download_task_file.
|
| 56 |
+
9. If file content is provided in the question, you MUST use it.
|
| 57 |
+
DO NOT ask for the file again.
|
| 58 |
+
|
| 59 |
+
10. Never hallucinate.
|
| 60 |
+
11. When using web_search, ALWAYS follow by visit_webpage on a relevant result to confirm the answer.
|
| 61 |
+
|
| 62 |
+
12. Prefer exact facts from webpages over search snippets.
|
| 63 |
+
|
| 64 |
+
13. When possible, verify the answer using at least two sources.
|
| 65 |
"""
|
| 66 |
|
| 67 |
# ── Tools ──────────────────────────────────────────────────────────────────────
|
|
|
|
| 195 |
def agent_node(state: AgentState):
|
| 196 |
"""
|
| 197 |
Essaie les modèles dans l'ordre MODEL_PRIORITY.
|
| 198 |
+
Ajoute un filtre qualité pour éviter les mauvaises réponses.
|
| 199 |
+
Retry intelligent si réponse faible.
|
| 200 |
"""
|
| 201 |
last_error = None
|
| 202 |
+
|
| 203 |
for model_id in MODEL_PRIORITY:
|
| 204 |
try:
|
| 205 |
print(f" [agent] Essai modèle : {model_id}")
|
| 206 |
+
|
| 207 |
+
llm = make_llm(model_id)
|
| 208 |
+
response = llm.invoke(state["messages"])
|
| 209 |
+
|
| 210 |
+
# 🔥 CONTENU
|
| 211 |
+
content = str(response.content).strip()
|
| 212 |
+
content_lower = content.lower()
|
| 213 |
+
|
| 214 |
+
print(f" [agent] Réponse brute : {content[:120]}")
|
| 215 |
+
|
| 216 |
+
# ❌ FILTRE QUALITÉ (hyper important pour GAIA)
|
| 217 |
+
weak_patterns = [
|
| 218 |
+
"unable",
|
| 219 |
+
"not sure",
|
| 220 |
+
"i don't know",
|
| 221 |
+
"cannot find",
|
| 222 |
+
"no information",
|
| 223 |
+
"insufficient information",
|
| 224 |
+
"not available",
|
| 225 |
+
"i could not",
|
| 226 |
+
"i cannot",
|
| 227 |
+
"unknown"
|
| 228 |
+
]
|
| 229 |
+
|
| 230 |
+
if (
|
| 231 |
+
not content
|
| 232 |
+
or any(p in content_lower for p in weak_patterns)
|
| 233 |
+
):
|
| 234 |
+
print(f" [agent] Réponse faible détectée → retry modèle suivant")
|
| 235 |
+
raise ValueError("Weak or uncertain answer")
|
| 236 |
+
|
| 237 |
+
# ❌ Éviter réponses trop longues (souvent mauvaises en GAIA)
|
| 238 |
+
if len(content.split()) > 50:
|
| 239 |
+
print(f" [agent] Réponse trop longue → probablement incorrecte")
|
| 240 |
+
raise ValueError("Answer too verbose")
|
| 241 |
+
|
| 242 |
+
# ✅ Si OK → retourner
|
| 243 |
+
print(f" [agent] Réponse acceptée ✅")
|
| 244 |
return {"messages": [response]}
|
| 245 |
+
|
| 246 |
except Exception as e:
|
| 247 |
print(f" [agent] Modèle {model_id} échoué : {e}")
|
| 248 |
last_error = e
|
| 249 |
continue
|
| 250 |
|
| 251 |
+
# ❌ Si tous échouent
|
| 252 |
raise RuntimeError(f"Tous les modèles Groq ont échoué. Dernière erreur : {last_error}")
|
| 253 |
|
| 254 |
|
| 255 |
def should_continue(state: AgentState):
|
|
|
|
| 256 |
last = state["messages"][-1]
|
| 257 |
+
|
| 258 |
+
# Si tool call → continuer
|
| 259 |
if hasattr(last, "tool_calls") and last.tool_calls:
|
| 260 |
return "tools"
|
| 261 |
+
|
| 262 |
+
# 🔥 Si pas encore utilisé d’outil → forcer recherche
|
| 263 |
+
used_tools = any(
|
| 264 |
+
hasattr(m, "tool_calls") and m.tool_calls
|
| 265 |
+
for m in state["messages"]
|
| 266 |
+
)
|
| 267 |
+
|
| 268 |
+
if not used_tools:
|
| 269 |
+
return "agent"
|
| 270 |
+
|
| 271 |
return END
|
| 272 |
|
| 273 |
|