Spaces:
Sleeping
Sleeping
Ajout web_search et web_get
Browse files- quick_deploy_agent.py +103 -11
quick_deploy_agent.py
CHANGED
|
@@ -142,7 +142,7 @@ class RegexCOICOP(Tool):
|
|
| 142 |
output_type = "object"
|
| 143 |
|
| 144 |
import re as _re
|
| 145 |
-
SOFT = _re.compile(r"(?:\b|^)(?:CAMEMB(?:ERT)?|BRIE|COULOMMI(?:ERS?)?|BLEU|ROQUEFORT|
|
| 146 |
PRESS = _re.compile(r"(?:\b|^)(EMMENTAL|COMTE|CANTAL|MIMOLETTE|GOUDA|EDAM|BEAUFORT|ABONDANCE|SALERS|TOMME|TOME)(?:\b|$)")
|
| 147 |
GOAT = _re.compile(r"(?:\b|^)(CHEVRE|STE MAURE|CROTTIN|BUCHE|PICODON|PELARDON|BANON)(?:\b|$)")
|
| 148 |
PROC = _re.compile(r"(?:\b|^)(FONDU(?:ES?)?|FROMAGE FONDU|TOASTINETTES?|VACHE QUI RIT|KIRI|CARRE FRAIS|CARR[ÉE] FRAIS|PORTIONS?)(?:\b|$)|\bRAP[ÉE]?\b")
|
|
@@ -262,6 +262,74 @@ class SemSim(Tool):
|
|
| 262 |
)
|
| 263 |
return {"candidates": ranked[:max(1,int(topk))]}
|
| 264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
# ---- Resolve ----
|
| 266 |
class Resolve(Tool):
|
| 267 |
name, description = "resolve_coicop_candidates", "Fusionne candidats → choix final + alternatives + explication."
|
|
@@ -312,10 +380,19 @@ def build_agent(model_id: str | None = None) -> CodeAgent:
|
|
| 312 |
top_p=0.95,
|
| 313 |
)
|
| 314 |
agent = CodeAgent(
|
| 315 |
-
tools=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
model=model,
|
| 317 |
add_base_tools=False,
|
| 318 |
-
max_steps=
|
| 319 |
verbosity_level=2,
|
| 320 |
)
|
| 321 |
return agent
|
|
@@ -334,16 +411,31 @@ if __name__ == "__main__":
|
|
| 334 |
Classe ce produit en COICOP:
|
| 335 |
EAN: {ean}
|
| 336 |
Libellé: {label}
|
| 337 |
-
|
| 338 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
|
| 340 |
Pipeline :
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
|
|
|
|
|
|
| 347 |
|
| 348 |
Retourne uniquement un JSON valide (objet), sans backticks.
|
| 349 |
"""
|
|
|
|
| 142 |
output_type = "object"
|
| 143 |
|
| 144 |
import re as _re
|
| 145 |
+
SOFT = _re.compile(r"(?:\b|^)(?:CAMEMB(?:ERT)?|BRIE|COULOMMI(?:ERS?)?|BLEU|ROQUEFORT|GORGONZOLA|REBLOCHON|MUNSTER)(?:\b|$)")
|
| 146 |
PRESS = _re.compile(r"(?:\b|^)(EMMENTAL|COMTE|CANTAL|MIMOLETTE|GOUDA|EDAM|BEAUFORT|ABONDANCE|SALERS|TOMME|TOME)(?:\b|$)")
|
| 147 |
GOAT = _re.compile(r"(?:\b|^)(CHEVRE|STE MAURE|CROTTIN|BUCHE|PICODON|PELARDON|BANON)(?:\b|$)")
|
| 148 |
PROC = _re.compile(r"(?:\b|^)(FONDU(?:ES?)?|FROMAGE FONDU|TOASTINETTES?|VACHE QUI RIT|KIRI|CARRE FRAIS|CARR[ÉE] FRAIS|PORTIONS?)(?:\b|$)|\bRAP[ÉE]?\b")
|
|
|
|
| 262 |
)
|
| 263 |
return {"candidates": ranked[:max(1,int(topk))]}
|
| 264 |
|
| 265 |
+
# ---- Web tools (recherche & lecture) ----
|
| 266 |
+
class WebSearch(Tool):
|
| 267 |
+
name = "web_search"
|
| 268 |
+
description = "Recherche web légère (DuckDuckGo HTML). Entrée: query (fr/en). Retour: top résultats avec titre, url, snippet."
|
| 269 |
+
inputs = {"query": {"type":"string","description":"Requête de recherche web."}}
|
| 270 |
+
output_type = "object"
|
| 271 |
+
requirements = ["requests"]
|
| 272 |
+
|
| 273 |
+
def forward(self, query: str):
|
| 274 |
+
import html
|
| 275 |
+
sess = requests.Session()
|
| 276 |
+
sess.headers.update({"User-Agent":"insee-coicop-agent/1.0"})
|
| 277 |
+
try:
|
| 278 |
+
r = sess.get("https://duckduckgo.com/html/", params={"q": query, "kl":"fr-fr"}, timeout=15)
|
| 279 |
+
r.raise_for_status()
|
| 280 |
+
except Exception as e:
|
| 281 |
+
return {"ok": False, "error": str(e), "results": []}
|
| 282 |
+
# parsing très simple, sans dépendance lourde
|
| 283 |
+
text = r.text
|
| 284 |
+
# Résultats sous <a class="result__a" href="...">Titre</a>
|
| 285 |
+
results = []
|
| 286 |
+
for m in re.finditer(r'<a[^>]+class="result__a"[^>]+href="([^"]+)"[^>]*>(.*?)</a>', text, re.I|re.S):
|
| 287 |
+
url = html.unescape(m.group(1))
|
| 288 |
+
title = re.sub("<.*?>", "", html.unescape(m.group(2))).strip()
|
| 289 |
+
# Snippet
|
| 290 |
+
snip_m = re.search(r'<a[^>]+class="result__a"[^>]+href="{}"[^>]*>.*?</a>.*?<a[^>]+class="result__snippet"[^>]*>(.*?)</a>'.format(re.escape(m.group(1))), text, re.I|re.S)
|
| 291 |
+
snippet = ""
|
| 292 |
+
if snip_m:
|
| 293 |
+
snippet = re.sub("<.*?>", "", html.unescape(snip_m.group(1))).strip()
|
| 294 |
+
if title and url:
|
| 295 |
+
results.append({"title": title, "url": url, "snippet": snippet})
|
| 296 |
+
if len(results) >= 8:
|
| 297 |
+
break
|
| 298 |
+
return {"ok": True, "query": query, "results": results}
|
| 299 |
+
|
| 300 |
+
class WebGet(Tool):
|
| 301 |
+
name = "web_get"
|
| 302 |
+
description = "Télécharge une page web et renvoie un texte brut nettoyé (limité à ~50k chars)."
|
| 303 |
+
inputs = {"url": {"type":"string","description":"URL http(s) à lire."}}
|
| 304 |
+
output_type = "object"
|
| 305 |
+
requirements = ["requests", "beautifulsoup4"]
|
| 306 |
+
|
| 307 |
+
def forward(self, url: str):
|
| 308 |
+
import html
|
| 309 |
+
text_out = ""
|
| 310 |
+
try:
|
| 311 |
+
r = requests.get(url, headers={"User-Agent":"insee-coicop-agent/1.0"}, timeout=20)
|
| 312 |
+
if not r.ok:
|
| 313 |
+
return {"ok": False, "status": r.status_code, "url": url, "text": ""}
|
| 314 |
+
content = r.text
|
| 315 |
+
try:
|
| 316 |
+
from bs4 import BeautifulSoup
|
| 317 |
+
soup = BeautifulSoup(content, "html.parser")
|
| 318 |
+
# retirer scripts/styles/nav
|
| 319 |
+
for tag in soup(["script","style","noscript","header","footer","nav","form","aside"]):
|
| 320 |
+
tag.decompose()
|
| 321 |
+
text_out = soup.get_text(separator=" ")
|
| 322 |
+
except Exception:
|
| 323 |
+
# fallback brut: retire les tags
|
| 324 |
+
text_out = re.sub(r"<script.*?</script>|<style.*?</style>", " ", content, flags=re.S|re.I)
|
| 325 |
+
text_out = re.sub(r"<[^>]+>", " ", text_out)
|
| 326 |
+
text_out = re.sub(r"\s+", " ", text_out).strip()
|
| 327 |
+
if len(text_out) > 50000:
|
| 328 |
+
text_out = text_out[:50000]
|
| 329 |
+
return {"ok": True, "url": url, "text": text_out}
|
| 330 |
+
except Exception as e:
|
| 331 |
+
return {"ok": False, "url": url, "error": str(e), "text": ""}
|
| 332 |
+
|
| 333 |
# ---- Resolve ----
|
| 334 |
class Resolve(Tool):
|
| 335 |
name, description = "resolve_coicop_candidates", "Fusionne candidats → choix final + alternatives + explication."
|
|
|
|
| 380 |
top_p=0.95,
|
| 381 |
)
|
| 382 |
agent = CodeAgent(
|
| 383 |
+
tools=[
|
| 384 |
+
ValidateEANTool(),
|
| 385 |
+
OFFByEAN(),
|
| 386 |
+
RegexCOICOP(),
|
| 387 |
+
OFFtoCOICOP(),
|
| 388 |
+
SemSim(),
|
| 389 |
+
WebSearch(), # <-- autorise recherche web
|
| 390 |
+
WebGet(), # <-- autorise lecture de pages
|
| 391 |
+
Resolve(),
|
| 392 |
+
],
|
| 393 |
model=model,
|
| 394 |
add_base_tools=False,
|
| 395 |
+
max_steps=8, # un peu plus de marge si web utilisé
|
| 396 |
verbosity_level=2,
|
| 397 |
)
|
| 398 |
return agent
|
|
|
|
| 411 |
Classe ce produit en COICOP:
|
| 412 |
EAN: {ean}
|
| 413 |
Libellé: {label}
|
| 414 |
+
|
| 415 |
+
Outils autorisés UNIQUEMENT :
|
| 416 |
+
- validate_ean
|
| 417 |
+
- openfoodfacts_product_by_ean
|
| 418 |
+
- map_off_to_coicop
|
| 419 |
+
- coicop_regex_rules
|
| 420 |
+
- coicop_semantic_similarity
|
| 421 |
+
- web_search
|
| 422 |
+
- web_get
|
| 423 |
+
- resolve_coicop_candidates
|
| 424 |
+
|
| 425 |
+
RÈGLES:
|
| 426 |
+
- TU PEUX interroger Internet via web_search puis web_get pour récupérer infos produit (fiche marque, page drive, comparateurs, etc.).
|
| 427 |
+
- N'UTILISE PAS python_interpreter. N'ÉCRIS PAS DE CODE.
|
| 428 |
+
- N'INDEXE JAMAIS la sortie d'un tool (copie-colle uniquement ce qui est utile).
|
| 429 |
|
| 430 |
Pipeline :
|
| 431 |
+
1) validate_ean(ean)
|
| 432 |
+
2) openfoodfacts_product_by_ean(ean)
|
| 433 |
+
3) map_off_to_coicop(off_payload=<sortie brute de (2)>) ou, si nécessaire, map_off_to_coicop(product_name, categories_tags, ingredients_text)
|
| 434 |
+
3bis) SI doute (peu d'infos ou contradictions), web_search(query = "EAN + libellé + marque" ou libellé seul) → choisir 1–2 urls pertinentes → web_get(url)
|
| 435 |
+
4) coicop_regex_rules(text = LIBELLÉ UTILISATEUR)
|
| 436 |
+
4bis) coicop_regex_rules(text = TEXTE DES PAGES WEB RÉCUPÉRÉES) # pour capter des mots-clés comme camembert/brie/emmental/etc.
|
| 437 |
+
5) coicop_semantic_similarity(text = LIBELLÉ UTILISATEUR, topk = 5)
|
| 438 |
+
6) resolve_coicop_candidates(json_lists = [candidats de 3, 4, 4bis, 5], topn = 3)
|
| 439 |
|
| 440 |
Retourne uniquement un JSON valide (objet), sans backticks.
|
| 441 |
"""
|