Spaces:
Sleeping
Sleeping
Initial commit for Hugging Face Space
Browse files- .gitignore +49 -0
- app.py +63 -0
- config/prompts.py +466 -0
- config/settings.py +127 -0
- langgraphe_app.py +219 -0
- requirements.txt +36 -0
- src/__init__.py +14 -0
- src/agents/__init__.py +14 -0
- src/agents/base_agent.py +235 -0
- src/agents/content_extractor_agent.py +626 -0
- src/agents/global_synthesizer_agent.py +826 -0
- src/agents/researcher_agent.py +642 -0
- src/agents/summarizer_agent.py +669 -0
- src/core/__init__.py +7 -0
- src/core/logging.py +73 -0
- src/enhanced_system_prompt.py +159 -0
- src/graph.py +294 -0
- src/graph/__init__.py +0 -0
- src/graph/nodes.py +0 -0
- src/graph/notebook.ipynb +0 -0
- src/memory_integration.py +285 -0
- src/memory_system.py +547 -0
- src/models/__init__.py +64 -0
- src/models/agent_models.py +0 -0
- src/models/document_models.py +232 -0
- src/models/report_models.py +221 -0
- src/models/research_models.py +86 -0
- src/models/state_models.py +212 -0
- src/models/synthesis_models.py +306 -0
- src/services/__init__.py +19 -0
- src/services/content_extraction.py +462 -0
- src/services/llm_service.py +488 -0
- src/services/search_api.py +347 -0
- src/services/text_chunking.py +404 -0
.gitignore
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*.pyo
|
| 5 |
+
*.pyd
|
| 6 |
+
|
| 7 |
+
# Virtual environments
|
| 8 |
+
.env/
|
| 9 |
+
venv/
|
| 10 |
+
ENV/
|
| 11 |
+
env/
|
| 12 |
+
.venv/
|
| 13 |
+
|
| 14 |
+
# Distribution / packaging
|
| 15 |
+
build/
|
| 16 |
+
dist/
|
| 17 |
+
*.egg-info/
|
| 18 |
+
|
| 19 |
+
# Pytest
|
| 20 |
+
.cache/
|
| 21 |
+
.pytest_cache/
|
| 22 |
+
|
| 23 |
+
# Data and outputs
|
| 24 |
+
data/
|
| 25 |
+
output/
|
| 26 |
+
logs/
|
| 27 |
+
docs/
|
| 28 |
+
|
| 29 |
+
#file
|
| 30 |
+
prompt.md
|
| 31 |
+
|
| 32 |
+
# Database files
|
| 33 |
+
*.db
|
| 34 |
+
|
| 35 |
+
# IDEs
|
| 36 |
+
.vscode/
|
| 37 |
+
.idea/
|
| 38 |
+
|
| 39 |
+
# OS files
|
| 40 |
+
.DS_Store
|
| 41 |
+
Thumbs.db
|
| 42 |
+
|
| 43 |
+
# dotenv
|
| 44 |
+
.env
|
| 45 |
+
.env.*
|
| 46 |
+
.specstory
|
| 47 |
+
|
| 48 |
+
# Optional: ignore local config
|
| 49 |
+
config/*.local
|
app.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from langgraphe_app import app # importe ton graphe déjà compilé
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def print_stream(stream):
|
| 7 |
+
"""Affiche le flux de messages de manière lisible"""
|
| 8 |
+
print("\n" + "="*60)
|
| 9 |
+
for s in stream:
|
| 10 |
+
message = s["messages"][-1]
|
| 11 |
+
if hasattr(message, 'pretty_print'):
|
| 12 |
+
message.pretty_print()
|
| 13 |
+
else:
|
| 14 |
+
print(message)
|
| 15 |
+
print("-"*60)
|
| 16 |
+
|
| 17 |
+
# def run_research(user_query: str):
|
| 18 |
+
# """Fonction helper pour lancer une recherche"""
|
| 19 |
+
# inputs = {"messages": [("user", user_query)]}
|
| 20 |
+
# print_stream(app.stream(inputs, stream_mode="values"))
|
| 21 |
+
|
| 22 |
+
def run_research(user_query: str) -> str:
|
| 23 |
+
"""Exécute le graphe et renvoie le texte final pour Gradio."""
|
| 24 |
+
|
| 25 |
+
inputs = {"messages": [("user", user_query)]}
|
| 26 |
+
stream = app.stream(inputs, stream_mode="values")
|
| 27 |
+
|
| 28 |
+
last_state = None
|
| 29 |
+
|
| 30 |
+
# on lit le stream mais on n'affiche pas dans le terminal
|
| 31 |
+
for s in stream:
|
| 32 |
+
last_state = s
|
| 33 |
+
|
| 34 |
+
# le message final
|
| 35 |
+
final_message = last_state["messages"][-1]
|
| 36 |
+
|
| 37 |
+
# Retourne le texte pour l’UI Gradio
|
| 38 |
+
try:
|
| 39 |
+
return final_message.content
|
| 40 |
+
except:
|
| 41 |
+
return str(final_message)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
with gr.Blocks(title="AI Research Assistant") as demo:
|
| 45 |
+
gr.Markdown("# 🔍 AI Research Assistant\nPipeline LangGraph pour la recherche automatisée")
|
| 46 |
+
|
| 47 |
+
input_box = gr.Textbox(
|
| 48 |
+
label="Votre sujet de recherche",
|
| 49 |
+
placeholder="Ex : Impact de l'IA sur le marché du travail"
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
output_box = gr.TextArea(
|
| 53 |
+
label="Rapport généré",
|
| 54 |
+
lines=20
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
run_button = gr.Button("Lancer la recherche")
|
| 58 |
+
run_button.click(run_research, inputs=input_box, outputs=output_box)
|
| 59 |
+
|
| 60 |
+
if __name__ == "__main__":
|
| 61 |
+
demo.launch(server_name="0.0.0.0", server_port=8000)
|
| 62 |
+
|
| 63 |
+
# p-:8P^AduGVf2hU
|
config/prompts.py
ADDED
|
@@ -0,0 +1,466 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configuration des prompts pour chaque agent du système
|
| 2 |
+
|
| 3 |
+
# Agent Researcher - Prompt de base
|
| 4 |
+
RESEARCHER_PROMPT = """
|
| 5 |
+
Tu es un agent de recherche expert. Ta mission est de trouver des informations pertinentes sur internet
|
| 6 |
+
concernant le sujet suivant: {topic}.
|
| 7 |
+
|
| 8 |
+
Recherche des sources fiables et récentes. Analyse le sujet et décompose-le en sous-sujets pertinents
|
| 9 |
+
si nécessaire. Pour chaque source, récupère les informations suivantes:
|
| 10 |
+
- L'URL complète
|
| 11 |
+
- Le titre
|
| 12 |
+
- Un résumé court du contenu
|
| 13 |
+
- La date de publication (si disponible)
|
| 14 |
+
- L'auteur ou la source (si disponible)
|
| 15 |
+
|
| 16 |
+
Concentre-toi sur les informations factuelles et évite les sources d'opinion non fondée.
|
| 17 |
+
Retourne une liste structurée des meilleures sources que tu trouves.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
# Prompt pour l'extraction de mots-clés
|
| 21 |
+
KEYWORD_EXTRACTION_PROMPT = """
|
| 22 |
+
Tu es un expert en analyse sémantique. Analyse le sujet de recherche suivant et extrais 3-7 mots-clés pertinents qui amélioreront la recherche web.
|
| 23 |
+
|
| 24 |
+
Sujet: {topic}
|
| 25 |
+
|
| 26 |
+
Règles:
|
| 27 |
+
1. Extrais des mots-clés spécifiques et techniques liés au sujet
|
| 28 |
+
2. Évite les mots trop génériques (comme "analyse", "étude", "recherche")
|
| 29 |
+
3. Privilégie les synonymes et termes alternatifs qui enrichiront la recherche
|
| 30 |
+
4. Inclus des termes en français et leurs équivalents anglais si pertinents
|
| 31 |
+
5. Évite de répéter les mots déjà présents dans le sujet principal
|
| 32 |
+
|
| 33 |
+
Format de réponse: Retourne uniquement une liste de mots-clés séparés par des virgules, sans numérotation.
|
| 34 |
+
Exemple: intelligence artificielle, machine learning, automatisation, emploi, marché du travail
|
| 35 |
+
|
| 36 |
+
Mots-clés pour "{topic}":"""
|
| 37 |
+
|
| 38 |
+
# Agent Reader/Summarizer - Prompt de base
|
| 39 |
+
READER_PROMPT = """
|
| 40 |
+
Tu es un expert en analyse et synthèse de documents. Tu dois lire et résumer le contenu suivant:
|
| 41 |
+
|
| 42 |
+
{document_content}
|
| 43 |
+
|
| 44 |
+
Source: {source_url}
|
| 45 |
+
Titre: {title}
|
| 46 |
+
Date: {date}
|
| 47 |
+
Auteur: {author}
|
| 48 |
+
|
| 49 |
+
Crée un résumé structuré qui:
|
| 50 |
+
1. Identifie les points clés et arguments principaux (max 5)
|
| 51 |
+
2. Extrait les données et statistiques importantes
|
| 52 |
+
3. Note les méthodologies utilisées (si pertinent)
|
| 53 |
+
4. Identifie les limitations ou biais potentiels
|
| 54 |
+
5. Inclut les citations importantes (avec guillemets)
|
| 55 |
+
|
| 56 |
+
Format ton résumé de manière claire avec des sections et des puces pour faciliter la lecture.
|
| 57 |
+
Limite-toi à l'essentiel, le résumé ne doit pas dépasser 30% de la longueur du texte original.
|
| 58 |
+
"""
|
| 59 |
+
|
| 60 |
+
# Agent Writer/Reporter - Prompt de base
|
| 61 |
+
WRITER_PROMPT = """
|
| 62 |
+
Tu es un rédacteur expert. Ta mission est de créer un rapport de recherche structuré et professionnel
|
| 63 |
+
sur le sujet: {topic}.
|
| 64 |
+
|
| 65 |
+
Utilise les résumés de sources suivants pour rédiger ton rapport:
|
| 66 |
+
|
| 67 |
+
{source_summaries}
|
| 68 |
+
|
| 69 |
+
Ton rapport doit:
|
| 70 |
+
1. Commencer par une introduction claire qui présente le sujet et son importance
|
| 71 |
+
2. Organiser le contenu en sections logiques avec des titres et sous-titres
|
| 72 |
+
3. Synthétiser les informations de toutes les sources de manière cohérente
|
| 73 |
+
4. Présenter différentes perspectives sur le sujet quand elles existent
|
| 74 |
+
5. Inclure des citations directes importantes (avec guillemets et références)
|
| 75 |
+
6. Se terminer par une conclusion qui résume les points clés
|
| 76 |
+
7. Inclure une bibliographie complète des sources utilisées
|
| 77 |
+
|
| 78 |
+
Format du rapport: {format} (Markdown ou PDF)
|
| 79 |
+
Utilise un ton professionnel et objectif. Assure-toi que toutes les informations sont correctement citées.
|
| 80 |
+
"""
|
| 81 |
+
|
| 82 |
+
# Prompts pour l'agent Summarizer/Reader
|
| 83 |
+
SUMMARIZER_PROMPTS = {
|
| 84 |
+
"executive_summary": """
|
| 85 |
+
Tu es un expert en synthèse de documents. Crée un résumé exécutif concis et percutant du document suivant.
|
| 86 |
+
|
| 87 |
+
DOCUMENT:
|
| 88 |
+
Titre: {title}
|
| 89 |
+
Auteur: {author}
|
| 90 |
+
URL: {url}
|
| 91 |
+
|
| 92 |
+
CONTENU:
|
| 93 |
+
{content}
|
| 94 |
+
|
| 95 |
+
INSTRUCTIONS:
|
| 96 |
+
1. Rédige un résumé exécutif de 2-3 phrases maximum
|
| 97 |
+
2. Capture l'essence et les points les plus importants du document
|
| 98 |
+
3. Utilise un langage clair et professionnel
|
| 99 |
+
4. Évite les détails techniques superflus
|
| 100 |
+
5. Focus sur les conclusions et impacts principaux
|
| 101 |
+
|
| 102 |
+
RÉSUMÉ EXÉCUTIF:""",
|
| 103 |
+
|
| 104 |
+
"detailed_analysis": """
|
| 105 |
+
Tu es un analyste expert. Effectue une analyse détaillée du document suivant.
|
| 106 |
+
|
| 107 |
+
DOCUMENT:
|
| 108 |
+
Titre: {title}
|
| 109 |
+
Auteur: {author}
|
| 110 |
+
URL: {url}
|
| 111 |
+
|
| 112 |
+
CONTENU:
|
| 113 |
+
{content}
|
| 114 |
+
|
| 115 |
+
ANALYSE DEMANDÉE:
|
| 116 |
+
1. **RÉSUMÉ DÉTAILLÉ** (2-3 paragraphes): Synthèse approfondie du contenu
|
| 117 |
+
2. **POINTS CLÉS** (3-5 points): Arguments et idées principales (format: - Point clé)
|
| 118 |
+
3. **ARGUMENTS PRINCIPAUX**: Thèses soutenues par l'auteur
|
| 119 |
+
4. **DONNÉES ET STATISTIQUES**: Chiffres importants mentionnés
|
| 120 |
+
5. **MÉTHODOLOGIE**: Approche utilisée (si applicable)
|
| 121 |
+
6. **LIMITATIONS**: Biais ou limites identifiés
|
| 122 |
+
|
| 123 |
+
Structure ta réponse avec des sections claires et des listes à puces.
|
| 124 |
+
|
| 125 |
+
ANALYSE DÉTAILLÉE:""",
|
| 126 |
+
|
| 127 |
+
"sentiment_analysis": """
|
| 128 |
+
Tu es un expert en analyse de sentiment et crédibilité. Évalue le document suivant.
|
| 129 |
+
|
| 130 |
+
DOCUMENT:
|
| 131 |
+
Titre: {title}
|
| 132 |
+
Contenu: {content}
|
| 133 |
+
|
| 134 |
+
ÉVALUATION DEMANDÉE:
|
| 135 |
+
1. **SENTIMENT GÉNÉRAL**: Positif, Neutre, ou Négatif (justifie brièvement)
|
| 136 |
+
2. **CRÉDIBILITÉ**: Score sur 10 (justifie ton évaluation)
|
| 137 |
+
3. **BIAIS POTENTIELS**: Identifie les biais éventuels
|
| 138 |
+
4. **QUALITÉ DES SOURCES**: Évalue la fiabilité des références
|
| 139 |
+
|
| 140 |
+
Critères de crédibilité:
|
| 141 |
+
- Qualité des sources citées
|
| 142 |
+
- Objectivité du ton
|
| 143 |
+
- Présence de données factuelles
|
| 144 |
+
- Expertise apparente de l'auteur
|
| 145 |
+
- Cohérence argumentative
|
| 146 |
+
|
| 147 |
+
Format de réponse:
|
| 148 |
+
SENTIMENT: [Positif/Neutre/Négatif] - [Justification]
|
| 149 |
+
CRÉDIBILITÉ: [Score]/10 - [Justification]
|
| 150 |
+
BIAIS: [Description des biais identifiés]
|
| 151 |
+
|
| 152 |
+
ÉVALUATION:""",
|
| 153 |
+
|
| 154 |
+
"key_points_extraction": """
|
| 155 |
+
Tu es un expert en extraction d'informations clés. Identifie les points les plus importants du document.
|
| 156 |
+
|
| 157 |
+
DOCUMENT:
|
| 158 |
+
{content}
|
| 159 |
+
|
| 160 |
+
INSTRUCTIONS:
|
| 161 |
+
1. Extrais 3-7 points clés maximum
|
| 162 |
+
2. Chaque point doit être autonome et informatif
|
| 163 |
+
3. Priorise par ordre d'importance
|
| 164 |
+
4. Utilise des phrases courtes et claires
|
| 165 |
+
5. Évite la redondance
|
| 166 |
+
|
| 167 |
+
Format souhaité:
|
| 168 |
+
- Point clé 1 (le plus important)
|
| 169 |
+
- Point clé 2
|
| 170 |
+
- Point clé 3
|
| 171 |
+
etc.
|
| 172 |
+
|
| 173 |
+
POINTS CLÉS:""",
|
| 174 |
+
|
| 175 |
+
"citations_extraction": """
|
| 176 |
+
Tu es un expert en extraction de citations importantes. Identifie les citations les plus significatives du document.
|
| 177 |
+
|
| 178 |
+
DOCUMENT:
|
| 179 |
+
{content}
|
| 180 |
+
|
| 181 |
+
INSTRUCTIONS:
|
| 182 |
+
1. Extrais 2-5 citations maximum
|
| 183 |
+
2. Privilégie les citations d'experts ou d'autorités
|
| 184 |
+
3. Sélectionne les phrases les plus impactantes
|
| 185 |
+
4. Inclus le contexte si nécessaire
|
| 186 |
+
5. Évite les citations trop longues
|
| 187 |
+
|
| 188 |
+
Format souhaité:
|
| 189 |
+
"Citation exacte" - [Contexte/Auteur si mentionné]
|
| 190 |
+
|
| 191 |
+
CITATIONS IMPORTANTES:""",
|
| 192 |
+
|
| 193 |
+
"chunked_summary": """
|
| 194 |
+
Tu es un expert en synthèse de texte. Résume le chunk suivant du document.
|
| 195 |
+
|
| 196 |
+
CHUNK {chunk_index}/{total_chunks} du document \"{title}\" :
|
| 197 |
+
|
| 198 |
+
{chunk_content}
|
| 199 |
+
|
| 200 |
+
INSTRUCTIONS:
|
| 201 |
+
1. Résume ce chunk en 5-7 phrases claires et informatives
|
| 202 |
+
2. Garde uniquement les informations essentielles
|
| 203 |
+
3. Ne fais pas de répétition avec les autres chunks
|
| 204 |
+
4. Utilise un style neutre et professionnel
|
| 205 |
+
|
| 206 |
+
RÉSUMÉ DU CHUNK:
|
| 207 |
+
""",
|
| 208 |
+
|
| 209 |
+
"synthesis": """
|
| 210 |
+
Tu es un expert en synthèse documentaire. Crée un résumé unifié à partir des analyses partielles suivantes.
|
| 211 |
+
|
| 212 |
+
ANALYSES PARTIELLES:
|
| 213 |
+
{partial_summaries}
|
| 214 |
+
|
| 215 |
+
DOCUMENT ORIGINAL:
|
| 216 |
+
Titre: {title}
|
| 217 |
+
URL: {url}
|
| 218 |
+
|
| 219 |
+
INSTRUCTIONS:
|
| 220 |
+
1. Synthétise toutes les analyses partielles en un résumé cohérent
|
| 221 |
+
2. Élimine les redondances
|
| 222 |
+
3. Préserve les informations essentielles
|
| 223 |
+
4. Maintiens la logique et la continuité
|
| 224 |
+
5. Assure-toi que le résumé final est compréhensible de manière autonome
|
| 225 |
+
|
| 226 |
+
Structure attendue:
|
| 227 |
+
- Résumé exécutif (2-3 phrases)
|
| 228 |
+
- Analyse détaillée (2-3 paragraphes)
|
| 229 |
+
- Points clés principaux
|
| 230 |
+
- Sentiment et crédibilité globale
|
| 231 |
+
|
| 232 |
+
SYNTHÈSE FINALE:""",
|
| 233 |
+
|
| 234 |
+
"global_analysis": """
|
| 235 |
+
Tu es un expert en analyse comparative de documents. Analyse l'ensemble des résumés suivants pour identifier les patterns globaux.
|
| 236 |
+
|
| 237 |
+
RÉSUMÉS DE DOCUMENTS:
|
| 238 |
+
{all_summaries}
|
| 239 |
+
|
| 240 |
+
ANALYSE GLOBALE DEMANDÉE:
|
| 241 |
+
1. **THÈMES COMMUNS**: Sujets récurrents dans plusieurs documents
|
| 242 |
+
2. **POINTS DE CONSENSUS**: Idées sur lesquelles les sources s'accordent
|
| 243 |
+
3. **POINTS CONFLICTUELS**: Contradictions ou désaccords entre sources
|
| 244 |
+
4. **TENDANCES**: Évolutions ou patterns identifiés
|
| 245 |
+
5. **LACUNES**: Aspects peu couverts ou manquants
|
| 246 |
+
|
| 247 |
+
Format ta réponse avec des sections claires et des listes à puces.
|
| 248 |
+
Sois objectif et factuel dans ton analyse.
|
| 249 |
+
|
| 250 |
+
ANALYSE COMPARATIVE:"""
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
# Prompts pour l'agent Global Synthesizer
|
| 254 |
+
GLOBAL_SYNTHESIZER_PROMPTS = {
|
| 255 |
+
"final_synthesis": """
|
| 256 |
+
Tu es un expert en synthèse de recherche et rédaction de rapports. Crée un rapport final complet basé sur les résumés de documents suivants.
|
| 257 |
+
|
| 258 |
+
SUJET DE RECHERCHE: {topic}
|
| 259 |
+
|
| 260 |
+
RÉSUMÉS DE DOCUMENTS ANALYSÉS:
|
| 261 |
+
{document_summaries}
|
| 262 |
+
|
| 263 |
+
ANALYSE GLOBALE EXISTANTE:
|
| 264 |
+
- Thèmes communs: {common_themes}
|
| 265 |
+
- Points de consensus: {consensus_points}
|
| 266 |
+
- Points conflictuels: {conflicting_views}
|
| 267 |
+
|
| 268 |
+
INSTRUCTIONS POUR LE RAPPORT FINAL:
|
| 269 |
+
|
| 270 |
+
1. **INTRODUCTION** (1-2 paragraphes):
|
| 271 |
+
- Présente le sujet et son importance
|
| 272 |
+
- Contextualise l'analyse menée
|
| 273 |
+
- Annonce la structure du rapport
|
| 274 |
+
|
| 275 |
+
2. **SYNTHÈSE EXÉCUTIVE** (3-5 points clés):
|
| 276 |
+
- Identifie les 3-5 conclusions principales
|
| 277 |
+
- Présente les insights les plus importants
|
| 278 |
+
- Formule des recommandations concrètes
|
| 279 |
+
|
| 280 |
+
3. **ANALYSE DÉTAILLÉE** (sections thématiques):
|
| 281 |
+
- Organise le contenu par thèmes principaux
|
| 282 |
+
- Synthétise les informations de manière cohérente
|
| 283 |
+
- Présente différentes perspectives quand elles existent
|
| 284 |
+
- Utilise des données et citations pertinentes
|
| 285 |
+
|
| 286 |
+
4. **TENDANCES ET IMPLICATIONS**:
|
| 287 |
+
- Identifie les tendances émergentes
|
| 288 |
+
- Analyse les implications futures
|
| 289 |
+
- Discute les défis et opportunités
|
| 290 |
+
|
| 291 |
+
5. **CONCLUSION**:
|
| 292 |
+
- Résume les points essentiels
|
| 293 |
+
- Propose des pistes d'action ou réflexion
|
| 294 |
+
|
| 295 |
+
STYLE ET FORMAT:
|
| 296 |
+
- Utilise un ton professionnel et objectif
|
| 297 |
+
- Structure claire avec titres et sous-titres
|
| 298 |
+
- Citations avec références aux sources
|
| 299 |
+
- Format Markdown avec mise en forme appropriée
|
| 300 |
+
|
| 301 |
+
RAPPORT FINAL:""",
|
| 302 |
+
|
| 303 |
+
"executive_summary": """
|
| 304 |
+
Tu es un expert en communication exécutive. Crée un résumé ex��cutif percutant basé sur les analyses suivantes.
|
| 305 |
+
|
| 306 |
+
SUJET: {topic}
|
| 307 |
+
|
| 308 |
+
DONNÉES D'ANALYSE:
|
| 309 |
+
{analysis_data}
|
| 310 |
+
|
| 311 |
+
INSTRUCTIONS:
|
| 312 |
+
1. **CONCLUSIONS PRINCIPALES** (3-5 points maximum):
|
| 313 |
+
- Identifie les découvertes les plus importantes
|
| 314 |
+
- Utilise des données concrètes quand disponibles
|
| 315 |
+
- Sois concis et impactant
|
| 316 |
+
|
| 317 |
+
2. **INSIGHTS CLÉS**:
|
| 318 |
+
- Révèle les patterns et tendances importantes
|
| 319 |
+
- Connecte les informations de différentes sources
|
| 320 |
+
- Identifie ce qui est nouveau ou surprenant
|
| 321 |
+
|
| 322 |
+
3. **RECOMMANDATIONS**:
|
| 323 |
+
- Propose 2-4 actions concrètes
|
| 324 |
+
- Base-toi sur l'analyse réalisée
|
| 325 |
+
- Sois pragmatique et réalisable
|
| 326 |
+
|
| 327 |
+
4. **SYNTHÈSE NARRATIVE** (2-3 paragraphes):
|
| 328 |
+
- Raconte l'histoire principale qui émerge des données
|
| 329 |
+
- Connecte logiquement les différents éléments
|
| 330 |
+
- Termine par l'implication la plus importante
|
| 331 |
+
|
| 332 |
+
Format: Structure claire avec sections distinctes.
|
| 333 |
+
Ton: Professionnel, confiant, basé sur les faits.
|
| 334 |
+
|
| 335 |
+
RÉSUMÉ EXÉCUTIF:""",
|
| 336 |
+
|
| 337 |
+
"thematic_analysis": """
|
| 338 |
+
Tu es un analyste expert. Organise et analyse les informations suivantes par thèmes cohérents.
|
| 339 |
+
|
| 340 |
+
SUJET: {topic}
|
| 341 |
+
RÉSUMÉS: {summaries}
|
| 342 |
+
|
| 343 |
+
INSTRUCTIONS:
|
| 344 |
+
1. **IDENTIFICATION DES THÈMES**:
|
| 345 |
+
- Identifie 3-6 thèmes principaux qui émergent des résumés
|
| 346 |
+
- Chaque thème doit être substantiel et distinct
|
| 347 |
+
- Nomme chaque thème de manière claire et descriptive
|
| 348 |
+
|
| 349 |
+
2. **ANALYSE THÉMATIQUE**:
|
| 350 |
+
Pour chaque thème identifié:
|
| 351 |
+
- Synthétise les informations pertinentes de toutes les sources
|
| 352 |
+
- Identifie les points de convergence et divergence
|
| 353 |
+
- Présente les données et exemples les plus significatifs
|
| 354 |
+
- Note les implications et enjeux associés
|
| 355 |
+
|
| 356 |
+
3. **HIÉRARCHISATION**:
|
| 357 |
+
- Classe les thèmes par ordre d'importance/impact
|
| 358 |
+
- Explique brièvement pourquoi chaque thème est important
|
| 359 |
+
- Identifie les liens entre les différents thèmes
|
| 360 |
+
|
| 361 |
+
FORMAT:
|
| 362 |
+
```
|
| 363 |
+
## THÈME 1: [Nom du thème]
|
| 364 |
+
### Synthèse
|
| 365 |
+
[Analyse détaillée]
|
| 366 |
+
### Points clés
|
| 367 |
+
- Point 1
|
| 368 |
+
- Point 2
|
| 369 |
+
### Implications
|
| 370 |
+
[Discussion]
|
| 371 |
+
|
| 372 |
+
## THÈME 2: [Nom du thème]
|
| 373 |
+
[etc.]
|
| 374 |
+
```
|
| 375 |
+
|
| 376 |
+
ANALYSE THÉMATIQUE:""",
|
| 377 |
+
|
| 378 |
+
"methodology_description": """
|
| 379 |
+
Tu es un méthodologue expert. Décris la méthodologie utilisée pour cette recherche de manière claire et professionnelle.
|
| 380 |
+
|
| 381 |
+
PARAMÈTRES DE RECHERCHE:
|
| 382 |
+
- Sujet original: {topic}
|
| 383 |
+
- Nombre de sources analysées: {sources_count}
|
| 384 |
+
- Méthodes d'extraction: {extraction_methods}
|
| 385 |
+
- Critères de sélection: {selection_criteria}
|
| 386 |
+
|
| 387 |
+
PROCESSUS D'ANALYSE:
|
| 388 |
+
{analysis_process}
|
| 389 |
+
|
| 390 |
+
INSTRUCTIONS:
|
| 391 |
+
1. **APPROCHE DE RECHERCHE**:
|
| 392 |
+
- Décris la stratégie de recherche adoptée
|
| 393 |
+
- Explique les critères de sélection des sources
|
| 394 |
+
- Justifie les choix méthodologiques
|
| 395 |
+
|
| 396 |
+
2. **MÉTHODES D'ANALYSE**:
|
| 397 |
+
- Détaille les techniques d'analyse utilisées
|
| 398 |
+
- Explique le processus de synthèse
|
| 399 |
+
- Décris l'approche d'évaluation de la crédibilité
|
| 400 |
+
|
| 401 |
+
3. **LIMITATIONS**:
|
| 402 |
+
- Identifie les limites de la méthodologie
|
| 403 |
+
- Reconnaît les biais potentiels
|
| 404 |
+
- Suggère des améliorations possibles
|
| 405 |
+
|
| 406 |
+
4. **QUALITÉ DES DONNÉES**:
|
| 407 |
+
- Évalue la qualité globale des sources
|
| 408 |
+
- Discute la représentativité de l'échantillon
|
| 409 |
+
- Commente la fiabilité des conclusions
|
| 410 |
+
|
| 411 |
+
Style: Académique mais accessible, précis et honnête.
|
| 412 |
+
|
| 413 |
+
DESCRIPTION MÉTHODOLOGIQUE:""",
|
| 414 |
+
|
| 415 |
+
"quality_assessment": """
|
| 416 |
+
Tu es un expert en évaluation de la qualité de recherche. Évalue la qualité et la fiabilité de cette analyse.
|
| 417 |
+
|
| 418 |
+
DONNÉES D'ÉVALUATION:
|
| 419 |
+
- Résumés analysés: {summaries_count}
|
| 420 |
+
- Sources utilisées: {sources_info}
|
| 421 |
+
- Scores de crédibilité: {credibility_scores}
|
| 422 |
+
- Couverture thématique: {thematic_coverage}
|
| 423 |
+
|
| 424 |
+
CRITÈRES D'ÉVALUATION:
|
| 425 |
+
1. **COMPLÉTUDE**: L'analyse couvre-t-elle tous les aspects importants du sujet?
|
| 426 |
+
2. **FIABILITÉ**: Les sources sont-elles crédibles et diversifiées?
|
| 427 |
+
3. **COHÉRENCE**: Les conclusions sont-elles logiques et bien étayées?
|
| 428 |
+
4. **OBJECTIVITÉ**: L'analyse évite-t-elle les biais évidents?
|
| 429 |
+
5. **ACTUALITÉ**: Les informations sont-elles récentes et pertinentes?
|
| 430 |
+
|
| 431 |
+
INSTRUCTIONS:
|
| 432 |
+
- Attribue un score de 0 à 1 pour chaque critère
|
| 433 |
+
- Justifie chaque score avec des éléments concrets
|
| 434 |
+
- Identifie les points forts et les points faibles
|
| 435 |
+
- Calcule un score de confiance global
|
| 436 |
+
- Propose des recommandations d'amélioration
|
| 437 |
+
|
| 438 |
+
Format:
|
| 439 |
+
```
|
| 440 |
+
## ÉVALUATION DE QUALITÉ
|
| 441 |
+
|
| 442 |
+
### Complétude: X.X/1.0
|
| 443 |
+
[Justification]
|
| 444 |
+
|
| 445 |
+
### Fiabilité: X.X/1.0
|
| 446 |
+
[Justification]
|
| 447 |
+
|
| 448 |
+
[etc.]
|
| 449 |
+
|
| 450 |
+
### SCORE GLOBAL: X.X/1.0
|
| 451 |
+
### RECOMMANDATIONS:
|
| 452 |
+
- [Recommandation 1]
|
| 453 |
+
- [Recommandation 2]
|
| 454 |
+
```
|
| 455 |
+
|
| 456 |
+
ÉVALUATION QUALITÉ:"""
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
+
# Prompts système pour définir le comportement général des agents
|
| 460 |
+
SYSTEM_PROMPTS = {
|
| 461 |
+
"researcher": "Tu es un agent de recherche IA spécialisé dans la recherche d'information pertinente et fiable.",
|
| 462 |
+
"reader": "Tu es un agent d'analyse IA spécialisé dans la lecture et la synthèse de documents complexes.",
|
| 463 |
+
"writer": "Tu es un agent rédacteur IA spécialisé dans la création de rapports de recherche structurés et professionnels.",
|
| 464 |
+
"summarizer": "Tu es un agent d'analyse IA expert en synthèse de documents, extraction de points clés et évaluation de crédibilité.",
|
| 465 |
+
"global_synthesizer": "Tu es un expert en synthèse de recherche et rédaction de rapports finaux. Tu excelles dans la création de documents structurés, professionnels et basés sur des analyses multiples."
|
| 466 |
+
}
|
config/settings.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration du projet AI Research Assistant.
|
| 3 |
+
Ce fichier contient les configurations par défaut qui peuvent être surchargées
|
| 4 |
+
par les variables d'environnement.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from pydantic_settings import BaseSettings
|
| 8 |
+
from typing import Dict, Optional, List
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class APIConfig(BaseSettings):
|
| 12 |
+
"""Configuration des clés API et des paramètres associés"""
|
| 13 |
+
# LLM API (REQUIS)
|
| 14 |
+
GROQ_API_KEY: str = ""
|
| 15 |
+
|
| 16 |
+
# APIs de Recherche (Au moins une REQUISE)
|
| 17 |
+
SERPER_API_KEY: str = ""
|
| 18 |
+
TAVILY_API_KEY: str = ""
|
| 19 |
+
BRAVE_API_KEY: str = ""
|
| 20 |
+
|
| 21 |
+
# Configuration des modèles
|
| 22 |
+
LLM_MODEL: str = "llama-3.1-8b-instant"
|
| 23 |
+
LLM_TEMPERATURE: float = 0.1
|
| 24 |
+
LLM_MAX_TOKENS: int = 4000
|
| 25 |
+
EMBEDDING_MODEL: str = "all-MiniLM-L6-v2"
|
| 26 |
+
|
| 27 |
+
# Limites de recherche
|
| 28 |
+
MAX_SOURCES: int = 20
|
| 29 |
+
MAX_SUMMARY_LENGTH: int = 500
|
| 30 |
+
SEARCH_TIMEOUT: int = 30
|
| 31 |
+
|
| 32 |
+
# Performance et sécurité
|
| 33 |
+
API_RATE_LIMIT: int = 100
|
| 34 |
+
MAX_CONCURRENT_REQUESTS: int = 10
|
| 35 |
+
|
| 36 |
+
class Config:
|
| 37 |
+
env_file = ".env"
|
| 38 |
+
env_file_encoding = "utf-8"
|
| 39 |
+
extra = "ignore"
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class DatabaseConfig(BaseSettings):
|
| 43 |
+
"""Configuration de la base de données"""
|
| 44 |
+
DATABASE_URL: str = "sqlite:///data/research.db"
|
| 45 |
+
CHROMA_PERSIST_DIRECTORY: str = "data/chroma"
|
| 46 |
+
CHROMA_COLLECTION_NAME: str = "research_documents"
|
| 47 |
+
|
| 48 |
+
class Config:
|
| 49 |
+
env_file = ".env"
|
| 50 |
+
env_file_encoding = "utf-8"
|
| 51 |
+
extra = "ignore"
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class PathConfig(BaseSettings):
|
| 55 |
+
"""Configuration des chemins et répertoires"""
|
| 56 |
+
DATA_DIR: str = "data"
|
| 57 |
+
REPORTS_DIR: str = "data/reports"
|
| 58 |
+
CACHE_DIR: str = "data/cache"
|
| 59 |
+
LOGS_DIR: str = "logs"
|
| 60 |
+
|
| 61 |
+
class Config:
|
| 62 |
+
env_file = ".env"
|
| 63 |
+
env_file_encoding = "utf-8"
|
| 64 |
+
extra = "ignore"
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class FeatureConfig(BaseSettings):
|
| 68 |
+
"""Configuration des fonctionnalités"""
|
| 69 |
+
ENABLE_CACHING: bool = True
|
| 70 |
+
ENABLE_VECTOR_STORE: bool = True
|
| 71 |
+
ENABLE_RATE_LIMITING: bool = True
|
| 72 |
+
CACHE_TTL: int = 3600
|
| 73 |
+
|
| 74 |
+
class Config:
|
| 75 |
+
env_file = ".env"
|
| 76 |
+
env_file_encoding = "utf-8"
|
| 77 |
+
extra = "ignore"
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
class LoggingConfig(BaseSettings):
|
| 81 |
+
"""Configuration du logging"""
|
| 82 |
+
LOG_LEVEL: str = "INFO"
|
| 83 |
+
ENABLE_FILE_LOGGING: bool = True
|
| 84 |
+
|
| 85 |
+
class Config:
|
| 86 |
+
env_file = ".env"
|
| 87 |
+
env_file_encoding = "utf-8"
|
| 88 |
+
extra = "ignore"
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
class ExportConfig(BaseSettings):
|
| 92 |
+
"""Configuration d'export et rapports"""
|
| 93 |
+
DEFAULT_EXPORT_FORMAT: str = "markdown"
|
| 94 |
+
PDF_PAGE_SIZE: str = "A4"
|
| 95 |
+
INCLUDE_CITATIONS: bool = True
|
| 96 |
+
|
| 97 |
+
class Config:
|
| 98 |
+
env_file = ".env"
|
| 99 |
+
env_file_encoding = "utf-8"
|
| 100 |
+
extra = "ignore"
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
class DevelopmentConfig(BaseSettings):
|
| 104 |
+
"""Configuration de développement"""
|
| 105 |
+
DEBUG: bool = False
|
| 106 |
+
DEVELOPMENT_MODE: bool = False
|
| 107 |
+
WORKER_THREADS: int = 4
|
| 108 |
+
|
| 109 |
+
class Config:
|
| 110 |
+
env_file = ".env"
|
| 111 |
+
env_file_encoding = "utf-8"
|
| 112 |
+
extra = "ignore"
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
# Instanciation des configurations
|
| 116 |
+
try:
|
| 117 |
+
api_config = APIConfig()
|
| 118 |
+
database_config = DatabaseConfig()
|
| 119 |
+
path_config = PathConfig()
|
| 120 |
+
feature_config = FeatureConfig()
|
| 121 |
+
logging_config = LoggingConfig()
|
| 122 |
+
export_config = ExportConfig()
|
| 123 |
+
development_config = DevelopmentConfig()
|
| 124 |
+
except Exception as e:
|
| 125 |
+
print(f"Erreur lors du chargement de la configuration: {e}")
|
| 126 |
+
# Configuration par défaut en cas d'erreur
|
| 127 |
+
api_config = None
|
langgraphe_app.py
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_groq import ChatGroq
|
| 2 |
+
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
|
| 3 |
+
from langgraph.graph import StateGraph, END
|
| 4 |
+
from typing import TypedDict, Sequence, Annotated, Union
|
| 5 |
+
from langchain_core.messages import BaseMessage
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
from langchain_core.tools import tool
|
| 8 |
+
import os
|
| 9 |
+
from langgraph.graph.message import add_messages
|
| 10 |
+
from langgraph.prebuilt import ToolNode
|
| 11 |
+
import asyncio
|
| 12 |
+
|
| 13 |
+
from src.agents.researcher_agent import ResearcherAgent
|
| 14 |
+
from src.agents.content_extractor_agent import ContentExtractorAgent
|
| 15 |
+
from src.agents.summarizer_agent import SummarizerAgent
|
| 16 |
+
from src.agents.global_synthesizer_agent import GlobalSynthesizerAgent
|
| 17 |
+
from src.models.research_models import ResearchQuery
|
| 18 |
+
|
| 19 |
+
# ============================================================================
|
| 20 |
+
# VOS AGENTS EXISTANTS (ne pas modifier)
|
| 21 |
+
# ============================================================================
|
| 22 |
+
researcher_agent = ResearcherAgent()
|
| 23 |
+
content_extractor_agent = ContentExtractorAgent()
|
| 24 |
+
summarizer_agent = SummarizerAgent()
|
| 25 |
+
global_synthesizer_agent = GlobalSynthesizerAgent()
|
| 26 |
+
|
| 27 |
+
# ============================================================================
|
| 28 |
+
# OUTIL QUI ENCAPSULE VOTRE PIPELINE COMPLET
|
| 29 |
+
# ============================================================================
|
| 30 |
+
@tool
|
| 31 |
+
def research_complete_pipeline(topic: str, max_results: Union[int, str] = 2) -> str:
|
| 32 |
+
"""Exécute un pipeline de recherche complet sur un sujet donné.
|
| 33 |
+
|
| 34 |
+
Ce tool encapsule 4 agents qui travaillent ensemble :
|
| 35 |
+
1. ResearcherAgent : recherche web et extraction de mots-clés
|
| 36 |
+
2. ContentExtractorAgent : extraction du contenu des pages
|
| 37 |
+
3. SummarizerAgent : création de résumés détaillés
|
| 38 |
+
4. GlobalSynthesizerAgent : synthèse globale finale
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
topic: Le sujet de recherche (ex: "impact de l'IA sur l'emploi")
|
| 42 |
+
max_results: Nombre de sources à analyser (2-10, défaut: 2)
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
Un rapport complet au format texte avec résumé exécutif et analyse détaillée
|
| 46 |
+
"""
|
| 47 |
+
# Conversion et validation
|
| 48 |
+
if isinstance(max_results, str):
|
| 49 |
+
try:
|
| 50 |
+
max_results = int(max_results)
|
| 51 |
+
except ValueError:
|
| 52 |
+
max_results = 2
|
| 53 |
+
max_results = max(2, min(max_results, 10))
|
| 54 |
+
|
| 55 |
+
async def run_pipeline():
|
| 56 |
+
print(f"\n{'='*60}")
|
| 57 |
+
print(f"🚀 DÉMARRAGE DU PIPELINE DE RECHERCHE")
|
| 58 |
+
print(f"📋 Sujet: {topic}")
|
| 59 |
+
print(f"📊 Sources à analyser: {max_results}")
|
| 60 |
+
print(f"{'='*60}\n")
|
| 61 |
+
|
| 62 |
+
# ÉTAPE 1: Recherche
|
| 63 |
+
print("🔍 [1/4] Recherche web en cours...")
|
| 64 |
+
query = ResearchQuery(
|
| 65 |
+
topic=topic,
|
| 66 |
+
keywords=await researcher_agent.extract_keywords_with_llm(topic),
|
| 67 |
+
max_results=max_results,
|
| 68 |
+
search_depth="basic"
|
| 69 |
+
)
|
| 70 |
+
research_data = await researcher_agent.process(query)
|
| 71 |
+
print(f"✅ Trouvé {research_data.total_found} sources")
|
| 72 |
+
|
| 73 |
+
# ÉTAPE 2: Extraction
|
| 74 |
+
print("\n📄 [2/4] Extraction du contenu...")
|
| 75 |
+
extraction_data = await content_extractor_agent.process_from_research_output(
|
| 76 |
+
research_output=research_data
|
| 77 |
+
)
|
| 78 |
+
print(f"✅ Extrait {extraction_data.successful_extractions} documents")
|
| 79 |
+
|
| 80 |
+
# ÉTAPE 3: Résumés
|
| 81 |
+
print("\n📝 [3/4] Création des résumés...")
|
| 82 |
+
summarization_data = await summarizer_agent.process_from_extraction_result(
|
| 83 |
+
extraction_result=extraction_data
|
| 84 |
+
)
|
| 85 |
+
print(f"✅ Généré {summarization_data.total_documents} résumés")
|
| 86 |
+
|
| 87 |
+
# ÉTAPE 4: Synthèse globale
|
| 88 |
+
print("\n🎯 [4/4] Synthèse globale...")
|
| 89 |
+
global_synthesis = await global_synthesizer_agent.process_from_summarization_output(
|
| 90 |
+
summarization_output=summarization_data
|
| 91 |
+
)
|
| 92 |
+
print(f"✅ Rapport final généré ({global_synthesis.final_report.word_count} mots)")
|
| 93 |
+
|
| 94 |
+
print(f"\n{'='*60}")
|
| 95 |
+
print("✨ PIPELINE TERMINÉ AVEC SUCCÈS")
|
| 96 |
+
print(f"{'='*60}\n")
|
| 97 |
+
|
| 98 |
+
# Retourner le rapport en format markdown
|
| 99 |
+
return global_synthesis.formatted_outputs.get('markdown',
|
| 100 |
+
global_synthesis.formatted_outputs.get('text',
|
| 101 |
+
str(global_synthesis))
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
return asyncio.run(run_pipeline())
|
| 105 |
+
|
| 106 |
+
# ============================================================================
|
| 107 |
+
# CONFIGURATION DU LLM ET DU GRAPHE
|
| 108 |
+
# ============================================================================
|
| 109 |
+
|
| 110 |
+
# État du graphe
|
| 111 |
+
class AgentState(TypedDict):
|
| 112 |
+
messages: Annotated[Sequence[BaseMessage], add_messages]
|
| 113 |
+
|
| 114 |
+
# Chargement des variables d'environnement
|
| 115 |
+
load_dotenv()
|
| 116 |
+
api_key = os.getenv("GROQ_API_KEY")
|
| 117 |
+
if not api_key:
|
| 118 |
+
raise ValueError("GROQ_API_KEY non définie dans .env")
|
| 119 |
+
|
| 120 |
+
# Configuration du modèle avec l'outil
|
| 121 |
+
tools = [research_complete_pipeline]
|
| 122 |
+
model = ChatGroq(
|
| 123 |
+
model="llama-3.1-8b-instant",
|
| 124 |
+
temperature=0.3, # Bas pour plus de cohérence
|
| 125 |
+
max_tokens=2048*2,
|
| 126 |
+
api_key=api_key
|
| 127 |
+
).bind_tools(tools)
|
| 128 |
+
|
| 129 |
+
# ============================================================================
|
| 130 |
+
# NŒUDS DU GRAPHE
|
| 131 |
+
# ============================================================================
|
| 132 |
+
|
| 133 |
+
def model_call(state: AgentState) -> AgentState:
|
| 134 |
+
"""Nœud qui appelle le LLM pour décider quoi faire"""
|
| 135 |
+
|
| 136 |
+
system_prompt = SystemMessage(content="""Tu es un assistant de recherche intelligent.
|
| 137 |
+
|
| 138 |
+
🎯 TON RÔLE:
|
| 139 |
+
Tu aides les utilisateurs à obtenir des résumés et analyses sur n'importe quel sujet.
|
| 140 |
+
|
| 141 |
+
🔧 TON OUTIL:
|
| 142 |
+
Tu as accès à un outil puissant appelé 'research_complete_pipeline' qui :
|
| 143 |
+
- Effectue des recherches web automatiques
|
| 144 |
+
- Extrait et analyse le contenu
|
| 145 |
+
- Génère des résumés détaillés
|
| 146 |
+
- Produit une synthèse globale complète
|
| 147 |
+
|
| 148 |
+
📋 QUAND L'UTILISER:
|
| 149 |
+
Utilise cet outil quand l'utilisateur demande :
|
| 150 |
+
- Un résumé sur un sujet
|
| 151 |
+
- Des informations sur un topic
|
| 152 |
+
- Une analyse d'un domaine
|
| 153 |
+
- Une recherche documentée
|
| 154 |
+
|
| 155 |
+
💡 COMMENT L'UTILISER:
|
| 156 |
+
- Identifie le sujet principal de la demande
|
| 157 |
+
- Appelle research_complete_pipeline avec le sujet en français clair
|
| 158 |
+
- Utilise max_results=2 pour une recherche standard
|
| 159 |
+
|
| 160 |
+
✅ EXEMPLES:
|
| 161 |
+
User: "Résume l'impact de l'IA sur l'emploi"
|
| 162 |
+
→ Appelle: research_complete_pipeline(topic="impact de l'intelligence artificielle sur le marché de l'emploi", max_results=2)
|
| 163 |
+
User: "Fais-moi une analyse complète sur le changement climatique"
|
| 164 |
+
→ Appelle: research_complete_pipeline(topic="changement climatique", max_results=3)
|
| 165 |
+
|
| 166 |
+
⚠️ IMPORTANT:
|
| 167 |
+
- N'essaie PAS de faire la recherche toi-même
|
| 168 |
+
- Utilise TOUJOURS l'outil pour les demandes de recherche
|
| 169 |
+
- Le résultat de l'outil est déjà un rapport complet formaté
|
| 170 |
+
- Tu peux présenter le résultat directement à l'utilisateur
|
| 171 |
+
"""
|
| 172 |
+
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
messages = state["messages"]
|
| 176 |
+
response = model.invoke([system_prompt] + messages)
|
| 177 |
+
return {"messages": [response]}
|
| 178 |
+
|
| 179 |
+
def should_continue(state: AgentState) -> str:
|
| 180 |
+
"""Décide si on continue avec des outils ou si on termine"""
|
| 181 |
+
messages = state["messages"]
|
| 182 |
+
last_message = messages[-1]
|
| 183 |
+
|
| 184 |
+
# Si le dernier message a des appels d'outils, continuer
|
| 185 |
+
if hasattr(last_message, 'tool_calls') and last_message.tool_calls:
|
| 186 |
+
return "continue"
|
| 187 |
+
else:
|
| 188 |
+
return "end"
|
| 189 |
+
|
| 190 |
+
# ============================================================================
|
| 191 |
+
# CONSTRUCTION DU GRAPHE LANGGRAPH
|
| 192 |
+
# ============================================================================
|
| 193 |
+
|
| 194 |
+
# Créer le graphe
|
| 195 |
+
graph = StateGraph(AgentState)
|
| 196 |
+
|
| 197 |
+
# Ajouter les nœuds
|
| 198 |
+
graph.add_node("llm", model_call)
|
| 199 |
+
tool_node = ToolNode(tools=tools)
|
| 200 |
+
graph.add_node("tools", tool_node)
|
| 201 |
+
|
| 202 |
+
# Définir le point d'entrée
|
| 203 |
+
graph.set_entry_point("llm")
|
| 204 |
+
|
| 205 |
+
# Ajouter les transitions conditionnelles
|
| 206 |
+
graph.add_conditional_edges(
|
| 207 |
+
"llm",
|
| 208 |
+
should_continue,
|
| 209 |
+
{
|
| 210 |
+
"continue": "tools",
|
| 211 |
+
"end": END,
|
| 212 |
+
},
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
# Après l'exécution des outils, retourner au LLM pour présenter les résultats
|
| 216 |
+
graph.add_edge("tools", "llm")
|
| 217 |
+
|
| 218 |
+
# Compiler le graphe
|
| 219 |
+
app = graph.compile()
|
requirements.txt
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AI Research Assistant - Configuration requise
|
| 2 |
+
|
| 3 |
+
# Python
|
| 4 |
+
# python>=3.10
|
| 5 |
+
|
| 6 |
+
# LangGraph
|
| 7 |
+
langgraph>=0.0.20
|
| 8 |
+
|
| 9 |
+
# LangChain
|
| 10 |
+
langchain>=0.0.310
|
| 11 |
+
langchain-core>=0.1.4
|
| 12 |
+
langchain-groq>=0.1.0
|
| 13 |
+
langchain-text-splitters>=0.0.1
|
| 14 |
+
|
| 15 |
+
# ChromaDB
|
| 16 |
+
# chromadb>=0.4.18
|
| 17 |
+
|
| 18 |
+
# API Clients
|
| 19 |
+
tavily-python>=0.2.6
|
| 20 |
+
serper-python>=0.1.3
|
| 21 |
+
|
| 22 |
+
# Utilitaires
|
| 23 |
+
python-dotenv>=1.0.0
|
| 24 |
+
requests>=2.31.0
|
| 25 |
+
aiohttp>=3.8.0
|
| 26 |
+
pydantic>=2.5.0
|
| 27 |
+
pydantic-settings>=2.0.0
|
| 28 |
+
markdown>=3.5.1
|
| 29 |
+
fpdf2>=2.7.5
|
| 30 |
+
|
| 31 |
+
# Streamlit (optionnel, pour l'interface utilisateur)
|
| 32 |
+
# streamlit>=1.28.0
|
| 33 |
+
|
| 34 |
+
# Tests
|
| 35 |
+
pytest>=7.0.0
|
| 36 |
+
pytest-asyncio>=0.21.0
|
src/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Fichier d'initialisation du package principal.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
# Vide pour l'instant, sera complété plus tard avec les imports et exports nécessaires
|
| 6 |
+
|
| 7 |
+
from src.utils import *
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
__all__ = [
|
| 11 |
+
"memory_system",
|
| 12 |
+
"tools_with_memory",
|
| 13 |
+
"create_enhanced_model_call",
|
| 14 |
+
]
|
src/agents/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Package des agents du système multi-agents.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from .base_agent import BaseAgent, AgentError, AgentTimeoutError, AgentValidationError
|
| 6 |
+
from .researcher_agent import ResearcherAgent
|
| 7 |
+
|
| 8 |
+
__all__ = [
|
| 9 |
+
"BaseAgent",
|
| 10 |
+
"AgentError",
|
| 11 |
+
"AgentTimeoutError",
|
| 12 |
+
"AgentValidationError",
|
| 13 |
+
"ResearcherAgent"
|
| 14 |
+
]
|
src/agents/base_agent.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Classe de base pour tous les agents du système.
|
| 3 |
+
Définit l'interface commune et les fonctionnalités partagées.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from abc import ABC, abstractmethod
|
| 7 |
+
from typing import Any, Dict, Optional, TypeVar, Generic
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
import asyncio
|
| 10 |
+
import uuid
|
| 11 |
+
|
| 12 |
+
from src.core.logging import setup_logger
|
| 13 |
+
from src.models.state_models import AgentState, AgentStatus, AgentType
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# Type générique pour les inputs et outputs des agents
|
| 17 |
+
InputType = TypeVar('InputType')
|
| 18 |
+
OutputType = TypeVar('OutputType')
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class BaseAgent(ABC, Generic[InputType, OutputType]):
|
| 22 |
+
"""
|
| 23 |
+
Classe de base abstraite pour tous les agents du système.
|
| 24 |
+
|
| 25 |
+
Fournit les fonctionnalités communes :
|
| 26 |
+
- Gestion de l'état
|
| 27 |
+
- Logging
|
| 28 |
+
- Gestion des erreurs et retry
|
| 29 |
+
- Métriques de performance
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
def __init__(
|
| 33 |
+
self,
|
| 34 |
+
agent_type: AgentType,
|
| 35 |
+
name: Optional[str] = None,
|
| 36 |
+
max_retries: int = 3,
|
| 37 |
+
timeout: float = 300.0 # 5 minutes par défaut
|
| 38 |
+
):
|
| 39 |
+
"""
|
| 40 |
+
Initialise l'agent de base.
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
agent_type: Type de l'agent
|
| 44 |
+
name: Nom personnalisé de l'agent
|
| 45 |
+
max_retries: Nombre maximum de tentatives en cas d'erreur
|
| 46 |
+
timeout: Timeout en secondes pour l'exécution
|
| 47 |
+
"""
|
| 48 |
+
self.agent_type = agent_type
|
| 49 |
+
self.name = name or f"{agent_type.value}_agent"
|
| 50 |
+
self.agent_id = str(uuid.uuid4())
|
| 51 |
+
|
| 52 |
+
# Configuration
|
| 53 |
+
self.max_retries = max_retries
|
| 54 |
+
self.timeout = timeout
|
| 55 |
+
|
| 56 |
+
# État de l'agent
|
| 57 |
+
self.state = AgentState(
|
| 58 |
+
agent_type=agent_type,
|
| 59 |
+
max_retries=max_retries
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
# Logger spécifique à l'agent
|
| 63 |
+
self.logger = setup_logger(f"agent_{self.name}")
|
| 64 |
+
|
| 65 |
+
# Métriques
|
| 66 |
+
self.metrics = {
|
| 67 |
+
"total_executions": 0,
|
| 68 |
+
"successful_executions": 0,
|
| 69 |
+
"failed_executions": 0,
|
| 70 |
+
"total_processing_time": 0.0,
|
| 71 |
+
"average_processing_time": 0.0
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
self.logger.info(f"Agent {self.name} initialisé (ID: {self.agent_id})")
|
| 75 |
+
|
| 76 |
+
@abstractmethod
|
| 77 |
+
async def process(self, input_data: InputType) -> OutputType:
|
| 78 |
+
"""
|
| 79 |
+
Méthode principale de traitement de l'agent.
|
| 80 |
+
Doit être implémentée par chaque agent concret.
|
| 81 |
+
|
| 82 |
+
Args:
|
| 83 |
+
input_data: Données d'entrée spécifiques à l'agent
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
Données de sortie spécifiques à l'agent
|
| 87 |
+
"""
|
| 88 |
+
pass
|
| 89 |
+
|
| 90 |
+
@abstractmethod
|
| 91 |
+
def validate_input(self, input_data: InputType) -> bool:
|
| 92 |
+
"""
|
| 93 |
+
Valide les données d'entrée.
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
input_data: Données à valider
|
| 97 |
+
|
| 98 |
+
Returns:
|
| 99 |
+
True si les données sont valides
|
| 100 |
+
"""
|
| 101 |
+
pass
|
| 102 |
+
|
| 103 |
+
async def execute(self, input_data: InputType) -> OutputType:
|
| 104 |
+
"""
|
| 105 |
+
Exécute l'agent avec gestion des erreurs et retry.
|
| 106 |
+
|
| 107 |
+
Args:
|
| 108 |
+
input_data: Données d'entrée
|
| 109 |
+
|
| 110 |
+
Returns:
|
| 111 |
+
Résultat de l'exécution
|
| 112 |
+
|
| 113 |
+
Raises:
|
| 114 |
+
Exception: Si l'exécution échoue après tous les retry
|
| 115 |
+
"""
|
| 116 |
+
self.logger.info(f"Début d'exécution de l'agent {self.name}")
|
| 117 |
+
self.state.start_execution()
|
| 118 |
+
self.metrics["total_executions"] += 1
|
| 119 |
+
|
| 120 |
+
# Validation des données d'entrée
|
| 121 |
+
if not self.validate_input(input_data):
|
| 122 |
+
error_msg = f"Données d'entrée invalides pour l'agent {self.name}"
|
| 123 |
+
self.logger.error(error_msg)
|
| 124 |
+
self.state.mark_error(error_msg)
|
| 125 |
+
self.metrics["failed_executions"] += 1
|
| 126 |
+
raise ValueError(error_msg)
|
| 127 |
+
|
| 128 |
+
# Tentatives d'exécution avec retry
|
| 129 |
+
last_exception = None
|
| 130 |
+
|
| 131 |
+
for attempt in range(self.max_retries + 1):
|
| 132 |
+
try:
|
| 133 |
+
self.logger.info(f"Tentative {attempt + 1}/{self.max_retries + 1}")
|
| 134 |
+
|
| 135 |
+
# Exécution avec timeout
|
| 136 |
+
result = await asyncio.wait_for(
|
| 137 |
+
self.process(input_data),
|
| 138 |
+
timeout=self.timeout
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
# Succès
|
| 142 |
+
self.state.complete_execution()
|
| 143 |
+
self.metrics["successful_executions"] += 1
|
| 144 |
+
self._update_processing_time()
|
| 145 |
+
|
| 146 |
+
self.logger.info(f"Agent {self.name} terminé avec succès")
|
| 147 |
+
return result
|
| 148 |
+
|
| 149 |
+
except asyncio.TimeoutError as e:
|
| 150 |
+
error_msg = f"Timeout atteint pour l'agent {self.name} (>{self.timeout}s)"
|
| 151 |
+
self.logger.warning(error_msg)
|
| 152 |
+
last_exception = e
|
| 153 |
+
self.state.retry_count += 1
|
| 154 |
+
|
| 155 |
+
except Exception as e:
|
| 156 |
+
error_msg = f"Erreur dans l'agent {self.name}: {str(e)}"
|
| 157 |
+
self.logger.warning(error_msg)
|
| 158 |
+
last_exception = e
|
| 159 |
+
self.state.retry_count += 1
|
| 160 |
+
|
| 161 |
+
# Attendre avant la prochaine tentative (backoff exponentiel)
|
| 162 |
+
if attempt < self.max_retries:
|
| 163 |
+
wait_time = 2 ** attempt # 1s, 2s, 4s, etc.
|
| 164 |
+
self.logger.info(f"Attente de {wait_time}s avant la prochaine tentative")
|
| 165 |
+
await asyncio.sleep(wait_time)
|
| 166 |
+
|
| 167 |
+
# Toutes les tentatives ont échoué
|
| 168 |
+
final_error = f"Agent {self.name} a échoué après {self.max_retries + 1} tentatives"
|
| 169 |
+
self.logger.error(final_error)
|
| 170 |
+
self.state.mark_error(final_error)
|
| 171 |
+
self.metrics["failed_executions"] += 1
|
| 172 |
+
|
| 173 |
+
raise Exception(final_error) from last_exception
|
| 174 |
+
|
| 175 |
+
def _update_processing_time(self):
|
| 176 |
+
"""Met à jour les métriques de temps de traitement."""
|
| 177 |
+
if self.state.duration:
|
| 178 |
+
self.metrics["total_processing_time"] += self.state.duration
|
| 179 |
+
self.metrics["average_processing_time"] = (
|
| 180 |
+
self.metrics["total_processing_time"] /
|
| 181 |
+
self.metrics["successful_executions"]
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
def get_status(self) -> Dict[str, Any]:
|
| 185 |
+
"""
|
| 186 |
+
Retourne le statut actuel de l'agent.
|
| 187 |
+
|
| 188 |
+
Returns:
|
| 189 |
+
Dictionnaire avec les informations de statut
|
| 190 |
+
"""
|
| 191 |
+
return {
|
| 192 |
+
"agent_id": self.agent_id,
|
| 193 |
+
"name": self.name,
|
| 194 |
+
"type": self.agent_type.value,
|
| 195 |
+
"status": self.state.status.value,
|
| 196 |
+
"retry_count": self.state.retry_count,
|
| 197 |
+
"duration": self.state.duration,
|
| 198 |
+
"error_message": self.state.error_message,
|
| 199 |
+
"metrics": self.metrics,
|
| 200 |
+
"last_execution": self.state.end_time.isoformat() if self.state.end_time else None
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
def reset(self):
|
| 204 |
+
"""Remet l'agent à zéro pour une nouvelle exécution."""
|
| 205 |
+
self.state = AgentState(
|
| 206 |
+
agent_type=self.agent_type,
|
| 207 |
+
max_retries=self.max_retries
|
| 208 |
+
)
|
| 209 |
+
self.logger.info(f"Agent {self.name} remis à zéro")
|
| 210 |
+
|
| 211 |
+
def __str__(self) -> str:
|
| 212 |
+
return f"{self.__class__.__name__}(name={self.name}, status={self.state.status.value})"
|
| 213 |
+
|
| 214 |
+
def __repr__(self) -> str:
|
| 215 |
+
return (f"{self.__class__.__name__}(agent_id={self.agent_id}, "
|
| 216 |
+
f"type={self.agent_type.value}, status={self.state.status.value})")
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
class AgentError(Exception):
|
| 220 |
+
"""Exception personnalisée pour les erreurs d'agents."""
|
| 221 |
+
|
| 222 |
+
def __init__(self, message: str, agent_name: str, agent_id: str):
|
| 223 |
+
self.agent_name = agent_name
|
| 224 |
+
self.agent_id = agent_id
|
| 225 |
+
super().__init__(f"Agent {agent_name} ({agent_id}): {message}")
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
class AgentTimeoutError(AgentError):
|
| 229 |
+
"""Exception pour les timeouts d'agents."""
|
| 230 |
+
pass
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
class AgentValidationError(AgentError):
|
| 234 |
+
"""Exception pour les erreurs de validation d'agents."""
|
| 235 |
+
pass
|
src/agents/content_extractor_agent.py
ADDED
|
@@ -0,0 +1,626 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Agent Content Extractor - Extraction et nettoyage de contenu web.
|
| 3 |
+
Extrait le contenu de pages web, PDFs et autres documents.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import asyncio
|
| 7 |
+
from typing import List, Optional
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
|
| 10 |
+
from src.agents.base_agent import BaseAgent
|
| 11 |
+
from src.models.document_models import Document, ExtractionInput, ExtractionResult
|
| 12 |
+
from src.models.research_models import ResearchOutput
|
| 13 |
+
from src.models.state_models import AgentState, AgentType
|
| 14 |
+
from src.services.content_extraction import ContentExtractionManager, ContentExtractionError
|
| 15 |
+
from src.core.logging import setup_logger
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class ContentExtractorAgent(BaseAgent[ExtractionInput, ExtractionResult]):
|
| 19 |
+
"""
|
| 20 |
+
Agent responsable de l'extraction de contenu depuis des URLs.
|
| 21 |
+
|
| 22 |
+
Fonctionnalités:
|
| 23 |
+
- Extraction de contenu HTML avec nettoyage intelligent
|
| 24 |
+
- Support des PDFs et autres formats
|
| 25 |
+
- Traitement parallèle de plusieurs URLs
|
| 26 |
+
- Gestion des erreurs et retry automatique
|
| 27 |
+
- Structuration et nettoyage du contenu
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
def __init__(self, max_concurrent_extractions: int = 5, max_retries: int = 2):
|
| 31 |
+
super().__init__(
|
| 32 |
+
agent_type=AgentType.CONTENT_EXTRACTOR,
|
| 33 |
+
name="content_extractor",
|
| 34 |
+
max_retries=max_retries,
|
| 35 |
+
timeout=300.0 # 5 minutes
|
| 36 |
+
)
|
| 37 |
+
self.extraction_manager = ContentExtractionManager(
|
| 38 |
+
max_concurrent=max_concurrent_extractions,
|
| 39 |
+
max_retries=max_retries
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
def validate_input(self, input_data: ExtractionInput) -> bool:
|
| 43 |
+
"""
|
| 44 |
+
Valide les données d'entrée pour l'extraction.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
input_data: Input contenant les URLs à extraire
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
True si les données sont valides
|
| 51 |
+
"""
|
| 52 |
+
if not input_data.urls:
|
| 53 |
+
self.logger.error("Aucune URL fournie pour l'extraction")
|
| 54 |
+
return False
|
| 55 |
+
|
| 56 |
+
if len(input_data.urls) > 50: # Limite raisonnable
|
| 57 |
+
self.logger.error(f"Trop d'URLs ({len(input_data.urls)}), maximum 50")
|
| 58 |
+
return False
|
| 59 |
+
|
| 60 |
+
# Filtrer les URLs valides
|
| 61 |
+
valid_urls = self._filter_valid_urls(input_data.urls)
|
| 62 |
+
if not valid_urls:
|
| 63 |
+
self.logger.error("Aucune URL valide trouvée")
|
| 64 |
+
return False
|
| 65 |
+
|
| 66 |
+
return True
|
| 67 |
+
|
| 68 |
+
async def process_from_research_output(self, research_output: ResearchOutput) -> ExtractionResult:
|
| 69 |
+
"""
|
| 70 |
+
Traite directement un ResearchOutput pour extraire le contenu des URLs.
|
| 71 |
+
|
| 72 |
+
Args:
|
| 73 |
+
research_output: Résultats de recherche avec URLs à extraire
|
| 74 |
+
|
| 75 |
+
Returns:
|
| 76 |
+
ExtractionResult avec les documents extraits
|
| 77 |
+
"""
|
| 78 |
+
# Extraire les URLs des résultats de recherche (conversion en string)
|
| 79 |
+
urls = [str(result.url) for result in research_output.results]
|
| 80 |
+
|
| 81 |
+
self.logger.info(f"Extraction de contenu depuis ResearchOutput: {len(urls)} URLs")
|
| 82 |
+
self.logger.info(f"Sujet de recherche: {research_output.query.topic}")
|
| 83 |
+
|
| 84 |
+
# Créer l'input d'extraction
|
| 85 |
+
extraction_input = ExtractionInput(
|
| 86 |
+
urls=urls,
|
| 87 |
+
content_filters={
|
| 88 |
+
'min_content_length': 200, # Minimum de contenu
|
| 89 |
+
'max_content_length': 50000, # Maximum pour éviter les textes trop longs
|
| 90 |
+
'required_keywords': research_output.query.keywords # Filtrer par mots-clés de recherche
|
| 91 |
+
},
|
| 92 |
+
extraction_options={
|
| 93 |
+
'source_query': research_output.query.topic,
|
| 94 |
+
'search_keywords': research_output.query.keywords
|
| 95 |
+
}
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
# Traiter avec la méthode normale
|
| 99 |
+
return await self.process(extraction_input)
|
| 100 |
+
|
| 101 |
+
async def process(self, input_data: ExtractionInput) -> ExtractionResult:
|
| 102 |
+
"""
|
| 103 |
+
Exécute l'extraction de contenu pour les URLs fournies.
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
input_data: Input contenant les URLs à extraire et les options
|
| 107 |
+
|
| 108 |
+
Returns:
|
| 109 |
+
ExtractionResult avec les documents extraits
|
| 110 |
+
|
| 111 |
+
Raises:
|
| 112 |
+
ValueError: Si les URLs sont invalides
|
| 113 |
+
ContentExtractionError: Si l'extraction échoue
|
| 114 |
+
"""
|
| 115 |
+
start_time = datetime.now()
|
| 116 |
+
self.logger.info(f"Début extraction de contenu pour {len(input_data.urls)} URLs")
|
| 117 |
+
|
| 118 |
+
# Filtrer les URLs valides (validation déjà faite dans validate_input)
|
| 119 |
+
valid_urls = self._filter_valid_urls(input_data.urls)
|
| 120 |
+
self.logger.info(f"URLs valides à traiter: {len(valid_urls)}/{len(input_data.urls)}")
|
| 121 |
+
|
| 122 |
+
try:
|
| 123 |
+
# Extraction du contenu
|
| 124 |
+
documents = await self._extract_all_content(valid_urls, input_data)
|
| 125 |
+
|
| 126 |
+
# Post-traitement des documents
|
| 127 |
+
processed_documents = self._post_process_documents(documents, input_data)
|
| 128 |
+
|
| 129 |
+
# Calcul des statistiques
|
| 130 |
+
execution_time = (datetime.now() - start_time).total_seconds()
|
| 131 |
+
|
| 132 |
+
# Identifier les URLs qui ont échoué
|
| 133 |
+
successful_urls = {str(doc.url) for doc in processed_documents}
|
| 134 |
+
failed_urls = [url for url in valid_urls if url not in successful_urls]
|
| 135 |
+
|
| 136 |
+
# Création du résultat
|
| 137 |
+
result = ExtractionResult(
|
| 138 |
+
documents=processed_documents,
|
| 139 |
+
total_urls=len(input_data.urls),
|
| 140 |
+
successful_extractions=len(processed_documents),
|
| 141 |
+
failed_extractions=len(input_data.urls) - len(processed_documents),
|
| 142 |
+
failed_urls=failed_urls,
|
| 143 |
+
execution_time=execution_time,
|
| 144 |
+
extraction_stats=self._calculate_stats(processed_documents)
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
self.logger.info(
|
| 148 |
+
f"Extraction terminée: {result.successful_extractions}/{result.total_urls} "
|
| 149 |
+
f"succès en {execution_time:.2f}s"
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
return result
|
| 153 |
+
|
| 154 |
+
except Exception as e:
|
| 155 |
+
self.logger.error(f"Erreur lors de l'extraction: {str(e)}")
|
| 156 |
+
raise ContentExtractionError(f"Échec de l'extraction de contenu: {str(e)}")
|
| 157 |
+
|
| 158 |
+
def _filter_valid_urls(self, urls: List[str]) -> List[str]:
|
| 159 |
+
"""Filtre et valide les URLs."""
|
| 160 |
+
import re
|
| 161 |
+
from urllib.parse import urlparse
|
| 162 |
+
|
| 163 |
+
valid_urls = []
|
| 164 |
+
url_pattern = re.compile(
|
| 165 |
+
r'^https?://' # http:// ou https://
|
| 166 |
+
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain
|
| 167 |
+
r'localhost|' # localhost
|
| 168 |
+
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # IP
|
| 169 |
+
r'(?::\d+)?' # port
|
| 170 |
+
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
|
| 171 |
+
|
| 172 |
+
for url in urls:
|
| 173 |
+
if not url or not isinstance(url, str):
|
| 174 |
+
self.logger.warning(f"URL invalide ignorée: {url}")
|
| 175 |
+
continue
|
| 176 |
+
|
| 177 |
+
url = url.strip()
|
| 178 |
+
if not url:
|
| 179 |
+
continue
|
| 180 |
+
|
| 181 |
+
# Validation du format
|
| 182 |
+
if not url_pattern.match(url):
|
| 183 |
+
self.logger.warning(f"Format URL invalide: {url}")
|
| 184 |
+
continue
|
| 185 |
+
|
| 186 |
+
# Validation avec urlparse
|
| 187 |
+
try:
|
| 188 |
+
parsed = urlparse(url)
|
| 189 |
+
if not parsed.netloc:
|
| 190 |
+
self.logger.warning(f"URL sans domaine: {url}")
|
| 191 |
+
continue
|
| 192 |
+
|
| 193 |
+
valid_urls.append(url)
|
| 194 |
+
|
| 195 |
+
except Exception as e:
|
| 196 |
+
self.logger.warning(f"Erreur de parsing URL {url}: {e}")
|
| 197 |
+
continue
|
| 198 |
+
|
| 199 |
+
return valid_urls
|
| 200 |
+
|
| 201 |
+
async def _extract_all_content(self, urls: List[str], input_data: ExtractionInput) -> List[Document]:
|
| 202 |
+
"""Extrait le contenu de toutes les URLs."""
|
| 203 |
+
try:
|
| 204 |
+
# Utiliser le gestionnaire d'extraction
|
| 205 |
+
documents = await self.extraction_manager.extract_multiple(urls)
|
| 206 |
+
|
| 207 |
+
# Appliquer les filtres si spécifiés
|
| 208 |
+
if input_data.content_filters:
|
| 209 |
+
documents = self._apply_content_filters(documents, input_data.content_filters)
|
| 210 |
+
|
| 211 |
+
return documents
|
| 212 |
+
|
| 213 |
+
except Exception as e:
|
| 214 |
+
self.logger.error(f"Erreur lors de l'extraction multiple: {str(e)}")
|
| 215 |
+
raise
|
| 216 |
+
|
| 217 |
+
def _apply_content_filters(self, documents: List[Document], filters: dict) -> List[Document]:
|
| 218 |
+
"""Applique les filtres de contenu aux documents."""
|
| 219 |
+
filtered_documents = []
|
| 220 |
+
|
| 221 |
+
for doc in documents:
|
| 222 |
+
# Filtrer par longueur minimale
|
| 223 |
+
min_length = filters.get('min_content_length', 100)
|
| 224 |
+
if len(doc.content) < min_length:
|
| 225 |
+
self.logger.debug(f"Document {doc.title} trop court: {len(doc.content)} caractères")
|
| 226 |
+
continue
|
| 227 |
+
|
| 228 |
+
# Filtrer par longueur maximale
|
| 229 |
+
max_length = filters.get('max_content_length', 100000)
|
| 230 |
+
if len(doc.content) > max_length:
|
| 231 |
+
self.logger.debug(f"Document {doc.title} trop long, troncature")
|
| 232 |
+
doc.content = doc.content[:max_length] + "... [Contenu tronqué]"
|
| 233 |
+
|
| 234 |
+
# Filtrer par langue si spécifiée
|
| 235 |
+
required_language = filters.get('language')
|
| 236 |
+
if required_language and doc.language != required_language:
|
| 237 |
+
self.logger.debug(f"Document {doc.title} ignoré: langue {doc.language}")
|
| 238 |
+
continue
|
| 239 |
+
|
| 240 |
+
# Filtrer par mots-clés si spécifiés
|
| 241 |
+
required_keywords = filters.get('required_keywords', [])
|
| 242 |
+
if required_keywords:
|
| 243 |
+
content_lower = doc.content.lower()
|
| 244 |
+
if not any(keyword.lower() in content_lower for keyword in required_keywords):
|
| 245 |
+
self.logger.debug(f"Document {doc.title} ignoré: mots-clés manquants")
|
| 246 |
+
continue
|
| 247 |
+
|
| 248 |
+
filtered_documents.append(doc)
|
| 249 |
+
|
| 250 |
+
self.logger.info(f"Filtres appliqués: {len(filtered_documents)}/{len(documents)} documents retenus")
|
| 251 |
+
return filtered_documents
|
| 252 |
+
|
| 253 |
+
def _post_process_documents(self, documents: List[Document], input_data: ExtractionInput) -> List[Document]:
|
| 254 |
+
"""Post-traitement des documents extraits."""
|
| 255 |
+
processed_docs = []
|
| 256 |
+
|
| 257 |
+
for doc in documents:
|
| 258 |
+
# Nettoyage supplémentaire du contenu
|
| 259 |
+
doc.content = self._clean_content(doc.content)
|
| 260 |
+
|
| 261 |
+
# Recalcul du nombre de mots après nettoyage
|
| 262 |
+
doc.word_count = len(doc.content.split())
|
| 263 |
+
|
| 264 |
+
# Validation finale
|
| 265 |
+
if self._is_valid_document(doc, input_data):
|
| 266 |
+
processed_docs.append(doc)
|
| 267 |
+
else:
|
| 268 |
+
self.logger.debug(f"Document {doc.title} rejeté lors de la validation finale")
|
| 269 |
+
|
| 270 |
+
return processed_docs
|
| 271 |
+
|
| 272 |
+
def _clean_content(self, content: str) -> str:
|
| 273 |
+
"""Nettoyage avancé du contenu."""
|
| 274 |
+
import re
|
| 275 |
+
|
| 276 |
+
if not content:
|
| 277 |
+
return ""
|
| 278 |
+
|
| 279 |
+
# Supprimer les caractères de contrôle
|
| 280 |
+
content = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', content)
|
| 281 |
+
|
| 282 |
+
# Normaliser les espaces
|
| 283 |
+
content = re.sub(r'[ \t]+', ' ', content)
|
| 284 |
+
|
| 285 |
+
# Normaliser les sauts de ligne
|
| 286 |
+
content = re.sub(r'\n\s*\n\s*\n+', '\n\n', content)
|
| 287 |
+
|
| 288 |
+
# Supprimer les espaces en début et fin de lignes
|
| 289 |
+
lines = content.split('\n')
|
| 290 |
+
lines = [line.strip() for line in lines]
|
| 291 |
+
content = '\n'.join(lines)
|
| 292 |
+
|
| 293 |
+
# Supprimer les lignes vides multiples
|
| 294 |
+
content = re.sub(r'\n{3,}', '\n\n', content)
|
| 295 |
+
|
| 296 |
+
return content.strip()
|
| 297 |
+
|
| 298 |
+
def _is_valid_document(self, doc: Document, input_data: ExtractionInput) -> bool:
|
| 299 |
+
"""Valide un document extrait."""
|
| 300 |
+
# Vérifications de base
|
| 301 |
+
if not doc.content or not doc.content.strip():
|
| 302 |
+
return False
|
| 303 |
+
|
| 304 |
+
if len(doc.content) < 50: # Contenu trop court
|
| 305 |
+
return False
|
| 306 |
+
|
| 307 |
+
# Vérification du ratio texte/contenu (détecter les pages avec peu de contenu)
|
| 308 |
+
if doc.word_count < 20:
|
| 309 |
+
return False
|
| 310 |
+
|
| 311 |
+
# Vérifications spécifiques aux options d'entrée
|
| 312 |
+
if hasattr(input_data, 'min_quality_score'):
|
| 313 |
+
quality_score = self._calculate_content_quality(doc)
|
| 314 |
+
if quality_score < input_data.min_quality_score:
|
| 315 |
+
return False
|
| 316 |
+
|
| 317 |
+
return True
|
| 318 |
+
|
| 319 |
+
def _calculate_content_quality(self, doc: Document) -> float:
|
| 320 |
+
"""Calcule un score de qualité pour le contenu (0-1)."""
|
| 321 |
+
score = 0.0
|
| 322 |
+
|
| 323 |
+
# Points pour la longueur
|
| 324 |
+
if doc.word_count > 100:
|
| 325 |
+
score += 0.3
|
| 326 |
+
elif doc.word_count > 50:
|
| 327 |
+
score += 0.1
|
| 328 |
+
|
| 329 |
+
# Points pour la structure
|
| 330 |
+
if doc.title and len(doc.title) > 10:
|
| 331 |
+
score += 0.2
|
| 332 |
+
|
| 333 |
+
if doc.author:
|
| 334 |
+
score += 0.1
|
| 335 |
+
|
| 336 |
+
if doc.published_date:
|
| 337 |
+
score += 0.1
|
| 338 |
+
|
| 339 |
+
# Points pour la richesse du contenu
|
| 340 |
+
content = doc.content.lower()
|
| 341 |
+
if any(marker in content for marker in ['conclusion', 'introduction', 'sommaire']):
|
| 342 |
+
score += 0.2
|
| 343 |
+
|
| 344 |
+
# Pénalité pour contenu répétitif
|
| 345 |
+
lines = doc.content.split('\n')
|
| 346 |
+
unique_lines = set(line.strip() for line in lines if line.strip())
|
| 347 |
+
if len(lines) > 0:
|
| 348 |
+
uniqueness_ratio = len(unique_lines) / len(lines)
|
| 349 |
+
if uniqueness_ratio < 0.5:
|
| 350 |
+
score -= 0.2
|
| 351 |
+
|
| 352 |
+
return max(0.0, min(1.0, score))
|
| 353 |
+
|
| 354 |
+
def _calculate_stats(self, documents: List[Document]) -> dict:
|
| 355 |
+
"""Calcule les statistiques d'extraction."""
|
| 356 |
+
if not documents:
|
| 357 |
+
return {
|
| 358 |
+
'total_words': 0,
|
| 359 |
+
'average_words_per_doc': 0,
|
| 360 |
+
'doc_types': {},
|
| 361 |
+
'languages': {},
|
| 362 |
+
'has_authors': 0,
|
| 363 |
+
'has_dates': 0
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
total_words = sum(doc.word_count for doc in documents)
|
| 367 |
+
|
| 368 |
+
# Compter les types de documents
|
| 369 |
+
doc_types = {}
|
| 370 |
+
for doc in documents:
|
| 371 |
+
doc_type = doc.doc_type.value if doc.doc_type else 'unknown'
|
| 372 |
+
doc_types[doc_type] = doc_types.get(doc_type, 0) + 1
|
| 373 |
+
|
| 374 |
+
# Compter les langues
|
| 375 |
+
languages = {}
|
| 376 |
+
for doc in documents:
|
| 377 |
+
lang = doc.language or 'unknown'
|
| 378 |
+
languages[lang] = languages.get(lang, 0) + 1
|
| 379 |
+
|
| 380 |
+
# Compter les métadonnées
|
| 381 |
+
has_authors = sum(1 for doc in documents if doc.author)
|
| 382 |
+
has_dates = sum(1 for doc in documents if doc.published_date)
|
| 383 |
+
|
| 384 |
+
return {
|
| 385 |
+
'total_words': total_words,
|
| 386 |
+
'average_words_per_doc': total_words // len(documents),
|
| 387 |
+
'doc_types': doc_types,
|
| 388 |
+
'languages': languages,
|
| 389 |
+
'has_authors': has_authors,
|
| 390 |
+
'has_dates': has_dates
|
| 391 |
+
}
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
# Fonction utilitaire pour les tests
|
| 395 |
+
async def extract_content_from_urls(urls: List[str], **options) -> List[Document]:
|
| 396 |
+
"""
|
| 397 |
+
Fonction utilitaire pour extraire du contenu depuis une liste d'URLs.
|
| 398 |
+
|
| 399 |
+
Args:
|
| 400 |
+
urls: Liste des URLs à extraire
|
| 401 |
+
**options: Options d'extraction (filters, etc.)
|
| 402 |
+
|
| 403 |
+
Returns:
|
| 404 |
+
Liste des documents extraits
|
| 405 |
+
"""
|
| 406 |
+
agent = ContentExtractorAgent()
|
| 407 |
+
|
| 408 |
+
input_data = ExtractionInput(
|
| 409 |
+
urls=urls,
|
| 410 |
+
content_filters=options.get('content_filters', {}),
|
| 411 |
+
extraction_options=options.get('extraction_options', {})
|
| 412 |
+
)
|
| 413 |
+
|
| 414 |
+
result = await agent.execute(input_data)
|
| 415 |
+
return result.documents
|
| 416 |
+
|
| 417 |
+
|
| 418 |
+
# Fonction utilitaire pour l'intégration avec le Researcher
|
| 419 |
+
async def extract_from_search_results(search_results: List[dict]) -> List[Document]:
|
| 420 |
+
"""
|
| 421 |
+
Extrait le contenu depuis des résultats de recherche.
|
| 422 |
+
|
| 423 |
+
Args:
|
| 424 |
+
search_results: Résultats de recherche avec URLs
|
| 425 |
+
|
| 426 |
+
Returns:
|
| 427 |
+
Liste des documents extraits
|
| 428 |
+
"""
|
| 429 |
+
urls = []
|
| 430 |
+
for result in search_results:
|
| 431 |
+
if isinstance(result, dict) and 'url' in result:
|
| 432 |
+
urls.append(result['url'])
|
| 433 |
+
elif hasattr(result, 'url'):
|
| 434 |
+
urls.append(result.url)
|
| 435 |
+
|
| 436 |
+
if not urls:
|
| 437 |
+
return []
|
| 438 |
+
|
| 439 |
+
return await extract_content_from_urls(urls)
|
| 440 |
+
|
| 441 |
+
|
| 442 |
+
# Fonctions utilitaires pour la sauvegarde
|
| 443 |
+
def save_extraction_result(result: ExtractionResult, filename: str = None) -> str:
|
| 444 |
+
"""
|
| 445 |
+
Sauvegarde un ExtractionResult dans un fichier JSON.
|
| 446 |
+
|
| 447 |
+
Args:
|
| 448 |
+
result: Résultat d'extraction à sauvegarder
|
| 449 |
+
filename: Nom du fichier (optionnel)
|
| 450 |
+
|
| 451 |
+
Returns:
|
| 452 |
+
Nom du fichier sauvegardé
|
| 453 |
+
"""
|
| 454 |
+
import json
|
| 455 |
+
from datetime import datetime
|
| 456 |
+
|
| 457 |
+
if not filename:
|
| 458 |
+
# Générer un nom de fichier basé sur le nombre de documents et timestamp
|
| 459 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 460 |
+
filename = f"extraction_result_{result.successful_extractions}docs_{timestamp}.json"
|
| 461 |
+
|
| 462 |
+
try:
|
| 463 |
+
# Conversion en dictionnaire avec sérialisation des dates
|
| 464 |
+
result_dict = result.model_dump(mode='json')
|
| 465 |
+
|
| 466 |
+
# Sauvegarde dans le fichier
|
| 467 |
+
with open(filename, 'w', encoding='utf-8') as f:
|
| 468 |
+
json.dump(result_dict, f, indent=2, ensure_ascii=False)
|
| 469 |
+
|
| 470 |
+
return filename
|
| 471 |
+
|
| 472 |
+
except Exception as e:
|
| 473 |
+
raise Exception(f"Erreur lors de la sauvegarde: {e}")
|
| 474 |
+
|
| 475 |
+
|
| 476 |
+
def load_extraction_result(filename: str) -> ExtractionResult:
|
| 477 |
+
"""
|
| 478 |
+
Charge un ExtractionResult depuis un fichier JSON.
|
| 479 |
+
|
| 480 |
+
Args:
|
| 481 |
+
filename: Nom du fichier à charger
|
| 482 |
+
|
| 483 |
+
Returns:
|
| 484 |
+
ExtractionResult chargé
|
| 485 |
+
"""
|
| 486 |
+
import json
|
| 487 |
+
|
| 488 |
+
try:
|
| 489 |
+
with open(filename, 'r', encoding='utf-8') as f:
|
| 490 |
+
data = json.load(f)
|
| 491 |
+
|
| 492 |
+
# Reconstruction de l'ExtractionResult
|
| 493 |
+
return ExtractionResult(**data)
|
| 494 |
+
|
| 495 |
+
except Exception as e:
|
| 496 |
+
raise Exception(f"Erreur lors du chargement: {e}")
|
| 497 |
+
|
| 498 |
+
|
| 499 |
+
# Configuration du logger pour l'agent
|
| 500 |
+
logger = setup_logger("ContentExtractorAgent")
|
| 501 |
+
# Exemple d'utilisation
|
| 502 |
+
if __name__ == "__main__":
|
| 503 |
+
import asyncio
|
| 504 |
+
import json
|
| 505 |
+
from src.models.research_models import ResearchOutput
|
| 506 |
+
|
| 507 |
+
async def test_with_research_output():
|
| 508 |
+
"""Test avec un fichier ResearchOutput sauvegardé."""
|
| 509 |
+
# Charger le ResearchOutput depuis le fichier JSON le plus récent
|
| 510 |
+
research_file = "research_output_impact_de_lintelligence_artifi_20251116_141136.json"
|
| 511 |
+
|
| 512 |
+
try:
|
| 513 |
+
# Charger le ResearchOutput
|
| 514 |
+
with open(research_file, 'r', encoding='utf-8') as f:
|
| 515 |
+
research_data = json.load(f)
|
| 516 |
+
|
| 517 |
+
research_output = ResearchOutput(**research_data)
|
| 518 |
+
logger.info(f"=== CHARGEMENT DU RESEARCH OUTPUT ===")
|
| 519 |
+
logger.info(f"Sujet: {research_output.query.topic}")
|
| 520 |
+
logger.info(f"URLs à extraire: {len(research_output.results)}")
|
| 521 |
+
|
| 522 |
+
# Créer l'agent et traiter
|
| 523 |
+
agent = ContentExtractorAgent()
|
| 524 |
+
|
| 525 |
+
logger.info(f"=== DÉBUT DE L'EXTRACTION DE CONTENU ===")
|
| 526 |
+
extraction_result = await agent.process_from_research_output(research_output)
|
| 527 |
+
|
| 528 |
+
logger.info(f"=== RÉSULTATS D'EXTRACTION ===")
|
| 529 |
+
logger.info(f"URLs traitées: {extraction_result.total_urls}")
|
| 530 |
+
logger.info(f"Extractions réussies: {extraction_result.successful_extractions}")
|
| 531 |
+
logger.info(f"Extractions échouées: {extraction_result.failed_extractions}")
|
| 532 |
+
logger.info(f"Temps d'exécution: {extraction_result.execution_time:.2f}s")
|
| 533 |
+
|
| 534 |
+
# Afficher les détails des documents extraits
|
| 535 |
+
for i, doc in enumerate(extraction_result.documents, 1):
|
| 536 |
+
logger.info(f"\n{i}. {doc.title}")
|
| 537 |
+
logger.info(f" URL: {doc.url}")
|
| 538 |
+
logger.info(f" Mots: {doc.word_count}")
|
| 539 |
+
logger.info(f" Langue: {doc.language}")
|
| 540 |
+
logger.info(f" Type: {doc.doc_type}")
|
| 541 |
+
logger.info(f" Contenu (aperçu): {doc.content[:200]}...")
|
| 542 |
+
|
| 543 |
+
# URLs qui ont échoué
|
| 544 |
+
if extraction_result.failed_urls:
|
| 545 |
+
logger.info(f"\n❌ URLs en échec:")
|
| 546 |
+
for url in extraction_result.failed_urls:
|
| 547 |
+
logger.info(f" • {url}")
|
| 548 |
+
|
| 549 |
+
# === SAUVEGARDE DE L'EXTRACTION RESULT ===
|
| 550 |
+
logger.info(f"\n=== SAUVEGARDE DE L'EXTRACTION RESULT ===")
|
| 551 |
+
|
| 552 |
+
try:
|
| 553 |
+
filename = save_extraction_result(extraction_result)
|
| 554 |
+
logger.info(f"✅ ExtractionResult sauvegardé dans: {filename}")
|
| 555 |
+
|
| 556 |
+
# Affichage du contenu sauvegardé
|
| 557 |
+
logger.info("📄 Contenu sauvegardé:")
|
| 558 |
+
logger.info(f" • Documents extraits: {len(extraction_result.documents)}")
|
| 559 |
+
logger.info(f" • Temps d'extraction: {extraction_result.execution_time:.2f}s")
|
| 560 |
+
logger.info(f" • Statistiques: {extraction_result.extraction_stats}")
|
| 561 |
+
|
| 562 |
+
# Test de chargement pour vérifier l'intégrité
|
| 563 |
+
logger.info("=== Test de chargement ===")
|
| 564 |
+
loaded_result = load_extraction_result(filename)
|
| 565 |
+
logger.info(f"✅ ExtractionResult rechargé avec succès")
|
| 566 |
+
logger.info(f" • Vérification: {len(loaded_result.documents)} documents chargés")
|
| 567 |
+
|
| 568 |
+
# Comparaison des données
|
| 569 |
+
if loaded_result.successful_extractions == extraction_result.successful_extractions:
|
| 570 |
+
logger.info("✅ Intégrité des données vérifiée")
|
| 571 |
+
else:
|
| 572 |
+
logger.error("❌ Erreur d'intégrité des données")
|
| 573 |
+
|
| 574 |
+
# Affichage du format JSON pour référence
|
| 575 |
+
logger.info("\n📋 EXEMPLE DE FORMAT JSON SAUVEGARDÉ:")
|
| 576 |
+
logger.info("-" * 50)
|
| 577 |
+
|
| 578 |
+
# Créer un exemple compact pour l'affichage
|
| 579 |
+
example_result = {
|
| 580 |
+
"documents": [
|
| 581 |
+
{
|
| 582 |
+
"title": doc.title,
|
| 583 |
+
"url": str(doc.url),
|
| 584 |
+
"content": doc.content[:200] + "...",
|
| 585 |
+
"word_count": doc.word_count,
|
| 586 |
+
"language": doc.language,
|
| 587 |
+
"doc_type": doc.doc_type.value if doc.doc_type else None
|
| 588 |
+
} for doc in extraction_result.documents[:2] # Limiter à 2 documents
|
| 589 |
+
],
|
| 590 |
+
"total_urls": extraction_result.total_urls,
|
| 591 |
+
"successful_extractions": extraction_result.successful_extractions,
|
| 592 |
+
"failed_extractions": extraction_result.failed_extractions,
|
| 593 |
+
"failed_urls": extraction_result.failed_urls,
|
| 594 |
+
"execution_time": extraction_result.execution_time,
|
| 595 |
+
"extraction_stats": extraction_result.extraction_stats
|
| 596 |
+
}
|
| 597 |
+
|
| 598 |
+
print(json.dumps(example_result, indent=2, ensure_ascii=False))
|
| 599 |
+
|
| 600 |
+
except Exception as save_error:
|
| 601 |
+
logger.error(f"❌ Erreur lors de la sauvegarde: {save_error}")
|
| 602 |
+
|
| 603 |
+
except FileNotFoundError:
|
| 604 |
+
logger.error(f"❌ Fichier ResearchOutput non trouvé: {research_file}")
|
| 605 |
+
logger.info("Utilisation de l'exemple avec URLs directes...")
|
| 606 |
+
await test_with_direct_urls()
|
| 607 |
+
except Exception as e:
|
| 608 |
+
logger.error(f"❌ Erreur lors du traitement: {e}")
|
| 609 |
+
|
| 610 |
+
async def test_with_direct_urls():
|
| 611 |
+
"""Test avec des URLs directes."""
|
| 612 |
+
urls = [
|
| 613 |
+
'https://www.iana.org/help/example-domains',
|
| 614 |
+
]
|
| 615 |
+
|
| 616 |
+
logger.info(f"=== TEST AVEC URLS DIRECTES ===")
|
| 617 |
+
documents = await extract_content_from_urls(urls)
|
| 618 |
+
for doc in documents:
|
| 619 |
+
logger.info(f"Title: {doc.title}, URL: {doc.url}, Word Count: {doc.word_count}, Language: {doc.language}, Content Length: {len(doc.content)}")
|
| 620 |
+
|
| 621 |
+
# Choisir le test à exécuter
|
| 622 |
+
import sys
|
| 623 |
+
if len(sys.argv) > 1 and sys.argv[1] == "--direct":
|
| 624 |
+
asyncio.run(test_with_direct_urls())
|
| 625 |
+
else:
|
| 626 |
+
asyncio.run(test_with_research_output())
|
src/agents/global_synthesizer_agent.py
ADDED
|
@@ -0,0 +1,826 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Agent Global Synthesizer - Synthèse finale et génération de rapport.
|
| 3 |
+
Prend les résumés de l'agent Summarizer et génère un rapport final structuré.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import asyncio
|
| 7 |
+
import hashlib
|
| 8 |
+
import re
|
| 9 |
+
from typing import List, Dict, Any, Optional
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
|
| 12 |
+
from src.agents.base_agent import BaseAgent
|
| 13 |
+
from src.models.synthesis_models import (
|
| 14 |
+
GlobalSynthesisInput, GlobalSynthesisOutput, FinalReport,
|
| 15 |
+
ExecutiveSummary, ReportSection, SourceReference, Methodology,
|
| 16 |
+
ReportType, ReportFormat
|
| 17 |
+
)
|
| 18 |
+
from src.models.document_models import DocumentSummary, SummarizationOutput
|
| 19 |
+
from src.models.state_models import AgentType
|
| 20 |
+
from src.services.llm_service import LLMManager, LLMError
|
| 21 |
+
from src.core.logging import setup_logger
|
| 22 |
+
from config.prompts import GLOBAL_SYNTHESIZER_PROMPTS, SYSTEM_PROMPTS
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class GlobalSynthesizerAgent(BaseAgent[GlobalSynthesisInput, GlobalSynthesisOutput]):
|
| 26 |
+
"""
|
| 27 |
+
Agent responsable de la synthèse finale et de la génération de rapport.
|
| 28 |
+
|
| 29 |
+
Fonctionnalités:
|
| 30 |
+
- Synthèse de multiples résumés de documents
|
| 31 |
+
- Génération de rapport final structuré
|
| 32 |
+
- Analyse transversale et identification de patterns
|
| 33 |
+
- Évaluation de qualité et méthodologie
|
| 34 |
+
- Support de différents formats de rapport
|
| 35 |
+
- Génération de résumé exécutif
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
def __init__(
|
| 39 |
+
self,
|
| 40 |
+
max_retries: int = 2,
|
| 41 |
+
timeout: float = 300.0 # 5 minutes pour la synthèse finale
|
| 42 |
+
):
|
| 43 |
+
super().__init__(
|
| 44 |
+
agent_type=AgentType.WRITER,
|
| 45 |
+
name="global_synthesizer",
|
| 46 |
+
max_retries=max_retries,
|
| 47 |
+
timeout=timeout
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# Services
|
| 51 |
+
self.llm_manager = LLMManager()
|
| 52 |
+
|
| 53 |
+
# Configuration
|
| 54 |
+
self.max_concurrent_synthesis = 3 # Nombre de tâches parallèles max
|
| 55 |
+
self.min_sources_for_analysis = 1 # Minimum de sources pour une analyse
|
| 56 |
+
|
| 57 |
+
def validate_input(self, input_data: GlobalSynthesisInput) -> bool:
|
| 58 |
+
"""
|
| 59 |
+
Valide les données d'entrée pour la synthèse globale.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
input_data: Input contenant les résumés à synthétiser
|
| 63 |
+
|
| 64 |
+
Returns:
|
| 65 |
+
True si les données sont valides
|
| 66 |
+
"""
|
| 67 |
+
if not input_data.summarization_output:
|
| 68 |
+
self.logger.error("Aucune sortie de summarization fournie")
|
| 69 |
+
return False
|
| 70 |
+
|
| 71 |
+
if not input_data.summarization_output.summaries:
|
| 72 |
+
self.logger.error("Aucun résumé disponible pour la synthèse")
|
| 73 |
+
return False
|
| 74 |
+
|
| 75 |
+
if len(input_data.summarization_output.summaries) < self.min_sources_for_analysis:
|
| 76 |
+
self.logger.error(f"Minimum {self.min_sources_for_analysis} résumé(s) requis")
|
| 77 |
+
return False
|
| 78 |
+
|
| 79 |
+
if not input_data.original_topic or len(input_data.original_topic.strip()) < 3:
|
| 80 |
+
self.logger.error("Sujet original manquant ou trop court")
|
| 81 |
+
return False
|
| 82 |
+
|
| 83 |
+
return True
|
| 84 |
+
|
| 85 |
+
async def process(self, input_data: GlobalSynthesisInput) -> GlobalSynthesisOutput:
|
| 86 |
+
"""
|
| 87 |
+
Traite la synthèse globale et génère le rapport final.
|
| 88 |
+
|
| 89 |
+
Args:
|
| 90 |
+
input_data: Input contenant les résumés et options de synthèse
|
| 91 |
+
|
| 92 |
+
Returns:
|
| 93 |
+
GlobalSynthesisOutput avec le rapport final
|
| 94 |
+
"""
|
| 95 |
+
start_time = datetime.now()
|
| 96 |
+
self.logger.info(f"Début synthèse globale pour: '{input_data.original_topic}'")
|
| 97 |
+
self.logger.info(f"Nombre de résumés à synthétiser: {len(input_data.summarization_output.summaries)}")
|
| 98 |
+
|
| 99 |
+
try:
|
| 100 |
+
# Étape 1: Préparation des données
|
| 101 |
+
prepared_data = self._prepare_synthesis_data(input_data)
|
| 102 |
+
|
| 103 |
+
# Étape 2: Génération des sections du rapport en parallèle
|
| 104 |
+
report_sections = await self._generate_report_sections(prepared_data, input_data)
|
| 105 |
+
|
| 106 |
+
# Étape 3: Génération du résumé exécutif
|
| 107 |
+
executive_summary = await self._generate_executive_summary(prepared_data, input_data)
|
| 108 |
+
|
| 109 |
+
# Étape 4: Création de la méthodologie
|
| 110 |
+
methodology = self._create_methodology(input_data)
|
| 111 |
+
|
| 112 |
+
# Étape 5: Création des références de sources
|
| 113 |
+
source_references = self._create_source_references(input_data.summarization_output.summaries)
|
| 114 |
+
|
| 115 |
+
# Étape 6: Évaluation de qualité
|
| 116 |
+
quality_scores = await self._assess_quality(input_data, report_sections)
|
| 117 |
+
|
| 118 |
+
# Étape 7: Assemblage du rapport final
|
| 119 |
+
final_report = self._assemble_final_report(
|
| 120 |
+
input_data,
|
| 121 |
+
executive_summary,
|
| 122 |
+
report_sections,
|
| 123 |
+
methodology,
|
| 124 |
+
source_references,
|
| 125 |
+
quality_scores
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
# Étape 8: Génération des formats alternatifs
|
| 129 |
+
formatted_outputs = await self._generate_formatted_outputs(final_report, input_data)
|
| 130 |
+
|
| 131 |
+
# Calcul du temps de traitement
|
| 132 |
+
processing_time = (datetime.now() - start_time).total_seconds()
|
| 133 |
+
|
| 134 |
+
# Création du résultat
|
| 135 |
+
result = GlobalSynthesisOutput(
|
| 136 |
+
final_report=final_report,
|
| 137 |
+
synthesis_metadata={
|
| 138 |
+
"synthesis_approach": "comprehensive",
|
| 139 |
+
"llm_model_used": "groq/llama-3.1-8b-instant",
|
| 140 |
+
"quality_checks_passed": quality_scores["confidence_score"] > 0.6
|
| 141 |
+
},
|
| 142 |
+
processing_stats={
|
| 143 |
+
"input_summaries": len(input_data.summarization_output.summaries),
|
| 144 |
+
"synthesis_time": processing_time,
|
| 145 |
+
"final_report_words": final_report.word_count,
|
| 146 |
+
"sections_generated": len(report_sections)
|
| 147 |
+
},
|
| 148 |
+
formatted_outputs=formatted_outputs
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
self.logger.info(f"Synthèse globale terminée en {processing_time:.2f}s")
|
| 152 |
+
self.logger.info(f"Rapport final: {final_report.word_count} mots, {len(report_sections)} sections")
|
| 153 |
+
|
| 154 |
+
return result
|
| 155 |
+
|
| 156 |
+
except Exception as e:
|
| 157 |
+
self.logger.error(f"Erreur lors de la synthèse globale: {str(e)}")
|
| 158 |
+
raise
|
| 159 |
+
|
| 160 |
+
def _prepare_synthesis_data(self, input_data: GlobalSynthesisInput) -> Dict[str, Any]:
|
| 161 |
+
"""Prépare les données nécessaires pour la synthèse."""
|
| 162 |
+
summaries = input_data.summarization_output.summaries
|
| 163 |
+
|
| 164 |
+
# Compilation des résumés
|
| 165 |
+
document_summaries = []
|
| 166 |
+
for i, summary in enumerate(summaries, 1):
|
| 167 |
+
doc_summary = f"""
|
| 168 |
+
Document {i}: {summary.title}
|
| 169 |
+
URL: {summary.url}
|
| 170 |
+
Résumé exécutif: {summary.executive_summary}
|
| 171 |
+
Résumé détaillé: {summary.detailed_summary}
|
| 172 |
+
Sentiment: {summary.sentiment}
|
| 173 |
+
Crédibilité: {summary.credibility_score}
|
| 174 |
+
Points clés: {[kp.title for kp in summary.key_points]}
|
| 175 |
+
"""
|
| 176 |
+
document_summaries.append(doc_summary.strip())
|
| 177 |
+
|
| 178 |
+
return {
|
| 179 |
+
"topic": input_data.original_topic,
|
| 180 |
+
"document_summaries": "\n\n".join(document_summaries),
|
| 181 |
+
"common_themes": input_data.summarization_output.common_themes,
|
| 182 |
+
"consensus_points": input_data.summarization_output.consensus_points,
|
| 183 |
+
"conflicting_views": input_data.summarization_output.conflicting_views,
|
| 184 |
+
"summaries_count": len(summaries),
|
| 185 |
+
"average_credibility": input_data.summarization_output.average_credibility
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
async def _generate_report_sections(
|
| 189 |
+
self,
|
| 190 |
+
prepared_data: Dict[str, Any],
|
| 191 |
+
input_data: GlobalSynthesisInput
|
| 192 |
+
) -> List[ReportSection]:
|
| 193 |
+
"""Génère les sections principales du rapport."""
|
| 194 |
+
|
| 195 |
+
# Tâches parallèles pour différentes sections
|
| 196 |
+
tasks = []
|
| 197 |
+
|
| 198 |
+
# 1. Synthèse principale
|
| 199 |
+
main_synthesis_prompt = GLOBAL_SYNTHESIZER_PROMPTS['final_synthesis'].format(**prepared_data)
|
| 200 |
+
tasks.append(self._get_llm_response(main_synthesis_prompt, "main_synthesis"))
|
| 201 |
+
|
| 202 |
+
# 2. Analyse thématique
|
| 203 |
+
thematic_prompt = GLOBAL_SYNTHESIZER_PROMPTS['thematic_analysis'].format(
|
| 204 |
+
topic=prepared_data["topic"],
|
| 205 |
+
summaries=prepared_data["document_summaries"]
|
| 206 |
+
)
|
| 207 |
+
tasks.append(self._get_llm_response(thematic_prompt, "thematic_analysis"))
|
| 208 |
+
|
| 209 |
+
# Exécution des tâches en parallèle
|
| 210 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 211 |
+
|
| 212 |
+
# Traitement des résultats
|
| 213 |
+
sections = []
|
| 214 |
+
|
| 215 |
+
for result in results:
|
| 216 |
+
if isinstance(result, Exception):
|
| 217 |
+
self.logger.error(f"Erreur génération section: {result}")
|
| 218 |
+
continue
|
| 219 |
+
|
| 220 |
+
section_type, content = result
|
| 221 |
+
|
| 222 |
+
if section_type == "main_synthesis":
|
| 223 |
+
# Parser la synthèse principale en sections
|
| 224 |
+
parsed_sections = self._parse_main_synthesis(content)
|
| 225 |
+
sections.extend(parsed_sections)
|
| 226 |
+
|
| 227 |
+
elif section_type == "thematic_analysis":
|
| 228 |
+
# Ajouter l'analyse thématique comme section
|
| 229 |
+
thematic_section = ReportSection(
|
| 230 |
+
title="Analyse Thématique Détaillée",
|
| 231 |
+
content=content,
|
| 232 |
+
order=2
|
| 233 |
+
)
|
| 234 |
+
sections.append(thematic_section)
|
| 235 |
+
|
| 236 |
+
# Trier les sections par ordre
|
| 237 |
+
sections.sort(key=lambda x: x.order)
|
| 238 |
+
|
| 239 |
+
return sections
|
| 240 |
+
|
| 241 |
+
async def _generate_executive_summary(
|
| 242 |
+
self,
|
| 243 |
+
prepared_data: Dict[str, Any],
|
| 244 |
+
input_data: GlobalSynthesisInput
|
| 245 |
+
) -> ExecutiveSummary:
|
| 246 |
+
"""Génère le résumé exécutif."""
|
| 247 |
+
|
| 248 |
+
# Préparation des données pour le résumé exécutif
|
| 249 |
+
analysis_data = {
|
| 250 |
+
"summaries": prepared_data["document_summaries"],
|
| 251 |
+
"themes": prepared_data["common_themes"],
|
| 252 |
+
"consensus": prepared_data["consensus_points"],
|
| 253 |
+
"conflicts": prepared_data["conflicting_views"],
|
| 254 |
+
"credibility": prepared_data["average_credibility"]
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
prompt = GLOBAL_SYNTHESIZER_PROMPTS['executive_summary'].format(
|
| 258 |
+
topic=prepared_data["topic"],
|
| 259 |
+
analysis_data=str(analysis_data)
|
| 260 |
+
)
|
| 261 |
+
|
| 262 |
+
try:
|
| 263 |
+
response = await self.llm_manager.get_completion(
|
| 264 |
+
prompt,
|
| 265 |
+
system_prompt=SYSTEM_PROMPTS['global_synthesizer'],
|
| 266 |
+
temperature=0.3,
|
| 267 |
+
max_tokens=1500
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
# Parser la réponse pour extraire les composants
|
| 271 |
+
return self._parse_executive_summary(response)
|
| 272 |
+
|
| 273 |
+
except Exception as e:
|
| 274 |
+
self.logger.error(f"Erreur génération résumé exécutif: {e}")
|
| 275 |
+
# Fallback: créer un résumé simple
|
| 276 |
+
return self._create_fallback_executive_summary(prepared_data)
|
| 277 |
+
|
| 278 |
+
def _create_methodology(self, input_data: GlobalSynthesisInput) -> Methodology:
|
| 279 |
+
"""Crée la description de la méthodologie utilisée."""
|
| 280 |
+
|
| 281 |
+
analysis_methods = [
|
| 282 |
+
"Extraction automatique de contenu web",
|
| 283 |
+
"Analyse et résumé par intelligence artificielle",
|
| 284 |
+
"Synthèse thématique transversale",
|
| 285 |
+
"Évaluation de crédibilité des sources"
|
| 286 |
+
]
|
| 287 |
+
|
| 288 |
+
limitations = [
|
| 289 |
+
"Analyse limitée aux sources web accessibles publiquement",
|
| 290 |
+
"Évaluation de crédibilité basée sur des critères automatisés",
|
| 291 |
+
"Synthèse générée par IA - vérification humaine recommandée"
|
| 292 |
+
]
|
| 293 |
+
|
| 294 |
+
summaries_count = len(input_data.summarization_output.summaries)
|
| 295 |
+
avg_credibility = input_data.summarization_output.average_credibility or 0.5
|
| 296 |
+
|
| 297 |
+
quality_assessment = f"""
|
| 298 |
+
Qualité des données évaluée sur {summaries_count} sources analysées.
|
| 299 |
+
Score de crédibilité moyen: {avg_credibility:.2f}/1.0.
|
| 300 |
+
Sources diversifiées avec analyse automatisée de sentiment et biais.
|
| 301 |
+
"""
|
| 302 |
+
|
| 303 |
+
return Methodology(
|
| 304 |
+
research_approach="Recherche web automatisée avec synthèse par IA",
|
| 305 |
+
sources_count=summaries_count,
|
| 306 |
+
analysis_methods=analysis_methods,
|
| 307 |
+
limitations=limitations,
|
| 308 |
+
data_quality_assessment=quality_assessment.strip()
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
def _create_source_references(self, summaries: List[DocumentSummary]) -> List[SourceReference]:
|
| 312 |
+
"""Crée les références bibliographiques des sources."""
|
| 313 |
+
|
| 314 |
+
references = []
|
| 315 |
+
|
| 316 |
+
for summary in summaries:
|
| 317 |
+
reference = SourceReference(
|
| 318 |
+
title=summary.title,
|
| 319 |
+
url=str(summary.url),
|
| 320 |
+
author=getattr(summary, 'author', None),
|
| 321 |
+
publication_date=getattr(summary, 'published_date', None),
|
| 322 |
+
credibility_score=summary.credibility_score,
|
| 323 |
+
citation_count=1 # Par défaut, chaque source est citée au moins une fois
|
| 324 |
+
)
|
| 325 |
+
references.append(reference)
|
| 326 |
+
|
| 327 |
+
return references
|
| 328 |
+
|
| 329 |
+
async def _assess_quality(
|
| 330 |
+
self,
|
| 331 |
+
input_data: GlobalSynthesisInput,
|
| 332 |
+
sections: List[ReportSection]
|
| 333 |
+
) -> Dict[str, float]:
|
| 334 |
+
"""Évalue la qualité de l'analyse et du rapport."""
|
| 335 |
+
|
| 336 |
+
summaries = input_data.summarization_output.summaries
|
| 337 |
+
credibility_scores = [s.credibility_score for s in summaries if s.credibility_score]
|
| 338 |
+
|
| 339 |
+
# Métriques de base
|
| 340 |
+
completeness_score = min(len(summaries) / 5.0, 1.0) # Optimal à 5+ sources
|
| 341 |
+
|
| 342 |
+
if credibility_scores:
|
| 343 |
+
reliability_score = sum(credibility_scores) / len(credibility_scores)
|
| 344 |
+
else:
|
| 345 |
+
reliability_score = 0.5
|
| 346 |
+
|
| 347 |
+
coherence_score = min(len(sections) / 3.0, 1.0) # Optimal à 3+ sections
|
| 348 |
+
|
| 349 |
+
# Score de confiance global
|
| 350 |
+
confidence_score = (completeness_score * 0.4 +
|
| 351 |
+
reliability_score * 0.4 +
|
| 352 |
+
coherence_score * 0.2)
|
| 353 |
+
|
| 354 |
+
return {
|
| 355 |
+
"confidence_score": confidence_score,
|
| 356 |
+
"completeness_score": completeness_score,
|
| 357 |
+
"reliability_score": reliability_score,
|
| 358 |
+
"coherence_score": coherence_score
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
def _assemble_final_report(
|
| 362 |
+
self,
|
| 363 |
+
input_data: GlobalSynthesisInput,
|
| 364 |
+
executive_summary: ExecutiveSummary,
|
| 365 |
+
sections: List[ReportSection],
|
| 366 |
+
methodology: Methodology,
|
| 367 |
+
source_references: List[SourceReference],
|
| 368 |
+
quality_scores: Dict[str, float]
|
| 369 |
+
) -> FinalReport:
|
| 370 |
+
"""Assemble le rapport final complet."""
|
| 371 |
+
|
| 372 |
+
# Génération de l'ID du rapport
|
| 373 |
+
report_id = self._generate_report_id(input_data.original_topic)
|
| 374 |
+
|
| 375 |
+
# Titre du rapport
|
| 376 |
+
title = f"Analyse de Recherche: {input_data.original_topic.title()}"
|
| 377 |
+
|
| 378 |
+
# Introduction générique
|
| 379 |
+
introduction = f"""
|
| 380 |
+
Ce rapport présente une analyse complète du sujet "{input_data.original_topic}"
|
| 381 |
+
basée sur l'analyse de {len(source_references)} sources documentaires.
|
| 382 |
+
|
| 383 |
+
L'analyse a été réalisée par un système d'intelligence artificielle utilisant des
|
| 384 |
+
méthodes d'extraction automatique de contenu, de résumé intelligent et de synthèse
|
| 385 |
+
thématique transversale.
|
| 386 |
+
""".strip()
|
| 387 |
+
|
| 388 |
+
# Conclusion générique
|
| 389 |
+
conclusion = f"""
|
| 390 |
+
Cette analyse de "{input_data.original_topic}" révèle des insights importants
|
| 391 |
+
basés sur {len(source_references)} sources analysées.
|
| 392 |
+
|
| 393 |
+
Les résultats présentés dans ce rapport offrent une perspective complète sur
|
| 394 |
+
les différents aspects du sujet, avec un score de confiance global de
|
| 395 |
+
{quality_scores['confidence_score']:.2f}/1.0.
|
| 396 |
+
|
| 397 |
+
Pour des décisions importantes, il est recommandé de compléter cette analyse
|
| 398 |
+
par une vérification humaine et des sources supplémentaires si nécessaire.
|
| 399 |
+
""".strip()
|
| 400 |
+
|
| 401 |
+
# Calcul du nombre de mots (approximatif)
|
| 402 |
+
word_count = (
|
| 403 |
+
len(introduction.split()) +
|
| 404 |
+
len(conclusion.split()) +
|
| 405 |
+
len(executive_summary.summary_text.split()) +
|
| 406 |
+
sum(len(section.content.split()) for section in sections)
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
# Extraction des thèmes et tendances
|
| 410 |
+
summarization_output = input_data.summarization_output
|
| 411 |
+
|
| 412 |
+
return FinalReport(
|
| 413 |
+
report_id=report_id,
|
| 414 |
+
title=title,
|
| 415 |
+
topic=input_data.original_topic,
|
| 416 |
+
report_type=input_data.report_type,
|
| 417 |
+
report_format=input_data.report_format,
|
| 418 |
+
|
| 419 |
+
executive_summary=executive_summary,
|
| 420 |
+
introduction=introduction,
|
| 421 |
+
main_sections=sections,
|
| 422 |
+
conclusion=conclusion,
|
| 423 |
+
|
| 424 |
+
key_themes=summarization_output.common_themes[:10],
|
| 425 |
+
consensus_points=summarization_output.consensus_points[:10],
|
| 426 |
+
conflicting_viewpoints=summarization_output.conflicting_views[:10],
|
| 427 |
+
emerging_trends=[], # À améliorer avec analyse spécifique
|
| 428 |
+
|
| 429 |
+
methodology=methodology,
|
| 430 |
+
sources=source_references,
|
| 431 |
+
|
| 432 |
+
confidence_score=quality_scores["confidence_score"],
|
| 433 |
+
completeness_score=quality_scores["completeness_score"],
|
| 434 |
+
|
| 435 |
+
total_sources_analyzed=len(source_references),
|
| 436 |
+
processing_time=0.0, # Sera mis à jour par le processus principal
|
| 437 |
+
word_count=word_count
|
| 438 |
+
)
|
| 439 |
+
|
| 440 |
+
async def _generate_formatted_outputs(
|
| 441 |
+
self,
|
| 442 |
+
final_report: FinalReport,
|
| 443 |
+
input_data: GlobalSynthesisInput
|
| 444 |
+
) -> Dict[str, str]:
|
| 445 |
+
"""Génère le rapport dans différents formats."""
|
| 446 |
+
|
| 447 |
+
formatted_outputs = {}
|
| 448 |
+
|
| 449 |
+
# Format Markdown (par défaut)
|
| 450 |
+
markdown_content = self._format_as_markdown(final_report)
|
| 451 |
+
formatted_outputs["markdown"] = markdown_content
|
| 452 |
+
|
| 453 |
+
# Format texte simple
|
| 454 |
+
text_content = self._format_as_text(final_report)
|
| 455 |
+
formatted_outputs["text"] = text_content
|
| 456 |
+
|
| 457 |
+
# Format HTML (basique)
|
| 458 |
+
html_content = self._format_as_html(final_report)
|
| 459 |
+
formatted_outputs["html"] = html_content
|
| 460 |
+
|
| 461 |
+
return formatted_outputs
|
| 462 |
+
|
| 463 |
+
def _format_as_markdown(self, report: FinalReport) -> str:
|
| 464 |
+
"""Formate le rapport en Markdown."""
|
| 465 |
+
|
| 466 |
+
content = f"""# {report.title}
|
| 467 |
+
|
| 468 |
+
**Sujet:** {report.topic}
|
| 469 |
+
**Date de génération:** {report.generated_at.strftime('%d/%m/%Y %H:%M')}
|
| 470 |
+
**ID du rapport:** {report.report_id}
|
| 471 |
+
|
| 472 |
+
---
|
| 473 |
+
|
| 474 |
+
## Résumé Exécutif
|
| 475 |
+
|
| 476 |
+
{report.executive_summary.summary_text}
|
| 477 |
+
|
| 478 |
+
### Conclusions Principales
|
| 479 |
+
{chr(10).join(f"- {finding}" for finding in report.executive_summary.key_findings)}
|
| 480 |
+
|
| 481 |
+
### Insights Clés
|
| 482 |
+
{chr(10).join(f"- {insight}" for insight in report.executive_summary.main_insights)}
|
| 483 |
+
|
| 484 |
+
### Recommandations
|
| 485 |
+
{chr(10).join(f"- {rec}" for rec in report.executive_summary.recommendations)}
|
| 486 |
+
|
| 487 |
+
---
|
| 488 |
+
|
| 489 |
+
## Introduction
|
| 490 |
+
|
| 491 |
+
{report.introduction}
|
| 492 |
+
|
| 493 |
+
---
|
| 494 |
+
|
| 495 |
+
"""
|
| 496 |
+
|
| 497 |
+
# Ajout des sections principales
|
| 498 |
+
for section in report.main_sections:
|
| 499 |
+
content += f"## {section.title}\n\n{section.content}\n\n---\n\n"
|
| 500 |
+
|
| 501 |
+
# Thèmes et analyses
|
| 502 |
+
if report.key_themes:
|
| 503 |
+
content += "## Thèmes Principaux\n\n"
|
| 504 |
+
content += "\n".join(f"- {theme}" for theme in report.key_themes[:5])
|
| 505 |
+
content += "\n\n---\n\n"
|
| 506 |
+
|
| 507 |
+
# Conclusion
|
| 508 |
+
content += f"## Conclusion\n\n{report.conclusion}\n\n---\n\n"
|
| 509 |
+
|
| 510 |
+
# Méthodologie
|
| 511 |
+
content += f"""## M��thodologie
|
| 512 |
+
|
| 513 |
+
**Approche:** {report.methodology.research_approach}
|
| 514 |
+
**Sources analysées:** {report.methodology.sources_count}
|
| 515 |
+
**Score de confiance:** {report.confidence_score:.2f}/1.0
|
| 516 |
+
|
| 517 |
+
### Méthodes d'Analyse
|
| 518 |
+
{chr(10).join(f"- {method}" for method in report.methodology.analysis_methods)}
|
| 519 |
+
|
| 520 |
+
### Limitations
|
| 521 |
+
{chr(10).join(f"- {limitation}" for limitation in report.methodology.limitations)}
|
| 522 |
+
|
| 523 |
+
---
|
| 524 |
+
|
| 525 |
+
## Sources
|
| 526 |
+
|
| 527 |
+
"""
|
| 528 |
+
|
| 529 |
+
# Sources
|
| 530 |
+
for i, source in enumerate(report.sources, 1):
|
| 531 |
+
content += f"{i}. **{source.title}** \n"
|
| 532 |
+
content += f" URL: {source.url} \n"
|
| 533 |
+
if source.credibility_score:
|
| 534 |
+
content += f" Crédibilité: {source.credibility_score:.2f}/1.0 \n"
|
| 535 |
+
content += "\n"
|
| 536 |
+
|
| 537 |
+
return content
|
| 538 |
+
|
| 539 |
+
def _format_as_text(self, report: FinalReport) -> str:
|
| 540 |
+
"""Formate le rapport en texte simple."""
|
| 541 |
+
content = f"""
|
| 542 |
+
{report.title}
|
| 543 |
+
{'=' * len(report.title)}
|
| 544 |
+
|
| 545 |
+
Sujet: {report.topic}
|
| 546 |
+
Date: {report.generated_at.strftime('%d/%m/%Y %H:%M')}
|
| 547 |
+
ID: {report.report_id}
|
| 548 |
+
|
| 549 |
+
RÉSUMÉ EXÉCUTIF
|
| 550 |
+
{'-' * 20}
|
| 551 |
+
|
| 552 |
+
{report.executive_summary.summary_text}
|
| 553 |
+
|
| 554 |
+
CONCLUSIONS PRINCIPALES:
|
| 555 |
+
{chr(10).join(f"• {finding}" for finding in report.executive_summary.key_findings)}
|
| 556 |
+
|
| 557 |
+
INTRODUCTION
|
| 558 |
+
{'-' * 15}
|
| 559 |
+
|
| 560 |
+
{report.introduction}
|
| 561 |
+
|
| 562 |
+
"""
|
| 563 |
+
|
| 564 |
+
# Sections principales
|
| 565 |
+
for section in report.main_sections:
|
| 566 |
+
content += f"\n{section.title.upper()}\n"
|
| 567 |
+
content += "-" * len(section.title) + "\n\n"
|
| 568 |
+
content += section.content + "\n\n"
|
| 569 |
+
|
| 570 |
+
# Conclusion
|
| 571 |
+
content += f"CONCLUSION\n{'-' * 10}\n\n{report.conclusion}\n\n"
|
| 572 |
+
|
| 573 |
+
return content
|
| 574 |
+
|
| 575 |
+
def _format_as_html(self, report: FinalReport) -> str:
|
| 576 |
+
"""Formate le rapport en HTML basique."""
|
| 577 |
+
|
| 578 |
+
html = f"""
|
| 579 |
+
<!DOCTYPE html>
|
| 580 |
+
<html>
|
| 581 |
+
<head>
|
| 582 |
+
<title>{report.title}</title>
|
| 583 |
+
<meta charset="utf-8">
|
| 584 |
+
<style>
|
| 585 |
+
body {{ font-family: Arial, sans-serif; margin: 40px; line-height: 1.6; }}
|
| 586 |
+
h1 {{ color: #333; border-bottom: 2px solid #333; }}
|
| 587 |
+
h2 {{ color: #666; border-bottom: 1px solid #ccc; }}
|
| 588 |
+
.metadata {{ background: #f5f5f5; padding: 15px; border-radius: 5px; }}
|
| 589 |
+
.section {{ margin: 20px 0; }}
|
| 590 |
+
ul {{ margin: 10px 0; }}
|
| 591 |
+
</style>
|
| 592 |
+
</head>
|
| 593 |
+
<body>
|
| 594 |
+
<h1>{report.title}</h1>
|
| 595 |
+
|
| 596 |
+
<div class="metadata">
|
| 597 |
+
<strong>Sujet:</strong> {report.topic}<br>
|
| 598 |
+
<strong>Date:</strong> {report.generated_at.strftime('%d/%m/%Y %H:%M')}<br>
|
| 599 |
+
<strong>Score de confiance:</strong> {report.confidence_score:.2f}/1.0
|
| 600 |
+
</div>
|
| 601 |
+
|
| 602 |
+
<h2>Résumé Exécutif</h2>
|
| 603 |
+
<p>{report.executive_summary.summary_text}</p>
|
| 604 |
+
|
| 605 |
+
<h3>Conclusions Principales</h3>
|
| 606 |
+
<ul>
|
| 607 |
+
{''.join(f"<li>{finding}</li>" for finding in report.executive_summary.key_findings)}
|
| 608 |
+
</ul>
|
| 609 |
+
|
| 610 |
+
<h2>Introduction</h2>
|
| 611 |
+
<p>{report.introduction}</p>
|
| 612 |
+
"""
|
| 613 |
+
|
| 614 |
+
# Sections principales
|
| 615 |
+
for section in report.main_sections:
|
| 616 |
+
html += f"""
|
| 617 |
+
<h2>{section.title}</h2>
|
| 618 |
+
<div class="section">
|
| 619 |
+
{section.content.replace(chr(10), '<br>')}
|
| 620 |
+
</div>
|
| 621 |
+
"""
|
| 622 |
+
|
| 623 |
+
# Conclusion
|
| 624 |
+
html += f"""
|
| 625 |
+
<h2>Conclusion</h2>
|
| 626 |
+
<p>{report.conclusion}</p>
|
| 627 |
+
|
| 628 |
+
<h2>Sources</h2>
|
| 629 |
+
<ol>
|
| 630 |
+
"""
|
| 631 |
+
|
| 632 |
+
for source in report.sources:
|
| 633 |
+
html += f"""
|
| 634 |
+
<li>
|
| 635 |
+
<strong>{source.title}</strong><br>
|
| 636 |
+
<a href="{source.url}">{source.url}</a>
|
| 637 |
+
{f"<br>Crédibilité: {source.credibility_score:.2f}/1.0" if source.credibility_score else ""}
|
| 638 |
+
</li>
|
| 639 |
+
"""
|
| 640 |
+
|
| 641 |
+
html += """
|
| 642 |
+
</ol>
|
| 643 |
+
</body>
|
| 644 |
+
</html>
|
| 645 |
+
"""
|
| 646 |
+
|
| 647 |
+
return html
|
| 648 |
+
|
| 649 |
+
# Méthodes utilitaires
|
| 650 |
+
|
| 651 |
+
async def _get_llm_response(self, prompt: str, task_type: str) -> tuple:
|
| 652 |
+
"""Obtient une réponse LLM pour une tâche spécifique."""
|
| 653 |
+
try:
|
| 654 |
+
response = await self.llm_manager.get_completion(
|
| 655 |
+
prompt,
|
| 656 |
+
system_prompt=SYSTEM_PROMPTS['global_synthesizer'],
|
| 657 |
+
temperature=0.3,
|
| 658 |
+
max_tokens=3000
|
| 659 |
+
)
|
| 660 |
+
return task_type, response
|
| 661 |
+
except Exception as e:
|
| 662 |
+
self.logger.error(f"Erreur LLM pour {task_type}: {e}")
|
| 663 |
+
return task_type, f"Erreur: {str(e)}"
|
| 664 |
+
|
| 665 |
+
def _parse_main_synthesis(self, content: str) -> List[ReportSection]:
|
| 666 |
+
"""Parse la synthèse principale en sections structurées."""
|
| 667 |
+
|
| 668 |
+
sections = []
|
| 669 |
+
|
| 670 |
+
# Recherche des sections avec titres
|
| 671 |
+
section_pattern = r'##\s+(.+?)\n(.*?)(?=##|\Z)'
|
| 672 |
+
matches = re.findall(section_pattern, content, re.DOTALL)
|
| 673 |
+
|
| 674 |
+
for i, (title, section_content) in enumerate(matches):
|
| 675 |
+
section = ReportSection(
|
| 676 |
+
title=title.strip(),
|
| 677 |
+
content=section_content.strip(),
|
| 678 |
+
order=i + 1
|
| 679 |
+
)
|
| 680 |
+
sections.append(section)
|
| 681 |
+
|
| 682 |
+
# Si aucune section trouvée, créer une section générale
|
| 683 |
+
if not sections:
|
| 684 |
+
sections.append(ReportSection(
|
| 685 |
+
title="Analyse Générale",
|
| 686 |
+
content=content,
|
| 687 |
+
order=1
|
| 688 |
+
))
|
| 689 |
+
|
| 690 |
+
return sections
|
| 691 |
+
|
| 692 |
+
def _parse_executive_summary(self, content: str) -> ExecutiveSummary:
|
| 693 |
+
"""Parse le contenu du résumé exécutif."""
|
| 694 |
+
|
| 695 |
+
# Extraction simplifiée - à améliorer selon le format LLM
|
| 696 |
+
lines = content.split('\n')
|
| 697 |
+
|
| 698 |
+
key_findings = []
|
| 699 |
+
main_insights = []
|
| 700 |
+
recommendations = []
|
| 701 |
+
summary_text = content
|
| 702 |
+
|
| 703 |
+
# Recherche des sections spécifiques
|
| 704 |
+
current_section = None
|
| 705 |
+
|
| 706 |
+
for line in lines:
|
| 707 |
+
line = line.strip()
|
| 708 |
+
|
| 709 |
+
if 'conclusion' in line.lower() or 'finding' in line.lower():
|
| 710 |
+
current_section = 'findings'
|
| 711 |
+
elif 'insight' in line.lower() or 'découverte' in line.lower():
|
| 712 |
+
current_section = 'insights'
|
| 713 |
+
elif 'recommandation' in line.lower() or 'recommendation' in line.lower():
|
| 714 |
+
current_section = 'recommendations'
|
| 715 |
+
elif line.startswith('-') or line.startswith('•'):
|
| 716 |
+
point = line[1:].strip()
|
| 717 |
+
if len(point) > 10:
|
| 718 |
+
if current_section == 'findings':
|
| 719 |
+
key_findings.append(point)
|
| 720 |
+
elif current_section == 'insights':
|
| 721 |
+
main_insights.append(point)
|
| 722 |
+
elif current_section == 'recommendations':
|
| 723 |
+
recommendations.append(point)
|
| 724 |
+
|
| 725 |
+
# Fallback: extraire les premières phrases comme findings
|
| 726 |
+
if not key_findings:
|
| 727 |
+
sentences = content.split('.')[:3]
|
| 728 |
+
key_findings = [s.strip() + '.' for s in sentences if len(s.strip()) > 20]
|
| 729 |
+
|
| 730 |
+
return ExecutiveSummary(
|
| 731 |
+
key_findings=key_findings[:5],
|
| 732 |
+
main_insights=main_insights[:5],
|
| 733 |
+
recommendations=recommendations[:5],
|
| 734 |
+
summary_text=summary_text[:500] + "..." if len(summary_text) > 500 else summary_text
|
| 735 |
+
)
|
| 736 |
+
|
| 737 |
+
def _create_fallback_executive_summary(self, prepared_data: Dict[str, Any]) -> ExecutiveSummary:
|
| 738 |
+
"""Crée un résumé exécutif de fallback."""
|
| 739 |
+
|
| 740 |
+
return ExecutiveSummary(
|
| 741 |
+
key_findings=[
|
| 742 |
+
f"Analyse basée sur {prepared_data['summaries_count']} sources documentaires",
|
| 743 |
+
f"Score de crédibilité moyen: {prepared_data.get('average_credibility', 0.5):.2f}/1.0",
|
| 744 |
+
"Synthèse générée automatiquement par IA"
|
| 745 |
+
],
|
| 746 |
+
main_insights=[
|
| 747 |
+
"Analyse transversale des différentes perspectives sur le sujet",
|
| 748 |
+
"Identification des thèmes récurrents et des consensus",
|
| 749 |
+
"Évaluation critique des sources et de leur fiabilité"
|
| 750 |
+
],
|
| 751 |
+
recommendations=[
|
| 752 |
+
"Vérification humaine recommandée pour les décisions importantes",
|
| 753 |
+
"Complément par des sources supplémentaires si nécessaire",
|
| 754 |
+
"Mise à jour régulière de l'analyse"
|
| 755 |
+
],
|
| 756 |
+
summary_text=f"Cette analyse du sujet '{prepared_data['topic']}' synthétise {prepared_data['summaries_count']} sources documentaires pour fournir une vue d'ensemble complète et objective."
|
| 757 |
+
)
|
| 758 |
+
|
| 759 |
+
def _generate_report_id(self, topic: str) -> str:
|
| 760 |
+
"""Génère un ID unique pour le rapport."""
|
| 761 |
+
|
| 762 |
+
# Hash du sujet + timestamp
|
| 763 |
+
topic_hash = hashlib.md5(topic.encode()).hexdigest()[:8]
|
| 764 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
|
| 765 |
+
|
| 766 |
+
return f"rpt_{timestamp}_{topic_hash}"
|
| 767 |
+
|
| 768 |
+
# #fonction global_summary from summarization output
|
| 769 |
+
async def process_from_summarization_output(self, summarization_output: SummarizationOutput) -> GlobalSynthesisOutput:
|
| 770 |
+
"""Génère une synthèse globale à partir de la sortie du summarizer."""
|
| 771 |
+
# Utilise le topic du fichier si non fourni
|
| 772 |
+
topic_val = (getattr(summarization_output, "topic", None) or "Sujet de synthèse")
|
| 773 |
+
synthesis_input = GlobalSynthesisInput(
|
| 774 |
+
summarization_output=summarization_output,
|
| 775 |
+
original_topic=topic_val
|
| 776 |
+
)
|
| 777 |
+
if not self.validate_input(synthesis_input):
|
| 778 |
+
self.logger.error("Entrée de synthèse invalide. Abandon.")
|
| 779 |
+
raise ValueError("Invalid synthesis input")
|
| 780 |
+
|
| 781 |
+
self.logger.info(f"Démarrage de la synthèse globale pour '{synthesis_input.original_topic}'...")
|
| 782 |
+
output = await self.process(synthesis_input)
|
| 783 |
+
return output
|
| 784 |
+
|
| 785 |
+
|
| 786 |
+
|
| 787 |
+
# Exemple d'utilisation
|
| 788 |
+
if __name__ == "__main__":
|
| 789 |
+
import asyncio
|
| 790 |
+
from src.models.document_models import Document, DocumentSummary, SummarizationOutput, KeyPoint
|
| 791 |
+
|
| 792 |
+
import argparse
|
| 793 |
+
import json
|
| 794 |
+
import os
|
| 795 |
+
import sys
|
| 796 |
+
from pathlib import Path
|
| 797 |
+
|
| 798 |
+
logger = setup_logger("global_synthesizer_cli")
|
| 799 |
+
|
| 800 |
+
def load_summarization_output(json_path):
|
| 801 |
+
with open(json_path, "r", encoding="utf-8") as f:
|
| 802 |
+
data = json.load(f)
|
| 803 |
+
return SummarizationOutput(**data)
|
| 804 |
+
|
| 805 |
+
async def run_synthesis(input_json, topic=None, output_json=None):
|
| 806 |
+
summarization_output = load_summarization_output(input_json)
|
| 807 |
+
|
| 808 |
+
agent = GlobalSynthesizerAgent()
|
| 809 |
+
output = await agent.process_from_summarization_output(summarization_output)
|
| 810 |
+
# Génération du nom de fichier si non fourni
|
| 811 |
+
if not output_json:
|
| 812 |
+
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 813 |
+
output_json = f"synthesis_output_{ts}.json"
|
| 814 |
+
# Correction : model_dump_json n'accepte pas ensure_ascii
|
| 815 |
+
with open(output_json, "w", encoding="utf-8") as f:
|
| 816 |
+
f.write(output.model_dump_json(indent=2))
|
| 817 |
+
logger.info(f"Synthèse sauvegardée dans {output_json}")
|
| 818 |
+
print(f"\nSynthèse globale terminée. Rapport sauvegardé dans: {output_json}")
|
| 819 |
+
|
| 820 |
+
if __name__ == "__main__":
|
| 821 |
+
parser = argparse.ArgumentParser(description="Global Synthesizer Agent CLI")
|
| 822 |
+
parser.add_argument("--input", required=True, help="Chemin du fichier JSON de sortie du summarizer")
|
| 823 |
+
parser.add_argument("--topic", required=False, help="Sujet de recherche (optionnel)")
|
| 824 |
+
parser.add_argument("--output", required=False, help="Chemin du fichier de sortie JSON (optionnel)")
|
| 825 |
+
args = parser.parse_args()
|
| 826 |
+
asyncio.run(run_synthesis(args.input, args.topic, args.output))
|
src/agents/researcher_agent.py
ADDED
|
@@ -0,0 +1,642 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Agent Researcher - Premier agent du pipeline.
|
| 3 |
+
Effectue la recherche web sur un sujet donné et retourne des sources pertinentes.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import List, Dict, Any, Optional
|
| 7 |
+
import asyncio
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
|
| 10 |
+
from src.agents.base_agent import BaseAgent
|
| 11 |
+
from src.models.research_models import ResearchQuery, ResearchOutput, SearchResult
|
| 12 |
+
from src.models.state_models import AgentType
|
| 13 |
+
from src.services.search_api import SearchAPIManager, SearchAPIError
|
| 14 |
+
from src.services.llm_service import LLMService, LLMError
|
| 15 |
+
from src.core.logging import setup_logger
|
| 16 |
+
from config.prompts import RESEARCHER_PROMPT, SYSTEM_PROMPTS, KEYWORD_EXTRACTION_PROMPT
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class ResearcherAgent(BaseAgent[ResearchQuery, ResearchOutput]):
|
| 20 |
+
"""
|
| 21 |
+
Agent de recherche web.
|
| 22 |
+
|
| 23 |
+
Responsabilités:
|
| 24 |
+
- Recevoir une requête de recherche
|
| 25 |
+
- Effectuer des recherches sur le web via des APIs
|
| 26 |
+
- Analyser et filtrer les résultats
|
| 27 |
+
- Retourner une liste de sources pertinentes
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
def __init__(
|
| 31 |
+
self,
|
| 32 |
+
name: str = "researcher",
|
| 33 |
+
max_retries: int = 3,
|
| 34 |
+
timeout: float = 120.0 # 2 minutes pour la recherche
|
| 35 |
+
):
|
| 36 |
+
super().__init__(
|
| 37 |
+
agent_type=AgentType.RESEARCHER,
|
| 38 |
+
name=name,
|
| 39 |
+
max_retries=max_retries,
|
| 40 |
+
timeout=timeout
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
# Initialisation du gestionnaire de recherche
|
| 44 |
+
try:
|
| 45 |
+
self.search_manager = SearchAPIManager()
|
| 46 |
+
self.logger.info(f"APIs disponibles: {self.search_manager.get_available_apis()}")
|
| 47 |
+
except Exception as e:
|
| 48 |
+
self.logger.error(f"Impossible d'initialiser le gestionnaire de recherche: {e}")
|
| 49 |
+
raise
|
| 50 |
+
|
| 51 |
+
# Initialisation du service LLM pour l'extraction de mots-clés
|
| 52 |
+
try:
|
| 53 |
+
self.llm_service = LLMService()
|
| 54 |
+
self.logger.info("Service LLM initialisé pour l'extraction de mots-clés")
|
| 55 |
+
except Exception as e:
|
| 56 |
+
self.logger.error(f"Impossible d'initialiser le service LLM: {e}")
|
| 57 |
+
raise
|
| 58 |
+
|
| 59 |
+
# Configuration de recherche
|
| 60 |
+
self.default_search_params = {
|
| 61 |
+
"preferred_api": "tavily",
|
| 62 |
+
"search_depth": "basic",
|
| 63 |
+
"include_answer": True
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
def validate_input(self, input_data: ResearchQuery) -> bool:
|
| 67 |
+
"""
|
| 68 |
+
Valide la requête de recherche.
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
input_data: Requête de recherche à valider
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
True si la requête est valide
|
| 75 |
+
"""
|
| 76 |
+
if not input_data.topic or len(input_data.topic.strip()) < 3:
|
| 77 |
+
self.logger.error("Le sujet de recherche doit contenir au moins 3 caractères")
|
| 78 |
+
return False
|
| 79 |
+
|
| 80 |
+
if input_data.max_results <= 0 or input_data.max_results > 20:
|
| 81 |
+
self.logger.error("Le nombre de résultats doit être entre 1 et 20")
|
| 82 |
+
return False
|
| 83 |
+
|
| 84 |
+
return True
|
| 85 |
+
|
| 86 |
+
async def process(self, input_data: ResearchQuery) -> ResearchOutput:
|
| 87 |
+
"""
|
| 88 |
+
Traite la requête de recherche.
|
| 89 |
+
|
| 90 |
+
Args:
|
| 91 |
+
input_data: Requête de recherche
|
| 92 |
+
|
| 93 |
+
Returns:
|
| 94 |
+
Résultats de recherche structurés
|
| 95 |
+
"""
|
| 96 |
+
start_time = datetime.now()
|
| 97 |
+
self.logger.info(f"Début de recherche pour: '{input_data.topic}'")
|
| 98 |
+
|
| 99 |
+
# Préparation de la requête
|
| 100 |
+
search_query = self._prepare_search_query(input_data)
|
| 101 |
+
self.logger.info(f"Requête préparée: '{search_query}'")
|
| 102 |
+
|
| 103 |
+
# Configuration des paramètres de recherche
|
| 104 |
+
search_params = {
|
| 105 |
+
**self.default_search_params,
|
| 106 |
+
"search_depth": input_data.search_depth,
|
| 107 |
+
"max_results": input_data.max_results
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
try:
|
| 111 |
+
# Recherche principale
|
| 112 |
+
results = await self.search_manager.search(
|
| 113 |
+
query=search_query,
|
| 114 |
+
**search_params
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
# Post-traitement des résultats
|
| 118 |
+
filtered_results = self._filter_and_rank_results(
|
| 119 |
+
results,
|
| 120 |
+
input_data.topic,
|
| 121 |
+
input_data.keywords
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
# Limitation au nombre demandé
|
| 125 |
+
final_results = filtered_results[:input_data.max_results]
|
| 126 |
+
|
| 127 |
+
# Calcul du temps de recherche
|
| 128 |
+
search_time = (datetime.now() - start_time).total_seconds()
|
| 129 |
+
|
| 130 |
+
# Création de l'output
|
| 131 |
+
research_output = ResearchOutput(
|
| 132 |
+
query=input_data,
|
| 133 |
+
results=final_results,
|
| 134 |
+
total_found=len(results),
|
| 135 |
+
search_time=search_time,
|
| 136 |
+
search_engine=search_params["preferred_api"],
|
| 137 |
+
timestamp=datetime.now()
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
self.logger.info(
|
| 141 |
+
f"Recherche terminée: {len(final_results)} résultats finaux "
|
| 142 |
+
f"sur {len(results)} trouvés en {search_time:.2f}s"
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
return research_output
|
| 146 |
+
|
| 147 |
+
except SearchAPIError as e:
|
| 148 |
+
self.logger.error(f"Erreur de recherche: {e}")
|
| 149 |
+
raise
|
| 150 |
+
except Exception as e:
|
| 151 |
+
self.logger.error(f"Erreur inattendue lors de la recherche: {e}")
|
| 152 |
+
raise
|
| 153 |
+
|
| 154 |
+
def _prepare_search_query(self, query: ResearchQuery) -> str:
|
| 155 |
+
"""
|
| 156 |
+
Prépare la requête de recherche en optimisant les mots-clés.
|
| 157 |
+
|
| 158 |
+
Args:
|
| 159 |
+
query: Requête originale
|
| 160 |
+
|
| 161 |
+
Returns:
|
| 162 |
+
Requête optimisée pour la recherche
|
| 163 |
+
"""
|
| 164 |
+
# Commencer par le sujet principal
|
| 165 |
+
search_terms = [query.topic]
|
| 166 |
+
|
| 167 |
+
# Ajouter les mots-clés s'ils existent
|
| 168 |
+
if query.keywords:
|
| 169 |
+
# Éviter la redondance avec le sujet principal
|
| 170 |
+
unique_keywords = [
|
| 171 |
+
kw for kw in query.keywords
|
| 172 |
+
if kw.lower() not in query.topic.lower()
|
| 173 |
+
]
|
| 174 |
+
search_terms.extend(unique_keywords)
|
| 175 |
+
|
| 176 |
+
# Joindre avec des espaces
|
| 177 |
+
search_query = " ".join(search_terms)
|
| 178 |
+
|
| 179 |
+
##################### A Améliorer selon ce qu'on veut rechercher #################################
|
| 180 |
+
# Optimisations spécifiques selon la profondeur
|
| 181 |
+
##################################################################################################
|
| 182 |
+
if query.search_depth == "advanced":
|
| 183 |
+
# Pour les recherches avancées, ajouter des termes de contexte
|
| 184 |
+
if "intelligence artificielle" in search_query.lower() or "ia" in search_query.lower():
|
| 185 |
+
search_query += " 2024 2025 récent"
|
| 186 |
+
if "emploi" in search_query.lower() or "travail" in search_query.lower():
|
| 187 |
+
search_query += " marché impact"
|
| 188 |
+
|
| 189 |
+
return search_query.strip()
|
| 190 |
+
|
| 191 |
+
def _filter_and_rank_results(
|
| 192 |
+
self,
|
| 193 |
+
results: List[SearchResult],
|
| 194 |
+
topic: str,
|
| 195 |
+
keywords: List[str]
|
| 196 |
+
) -> List[SearchResult]:
|
| 197 |
+
"""
|
| 198 |
+
Filtre et classe les résultats par pertinence.
|
| 199 |
+
|
| 200 |
+
Args:
|
| 201 |
+
results: Résultats bruts de la recherche
|
| 202 |
+
topic: Sujet de recherche original
|
| 203 |
+
keywords: Mots-clés de recherche
|
| 204 |
+
|
| 205 |
+
Returns:
|
| 206 |
+
Résultats filtrés et classés
|
| 207 |
+
"""
|
| 208 |
+
if not results:
|
| 209 |
+
return []
|
| 210 |
+
|
| 211 |
+
# Mots-clés pour le scoring (topic + keywords)
|
| 212 |
+
scoring_terms = [topic.lower()] + [kw.lower() for kw in keywords]
|
| 213 |
+
|
| 214 |
+
# Calcul du score de pertinence pour chaque résultat
|
| 215 |
+
scored_results = []
|
| 216 |
+
for result in results:
|
| 217 |
+
score = self._calculate_relevance_score(result, scoring_terms)
|
| 218 |
+
|
| 219 |
+
# Mise à jour du score dans le résultat
|
| 220 |
+
result.score = score
|
| 221 |
+
scored_results.append(result)
|
| 222 |
+
|
| 223 |
+
# Tri par score décroissant
|
| 224 |
+
scored_results.sort(key=lambda x: x.score or 0, reverse=True)
|
| 225 |
+
|
| 226 |
+
# Filtrage des résultats de faible qualité
|
| 227 |
+
min_score = 0.1 # Score minimum acceptable
|
| 228 |
+
filtered_results = [r for r in scored_results if (r.score or 0) >= min_score]
|
| 229 |
+
|
| 230 |
+
self.logger.info(f"Filtrage: {len(filtered_results)} résultats conservés sur {len(results)}")
|
| 231 |
+
|
| 232 |
+
return filtered_results
|
| 233 |
+
|
| 234 |
+
#Améiorer le score selon le site
|
| 235 |
+
# EX: if result.url.endswith(".edu") or result.url.endswith(".gov"):
|
| 236 |
+
# score += 0.1
|
| 237 |
+
def _calculate_relevance_score(
|
| 238 |
+
self,
|
| 239 |
+
result: SearchResult,
|
| 240 |
+
scoring_terms: List[str]
|
| 241 |
+
) -> float:
|
| 242 |
+
"""
|
| 243 |
+
Calcule un score de pertinence pour un résultat.
|
| 244 |
+
|
| 245 |
+
Args:
|
| 246 |
+
result: Résultat à scorer
|
| 247 |
+
scoring_terms: Termes de référence pour le scoring
|
| 248 |
+
|
| 249 |
+
Returns:
|
| 250 |
+
Score entre 0 et 1
|
| 251 |
+
"""
|
| 252 |
+
score = 0.0
|
| 253 |
+
|
| 254 |
+
# Texte à analyser (titre + snippet)
|
| 255 |
+
text_to_analyze = f"{result.title} {result.snippet}".lower()
|
| 256 |
+
|
| 257 |
+
# Score basé sur la présence des termes de recherche
|
| 258 |
+
term_matches = 0
|
| 259 |
+
for term in scoring_terms:
|
| 260 |
+
if term in text_to_analyze:
|
| 261 |
+
term_matches += 1
|
| 262 |
+
|
| 263 |
+
if scoring_terms:
|
| 264 |
+
term_score = term_matches / len(scoring_terms)
|
| 265 |
+
score += term_score * 0.6 # 60% du score
|
| 266 |
+
|
| 267 |
+
# Bonus pour les titres pertinents
|
| 268 |
+
title_matches = sum(1 for term in scoring_terms if term in result.title.lower())
|
| 269 |
+
if scoring_terms:
|
| 270 |
+
title_score = title_matches / len(scoring_terms)
|
| 271 |
+
score += title_score * 0.3 # 30% du score
|
| 272 |
+
|
| 273 |
+
# Bonus pour les sources récentes (si date disponible)
|
| 274 |
+
if result.published_date:
|
| 275 |
+
days_old = (datetime.now() - result.published_date.replace(tzinfo=None)).days
|
| 276 |
+
if days_old <= 365: # Moins d'un an
|
| 277 |
+
recency_score = max(0, 1 - (days_old / 365))
|
| 278 |
+
score += recency_score * 0.1 # 10% du score
|
| 279 |
+
|
| 280 |
+
# Score existant de l'API (si disponible)
|
| 281 |
+
if result.score and result.score > 0:
|
| 282 |
+
score = (score + result.score) / 2 # Moyenne avec le score API
|
| 283 |
+
|
| 284 |
+
return min(score, 1.0) # Cap à 1.0
|
| 285 |
+
|
| 286 |
+
async def extract_keywords_with_llm(self, topic: str) -> List[str]:
|
| 287 |
+
"""
|
| 288 |
+
Extrait automatiquement des mots-clés pertinents à partir du sujet
|
| 289 |
+
en utilisant le service LLM.
|
| 290 |
+
|
| 291 |
+
Args:
|
| 292 |
+
topic: Sujet de recherche
|
| 293 |
+
|
| 294 |
+
Returns:
|
| 295 |
+
Liste de mots-clés extraits
|
| 296 |
+
"""
|
| 297 |
+
try:
|
| 298 |
+
self.logger.info(f"Extraction de mots-clés pour: '{topic}'")
|
| 299 |
+
|
| 300 |
+
# Préparation du prompt avec le template
|
| 301 |
+
prompt = KEYWORD_EXTRACTION_PROMPT.format(topic=topic)
|
| 302 |
+
|
| 303 |
+
# Appel au service LLM
|
| 304 |
+
response = await self.llm_service.generate_completion(
|
| 305 |
+
prompt=prompt,
|
| 306 |
+
system_prompt="Tu es un expert en analyse sémantique spécialisé dans l'extraction de mots-clés pour la recherche web.",
|
| 307 |
+
temperature=0.3, # Faible température pour plus de cohérence
|
| 308 |
+
max_tokens=150 # Limite pour les mots-clés
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
# Parsing de la réponse
|
| 312 |
+
keywords = self._parse_keywords_response(response)
|
| 313 |
+
|
| 314 |
+
self.logger.info(f"Mots-clés extraits: {keywords}")
|
| 315 |
+
return keywords
|
| 316 |
+
|
| 317 |
+
except LLMError as e:
|
| 318 |
+
self.logger.error(f"Erreur LLM lors de l'extraction de mots-clés: {e}")
|
| 319 |
+
# Fallback: extraction simple basée sur le sujet
|
| 320 |
+
return self._extract_keywords_fallback(topic)
|
| 321 |
+
except Exception as e:
|
| 322 |
+
self.logger.error(f"Erreur inattendue lors de l'extraction de mots-clés: {e}")
|
| 323 |
+
return self._extract_keywords_fallback(topic)
|
| 324 |
+
|
| 325 |
+
def _parse_keywords_response(self, response: str) -> List[str]:
|
| 326 |
+
"""
|
| 327 |
+
Parse la réponse du LLM pour extraire les mots-clés.
|
| 328 |
+
|
| 329 |
+
Args:
|
| 330 |
+
response: Réponse brute du LLM
|
| 331 |
+
|
| 332 |
+
Returns:
|
| 333 |
+
Liste de mots-clés nettoyés
|
| 334 |
+
"""
|
| 335 |
+
# Nettoyer la réponse
|
| 336 |
+
response = response.strip()
|
| 337 |
+
|
| 338 |
+
# Supprimer les préfixes potentiels
|
| 339 |
+
for prefix in ["mots-clés:", "keywords:", "réponse:", "voici:", "liste:"]:
|
| 340 |
+
if response.lower().startswith(prefix):
|
| 341 |
+
response = response[len(prefix):].strip()
|
| 342 |
+
|
| 343 |
+
# Séparer par virgules
|
| 344 |
+
keywords = [kw.strip() for kw in response.split(",")]
|
| 345 |
+
|
| 346 |
+
# Nettoyer et filtrer
|
| 347 |
+
cleaned_keywords = []
|
| 348 |
+
for kw in keywords:
|
| 349 |
+
# Supprimer les numéros et tirets
|
| 350 |
+
kw = kw.strip("0123456789.-\t\n ")
|
| 351 |
+
|
| 352 |
+
# Filtrer les mots trop courts ou vides
|
| 353 |
+
if len(kw) >= 2 and kw.lower() not in ["et", "ou", "le", "la", "les", "de", "du", "des"]:
|
| 354 |
+
cleaned_keywords.append(kw)
|
| 355 |
+
|
| 356 |
+
# Limiter le nombre de mots-clés
|
| 357 |
+
return cleaned_keywords[:7]
|
| 358 |
+
|
| 359 |
+
def _extract_keywords_fallback(self, topic: str) -> List[str]:
|
| 360 |
+
"""
|
| 361 |
+
Méthode de fallback pour extraire des mots-clés simples.
|
| 362 |
+
|
| 363 |
+
Args:
|
| 364 |
+
topic: Sujet de recherche
|
| 365 |
+
|
| 366 |
+
Returns:
|
| 367 |
+
Liste de mots-clés basiques
|
| 368 |
+
"""
|
| 369 |
+
self.logger.info("Utilisation du fallback pour l'extraction de mots-clés")
|
| 370 |
+
|
| 371 |
+
# Mots communs à ignorer
|
| 372 |
+
stop_words = {
|
| 373 |
+
"le", "la", "les", "de", "du", "des", "et", "ou", "sur", "dans",
|
| 374 |
+
"avec", "pour", "par", "en", "à", "un", "une", "ce", "cette", "ces"
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
# Extraction simple basée sur les mots significatifs
|
| 378 |
+
words = topic.lower().split()
|
| 379 |
+
keywords = [word for word in words if len(word) >= 3 and word not in stop_words]
|
| 380 |
+
|
| 381 |
+
return keywords[:5] # Limiter à 5 mots-clés max
|
| 382 |
+
|
| 383 |
+
async def search_with_fallback(
|
| 384 |
+
self,
|
| 385 |
+
query: str,
|
| 386 |
+
max_results: int = 5
|
| 387 |
+
) -> List[SearchResult]:
|
| 388 |
+
"""
|
| 389 |
+
Méthode utilitaire pour recherche simple avec fallback.
|
| 390 |
+
|
| 391 |
+
Args:
|
| 392 |
+
query: Requête de recherche simple
|
| 393 |
+
max_results: Nombre de résultats souhaités
|
| 394 |
+
|
| 395 |
+
Returns:
|
| 396 |
+
Liste des résultats
|
| 397 |
+
"""
|
| 398 |
+
research_query = ResearchQuery(
|
| 399 |
+
topic=query,
|
| 400 |
+
max_results=max_results
|
| 401 |
+
)
|
| 402 |
+
|
| 403 |
+
output = await self.process(research_query)
|
| 404 |
+
return output.results
|
| 405 |
+
|
| 406 |
+
def get_search_stats(self) -> Dict[str, Any]:
|
| 407 |
+
"""
|
| 408 |
+
Retourne les statistiques de recherche de l'agent.
|
| 409 |
+
|
| 410 |
+
Returns:
|
| 411 |
+
Dictionnaire avec les statistiques
|
| 412 |
+
"""
|
| 413 |
+
base_stats = self.get_status()
|
| 414 |
+
search_stats = {
|
| 415 |
+
"available_apis": self.search_manager.get_available_apis(),
|
| 416 |
+
"search_params": self.default_search_params
|
| 417 |
+
}
|
| 418 |
+
|
| 419 |
+
return {**base_stats, **search_stats}
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
# Fonctions utilitaires pour la sauvegarde
|
| 423 |
+
def save_research_output(output: ResearchOutput, filename: str = None) -> str:
|
| 424 |
+
"""
|
| 425 |
+
Sauvegarde un ResearchOutput dans un fichier JSON.
|
| 426 |
+
|
| 427 |
+
Args:
|
| 428 |
+
output: Sortie de recherche à sauvegarder
|
| 429 |
+
filename: Nom du fichier (optionnel)
|
| 430 |
+
|
| 431 |
+
Returns:
|
| 432 |
+
Nom du fichier sauvegardé
|
| 433 |
+
"""
|
| 434 |
+
import json
|
| 435 |
+
from datetime import datetime
|
| 436 |
+
|
| 437 |
+
if not filename:
|
| 438 |
+
# Générer un nom de fichier basé sur le sujet et timestamp
|
| 439 |
+
clean_topic = "".join(c for c in output.query.topic if c.isalnum() or c in (' ', '-', '_')).rstrip()
|
| 440 |
+
clean_topic = clean_topic.replace(' ', '_')[:30]
|
| 441 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 442 |
+
filename = f"research_output_{clean_topic}_{timestamp}.json"
|
| 443 |
+
|
| 444 |
+
try:
|
| 445 |
+
# Conversion en dictionnaire avec sérialisation des dates
|
| 446 |
+
output_dict = output.model_dump(mode='json')
|
| 447 |
+
|
| 448 |
+
# Sauvegarde dans le fichier
|
| 449 |
+
with open(filename, 'w', encoding='utf-8') as f:
|
| 450 |
+
json.dump(output_dict, f, indent=2, ensure_ascii=False)
|
| 451 |
+
|
| 452 |
+
return filename
|
| 453 |
+
|
| 454 |
+
except Exception as e:
|
| 455 |
+
raise Exception(f"Erreur lors de la sauvegarde: {e}")
|
| 456 |
+
|
| 457 |
+
|
| 458 |
+
def load_research_output(filename: str) -> ResearchOutput:
|
| 459 |
+
"""
|
| 460 |
+
Charge un ResearchOutput depuis un fichier JSON.
|
| 461 |
+
|
| 462 |
+
Args:
|
| 463 |
+
filename: Nom du fichier à charger
|
| 464 |
+
|
| 465 |
+
Returns:
|
| 466 |
+
ResearchOutput chargé
|
| 467 |
+
"""
|
| 468 |
+
import json
|
| 469 |
+
|
| 470 |
+
try:
|
| 471 |
+
with open(filename, 'r', encoding='utf-8') as f:
|
| 472 |
+
data = json.load(f)
|
| 473 |
+
|
| 474 |
+
# Reconstruction du ResearchOutput
|
| 475 |
+
return ResearchOutput(**data)
|
| 476 |
+
|
| 477 |
+
except Exception as e:
|
| 478 |
+
raise Exception(f"Erreur lors du chargement: {e}")
|
| 479 |
+
|
| 480 |
+
|
| 481 |
+
# Ecrire un main pour tester ici la classe
|
| 482 |
+
if __name__ == "__main__":
|
| 483 |
+
import asyncio
|
| 484 |
+
import json
|
| 485 |
+
from datetime import datetime
|
| 486 |
+
from src.core.logging import setup_logger
|
| 487 |
+
logger = setup_logger("researcher_agent_test")
|
| 488 |
+
|
| 489 |
+
async def main():
|
| 490 |
+
agent = ResearcherAgent()
|
| 491 |
+
|
| 492 |
+
# Test 1: Extraction automatique de mots-clés avec LLM
|
| 493 |
+
topic = "impact de l'intelligence artificielle sur le marché de l'emploi"
|
| 494 |
+
logger.info(f"=== Test d'extraction de mots-clés pour: {topic} ===")
|
| 495 |
+
|
| 496 |
+
try:
|
| 497 |
+
# Extraction automatique des mots-clés
|
| 498 |
+
keywords = await agent.extract_keywords_with_llm(topic)
|
| 499 |
+
logger.info(f"Mots-clés extraits automatiquement: {keywords}")
|
| 500 |
+
|
| 501 |
+
# Création de la requête avec les mots-clés extraits
|
| 502 |
+
query = ResearchQuery(
|
| 503 |
+
topic=topic,
|
| 504 |
+
keywords=keywords, # Utilisation des mots-clés extraits automatiquement
|
| 505 |
+
max_results=2,
|
| 506 |
+
search_depth="basic"
|
| 507 |
+
)
|
| 508 |
+
|
| 509 |
+
if agent.validate_input(query):
|
| 510 |
+
logger.info("=== Début de la recherche avec mots-clés automatiques ===")
|
| 511 |
+
output = await agent.process(query)
|
| 512 |
+
logger.info(f"Résultats obtenus: {len(output.results)}")
|
| 513 |
+
|
| 514 |
+
# Affichage des résultats
|
| 515 |
+
for i, res in enumerate(output.results, 1):
|
| 516 |
+
logger.info(f"{i}. {res.title}")
|
| 517 |
+
logger.info(f" URL: {res.url}")
|
| 518 |
+
logger.info(f" Score: {res.score:.3f}")
|
| 519 |
+
logger.info(f" Snippet: {res.snippet[:100]}...")
|
| 520 |
+
logger.info("")
|
| 521 |
+
|
| 522 |
+
# === SAUVEGARDE DU RESEARCHOUTPUT ===
|
| 523 |
+
logger.info("=== Sauvegarde du ResearchOutput ===")
|
| 524 |
+
|
| 525 |
+
try:
|
| 526 |
+
filename = save_research_output(output)
|
| 527 |
+
logger.info(f"✅ ResearchOutput sauvegardé dans: {filename}")
|
| 528 |
+
|
| 529 |
+
# Affichage du contenu sauvegardé
|
| 530 |
+
logger.info("📄 Contenu sauvegardé:")
|
| 531 |
+
logger.info(f" • Sujet: {output.query.topic}")
|
| 532 |
+
logger.info(f" • Mots-clés: {output.query.keywords}")
|
| 533 |
+
logger.info(f" • Nombre de résultats: {len(output.results)}")
|
| 534 |
+
logger.info(f" • Temps de recherche: {output.search_time:.2f}s")
|
| 535 |
+
logger.info(f" • Moteur utilisé: {output.search_engine}")
|
| 536 |
+
logger.info(f" • Timestamp: {output.timestamp}")
|
| 537 |
+
|
| 538 |
+
# Test de chargement pour vérifier l'intégrité
|
| 539 |
+
logger.info("=== Test de chargement ===")
|
| 540 |
+
loaded_output = load_research_output(filename)
|
| 541 |
+
logger.info(f"✅ ResearchOutput rechargé avec succès")
|
| 542 |
+
logger.info(f" • Vérification: {len(loaded_output.results)} résultats chargés")
|
| 543 |
+
|
| 544 |
+
# Comparaison des données
|
| 545 |
+
if loaded_output.query.topic == output.query.topic:
|
| 546 |
+
logger.info("✅ Intégrité des données vérifiée")
|
| 547 |
+
else:
|
| 548 |
+
logger.error("❌ Erreur d'intégrité des données")
|
| 549 |
+
|
| 550 |
+
# Affichage du format JSON pour référence
|
| 551 |
+
logger.info("\n📋 EXEMPLE DE FORMAT JSON SAUVEGARDÉ:")
|
| 552 |
+
logger.info("-" * 50)
|
| 553 |
+
|
| 554 |
+
# Créer un exemple compact pour l'affichage
|
| 555 |
+
example_output = {
|
| 556 |
+
"query": {
|
| 557 |
+
"topic": output.query.topic,
|
| 558 |
+
"keywords": output.query.keywords[:3], # Limiter pour l'affichage
|
| 559 |
+
"max_results": output.query.max_results,
|
| 560 |
+
"search_depth": output.query.search_depth
|
| 561 |
+
},
|
| 562 |
+
"results": [
|
| 563 |
+
{
|
| 564 |
+
"title": res.title,
|
| 565 |
+
"url": str(res.url),
|
| 566 |
+
"snippet": res.snippet[:100] + "...",
|
| 567 |
+
"score": res.score
|
| 568 |
+
} for res in output.results[:2] # Limiter à 2 résultats pour l'affichage
|
| 569 |
+
],
|
| 570 |
+
"total_found": output.total_found,
|
| 571 |
+
"search_time": output.search_time,
|
| 572 |
+
"search_engine": output.search_engine,
|
| 573 |
+
"timestamp": output.timestamp.isoformat()
|
| 574 |
+
}
|
| 575 |
+
|
| 576 |
+
print(json.dumps(example_output, indent=2, ensure_ascii=False))
|
| 577 |
+
|
| 578 |
+
except Exception as save_error:
|
| 579 |
+
logger.error(f"❌ Erreur lors de la sauvegarde: {save_error}")
|
| 580 |
+
|
| 581 |
+
else:
|
| 582 |
+
logger.error("Requête invalide.")
|
| 583 |
+
|
| 584 |
+
except Exception as e:
|
| 585 |
+
logger.error(f"Erreur lors du test: {e}")
|
| 586 |
+
|
| 587 |
+
# Fonction utilitaire pour tester la sauvegarde indépendamment
|
| 588 |
+
async def test_save_load():
|
| 589 |
+
"""Test spécifique de sauvegarde/chargement."""
|
| 590 |
+
logger.info("=== TEST SAUVEGARDE/CHARGEMENT SEUL ===")
|
| 591 |
+
|
| 592 |
+
# Créer un ResearchOutput factice pour le test
|
| 593 |
+
from datetime import datetime
|
| 594 |
+
|
| 595 |
+
fake_results = [
|
| 596 |
+
SearchResult(
|
| 597 |
+
title="Test Article 1",
|
| 598 |
+
url="https://example.com/test1",
|
| 599 |
+
snippet="Ceci est un test de snippet pour l'article 1",
|
| 600 |
+
score=0.85
|
| 601 |
+
),
|
| 602 |
+
SearchResult(
|
| 603 |
+
title="Test Article 2",
|
| 604 |
+
url="https://example.com/test2",
|
| 605 |
+
snippet="Ceci est un test de snippet pour l'article 2",
|
| 606 |
+
score=0.78
|
| 607 |
+
)
|
| 608 |
+
]
|
| 609 |
+
|
| 610 |
+
fake_query = ResearchQuery(
|
| 611 |
+
topic="test sauvegarde",
|
| 612 |
+
keywords=["test", "sauvegarde", "json"],
|
| 613 |
+
max_results=2
|
| 614 |
+
)
|
| 615 |
+
|
| 616 |
+
fake_output = ResearchOutput(
|
| 617 |
+
query=fake_query,
|
| 618 |
+
results=fake_results,
|
| 619 |
+
total_found=2,
|
| 620 |
+
search_time=1.5,
|
| 621 |
+
search_engine="test",
|
| 622 |
+
timestamp=datetime.now()
|
| 623 |
+
)
|
| 624 |
+
|
| 625 |
+
try:
|
| 626 |
+
# Test de sauvegarde
|
| 627 |
+
filename = save_research_output(fake_output, "test_research_output.json")
|
| 628 |
+
logger.info(f"✅ Test sauvegarde réussi: {filename}")
|
| 629 |
+
|
| 630 |
+
# Test de chargement
|
| 631 |
+
loaded = load_research_output(filename)
|
| 632 |
+
logger.info(f"✅ Test chargement réussi: {len(loaded.results)} résultats")
|
| 633 |
+
|
| 634 |
+
except Exception as e:
|
| 635 |
+
logger.error(f"❌ Test sauvegarde/chargement échoué: {e}")
|
| 636 |
+
|
| 637 |
+
# Choix du test à exécuter
|
| 638 |
+
import sys
|
| 639 |
+
if len(sys.argv) > 1 and sys.argv[1] == "--test-save":
|
| 640 |
+
asyncio.run(test_save_load())
|
| 641 |
+
else:
|
| 642 |
+
asyncio.run(main())
|
src/agents/summarizer_agent.py
ADDED
|
@@ -0,0 +1,669 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Agent Summarizer - Analyse et résumé de documents.
|
| 3 |
+
Crée des résumés structurés et des analyses approfondies des documents extraits.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import asyncio
|
| 7 |
+
from typing import List, Dict, Any, Optional
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
import hashlib
|
| 10 |
+
|
| 11 |
+
from src.agents.base_agent import BaseAgent
|
| 12 |
+
from src.models.document_models import Document, DocumentSummary, SummarizationOutput, KeyPoint, Citation
|
| 13 |
+
from src.models.state_models import AgentType
|
| 14 |
+
from src.services.llm_service import LLMManager, LLMError
|
| 15 |
+
from src.services.text_chunking import ChunkingManager, TextChunk
|
| 16 |
+
from src.core.logging import setup_logger
|
| 17 |
+
from config.prompts import SUMMARIZER_PROMPTS, SYSTEM_PROMPTS
|
| 18 |
+
import hashlib
|
| 19 |
+
import re
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class SummarizationInput:
|
| 23 |
+
"""Input pour l'agent Summarizer."""
|
| 24 |
+
|
| 25 |
+
def __init__(
|
| 26 |
+
self,
|
| 27 |
+
documents: List[Document],
|
| 28 |
+
summary_options: Optional[Dict[str, Any]] = None
|
| 29 |
+
):
|
| 30 |
+
self.documents = documents
|
| 31 |
+
self.summary_options = summary_options or {}
|
| 32 |
+
|
| 33 |
+
# Options par défaut
|
| 34 |
+
self.include_sentiment = self.summary_options.get('include_sentiment', True)
|
| 35 |
+
self.include_citations = self.summary_options.get('include_citations', True)
|
| 36 |
+
self.max_key_points = self.summary_options.get('max_key_points', 5)
|
| 37 |
+
self.detailed_analysis = self.summary_options.get('detailed_analysis', True)
|
| 38 |
+
self.chunk_large_docs = self.summary_options.get('chunk_large_docs', True)
|
| 39 |
+
self.max_doc_size = self.summary_options.get('max_doc_size', 8000) # caractères
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class SummarizerAgent(BaseAgent):
|
| 43 |
+
"""
|
| 44 |
+
Agent responsable de l'analyse et du résumé de documents.
|
| 45 |
+
|
| 46 |
+
Fonctionnalités:
|
| 47 |
+
- Résumé exécutif et détaillé
|
| 48 |
+
- Extraction de points clés et arguments
|
| 49 |
+
- Analyse de sentiment et biais
|
| 50 |
+
- Gestion des documents longs via chunking
|
| 51 |
+
- Citations et statistiques importantes
|
| 52 |
+
- Évaluation de crédibilité
|
| 53 |
+
"""
|
| 54 |
+
|
| 55 |
+
def __init__(
|
| 56 |
+
self,
|
| 57 |
+
max_retries: int = 2,
|
| 58 |
+
timeout: float = 600.0 # 10 minutes pour traiter plusieurs documents
|
| 59 |
+
):
|
| 60 |
+
super().__init__(
|
| 61 |
+
agent_type=AgentType.READER,
|
| 62 |
+
name="summarizer",
|
| 63 |
+
max_retries=max_retries,
|
| 64 |
+
timeout=timeout
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
# Services
|
| 68 |
+
self.llm_manager = LLMManager()
|
| 69 |
+
self.chunking_manager = ChunkingManager()
|
| 70 |
+
|
| 71 |
+
# Configuration
|
| 72 |
+
self.max_concurrent_summaries = 3 # maximum de résumés parallèles
|
| 73 |
+
self.chunk_overlap_threshold = 6000 # Seuil pour le chunking en caractères
|
| 74 |
+
|
| 75 |
+
def validate_input(self, input_data: SummarizationInput) -> bool:
|
| 76 |
+
"""
|
| 77 |
+
Valide les données d'entrée pour la summarization.
|
| 78 |
+
|
| 79 |
+
Args:
|
| 80 |
+
input_data: Input contenant les documents à résumer
|
| 81 |
+
|
| 82 |
+
Returns:
|
| 83 |
+
True si les données sont valides
|
| 84 |
+
"""
|
| 85 |
+
if not input_data.documents:
|
| 86 |
+
self.logger.error("Aucun document fourni pour la summarization")
|
| 87 |
+
return False
|
| 88 |
+
|
| 89 |
+
if len(input_data.documents) > 20: # Limite raisonnable
|
| 90 |
+
self.logger.error(f"Trop de documents ({len(input_data.documents)}), maximum 20")
|
| 91 |
+
return False
|
| 92 |
+
|
| 93 |
+
# Vérifier que les documents ont du contenu
|
| 94 |
+
valid_docs = [doc for doc in input_data.documents if doc.content and doc.content.strip()]
|
| 95 |
+
if not valid_docs:
|
| 96 |
+
self.logger.error("Aucun document avec contenu valide")
|
| 97 |
+
return False
|
| 98 |
+
|
| 99 |
+
return True
|
| 100 |
+
|
| 101 |
+
async def process(self, input_data: SummarizationInput) -> SummarizationOutput:
|
| 102 |
+
"""
|
| 103 |
+
Traite la summarization des documents.
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
input_data: Input contenant les documents à résumer
|
| 107 |
+
|
| 108 |
+
Returns:
|
| 109 |
+
SummarizationOutput avec tous les résumés
|
| 110 |
+
"""
|
| 111 |
+
start_time = datetime.now()
|
| 112 |
+
self.logger.info(f"Début summarization de {len(input_data.documents)} documents")
|
| 113 |
+
|
| 114 |
+
# Filtrer les documents valides
|
| 115 |
+
valid_documents = [doc for doc in input_data.documents if doc.content and doc.content.strip()]
|
| 116 |
+
self.logger.info(f"Documents valides à traiter: {len(valid_documents)}")
|
| 117 |
+
|
| 118 |
+
try:
|
| 119 |
+
# Traitement parallèle des résumés
|
| 120 |
+
summaries = await self._summarize_all_documents(valid_documents, input_data)
|
| 121 |
+
|
| 122 |
+
# Analyse globale
|
| 123 |
+
global_analysis = await self._perform_global_analysis(summaries)
|
| 124 |
+
|
| 125 |
+
# Calcul des métriques
|
| 126 |
+
total_processing_time = (datetime.now() - start_time).total_seconds()
|
| 127 |
+
average_credibility = self._calculate_average_credibility(summaries)
|
| 128 |
+
|
| 129 |
+
# Création du résultat
|
| 130 |
+
result = SummarizationOutput(
|
| 131 |
+
summaries=summaries,
|
| 132 |
+
total_documents=len(input_data.documents),
|
| 133 |
+
total_processing_time=total_processing_time,
|
| 134 |
+
average_credibility=average_credibility,
|
| 135 |
+
common_themes=global_analysis.get('common_themes', []),
|
| 136 |
+
consensus_points=global_analysis.get('consensus_points', []),
|
| 137 |
+
conflicting_views=global_analysis.get('conflicting_views', [])
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
self.logger.info(
|
| 141 |
+
f"Summarization terminée: {len(summaries)} résumés créés en {total_processing_time:.2f}s"
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
return result
|
| 145 |
+
|
| 146 |
+
except Exception as e:
|
| 147 |
+
self.logger.error(f"Erreur lors de la summarization: {str(e)}")
|
| 148 |
+
raise
|
| 149 |
+
|
| 150 |
+
async def _summarize_all_documents(
|
| 151 |
+
self,
|
| 152 |
+
documents: List[Document],
|
| 153 |
+
input_data: SummarizationInput
|
| 154 |
+
) -> List[DocumentSummary]:
|
| 155 |
+
"""Résume tous les documents en parallèle."""
|
| 156 |
+
semaphore = asyncio.Semaphore(self.max_concurrent_summaries)
|
| 157 |
+
|
| 158 |
+
async def summarize_single(doc: Document) -> DocumentSummary:
|
| 159 |
+
async with semaphore:
|
| 160 |
+
try:
|
| 161 |
+
return await self._summarize_document(doc, input_data)
|
| 162 |
+
except Exception as e:
|
| 163 |
+
self.logger.error(f"Erreur résumé document {doc.title}: {e}")
|
| 164 |
+
# Créer un résumé d'erreur minimal
|
| 165 |
+
return self._create_error_summary(doc, str(e))
|
| 166 |
+
|
| 167 |
+
# Lancer tous les résumés en parallèle
|
| 168 |
+
tasks = [summarize_single(doc) for doc in documents]
|
| 169 |
+
summaries = await asyncio.gather(*tasks, return_exceptions=True)
|
| 170 |
+
|
| 171 |
+
# Filtrer les résultats valides
|
| 172 |
+
valid_summaries = []
|
| 173 |
+
for summary in summaries:
|
| 174 |
+
if isinstance(summary, DocumentSummary):
|
| 175 |
+
valid_summaries.append(summary)
|
| 176 |
+
else:
|
| 177 |
+
self.logger.error(f"Résumé invalide: {summary}")
|
| 178 |
+
|
| 179 |
+
return valid_summaries
|
| 180 |
+
|
| 181 |
+
async def _summarize_document(self, document: Document, input_data: SummarizationInput) -> DocumentSummary:
|
| 182 |
+
"""Résume un document individuel."""
|
| 183 |
+
start_time = datetime.now()
|
| 184 |
+
doc_id = self._generate_document_id(document)
|
| 185 |
+
|
| 186 |
+
self.logger.info(f"Résumé document: {document.title} ({len(document.content)} caractères)")
|
| 187 |
+
|
| 188 |
+
# Décider si chunking nécessaire
|
| 189 |
+
if (input_data.chunk_large_docs and
|
| 190 |
+
len(document.content) > self.chunk_overlap_threshold):
|
| 191 |
+
summary = await self._summarize_large_document(document, input_data)
|
| 192 |
+
else:
|
| 193 |
+
summary = await self._summarize_standard_document(document, input_data)
|
| 194 |
+
|
| 195 |
+
# Finaliser le résumé
|
| 196 |
+
processing_time = (datetime.now() - start_time).total_seconds()
|
| 197 |
+
summary.document_id = doc_id
|
| 198 |
+
summary.processing_time = processing_time
|
| 199 |
+
summary.processed_at = datetime.now()
|
| 200 |
+
|
| 201 |
+
return summary
|
| 202 |
+
|
| 203 |
+
async def _summarize_standard_document(
|
| 204 |
+
self,
|
| 205 |
+
document: Document,
|
| 206 |
+
input_data: SummarizationInput
|
| 207 |
+
) -> DocumentSummary:
|
| 208 |
+
"""Résume un document de taille standard."""
|
| 209 |
+
|
| 210 |
+
# Préparer le contexte
|
| 211 |
+
context = {
|
| 212 |
+
'title': document.title,
|
| 213 |
+
'author': document.author or "Non spécifié",
|
| 214 |
+
'url': str(document.url),
|
| 215 |
+
'content': document.content
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
# Tâches parallèles
|
| 219 |
+
tasks = []
|
| 220 |
+
|
| 221 |
+
# 1. Résumé exécutif
|
| 222 |
+
exec_prompt = SUMMARIZER_PROMPTS['executive_summary'].format(**context)
|
| 223 |
+
tasks.append(self._get_llm_response(exec_prompt, "executive_summary"))
|
| 224 |
+
|
| 225 |
+
# 2. Analyse détaillée
|
| 226 |
+
if input_data.detailed_analysis:
|
| 227 |
+
detailed_prompt = SUMMARIZER_PROMPTS['detailed_analysis'].format(**context)
|
| 228 |
+
tasks.append(self._get_llm_response(detailed_prompt, "detailed_analysis"))
|
| 229 |
+
|
| 230 |
+
# 3. Analyse de sentiment (optionnelle)
|
| 231 |
+
if input_data.include_sentiment:
|
| 232 |
+
sentiment_prompt = SUMMARIZER_PROMPTS['sentiment_analysis'].format(**context)
|
| 233 |
+
tasks.append(self._get_llm_response(sentiment_prompt, "sentiment_analysis"))
|
| 234 |
+
|
| 235 |
+
# Exécuter les tâches
|
| 236 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 237 |
+
|
| 238 |
+
# Parser les résultats
|
| 239 |
+
executive_summary = ""
|
| 240 |
+
detailed_summary = ""
|
| 241 |
+
key_points = []
|
| 242 |
+
sentiment = None
|
| 243 |
+
credibility_score = None
|
| 244 |
+
|
| 245 |
+
for i, result in enumerate(results):
|
| 246 |
+
if isinstance(result, Exception):
|
| 247 |
+
self.logger.error(f"Erreur tâche {i}: {result}")
|
| 248 |
+
continue
|
| 249 |
+
|
| 250 |
+
task_type, content = result
|
| 251 |
+
|
| 252 |
+
if task_type == "executive_summary":
|
| 253 |
+
executive_summary = content
|
| 254 |
+
elif task_type == "detailed_analysis":
|
| 255 |
+
# Parser l'analyse détaillée
|
| 256 |
+
parsed = self._parse_detailed_analysis(content)
|
| 257 |
+
detailed_summary = parsed.get('summary', content)
|
| 258 |
+
key_points = parsed.get('key_points', [])
|
| 259 |
+
elif task_type == "sentiment_analysis":
|
| 260 |
+
# Parser l'analyse de sentiment
|
| 261 |
+
parsed = self._parse_sentiment_analysis(content)
|
| 262 |
+
sentiment = parsed.get('sentiment')
|
| 263 |
+
credibility_score = parsed.get('credibility_score')
|
| 264 |
+
|
| 265 |
+
# Créer le résumé
|
| 266 |
+
summary = DocumentSummary(
|
| 267 |
+
document_id="", # Sera rempli plus tard
|
| 268 |
+
title=document.title,
|
| 269 |
+
url=document.url,
|
| 270 |
+
executive_summary=executive_summary,
|
| 271 |
+
detailed_summary=detailed_summary,
|
| 272 |
+
key_points=key_points[:input_data.max_key_points],
|
| 273 |
+
sentiment=sentiment,
|
| 274 |
+
credibility_score=credibility_score
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
return summary
|
| 278 |
+
|
| 279 |
+
async def _summarize_large_document(
|
| 280 |
+
self,
|
| 281 |
+
document: Document,
|
| 282 |
+
input_data: SummarizationInput
|
| 283 |
+
) -> DocumentSummary:
|
| 284 |
+
"""Résume un document long via chunking."""
|
| 285 |
+
self.logger.info(f"Chunking document long: {document.title}")
|
| 286 |
+
|
| 287 |
+
# Découper le document
|
| 288 |
+
chunks = self.chunking_manager.chunk_document(
|
| 289 |
+
document.content,
|
| 290 |
+
strategy="default",
|
| 291 |
+
preserve_structure=True
|
| 292 |
+
)
|
| 293 |
+
|
| 294 |
+
self.logger.info(f"Document découpé en {len(chunks)} chunks")
|
| 295 |
+
|
| 296 |
+
# Résumer chaque chunk
|
| 297 |
+
chunk_summaries = await self._summarize_chunks(chunks, document)
|
| 298 |
+
|
| 299 |
+
# Synthétiser les résumés partiels
|
| 300 |
+
synthesis = await self._synthesize_chunk_summaries(chunk_summaries, document)
|
| 301 |
+
|
| 302 |
+
return synthesis
|
| 303 |
+
|
| 304 |
+
async def _summarize_chunks(self, chunks: List[TextChunk], document: Document) -> List[str]:
|
| 305 |
+
"""Résume chaque chunk individuellement en parallèle."""
|
| 306 |
+
async def summarize_chunk(chunk: TextChunk) -> str:
|
| 307 |
+
context = {
|
| 308 |
+
'title': document.title,
|
| 309 |
+
'chunk_index': chunk.chunk_id,
|
| 310 |
+
'total_chunks': chunk.total_chunks,
|
| 311 |
+
'chunk_content': chunk.content
|
| 312 |
+
}
|
| 313 |
+
prompt = SUMMARIZER_PROMPTS['chunked_summary'].format(**context)
|
| 314 |
+
try:
|
| 315 |
+
return await self.llm_manager.get_completion(
|
| 316 |
+
prompt,
|
| 317 |
+
system_prompt=SYSTEM_PROMPTS['summarizer']
|
| 318 |
+
)
|
| 319 |
+
except Exception as e:
|
| 320 |
+
self.logger.error(f"Erreur résumé chunk {chunk.chunk_id}: {e}")
|
| 321 |
+
return f"Erreur résumé chunk {chunk.chunk_id}"
|
| 322 |
+
|
| 323 |
+
# Parallélisation sur tous les chunks
|
| 324 |
+
tasks = [summarize_chunk(chunk) for chunk in chunks]
|
| 325 |
+
summaries = await asyncio.gather(*tasks)
|
| 326 |
+
return summaries
|
| 327 |
+
|
| 328 |
+
async def _synthesize_chunk_summaries(
|
| 329 |
+
self,
|
| 330 |
+
chunk_summaries: List[str],
|
| 331 |
+
document: Document
|
| 332 |
+
) -> DocumentSummary:
|
| 333 |
+
"""Synthétise les résumés de chunks en un résumé unifié."""
|
| 334 |
+
|
| 335 |
+
# Combiner tous les résumés partiels
|
| 336 |
+
combined_summaries = "\n\n".join([
|
| 337 |
+
f"Partie {i+1}: {summary}"
|
| 338 |
+
for i, summary in enumerate(chunk_summaries)
|
| 339 |
+
])
|
| 340 |
+
|
| 341 |
+
context = {
|
| 342 |
+
'partial_summaries': combined_summaries,
|
| 343 |
+
'title': document.title,
|
| 344 |
+
'url': str(document.url)
|
| 345 |
+
}
|
| 346 |
+
|
| 347 |
+
# Synthèse finale
|
| 348 |
+
synthesis_prompt = SUMMARIZER_PROMPTS['synthesis'].format(**context)
|
| 349 |
+
|
| 350 |
+
try:
|
| 351 |
+
synthesis_result = await self.llm_manager.get_completion(
|
| 352 |
+
synthesis_prompt,
|
| 353 |
+
system_prompt=SYSTEM_PROMPTS['summarizer']
|
| 354 |
+
)
|
| 355 |
+
|
| 356 |
+
# Parser le résultat de synthèse
|
| 357 |
+
parsed = self._parse_synthesis_result(synthesis_result)
|
| 358 |
+
|
| 359 |
+
summary = DocumentSummary(
|
| 360 |
+
document_id="",
|
| 361 |
+
title=document.title,
|
| 362 |
+
url=document.url,
|
| 363 |
+
executive_summary=parsed.get('executive_summary', ''),
|
| 364 |
+
detailed_summary=parsed.get('detailed_summary', ''),
|
| 365 |
+
key_points=parsed.get('key_points', []),
|
| 366 |
+
sentiment=parsed.get('sentiment'),
|
| 367 |
+
credibility_score=parsed.get('credibility_score')
|
| 368 |
+
)
|
| 369 |
+
|
| 370 |
+
return summary
|
| 371 |
+
|
| 372 |
+
except Exception as e:
|
| 373 |
+
self.logger.error(f"Erreur synthèse finale: {e}")
|
| 374 |
+
# Fallback: créer un résumé basique
|
| 375 |
+
return self._create_basic_summary_from_chunks(chunk_summaries, document)
|
| 376 |
+
|
| 377 |
+
async def _get_llm_response(self, prompt: str, task_type: str) -> tuple:
|
| 378 |
+
"""Obtient une réponse LLM pour une tâche spécifique."""
|
| 379 |
+
try:
|
| 380 |
+
response = await self.llm_manager.get_completion(
|
| 381 |
+
prompt,
|
| 382 |
+
system_prompt=SYSTEM_PROMPTS['summarizer'],
|
| 383 |
+
temperature=0.3,
|
| 384 |
+
max_tokens=2000
|
| 385 |
+
)
|
| 386 |
+
return task_type, response
|
| 387 |
+
except Exception as e:
|
| 388 |
+
self.logger.error(f"Erreur LLM pour {task_type}: {e}")
|
| 389 |
+
return task_type, f"Erreur: {str(e)}"
|
| 390 |
+
|
| 391 |
+
def _parse_detailed_analysis(self, content: str) -> Dict[str, Any]:
|
| 392 |
+
"""Parse l'analyse détaillée pour extraire les composants."""
|
| 393 |
+
# Implémentation simplifiée - à améliorer selon le format de réponse
|
| 394 |
+
result = {'summary': content, 'key_points': []}
|
| 395 |
+
|
| 396 |
+
# Chercher les points clés (format: - Point clé)
|
| 397 |
+
import re
|
| 398 |
+
key_point_pattern = r'^[-•]\s*(.+)$'
|
| 399 |
+
lines = content.split('\n')
|
| 400 |
+
|
| 401 |
+
current_key_points = []
|
| 402 |
+
for line in lines:
|
| 403 |
+
match = re.match(key_point_pattern, line.strip())
|
| 404 |
+
if match:
|
| 405 |
+
point_text = match.group(1).strip()
|
| 406 |
+
if len(point_text) > 10: # Filtrer les points trop courts
|
| 407 |
+
key_point = KeyPoint(
|
| 408 |
+
title=point_text[:50] + "..." if len(point_text) > 50 else point_text,
|
| 409 |
+
content=point_text,
|
| 410 |
+
importance=0.8, # Score par défaut
|
| 411 |
+
category="general"
|
| 412 |
+
)
|
| 413 |
+
current_key_points.append(key_point)
|
| 414 |
+
|
| 415 |
+
result['key_points'] = current_key_points
|
| 416 |
+
return result
|
| 417 |
+
|
| 418 |
+
def _parse_sentiment_analysis(self, content: str) -> Dict[str, Any]:
|
| 419 |
+
"""Parse l'analyse de sentiment."""
|
| 420 |
+
result = {}
|
| 421 |
+
|
| 422 |
+
# Extraction simplifiée
|
| 423 |
+
content_lower = content.lower()
|
| 424 |
+
|
| 425 |
+
if 'positif' in content_lower:
|
| 426 |
+
result['sentiment'] = 'positif'
|
| 427 |
+
elif 'négatif' in content_lower:
|
| 428 |
+
result['sentiment'] = 'négatif'
|
| 429 |
+
else:
|
| 430 |
+
result['sentiment'] = 'neutre'
|
| 431 |
+
|
| 432 |
+
# Chercher un score de crédibilité
|
| 433 |
+
import re
|
| 434 |
+
|
| 435 |
+
# Chercher un pattern comme "Crédibilité: 0.8" ou "0.8"
|
| 436 |
+
credibility_pattern = r'crédibilité\s*:?\s*(\d+(?:\.\d+)?)|(\d+(?:\.\d+)?)\s*\/\s*[1510]|(\d+(?:\.\d+)?)\s*%'
|
| 437 |
+
match = re.search(credibility_pattern, content_lower)
|
| 438 |
+
if match:
|
| 439 |
+
score = float(match.group(1) or match.group(2) or match.group(3))
|
| 440 |
+
if score > 1: # Si en pourcentage
|
| 441 |
+
score = score / 100
|
| 442 |
+
result['credibility_score'] = min(max(score, 0.0), 1.0)
|
| 443 |
+
else:
|
| 444 |
+
result['credibility_score'] = 0.5 # Valeur par défaut
|
| 445 |
+
|
| 446 |
+
return result
|
| 447 |
+
|
| 448 |
+
return result
|
| 449 |
+
|
| 450 |
+
def _parse_synthesis_result(self, content: str) -> Dict[str, Any]:
|
| 451 |
+
"""Parse le résultat de synthèse."""
|
| 452 |
+
# Version simplifiée - à améliorer
|
| 453 |
+
return {
|
| 454 |
+
'executive_summary': content[:200] + "..." if len(content) > 200 else content,
|
| 455 |
+
'detailed_summary': content,
|
| 456 |
+
'key_points': [],
|
| 457 |
+
'sentiment': 'neutre',
|
| 458 |
+
'credibility_score': 0.7
|
| 459 |
+
}
|
| 460 |
+
|
| 461 |
+
def _create_basic_summary_from_chunks(
|
| 462 |
+
self,
|
| 463 |
+
chunk_summaries: List[str],
|
| 464 |
+
document: Document
|
| 465 |
+
) -> DocumentSummary:
|
| 466 |
+
"""Crée un résumé basique à partir des résumés de chunks."""
|
| 467 |
+
combined = " ".join(chunk_summaries)
|
| 468 |
+
|
| 469 |
+
return DocumentSummary(
|
| 470 |
+
document_id="",
|
| 471 |
+
title=document.title,
|
| 472 |
+
url=document.url,
|
| 473 |
+
executive_summary=combined[:200] + "..." if len(combined) > 200 else combined,
|
| 474 |
+
detailed_summary=combined,
|
| 475 |
+
key_points=[],
|
| 476 |
+
sentiment="neutre",
|
| 477 |
+
credibility_score=0.5
|
| 478 |
+
)
|
| 479 |
+
|
| 480 |
+
def _create_error_summary(self, document: Document, error: str) -> DocumentSummary:
|
| 481 |
+
"""Crée un résumé d'erreur minimal."""
|
| 482 |
+
return DocumentSummary(
|
| 483 |
+
document_id=self._generate_document_id(document),
|
| 484 |
+
title=document.title,
|
| 485 |
+
url=document.url,
|
| 486 |
+
executive_summary=f"Erreur lors du résumé: {error}",
|
| 487 |
+
detailed_summary=f"Le résumé de ce document n'a pas pu être généré: {error}",
|
| 488 |
+
key_points=[],
|
| 489 |
+
sentiment=None,
|
| 490 |
+
credibility_score=None
|
| 491 |
+
)
|
| 492 |
+
|
| 493 |
+
def _generate_document_id(self, document: Document) -> str:
|
| 494 |
+
"""Génère un ID unique pour un document."""
|
| 495 |
+
content_hash = hashlib.md5(f"{document.url}{document.title}".encode()).hexdigest()
|
| 496 |
+
return f"doc_{content_hash[:8]}"
|
| 497 |
+
|
| 498 |
+
async def _perform_global_analysis(self, summaries: List[DocumentSummary]) -> Dict[str, List[str]]:
|
| 499 |
+
"""Effectue une analyse globale de tous les résumés."""
|
| 500 |
+
if len(summaries) < 2:
|
| 501 |
+
return {'common_themes': [], 'consensus_points': [], 'conflicting_views': []}
|
| 502 |
+
|
| 503 |
+
# Combiner tous les résumés pour l'analyse
|
| 504 |
+
all_summaries = "\n\n".join([
|
| 505 |
+
f"Document: {s.title}\nRésumé: {s.detailed_summary}"
|
| 506 |
+
for s in summaries
|
| 507 |
+
])
|
| 508 |
+
|
| 509 |
+
# Prompt d'analyse globale
|
| 510 |
+
global_prompt = f"""
|
| 511 |
+
Analyse les résumés de documents suivants et identifie:
|
| 512 |
+
|
| 513 |
+
1. **Thèmes communs** : Les sujets qui reviennent dans plusieurs documents
|
| 514 |
+
2. **Points de consensus** : Les idées sur lesquelles les sources s'accordent
|
| 515 |
+
3. **Points conflictuels** : Les idées contradictoires entre les sources
|
| 516 |
+
|
| 517 |
+
RÉSUMÉS:
|
| 518 |
+
{all_summaries}
|
| 519 |
+
|
| 520 |
+
Format ta réponse avec des sections claires et des listes à puces.
|
| 521 |
+
"""
|
| 522 |
+
|
| 523 |
+
try:
|
| 524 |
+
response = await self.llm_manager.get_completion(
|
| 525 |
+
global_prompt,
|
| 526 |
+
system_prompt="Tu es un expert en analyse comparative de documents."
|
| 527 |
+
)
|
| 528 |
+
|
| 529 |
+
# Parser la réponse (implémentation simplifiée)
|
| 530 |
+
return self._parse_global_analysis(response)
|
| 531 |
+
|
| 532 |
+
except Exception as e:
|
| 533 |
+
self.logger.error(f"Erreur analyse globale: {e}")
|
| 534 |
+
return {'common_themes': [], 'consensus_points': [], 'conflicting_views': []}
|
| 535 |
+
|
| 536 |
+
def _parse_global_analysis(self, content: str) -> Dict[str, List[str]]:
|
| 537 |
+
"""Parse l'analyse globale."""
|
| 538 |
+
# Implémentation simplifiée
|
| 539 |
+
lines = content.split('\n')
|
| 540 |
+
|
| 541 |
+
result = {
|
| 542 |
+
'common_themes': [],
|
| 543 |
+
'consensus_points': [],
|
| 544 |
+
'conflicting_views': []
|
| 545 |
+
}
|
| 546 |
+
|
| 547 |
+
current_section = None
|
| 548 |
+
|
| 549 |
+
for line in lines:
|
| 550 |
+
line = line.strip()
|
| 551 |
+
if not line:
|
| 552 |
+
continue
|
| 553 |
+
|
| 554 |
+
# Détecter les sections
|
| 555 |
+
if 'thème' in line.lower() or 'theme' in line.lower():
|
| 556 |
+
current_section = 'common_themes'
|
| 557 |
+
elif 'consensus' in line.lower():
|
| 558 |
+
current_section = 'consensus_points'
|
| 559 |
+
elif 'conflict' in line.lower() or 'contradictoire' in line.lower():
|
| 560 |
+
current_section = 'conflicting_views'
|
| 561 |
+
elif line.startswith('-') or line.startswith('•'):
|
| 562 |
+
# Point de liste
|
| 563 |
+
if current_section:
|
| 564 |
+
point = line[1:].strip()
|
| 565 |
+
if len(point) > 5: # Filtrer les points trop courts
|
| 566 |
+
result[current_section].append(point)
|
| 567 |
+
|
| 568 |
+
return result
|
| 569 |
+
|
| 570 |
+
def _calculate_average_credibility(self, summaries: List[DocumentSummary]) -> Optional[float]:
|
| 571 |
+
"""Calcule le score de crédibilité moyen."""
|
| 572 |
+
scores = [s.credibility_score for s in summaries if s.credibility_score is not None]
|
| 573 |
+
|
| 574 |
+
if not scores:
|
| 575 |
+
return None
|
| 576 |
+
|
| 577 |
+
return sum(scores) / len(scores)
|
| 578 |
+
|
| 579 |
+
#fonction summary from content extraction result
|
| 580 |
+
async def process_from_extraction_result(
|
| 581 |
+
self,
|
| 582 |
+
extraction_result: 'ExtractionResult'
|
| 583 |
+
) -> SummarizationOutput:
|
| 584 |
+
"""
|
| 585 |
+
Traite la summarization à partir d'un ExtractionResult.
|
| 586 |
+
|
| 587 |
+
Args:
|
| 588 |
+
extraction_result: Résultat de l'extraction de contenu
|
| 589 |
+
Returns:
|
| 590 |
+
SummarizationOutput avec tous les résumés
|
| 591 |
+
"""
|
| 592 |
+
# Préparer l'input de summarization
|
| 593 |
+
input_data = SummarizationInput(
|
| 594 |
+
documents=extraction_result.documents,
|
| 595 |
+
summary_options={
|
| 596 |
+
'include_sentiment': True,
|
| 597 |
+
'include_citations': True,
|
| 598 |
+
'max_key_points': 5,
|
| 599 |
+
'detailed_analysis': True,
|
| 600 |
+
'chunk_large_docs': True
|
| 601 |
+
}
|
| 602 |
+
|
| 603 |
+
)
|
| 604 |
+
|
| 605 |
+
if not self.validate_input(input_data):
|
| 606 |
+
self.logger.error("Input ExtractionResult invalide pour la summarization")
|
| 607 |
+
raise ValueError("Input ExtractionResult invalide pour la summarization")
|
| 608 |
+
|
| 609 |
+
# Appeler le processus principal de summarization
|
| 610 |
+
return await self.process(input_data)
|
| 611 |
+
|
| 612 |
+
|
| 613 |
+
|
| 614 |
+
|
| 615 |
+
# Exemple d'utilisation
|
| 616 |
+
if __name__ == "__main__":
|
| 617 |
+
import asyncio
|
| 618 |
+
import json
|
| 619 |
+
from src.models.document_models import ExtractionResult
|
| 620 |
+
|
| 621 |
+
def save_summarization_output(output, filename=None):
|
| 622 |
+
"""Sauvegarde un SummarizationOutput au format JSON."""
|
| 623 |
+
from datetime import datetime
|
| 624 |
+
if not filename:
|
| 625 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 626 |
+
filename = f"summarization_output_{len(output.summaries)}docs_{timestamp}.json"
|
| 627 |
+
try:
|
| 628 |
+
with open(filename, 'w', encoding='utf-8') as f:
|
| 629 |
+
json.dump(output.model_dump(mode='json'), f, indent=2, ensure_ascii=False)
|
| 630 |
+
return filename
|
| 631 |
+
except Exception as e:
|
| 632 |
+
print(f"Erreur lors de la sauvegarde: {e}")
|
| 633 |
+
return None
|
| 634 |
+
|
| 635 |
+
async def summarize_from_extraction_file():
|
| 636 |
+
# Charger ExtractionResult
|
| 637 |
+
extraction_file = "extraction_result_2docs_20251116_141527.json"
|
| 638 |
+
try:
|
| 639 |
+
with open(extraction_file, 'r', encoding='utf-8') as f:
|
| 640 |
+
extraction_data = json.load(f)
|
| 641 |
+
extraction_result = ExtractionResult(**extraction_data)
|
| 642 |
+
except Exception as e:
|
| 643 |
+
print(f"Erreur chargement ExtractionResult: {e}")
|
| 644 |
+
return
|
| 645 |
+
|
| 646 |
+
|
| 647 |
+
|
| 648 |
+
summarizer = SummarizerAgent()
|
| 649 |
+
|
| 650 |
+
output = await summarizer.process_from_extraction_result(extraction_result)
|
| 651 |
+
|
| 652 |
+
# Sauvegarde
|
| 653 |
+
filename = save_summarization_output(output)
|
| 654 |
+
if filename:
|
| 655 |
+
print(f"✅ Résumés sauvegardés dans: {filename}")
|
| 656 |
+
else:
|
| 657 |
+
print("❌ Erreur lors de la sauvegarde du résumé.")
|
| 658 |
+
|
| 659 |
+
# Affichage rapide
|
| 660 |
+
for summary in output.summaries:
|
| 661 |
+
print(f"\nRésumé pour {summary.title}:")
|
| 662 |
+
print(f"Résumé exécutif: {summary.executive_summary[:200]}...")
|
| 663 |
+
print(f"Points clés: {[kp.title for kp in summary.key_points]}")
|
| 664 |
+
print(f"Sentiment: {summary.sentiment}")
|
| 665 |
+
print(f"Score de crédibilité: {summary.credibility_score}")
|
| 666 |
+
print(f"Temps total de traitement: {output.total_processing_time:.2f}s")
|
| 667 |
+
print(f"Score de crédibilité moyen: {output.average_credibility}")
|
| 668 |
+
|
| 669 |
+
asyncio.run(summarize_from_extraction_file())
|
src/core/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Fichier d'initialisation du package core.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from src.core.logging import setup_logger, app_logger
|
| 6 |
+
|
| 7 |
+
__all__ = ["setup_logger", "app_logger"]
|
src/core/logging.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration du système de logging pour l'assistant de recherche.
|
| 3 |
+
Permet de tracer les événements importants (infos, erreurs, avertissements, etc.)
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from logging.handlers import RotatingFileHandler
|
| 10 |
+
|
| 11 |
+
# --- Création du dossier des logs ---
|
| 12 |
+
log_directory = Path("logs")
|
| 13 |
+
log_directory.mkdir(exist_ok=True)
|
| 14 |
+
|
| 15 |
+
# --- Fonction de configuration du logger ---
|
| 16 |
+
def setup_logger(name: str, level: int = logging.INFO) -> logging.Logger:
|
| 17 |
+
"""
|
| 18 |
+
Configure et retourne un logger complet avec console et fichiers rotatifs.
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
name (str): Nom du logger (ex: 'research_assistant')
|
| 22 |
+
level (int): Niveau minimal de logging (par défaut: INFO)
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
logging.Logger: Logger configuré
|
| 26 |
+
"""
|
| 27 |
+
logger = logging.getLogger(name)
|
| 28 |
+
logger.setLevel(level)
|
| 29 |
+
|
| 30 |
+
# Éviter les doublons si setup_logger() est appelé plusieurs fois
|
| 31 |
+
if logger.hasHandlers():
|
| 32 |
+
logger.handlers.clear()
|
| 33 |
+
|
| 34 |
+
# Formatage lisible du message de log
|
| 35 |
+
formatter = logging.Formatter(
|
| 36 |
+
"%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
|
| 37 |
+
datefmt="%Y-%m-%d %H:%M:%S"
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
# --- Handler Console (affichage terminal) ---
|
| 41 |
+
console_handler = logging.StreamHandler(sys.stdout)
|
| 42 |
+
console_handler.setLevel(logging.DEBUG) # plus verbeux pour la console
|
| 43 |
+
console_handler.setFormatter(formatter)
|
| 44 |
+
|
| 45 |
+
# --- Handler Fichier (logs persistants) ---
|
| 46 |
+
file_handler = RotatingFileHandler(
|
| 47 |
+
log_directory / f"{name}.log",
|
| 48 |
+
maxBytes=5 * 1024 * 1024, # 5 Mo
|
| 49 |
+
backupCount=5, # garder 5 fichiers d'historique
|
| 50 |
+
encoding='utf-8'
|
| 51 |
+
)
|
| 52 |
+
file_handler.setLevel(logging.INFO) # moins de bruit dans les fichiers
|
| 53 |
+
file_handler.setFormatter(formatter)
|
| 54 |
+
|
| 55 |
+
# --- Ajout des handlers au logger ---
|
| 56 |
+
logger.addHandler(console_handler)
|
| 57 |
+
logger.addHandler(file_handler)
|
| 58 |
+
|
| 59 |
+
# Message de démarrage du logger
|
| 60 |
+
logger.info("Logger initialisé avec succès.")
|
| 61 |
+
return logger
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# --- Exemple d’utilisation du logger ---
|
| 65 |
+
app_logger = setup_logger("research_assistant")
|
| 66 |
+
|
| 67 |
+
if __name__ == "__main__":
|
| 68 |
+
app_logger.debug("Ceci est un message DEBUG (utile pour le débogage).")
|
| 69 |
+
app_logger.info("Démarrage de l'application de recherche...")
|
| 70 |
+
app_logger.warning("Avertissement : connexion lente à la base de données.")
|
| 71 |
+
app_logger.error("Erreur : impossible de charger un fichier de configuration.")
|
| 72 |
+
app_logger.critical("ERREUR CRITIQUE : application arrêtée.")
|
| 73 |
+
app_logger.info("Application terminée.")
|
src/enhanced_system_prompt.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Prompt système amélioré pour l'agent avec mémoire
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
ENHANCED_SYSTEM_PROMPT = """Tu es un Assistant de Recherche Intelligent avec Mémoire Contextuelle.
|
| 6 |
+
|
| 7 |
+
🎯 TES CAPACITÉS:
|
| 8 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 9 |
+
Tu disposes d'un système de mémoire avancé qui te permet de :
|
| 10 |
+
• Stocker et réutiliser les résultats de recherches précédentes
|
| 11 |
+
• Éviter les doublons et optimiser les recherches
|
| 12 |
+
• Maintenir un contexte conversationnel enrichi
|
| 13 |
+
• Suggérer des recherches similaires déjà effectuées
|
| 14 |
+
|
| 15 |
+
🔧 TES OUTILS:
|
| 16 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 17 |
+
|
| 18 |
+
1️⃣ research_complete_pipeline_with_memory(topic, max_results, use_cache)
|
| 19 |
+
→ Pipeline complet de recherche avec cache intelligent
|
| 20 |
+
→ Paramètres:
|
| 21 |
+
- topic (str): Sujet de recherche
|
| 22 |
+
- max_results (int): 2-10 sources (défaut: 3)
|
| 23 |
+
- use_cache (bool): Utiliser le cache si disponible (défaut: True)
|
| 24 |
+
|
| 25 |
+
💡 Utilise cet outil pour:
|
| 26 |
+
- Nouvelles recherches complètes
|
| 27 |
+
- Analyses approfondies sur un sujet
|
| 28 |
+
- Résumés documentés et sourcés
|
| 29 |
+
|
| 30 |
+
2️⃣ search_in_memory(query, top_k)
|
| 31 |
+
→ Recherche rapide dans les données déjà collectées
|
| 32 |
+
→ Parfait pour retrouver des informations sans nouvelle recherche
|
| 33 |
+
|
| 34 |
+
💡 Utilise cet outil pour:
|
| 35 |
+
- Questions sur des sujets déjà explorés
|
| 36 |
+
- Vérifications rapides
|
| 37 |
+
- Références croisées
|
| 38 |
+
|
| 39 |
+
3️⃣ get_research_history(n_last)
|
| 40 |
+
→ Consulte l'historique des recherches
|
| 41 |
+
→ Utile pour voir les sujets déjà traités
|
| 42 |
+
|
| 43 |
+
💡 Utilise cet outil pour:
|
| 44 |
+
- "Qu'ai-je déjà recherché ?"
|
| 45 |
+
- "Quelles sont mes dernières recherches ?"
|
| 46 |
+
- Suggestions de sujets connexes
|
| 47 |
+
|
| 48 |
+
4️⃣ clear_memory(confirm)
|
| 49 |
+
→ Réinitialise la mémoire (avec confirmation)
|
| 50 |
+
→ À utiliser avec précaution
|
| 51 |
+
|
| 52 |
+
📋 STRATÉGIE D'UTILISATION:
|
| 53 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 54 |
+
|
| 55 |
+
AVANT de lancer une nouvelle recherche complète:
|
| 56 |
+
1. Vérifie si une recherche similaire existe déjà (use_cache=True par défaut)
|
| 57 |
+
2. Si l'utilisateur demande quelque chose sur un sujet déjà traité,
|
| 58 |
+
utilise search_in_memory d'abord
|
| 59 |
+
3. Pour les nouvelles recherches, utilise research_complete_pipeline_with_memory
|
| 60 |
+
|
| 61 |
+
EXEMPLES DE DÉCISIONS INTELLIGENTES:
|
| 62 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 63 |
+
|
| 64 |
+
❓ User: "Résume l'impact de l'IA sur l'emploi"
|
| 65 |
+
✅ Action: research_complete_pipeline_with_memory(
|
| 66 |
+
topic="impact de l'intelligence artificielle sur l'emploi",
|
| 67 |
+
max_results=3,
|
| 68 |
+
use_cache=True
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
❓ User: "Rappelle-moi ce que tu as trouvé sur l'IA dans l'emploi"
|
| 72 |
+
✅ Action: search_in_memory(query="intelligence artificielle emploi", top_k=3)
|
| 73 |
+
|
| 74 |
+
❓ User: "Quelles recherches ai-je faites récemment ?"
|
| 75 |
+
✅ Action: get_research_history(n_last=5)
|
| 76 |
+
|
| 77 |
+
❓ User: "Fais une analyse approfondie sur le climat"
|
| 78 |
+
✅ Action: research_complete_pipeline_with_memory(
|
| 79 |
+
topic="changement climatique analyse complète",
|
| 80 |
+
max_results=7,
|
| 81 |
+
use_cache=True
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
❓ User: "Bonjour, comment vas-tu ?"
|
| 85 |
+
✅ Action: Réponse directe, pas d'outil nécessaire
|
| 86 |
+
|
| 87 |
+
🎨 TON COMPORTEMENT:
|
| 88 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 89 |
+
• Toujours privilégier l'efficacité : utilise le cache quand possible
|
| 90 |
+
• Informe l'utilisateur si tu utilises des données en cache
|
| 91 |
+
• Suggère des recherches connexes quand pertinent
|
| 92 |
+
• Sois transparent sur tes sources et méthodes
|
| 93 |
+
• Présente les résultats de manière claire et structurée
|
| 94 |
+
|
| 95 |
+
⚠️ IMPORTANT:
|
| 96 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 97 |
+
• N'invente JAMAIS d'informations
|
| 98 |
+
• Cite toujours tes sources
|
| 99 |
+
• Si aucune info n'est disponible, dis-le clairement
|
| 100 |
+
• Le système évite automatiquement les doublons
|
| 101 |
+
• Les résultats en cache sont valides 24h
|
| 102 |
+
"""
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
# Chargement des variables d'environnement
|
| 107 |
+
from dotenv import load_dotenv
|
| 108 |
+
from langchain_groq import ChatGroq
|
| 109 |
+
import os
|
| 110 |
+
load_dotenv()
|
| 111 |
+
api_key = os.getenv("GROQ_API_KEY")
|
| 112 |
+
if not api_key:
|
| 113 |
+
raise ValueError("GROQ_API_KEY non définie dans .env")
|
| 114 |
+
|
| 115 |
+
# Configuration du modèle avec l'outil
|
| 116 |
+
model = ChatGroq(
|
| 117 |
+
model="llama-3.1-8b-instant",
|
| 118 |
+
temperature=0.3, # Bas pour plus de cohérence
|
| 119 |
+
max_tokens=2048*2,
|
| 120 |
+
api_key=api_key
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
# Fonction helper pour mettre à jour le model_call
|
| 124 |
+
def create_enhanced_model_call():
|
| 125 |
+
"""Crée la fonction model_call améliorée avec le nouveau prompt"""
|
| 126 |
+
|
| 127 |
+
from langchain_core.messages import SystemMessage
|
| 128 |
+
|
| 129 |
+
def model_call_enhanced(state):
|
| 130 |
+
"""Nœud LLM amélioré avec système de mémoire"""
|
| 131 |
+
|
| 132 |
+
system_message = SystemMessage(content=ENHANCED_SYSTEM_PROMPT)
|
| 133 |
+
messages = state["messages"]
|
| 134 |
+
|
| 135 |
+
# Vérifier si l'utilisateur demande l'historique ou la mémoire
|
| 136 |
+
last_user_msg = ""
|
| 137 |
+
for msg in reversed(messages):
|
| 138 |
+
if hasattr(msg, 'type') and msg.type == 'human':
|
| 139 |
+
last_user_msg = msg.content.lower()
|
| 140 |
+
break
|
| 141 |
+
|
| 142 |
+
# Ajouter un hint si l'utilisateur semble demander quelque chose déjà recherché
|
| 143 |
+
memory_hints = ['rappelle', 'déjà', 'précédent', 'avant', 'historique', 'recherches']
|
| 144 |
+
if any(hint in last_user_msg for hint in memory_hints):
|
| 145 |
+
hint_msg = SystemMessage(content=
|
| 146 |
+
"💡 L'utilisateur semble se référer à des informations passées. "
|
| 147 |
+
"Considère utiliser search_in_memory ou get_research_history avant une nouvelle recherche."
|
| 148 |
+
)
|
| 149 |
+
messages = [system_message, hint_msg] + messages
|
| 150 |
+
else:
|
| 151 |
+
messages = [system_message] + messages
|
| 152 |
+
|
| 153 |
+
response = model.invoke(messages)
|
| 154 |
+
return {"messages": [response]}
|
| 155 |
+
|
| 156 |
+
return model_call_enhanced
|
| 157 |
+
|
| 158 |
+
# Exporter
|
| 159 |
+
print("✅ Prompt système amélioré créé")
|
src/graph.py
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# """
|
| 2 |
+
# Script de test complet pour le système de mémoire
|
| 3 |
+
# """
|
| 4 |
+
|
| 5 |
+
# from langchain_groq import ChatGroq
|
| 6 |
+
# from langgraph.graph import StateGraph, END
|
| 7 |
+
# from langgraph.prebuilt import ToolNode
|
| 8 |
+
# from typing import TypedDict, Sequence, Annotated
|
| 9 |
+
# from langchain_core.messages import BaseMessage
|
| 10 |
+
# from langgraph.graph.message import add_messages
|
| 11 |
+
# from dotenv import load_dotenv
|
| 12 |
+
# import os
|
| 13 |
+
|
| 14 |
+
# # Importer les composants
|
| 15 |
+
# # from memory_integration import tools_with_memory
|
| 16 |
+
# # from enhanced_system_prompt import create_enhanced_model_call, ENHANCED_SYSTEM_PROMPT
|
| 17 |
+
|
| 18 |
+
# # ============================================================================
|
| 19 |
+
# # CONFIGURATION
|
| 20 |
+
# # ============================================================================
|
| 21 |
+
|
| 22 |
+
# class AgentState(TypedDict):
|
| 23 |
+
# messages: Annotated[Sequence[BaseMessage], add_messages]
|
| 24 |
+
|
| 25 |
+
# load_dotenv()
|
| 26 |
+
# api_key = os.getenv("GROQ_API_KEY")
|
| 27 |
+
# if not api_key:
|
| 28 |
+
# raise ValueError("GROQ_API_KEY non définie")
|
| 29 |
+
|
| 30 |
+
# model = ChatGroq(
|
| 31 |
+
# model="llama-3.1-8b-instant",
|
| 32 |
+
# temperature=0.3,
|
| 33 |
+
# max_tokens=2048,
|
| 34 |
+
# api_key=api_key
|
| 35 |
+
# ).bind_tools(tools_with_memory)
|
| 36 |
+
|
| 37 |
+
# # ============================================================================
|
| 38 |
+
# # CONSTRUCTION DU GRAPHE AMÉLIORÉ
|
| 39 |
+
# # ============================================================================
|
| 40 |
+
|
| 41 |
+
# def should_continue(state: AgentState) -> str:
|
| 42 |
+
# messages = state["messages"]
|
| 43 |
+
# last_message = messages[-1]
|
| 44 |
+
|
| 45 |
+
# if hasattr(last_message, 'tool_calls') and last_message.tool_calls:
|
| 46 |
+
# return "continue"
|
| 47 |
+
# else:
|
| 48 |
+
# return "end"
|
| 49 |
+
|
| 50 |
+
# # Créer le graphe
|
| 51 |
+
# graph = StateGraph(AgentState)
|
| 52 |
+
|
| 53 |
+
# # Ajouter les nœuds
|
| 54 |
+
# model_call_enhanced = create_enhanced_model_call()
|
| 55 |
+
# graph.add_node("llm", model_call_enhanced)
|
| 56 |
+
|
| 57 |
+
# tool_node = ToolNode(tools=tools_with_memory)
|
| 58 |
+
# graph.add_node("tools", tool_node)
|
| 59 |
+
|
| 60 |
+
# # Définir les connexions
|
| 61 |
+
# graph.set_entry_point("llm")
|
| 62 |
+
# graph.add_conditional_edges(
|
| 63 |
+
# "llm",
|
| 64 |
+
# should_continue,
|
| 65 |
+
# {
|
| 66 |
+
# "continue": "tools",
|
| 67 |
+
# "end": END,
|
| 68 |
+
# },
|
| 69 |
+
# )
|
| 70 |
+
# graph.add_edge("tools", "llm")
|
| 71 |
+
|
| 72 |
+
# # Compiler
|
| 73 |
+
# app_with_memory = graph.compile()
|
| 74 |
+
|
| 75 |
+
# # ============================================================================
|
| 76 |
+
# # FONCTIONS UTILITAIRES
|
| 77 |
+
# # ============================================================================
|
| 78 |
+
|
| 79 |
+
# def print_stream(stream, show_tool_calls=True):
|
| 80 |
+
# """Affiche le flux de messages de manière élégante"""
|
| 81 |
+
# print("\n" + "="*70)
|
| 82 |
+
|
| 83 |
+
# for i, s in enumerate(stream):
|
| 84 |
+
# message = s["messages"][-1]
|
| 85 |
+
|
| 86 |
+
# if hasattr(message, 'content') and message.content:
|
| 87 |
+
# print(f"\n{'─'*70}")
|
| 88 |
+
# if hasattr(message, 'type'):
|
| 89 |
+
# if message.type == 'human':
|
| 90 |
+
# print("👤 UTILISATEUR:")
|
| 91 |
+
# elif message.type == 'ai':
|
| 92 |
+
# print("🤖 ASSISTANT:")
|
| 93 |
+
# elif message.type == 'tool':
|
| 94 |
+
# if show_tool_calls:
|
| 95 |
+
# print("🔧 RÉSULTAT OUTIL:")
|
| 96 |
+
|
| 97 |
+
# content = message.content
|
| 98 |
+
# if isinstance(content, str):
|
| 99 |
+
# # Limiter l'affichage si trop long
|
| 100 |
+
# if len(content) > 1000:
|
| 101 |
+
# print(content[:1000] + "\n... (contenu tronqué)")
|
| 102 |
+
# else:
|
| 103 |
+
# print(content)
|
| 104 |
+
# else:
|
| 105 |
+
# print(content)
|
| 106 |
+
|
| 107 |
+
# if hasattr(message, 'tool_calls') and message.tool_calls and show_tool_calls:
|
| 108 |
+
# print("\n🔧 APPELS D'OUTILS:")
|
| 109 |
+
# for tool_call in message.tool_calls:
|
| 110 |
+
# print(f" → {tool_call.get('name', 'unknown')}({tool_call.get('args', {})})")
|
| 111 |
+
|
| 112 |
+
# print("\n" + "="*70)
|
| 113 |
+
|
| 114 |
+
# def run_test(user_query: str, test_name: str = ""):
|
| 115 |
+
# """Exécute un test avec affichage formaté"""
|
| 116 |
+
# if test_name:
|
| 117 |
+
# print(f"\n\n{'#'*70}")
|
| 118 |
+
# print(f"# TEST: {test_name}")
|
| 119 |
+
# print(f"{'#'*70}")
|
| 120 |
+
|
| 121 |
+
# inputs = {"messages": [("user", user_query)]}
|
| 122 |
+
# print_stream(app_with_memory.stream(inputs, stream_mode="values"))
|
| 123 |
+
|
| 124 |
+
# # ============================================================================
|
| 125 |
+
# # SUITE DE TESTS
|
| 126 |
+
# # ============================================================================
|
| 127 |
+
|
| 128 |
+
# def run_all_tests():
|
| 129 |
+
# """Exécute tous les tests du système"""
|
| 130 |
+
|
| 131 |
+
# print("\n" + "="*70)
|
| 132 |
+
# print(" 🧪 SUITE DE TESTS - SYSTÈME DE MÉMOIRE INTELLIGENT")
|
| 133 |
+
# print("="*70)
|
| 134 |
+
|
| 135 |
+
# # Test 1: Première recherche (création du cache)
|
| 136 |
+
# run_test(
|
| 137 |
+
# "Fais-moi un résumé complet sur l'impact de l'intelligence artificielle sur le marché du travail",
|
| 138 |
+
# "Test 1 - Première recherche (cache vide)"
|
| 139 |
+
# )
|
| 140 |
+
|
| 141 |
+
# # Test 2: Même sujet (utilisation du cache)
|
| 142 |
+
# run_test(
|
| 143 |
+
# "Peux-tu me redonner les infos sur l'IA et l'emploi ?",
|
| 144 |
+
# "Test 2 - Recherche dans le cache"
|
| 145 |
+
# )
|
| 146 |
+
|
| 147 |
+
# # Test 3: Recherche dans la mémoire
|
| 148 |
+
# run_test(
|
| 149 |
+
# "Qu'est-ce que tu as trouvé sur l'intelligence artificielle ?",
|
| 150 |
+
# "Test 3 - Recherche sémantique dans la mémoire"
|
| 151 |
+
# )
|
| 152 |
+
|
| 153 |
+
# # Test 4: Historique
|
| 154 |
+
# run_test(
|
| 155 |
+
# "Montre-moi l'historique de mes recherches",
|
| 156 |
+
# "Test 4 - Consultation de l'historique"
|
| 157 |
+
# )
|
| 158 |
+
|
| 159 |
+
# # Test 5: Nouvelle recherche différente
|
| 160 |
+
# run_test(
|
| 161 |
+
# "Fais une analyse sur les énergies renouvelables",
|
| 162 |
+
# "Test 5 - Nouvelle recherche (sujet différent)"
|
| 163 |
+
# )
|
| 164 |
+
|
| 165 |
+
# # Test 6: Question simple (pas de recherche)
|
| 166 |
+
# run_test(
|
| 167 |
+
# "Bonjour, comment ça va ?",
|
| 168 |
+
# "Test 6 - Conversation simple (sans recherche)"
|
| 169 |
+
# )
|
| 170 |
+
|
| 171 |
+
# # Test 7: Recherche croisée
|
| 172 |
+
# run_test(
|
| 173 |
+
# "Compare ce que tu as trouvé sur l'IA et les énergies renouvelables",
|
| 174 |
+
# "Test 7 - Recherche croisée dans la mémoire"
|
| 175 |
+
# )
|
| 176 |
+
|
| 177 |
+
# print("\n\n" + "="*70)
|
| 178 |
+
# print(" ✅ TOUS LES TESTS TERMINÉS")
|
| 179 |
+
# print("="*70)
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
# def demo_memory_stats():
|
| 183 |
+
# """Affiche les statistiques de la mémoire"""
|
| 184 |
+
# from memory_system import memory_system
|
| 185 |
+
|
| 186 |
+
# print("\n" + "="*70)
|
| 187 |
+
# print(" 📊 STATISTIQUES DU SYSTÈME DE MÉMOIRE")
|
| 188 |
+
# print("="*70)
|
| 189 |
+
|
| 190 |
+
# # Stats vectorielles
|
| 191 |
+
# vector_count = memory_system.vector_memory.collection.count()
|
| 192 |
+
# print(f"\n🗄️ Base Vectorielle:")
|
| 193 |
+
# print(f" Documents stockés: {vector_count}")
|
| 194 |
+
# print(f" Hashes en cache: {len(memory_system.vector_memory.content_hashes)}")
|
| 195 |
+
|
| 196 |
+
# # Stats agent
|
| 197 |
+
# conv_count = len(memory_system.agent_memory.conversation_history)
|
| 198 |
+
# research_count = len(memory_system.agent_memory.research_cache)
|
| 199 |
+
|
| 200 |
+
# print(f"\n🧠 Mémoire Agent:")
|
| 201 |
+
# print(f" Conversations: {conv_count}")
|
| 202 |
+
# print(f" Recherches en cache: {research_count}")
|
| 203 |
+
# print(f" Topics mémorisés: {len(memory_system.agent_memory.topic_keywords)}")
|
| 204 |
+
|
| 205 |
+
# if research_count > 0:
|
| 206 |
+
# print(f"\n📚 Topics en cache:")
|
| 207 |
+
# for topic in list(memory_system.agent_memory.research_cache.keys())[:5]:
|
| 208 |
+
# print(f" • {topic}")
|
| 209 |
+
|
| 210 |
+
# print("\n" + "="*70)
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
# # ============================================================================
|
| 214 |
+
# # MENU INTERACTIF
|
| 215 |
+
# # ============================================================================
|
| 216 |
+
|
| 217 |
+
# def interactive_menu():
|
| 218 |
+
# """Menu interactif pour tester le système"""
|
| 219 |
+
|
| 220 |
+
# while True:
|
| 221 |
+
# print("\n" + "="*70)
|
| 222 |
+
# print(" 🎯 ASSISTANT DE RECHERCHE INTELLIGENT")
|
| 223 |
+
# print("="*70)
|
| 224 |
+
# print("\n Options:")
|
| 225 |
+
# print(" 1. Poser une question / Lancer une recherche")
|
| 226 |
+
# print(" 2. Rechercher dans la mémoire")
|
| 227 |
+
# print(" 3. Voir l'historique")
|
| 228 |
+
# print(" 4. Statistiques de la mémoire")
|
| 229 |
+
# print(" 5. Lancer la suite de tests")
|
| 230 |
+
# print(" 6. Réinitialiser la mémoire")
|
| 231 |
+
# print(" 0. Quitter")
|
| 232 |
+
|
| 233 |
+
# choice = input("\n👉 Votre choix: ").strip()
|
| 234 |
+
|
| 235 |
+
# if choice == "1":
|
| 236 |
+
# query = input("\n💬 Votre question: ")
|
| 237 |
+
# run_test(query, "Recherche utilisateur")
|
| 238 |
+
|
| 239 |
+
# elif choice == "2":
|
| 240 |
+
# query = input("\n🔍 Recherche dans la mémoire: ")
|
| 241 |
+
# run_test(f"Cherche dans ta mémoire: {query}", "Recherche mémoire")
|
| 242 |
+
|
| 243 |
+
# elif choice == "3":
|
| 244 |
+
# run_test("Montre-moi mon historique de recherches", "Historique")
|
| 245 |
+
|
| 246 |
+
# elif choice == "4":
|
| 247 |
+
# demo_memory_stats()
|
| 248 |
+
|
| 249 |
+
# elif choice == "5":
|
| 250 |
+
# run_all_tests()
|
| 251 |
+
|
| 252 |
+
# elif choice == "6":
|
| 253 |
+
# confirm = input("\n⚠️ Êtes-vous sûr de vouloir réinitialiser ? (oui/non): ")
|
| 254 |
+
# if confirm.lower() == "oui":
|
| 255 |
+
# from memory_system import memory_system
|
| 256 |
+
# memory_system.agent_memory.clear_all()
|
| 257 |
+
# print("✅ Mémoire réinitialisée")
|
| 258 |
+
# else:
|
| 259 |
+
# print("❌ Annulé")
|
| 260 |
+
|
| 261 |
+
# elif choice == "0":
|
| 262 |
+
# print("\n👋 Au revoir!")
|
| 263 |
+
# break
|
| 264 |
+
|
| 265 |
+
# else:
|
| 266 |
+
# print("\n❌ Choix invalide")
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
# # ============================================================================
|
| 270 |
+
# # POINT D'ENTRÉE
|
| 271 |
+
# # ============================================================================
|
| 272 |
+
|
| 273 |
+
# if __name__ == "__main__":
|
| 274 |
+
# import sys
|
| 275 |
+
|
| 276 |
+
# print("\n" + "🚀"*35)
|
| 277 |
+
# print(" SYSTÈME DE RECHERCHE INTELLIGENT AVEC MÉMOIRE")
|
| 278 |
+
# print("🚀"*35 + "\n")
|
| 279 |
+
|
| 280 |
+
# if len(sys.argv) > 1:
|
| 281 |
+
# if sys.argv[1] == "test":
|
| 282 |
+
# # Mode test automatique
|
| 283 |
+
# run_all_tests()
|
| 284 |
+
# demo_memory_stats()
|
| 285 |
+
# elif sys.argv[1] == "stats":
|
| 286 |
+
# # Afficher uniquement les stats
|
| 287 |
+
# demo_memory_stats()
|
| 288 |
+
# else:
|
| 289 |
+
# # Exécuter une requête directe
|
| 290 |
+
# query = " ".join(sys.argv[1:])
|
| 291 |
+
# run_test(query, "Requête CLI")
|
| 292 |
+
# else:
|
| 293 |
+
# # Mode interactif
|
| 294 |
+
# interactive_menu()
|
src/graph/__init__.py
ADDED
|
File without changes
|
src/graph/nodes.py
ADDED
|
File without changes
|
src/graph/notebook.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/memory_integration.py
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Intégration du système de mémoire dans l'outil de recherche
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from langchain_core.tools import tool
|
| 6 |
+
from typing import Union
|
| 7 |
+
import asyncio
|
| 8 |
+
|
| 9 |
+
from src.agents.researcher_agent import ResearcherAgent
|
| 10 |
+
from src.agents.content_extractor_agent import ContentExtractorAgent
|
| 11 |
+
from src.agents.summarizer_agent import SummarizerAgent
|
| 12 |
+
from src.agents.global_synthesizer_agent import GlobalSynthesizerAgent
|
| 13 |
+
|
| 14 |
+
from src.models.research_models import ResearchQuery
|
| 15 |
+
# ------------ AGENTS ------------
|
| 16 |
+
researcher_agent = ResearcherAgent()
|
| 17 |
+
content_extractor_agent = ContentExtractorAgent()
|
| 18 |
+
summarizer_agent = SummarizerAgent()
|
| 19 |
+
global_synthesizer_agent = GlobalSynthesizerAgent()
|
| 20 |
+
|
| 21 |
+
# Importer le système de mémoire
|
| 22 |
+
from .memory_integration import memory_system
|
| 23 |
+
|
| 24 |
+
# ============================================================================
|
| 25 |
+
# OUTIL AMÉLIORÉ AVEC MÉMOIRE
|
| 26 |
+
# ============================================================================
|
| 27 |
+
|
| 28 |
+
@tool
|
| 29 |
+
def research_complete_pipeline_with_memory(topic: str, max_results: Union[int, str] = 3, use_cache: bool = True) -> str:
|
| 30 |
+
"""Exécute un pipeline de recherche complet avec système de mémoire intégré.
|
| 31 |
+
|
| 32 |
+
Ce tool intelligent :
|
| 33 |
+
1. Vérifie si des recherches similaires existent en cache
|
| 34 |
+
2. Utilise la mémoire vectorielle pour enrichir le contexte
|
| 35 |
+
3. Exécute le pipeline complet de recherche si nécessaire
|
| 36 |
+
4. Stocke tous les résultats pour réutilisation future
|
| 37 |
+
5. Déduplique automatiquement les documents
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
topic: Le sujet de recherche
|
| 41 |
+
max_results: Nombre de sources à analyser (2-10, défaut: 3)
|
| 42 |
+
use_cache: Utiliser le cache si disponible (défaut: True)
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
Un rapport complet enrichi par la mémoire contextuelle
|
| 46 |
+
"""
|
| 47 |
+
# Conversion et validation
|
| 48 |
+
if isinstance(max_results, str):
|
| 49 |
+
try:
|
| 50 |
+
max_results = int(max_results)
|
| 51 |
+
except ValueError:
|
| 52 |
+
max_results = 3
|
| 53 |
+
max_results = max(2, min(max_results, 10))
|
| 54 |
+
|
| 55 |
+
async def run_pipeline_with_memory():
|
| 56 |
+
print(f"\n{'='*60}")
|
| 57 |
+
print(f"🚀 PIPELINE DE RECHERCHE INTELLIGENT")
|
| 58 |
+
print(f"📋 Sujet: {topic}")
|
| 59 |
+
print(f"💾 Cache activé: {use_cache}")
|
| 60 |
+
print(f"{'='*60}\n")
|
| 61 |
+
|
| 62 |
+
# ===== PHASE 1: RÉCUPÉRATION DU CONTEXTE =====
|
| 63 |
+
print("🧠 [Phase 1] Récupération du contexte mémoriel...")
|
| 64 |
+
context = memory_system.retrieve_context_for_query(topic, use_cache=use_cache)
|
| 65 |
+
|
| 66 |
+
# Vérifier si on a un résultat en cache
|
| 67 |
+
if context['cached_result'] and use_cache:
|
| 68 |
+
print("✅ Résultat trouvé en cache (< 24h)")
|
| 69 |
+
print("📊 Utilisation du résultat mémorisé")
|
| 70 |
+
|
| 71 |
+
cached_report = context['cached_result']
|
| 72 |
+
if hasattr(cached_report, 'final_report'):
|
| 73 |
+
return cached_report.final_report.formatted_outputs.get('markdown', str(cached_report))
|
| 74 |
+
|
| 75 |
+
# Afficher le contexte sémantique si disponible
|
| 76 |
+
if context['semantic_context']:
|
| 77 |
+
print(f"📚 Contexte sémantique récupéré ({len(context['semantic_context'])} caractères)")
|
| 78 |
+
|
| 79 |
+
if context['related_topics']:
|
| 80 |
+
print(f"🔗 Topics similaires trouvés: {', '.join(context['related_topics'][:3])}")
|
| 81 |
+
|
| 82 |
+
# ===== PHASE 2: EXÉCUTION DU PIPELINE =====
|
| 83 |
+
print(f"\n{'='*60}")
|
| 84 |
+
print("🔬 [Phase 2] Exécution du pipeline de recherche")
|
| 85 |
+
print(f"{'='*60}\n")
|
| 86 |
+
|
| 87 |
+
# ÉTAPE 1: Recherche
|
| 88 |
+
print("🔍 [1/4] Recherche web en cours...")
|
| 89 |
+
query = ResearchQuery(
|
| 90 |
+
topic=topic,
|
| 91 |
+
keywords=await researcher_agent.extract_keywords_with_llm(topic),
|
| 92 |
+
max_results=max_results,
|
| 93 |
+
search_depth="basic"
|
| 94 |
+
)
|
| 95 |
+
research_data = await researcher_agent.process(query)
|
| 96 |
+
print(f"✅ Trouvé {research_data.total_found} sources")
|
| 97 |
+
|
| 98 |
+
# ÉTAPE 2: Extraction avec déduplication
|
| 99 |
+
print("\n📄 [2/4] Extraction du contenu (avec déduplication)...")
|
| 100 |
+
extraction_data = await content_extractor_agent.process_from_research_output(
|
| 101 |
+
research_output=research_data
|
| 102 |
+
)
|
| 103 |
+
print(f"✅ Extrait {extraction_data.successful_extractions} documents")
|
| 104 |
+
|
| 105 |
+
# Vérifier les doublons
|
| 106 |
+
if extraction_data.documents:
|
| 107 |
+
new_docs = []
|
| 108 |
+
duplicates = 0
|
| 109 |
+
for doc in extraction_data.documents:
|
| 110 |
+
if not memory_system.vector_memory.is_duplicate(doc.content):
|
| 111 |
+
new_docs.append(doc)
|
| 112 |
+
else:
|
| 113 |
+
duplicates += 1
|
| 114 |
+
|
| 115 |
+
if duplicates > 0:
|
| 116 |
+
print(f"ℹ️ {duplicates} documents en doublon ignorés")
|
| 117 |
+
# Mettre à jour extraction_data avec seulement les nouveaux docs
|
| 118 |
+
extraction_data.documents = new_docs
|
| 119 |
+
|
| 120 |
+
# ÉTAPE 3: Résumés
|
| 121 |
+
print("\n📝 [3/4] Création des résumés...")
|
| 122 |
+
summarization_data = await summarizer_agent.process_from_extraction_result(
|
| 123 |
+
extraction_result=extraction_data
|
| 124 |
+
)
|
| 125 |
+
print(f"✅ Généré {summarization_data.total_documents} résumés")
|
| 126 |
+
|
| 127 |
+
# ÉTAPE 4: Synthèse globale enrichie
|
| 128 |
+
print("\n🎯 [4/4] Synthèse globale (enrichie par le contexte)...")
|
| 129 |
+
|
| 130 |
+
# Enrichir avec le contexte sémantique si disponible
|
| 131 |
+
if context['semantic_context']:
|
| 132 |
+
print("📚 Enrichissement avec le contexte mémoriel...")
|
| 133 |
+
|
| 134 |
+
global_synthesis = await global_synthesizer_agent.process_from_summarization_output(
|
| 135 |
+
summarization_output=summarization_data
|
| 136 |
+
)
|
| 137 |
+
print(f"✅ Rapport final généré ({global_synthesis.final_report.word_count} mots)")
|
| 138 |
+
|
| 139 |
+
# ===== PHASE 3: STOCKAGE EN MÉMOIRE =====
|
| 140 |
+
print(f"\n{'='*60}")
|
| 141 |
+
print("💾 [Phase 3] Stockage en mémoire")
|
| 142 |
+
print(f"{'='*60}\n")
|
| 143 |
+
|
| 144 |
+
memory_system.process_research_result(
|
| 145 |
+
topic=topic,
|
| 146 |
+
extraction_result=extraction_data,
|
| 147 |
+
summarization_result=summarization_data,
|
| 148 |
+
global_synthesis=global_synthesis
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
# Ajouter à l'historique des conversations
|
| 152 |
+
final_report_text = global_synthesis.final_report.formatted_outputs.get('text', '')[:200]
|
| 153 |
+
memory_system.agent_memory.add_conversation(
|
| 154 |
+
user_message=f"Recherche sur: {topic}",
|
| 155 |
+
assistant_response=final_report_text,
|
| 156 |
+
metadata={'max_results': max_results, 'sources': research_data.total_found}
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
print(f"\n{'='*60}")
|
| 160 |
+
print("✨ PIPELINE TERMINÉ AVEC SUCCÈS")
|
| 161 |
+
print(f"📊 Statistiques:")
|
| 162 |
+
print(f" - Sources analysées: {research_data.total_found}")
|
| 163 |
+
print(f" - Documents stockés: {extraction_data.successful_extractions}")
|
| 164 |
+
print(f" - Résumés générés: {summarization_data.total_documents}")
|
| 165 |
+
print(f" - Mots du rapport: {global_synthesis.final_report.word_count}")
|
| 166 |
+
print(f"{'='*60}\n")
|
| 167 |
+
|
| 168 |
+
# Retourner le rapport en markdown
|
| 169 |
+
return global_synthesis.final_report.formatted_outputs.get('markdown',
|
| 170 |
+
global_synthesis.final_report.formatted_outputs.get('text',
|
| 171 |
+
str(global_synthesis)))
|
| 172 |
+
|
| 173 |
+
return asyncio.run(run_pipeline_with_memory())
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
# ============================================================================
|
| 177 |
+
# OUTILS SUPPLÉMENTAIRES POUR LA GESTION DE MÉMOIRE
|
| 178 |
+
# ============================================================================
|
| 179 |
+
|
| 180 |
+
@tool
|
| 181 |
+
def search_in_memory(query: str, top_k: int = 5) -> str:
|
| 182 |
+
"""Recherche sémantique dans la mémoire vectorielle.
|
| 183 |
+
|
| 184 |
+
Utile pour retrouver des informations de recherches précédentes
|
| 185 |
+
sans relancer une nouvelle recherche complète.
|
| 186 |
+
|
| 187 |
+
Args:
|
| 188 |
+
query: Requête de recherche
|
| 189 |
+
top_k: Nombre de résultats à retourner
|
| 190 |
+
|
| 191 |
+
Returns:
|
| 192 |
+
Contexte pertinent trouvé dans la mémoire
|
| 193 |
+
"""
|
| 194 |
+
print(f"🔍 Recherche dans la mémoire: '{query}'")
|
| 195 |
+
|
| 196 |
+
results = memory_system.vector_memory.semantic_search(query, k=top_k)
|
| 197 |
+
|
| 198 |
+
if not results:
|
| 199 |
+
return "Aucun résultat trouvé dans la mémoire."
|
| 200 |
+
|
| 201 |
+
output = f"📚 {len(results)} résultats trouvés dans la mémoire:\n\n"
|
| 202 |
+
|
| 203 |
+
for i, (doc, score) in enumerate(results, 1):
|
| 204 |
+
output += f"[Résultat {i} - Pertinence: {score:.2%}]\n"
|
| 205 |
+
output += f"Titre: {doc.metadata.get('title', 'N/A')}\n"
|
| 206 |
+
output += f"Source: {doc.metadata.get('source', 'N/A')}\n"
|
| 207 |
+
output += f"Contenu:\n{doc.page_content[:300]}...\n\n"
|
| 208 |
+
|
| 209 |
+
return output
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
@tool
|
| 213 |
+
def get_research_history(n_last: int = 5) -> str:
|
| 214 |
+
"""Récupère l'historique des dernières recherches effectuées.
|
| 215 |
+
|
| 216 |
+
Args:
|
| 217 |
+
n_last: Nombre de conversations récentes à retourner
|
| 218 |
+
|
| 219 |
+
Returns:
|
| 220 |
+
Historique formaté des recherches
|
| 221 |
+
"""
|
| 222 |
+
print(f"📜 Récupération des {n_last} dernières recherches...")
|
| 223 |
+
|
| 224 |
+
history = list(memory_system.agent_memory.conversation_history)[-n_last:]
|
| 225 |
+
|
| 226 |
+
if not history:
|
| 227 |
+
return "Aucun historique de recherche disponible."
|
| 228 |
+
|
| 229 |
+
output = f"📚 Historique des {len(history)} dernières recherches:\n\n"
|
| 230 |
+
|
| 231 |
+
for i, conv in enumerate(history, 1):
|
| 232 |
+
timestamp = conv.get('timestamp', 'N/A')
|
| 233 |
+
user_msg = conv.get('user', '')[:100]
|
| 234 |
+
metadata = conv.get('metadata', {})
|
| 235 |
+
|
| 236 |
+
output += f"[Recherche {i}] - {timestamp}\n"
|
| 237 |
+
output += f"Topic: {user_msg}\n"
|
| 238 |
+
if metadata:
|
| 239 |
+
output += f"Détails: {metadata}\n"
|
| 240 |
+
output += "\n"
|
| 241 |
+
|
| 242 |
+
return output
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
@tool
|
| 246 |
+
def clear_memory(confirm: bool = False) -> str:
|
| 247 |
+
"""Réinitialise complètement le système de mémoire.
|
| 248 |
+
|
| 249 |
+
⚠️ ATTENTION: Cette action est irréversible!
|
| 250 |
+
|
| 251 |
+
Args:
|
| 252 |
+
confirm: Doit être True pour confirmer l'action
|
| 253 |
+
|
| 254 |
+
Returns:
|
| 255 |
+
Message de confirmation
|
| 256 |
+
"""
|
| 257 |
+
if not confirm:
|
| 258 |
+
return "⚠️ Action non confirmée. Passez confirm=True pour réinitialiser la mémoire."
|
| 259 |
+
|
| 260 |
+
print("🗑️ Réinitialisation de la mémoire...")
|
| 261 |
+
memory_system.agent_memory.clear_all()
|
| 262 |
+
|
| 263 |
+
# Note: On ne clear pas la base vectorielle car elle peut contenir des données précieuses
|
| 264 |
+
# Si vraiment nécessaire, utiliser memory_system.vector_memory.collection.delete(where={})
|
| 265 |
+
|
| 266 |
+
return "✅ Mémoire de conversation réinitialisée. Base vectorielle préservée."
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
# ============================================================================
|
| 270 |
+
# LISTE DES OUTILS MISE À JOUR
|
| 271 |
+
# ============================================================================
|
| 272 |
+
|
| 273 |
+
# Mettre à jour la liste des outils dans votre code principal
|
| 274 |
+
tools_with_memory = [
|
| 275 |
+
research_complete_pipeline_with_memory,
|
| 276 |
+
search_in_memory,
|
| 277 |
+
get_research_history,
|
| 278 |
+
clear_memory
|
| 279 |
+
]
|
| 280 |
+
|
| 281 |
+
print("✅ Outils avec mémoire initialisés:")
|
| 282 |
+
print(" 1. research_complete_pipeline_with_memory - Pipeline complet avec cache")
|
| 283 |
+
print(" 2. search_in_memory - Recherche dans la mémoire vectorielle")
|
| 284 |
+
print(" 3. get_research_history - Historique des recherches")
|
| 285 |
+
print(" 4. clear_memory - Réinitialisation de la mémoire")
|
src/memory_system.py
ADDED
|
@@ -0,0 +1,547 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Système de Mémoire et Stockage Vectoriel pour l'Assistant de Recherche
|
| 3 |
+
Gère : embeddings, recherche sémantique, historique et déduplication
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import chromadb
|
| 7 |
+
from chromadb.config import Settings
|
| 8 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 9 |
+
from langchain_community.vectorstores import Chroma
|
| 10 |
+
from langchain_core.documents import Document
|
| 11 |
+
from typing import List, Dict, Optional, Tuple
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
import hashlib
|
| 14 |
+
import json
|
| 15 |
+
import pickle
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from collections import deque
|
| 18 |
+
|
| 19 |
+
# ============================================================================
|
| 20 |
+
# GESTIONNAIRE DE MÉMOIRE VECTORIELLE
|
| 21 |
+
# ============================================================================
|
| 22 |
+
|
| 23 |
+
class VectorMemoryManager:
|
| 24 |
+
"""Gère le stockage vectoriel des documents et résumés"""
|
| 25 |
+
|
| 26 |
+
def __init__(self,
|
| 27 |
+
persist_directory: str = "./chroma_db",
|
| 28 |
+
collection_name: str = "research_documents",
|
| 29 |
+
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"):
|
| 30 |
+
"""
|
| 31 |
+
Initialise le gestionnaire de mémoire vectorielle
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
persist_directory: Dossier de persistance de ChromaDB
|
| 35 |
+
collection_name: Nom de la collection ChromaDB
|
| 36 |
+
embedding_model: Modèle d'embeddings HuggingFace
|
| 37 |
+
"""
|
| 38 |
+
self.persist_directory = Path(persist_directory)
|
| 39 |
+
self.persist_directory.mkdir(parents=True, exist_ok=True)
|
| 40 |
+
|
| 41 |
+
print(f"🔧 Initialisation du système de mémoire vectorielle...")
|
| 42 |
+
|
| 43 |
+
# Configuration des embeddings
|
| 44 |
+
self.embeddings = HuggingFaceEmbeddings(
|
| 45 |
+
model_name=embedding_model,
|
| 46 |
+
model_kwargs={'device': 'cpu'},
|
| 47 |
+
encode_kwargs={'normalize_embeddings': True}
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# Configuration ChromaDB
|
| 51 |
+
self.client = chromadb.PersistentClient(
|
| 52 |
+
path=str(self.persist_directory),
|
| 53 |
+
settings=Settings(
|
| 54 |
+
anonymized_telemetry=False,
|
| 55 |
+
allow_reset=True
|
| 56 |
+
)
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# Créer ou récupérer la collection
|
| 60 |
+
try:
|
| 61 |
+
self.collection = self.client.get_collection(collection_name)
|
| 62 |
+
print(f"✅ Collection '{collection_name}' récupérée ({self.collection.count()} documents)")
|
| 63 |
+
except:
|
| 64 |
+
self.collection = self.client.create_collection(
|
| 65 |
+
name=collection_name,
|
| 66 |
+
metadata={"hnsw:space": "cosine"}
|
| 67 |
+
)
|
| 68 |
+
print(f"✅ Nouvelle collection '{collection_name}' créée")
|
| 69 |
+
|
| 70 |
+
# Initialiser le vectorstore LangChain
|
| 71 |
+
self.vectorstore = Chroma(
|
| 72 |
+
client=self.client,
|
| 73 |
+
collection_name=collection_name,
|
| 74 |
+
embedding_function=self.embeddings
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# Cache pour déduplication rapide
|
| 78 |
+
self.content_hashes = set()
|
| 79 |
+
self._load_existing_hashes()
|
| 80 |
+
|
| 81 |
+
def _load_existing_hashes(self):
|
| 82 |
+
"""Charge les hashes des documents existants pour déduplication"""
|
| 83 |
+
try:
|
| 84 |
+
results = self.collection.get(include=['metadatas'])
|
| 85 |
+
for metadata in results['metadatas']:
|
| 86 |
+
if 'content_hash' in metadata:
|
| 87 |
+
self.content_hashes.add(metadata['content_hash'])
|
| 88 |
+
print(f"📋 {len(self.content_hashes)} hashes chargés pour déduplication")
|
| 89 |
+
except Exception as e:
|
| 90 |
+
print(f"⚠️ Erreur lors du chargement des hashes: {e}")
|
| 91 |
+
|
| 92 |
+
def _compute_hash(self, content: str) -> str:
|
| 93 |
+
"""Calcule le hash MD5 d'un contenu"""
|
| 94 |
+
return hashlib.md5(content.encode('utf-8')).hexdigest()
|
| 95 |
+
|
| 96 |
+
def is_duplicate(self, content: str) -> bool:
|
| 97 |
+
"""Vérifie si un document est un doublon"""
|
| 98 |
+
content_hash = self._compute_hash(content)
|
| 99 |
+
return content_hash in self.content_hashes
|
| 100 |
+
|
| 101 |
+
def add_documents(self,
|
| 102 |
+
documents: List[Dict[str, any]],
|
| 103 |
+
source: str = "research",
|
| 104 |
+
check_duplicates: bool = True) -> Dict[str, int]:
|
| 105 |
+
"""
|
| 106 |
+
Ajoute des documents au vectorstore
|
| 107 |
+
|
| 108 |
+
Args:
|
| 109 |
+
documents: Liste de dicts avec 'content', 'title', 'url', etc.
|
| 110 |
+
source: Source des documents (research, summary, synthesis)
|
| 111 |
+
check_duplicates: Vérifier les doublons avant ajout
|
| 112 |
+
|
| 113 |
+
Returns:
|
| 114 |
+
Dict avec statistiques d'ajout
|
| 115 |
+
"""
|
| 116 |
+
print(f"\n📥 Ajout de {len(documents)} documents (source: {source})...")
|
| 117 |
+
|
| 118 |
+
added = 0
|
| 119 |
+
skipped = 0
|
| 120 |
+
|
| 121 |
+
docs_to_add = []
|
| 122 |
+
metadatas_to_add = []
|
| 123 |
+
ids_to_add = []
|
| 124 |
+
|
| 125 |
+
for doc in documents:
|
| 126 |
+
content = doc.get('content', '')
|
| 127 |
+
|
| 128 |
+
# Vérification des doublons
|
| 129 |
+
if check_duplicates and self.is_duplicate(content):
|
| 130 |
+
skipped += 1
|
| 131 |
+
continue
|
| 132 |
+
|
| 133 |
+
# Création du document LangChain
|
| 134 |
+
content_hash = self._compute_hash(content)
|
| 135 |
+
doc_id = f"{source}_{content_hash[:8]}_{datetime.now().timestamp()}"
|
| 136 |
+
|
| 137 |
+
metadata = {
|
| 138 |
+
'title': doc.get('title', 'Sans titre'),
|
| 139 |
+
'url': doc.get('url', ''),
|
| 140 |
+
'source': source,
|
| 141 |
+
'timestamp': datetime.now().isoformat(),
|
| 142 |
+
'content_hash': content_hash,
|
| 143 |
+
'word_count': len(content.split())
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
docs_to_add.append(content)
|
| 147 |
+
metadatas_to_add.append(metadata)
|
| 148 |
+
ids_to_add.append(doc_id)
|
| 149 |
+
self.content_hashes.add(content_hash)
|
| 150 |
+
added += 1
|
| 151 |
+
|
| 152 |
+
# Ajout batch à ChromaDB
|
| 153 |
+
if docs_to_add:
|
| 154 |
+
self.collection.add(
|
| 155 |
+
documents=docs_to_add,
|
| 156 |
+
metadatas=metadatas_to_add,
|
| 157 |
+
ids=ids_to_add
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
stats = {
|
| 161 |
+
'added': added,
|
| 162 |
+
'skipped': skipped,
|
| 163 |
+
'total_in_db': self.collection.count()
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
print(f"✅ Ajoutés: {added} | Doublons ignorés: {skipped} | Total DB: {stats['total_in_db']}")
|
| 167 |
+
return stats
|
| 168 |
+
|
| 169 |
+
def semantic_search(self,
|
| 170 |
+
query: str,
|
| 171 |
+
k: int = 5,
|
| 172 |
+
filter_dict: Optional[Dict] = None) -> List[Tuple[Document, float]]:
|
| 173 |
+
"""
|
| 174 |
+
Recherche sémantique dans le vectorstore
|
| 175 |
+
|
| 176 |
+
Args:
|
| 177 |
+
query: Requête de recherche
|
| 178 |
+
k: Nombre de résultats à retourner
|
| 179 |
+
filter_dict: Filtres sur les métadonnées (ex: {'source': 'research'})
|
| 180 |
+
|
| 181 |
+
Returns:
|
| 182 |
+
Liste de tuples (Document, score)
|
| 183 |
+
"""
|
| 184 |
+
print(f"\n🔍 Recherche sémantique: '{query}' (top-{k})")
|
| 185 |
+
|
| 186 |
+
results = self.vectorstore.similarity_search_with_score(
|
| 187 |
+
query=query,
|
| 188 |
+
k=k,
|
| 189 |
+
filter=filter_dict
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
print(f"✅ {len(results)} résultats trouvés")
|
| 193 |
+
return results
|
| 194 |
+
|
| 195 |
+
def get_relevant_context(self,
|
| 196 |
+
query: str,
|
| 197 |
+
k: int = 3,
|
| 198 |
+
source_filter: Optional[str] = None) -> str:
|
| 199 |
+
"""
|
| 200 |
+
Récupère le contexte pertinent pour une requête
|
| 201 |
+
|
| 202 |
+
Args:
|
| 203 |
+
query: Requête
|
| 204 |
+
k: Nombre de documents à récupérer
|
| 205 |
+
source_filter: Filtrer par source (research, summary, etc.)
|
| 206 |
+
|
| 207 |
+
Returns:
|
| 208 |
+
Contexte formaté en string
|
| 209 |
+
"""
|
| 210 |
+
filter_dict = {"source": source_filter} if source_filter else None
|
| 211 |
+
results = self.semantic_search(query, k=k, filter_dict=filter_dict)
|
| 212 |
+
|
| 213 |
+
if not results:
|
| 214 |
+
return ""
|
| 215 |
+
|
| 216 |
+
context_parts = []
|
| 217 |
+
for i, (doc, score) in enumerate(results, 1):
|
| 218 |
+
context_parts.append(
|
| 219 |
+
f"[Source {i} - Pertinence: {score:.2f}]\n"
|
| 220 |
+
f"Titre: {doc.metadata.get('title', 'N/A')}\n"
|
| 221 |
+
f"{doc.page_content[:500]}...\n"
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
return "\n---\n".join(context_parts)
|
| 225 |
+
|
| 226 |
+
def clear_old_documents(self, days: int = 30) -> int:
|
| 227 |
+
"""
|
| 228 |
+
Supprime les documents plus anciens que X jours
|
| 229 |
+
|
| 230 |
+
Args:
|
| 231 |
+
days: Nombre de jours de rétention
|
| 232 |
+
|
| 233 |
+
Returns:
|
| 234 |
+
Nombre de documents supprimés
|
| 235 |
+
"""
|
| 236 |
+
print(f"\n🧹 Nettoyage des documents > {days} jours...")
|
| 237 |
+
|
| 238 |
+
from datetime import timedelta
|
| 239 |
+
cutoff_date = datetime.now() - timedelta(days=days)
|
| 240 |
+
|
| 241 |
+
results = self.collection.get(include=['metadatas'])
|
| 242 |
+
ids_to_delete = []
|
| 243 |
+
|
| 244 |
+
for doc_id, metadata in zip(results['ids'], results['metadatas']):
|
| 245 |
+
timestamp_str = metadata.get('timestamp', '')
|
| 246 |
+
try:
|
| 247 |
+
doc_date = datetime.fromisoformat(timestamp_str)
|
| 248 |
+
if doc_date < cutoff_date:
|
| 249 |
+
ids_to_delete.append(doc_id)
|
| 250 |
+
hash_to_remove = metadata.get('content_hash')
|
| 251 |
+
if hash_to_remove:
|
| 252 |
+
self.content_hashes.discard(hash_to_remove)
|
| 253 |
+
except:
|
| 254 |
+
continue
|
| 255 |
+
|
| 256 |
+
if ids_to_delete:
|
| 257 |
+
self.collection.delete(ids=ids_to_delete)
|
| 258 |
+
|
| 259 |
+
print(f"✅ {len(ids_to_delete)} documents supprimés")
|
| 260 |
+
return len(ids_to_delete)
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
# ============================================================================
|
| 264 |
+
# GESTIONNAIRE DE MÉMOIRE D'AGENT
|
| 265 |
+
# ============================================================================
|
| 266 |
+
|
| 267 |
+
class AgentMemoryManager:
|
| 268 |
+
"""Gère l'historique des conversations et résumés"""
|
| 269 |
+
|
| 270 |
+
def __init__(self,
|
| 271 |
+
memory_file: str = "./agent_memory.pkl",
|
| 272 |
+
max_history: int = 100,
|
| 273 |
+
compression_threshold: int = 50):
|
| 274 |
+
"""
|
| 275 |
+
Initialise le gestionnaire de mémoire d'agent
|
| 276 |
+
|
| 277 |
+
Args:
|
| 278 |
+
memory_file: Fichier de sauvegarde de la mémoire
|
| 279 |
+
max_history: Nombre maximum d'entrées dans l'historique
|
| 280 |
+
compression_threshold: Seuil pour compression de mémoire
|
| 281 |
+
"""
|
| 282 |
+
self.memory_file = Path(memory_file)
|
| 283 |
+
self.max_history = max_history
|
| 284 |
+
self.compression_threshold = compression_threshold
|
| 285 |
+
|
| 286 |
+
# Structures de données
|
| 287 |
+
self.conversation_history = deque(maxlen=max_history)
|
| 288 |
+
self.research_cache = {} # topic -> result
|
| 289 |
+
self.summary_cache = {} # topic -> summary
|
| 290 |
+
self.topic_keywords = {} # topic -> keywords
|
| 291 |
+
|
| 292 |
+
print(f"🧠 Initialisation du gestionnaire de mémoire d'agent...")
|
| 293 |
+
self._load_memory()
|
| 294 |
+
|
| 295 |
+
def _load_memory(self):
|
| 296 |
+
"""Charge la mémoire depuis le fichier"""
|
| 297 |
+
if self.memory_file.exists():
|
| 298 |
+
try:
|
| 299 |
+
with open(self.memory_file, 'rb') as f:
|
| 300 |
+
data = pickle.load(f)
|
| 301 |
+
self.conversation_history = data.get('conversation_history', deque(maxlen=self.max_history))
|
| 302 |
+
self.research_cache = data.get('research_cache', {})
|
| 303 |
+
self.summary_cache = data.get('summary_cache', {})
|
| 304 |
+
self.topic_keywords = data.get('topic_keywords', {})
|
| 305 |
+
print(f"✅ Mémoire chargée: {len(self.conversation_history)} conversations, "
|
| 306 |
+
f"{len(self.research_cache)} recherches en cache")
|
| 307 |
+
except Exception as e:
|
| 308 |
+
print(f"⚠️ Erreur lors du chargement de la mémoire: {e}")
|
| 309 |
+
else:
|
| 310 |
+
print("ℹ️ Nouvelle mémoire initialisée")
|
| 311 |
+
|
| 312 |
+
def _save_memory(self):
|
| 313 |
+
"""Sauvegarde la mémoire dans le fichier"""
|
| 314 |
+
try:
|
| 315 |
+
data = {
|
| 316 |
+
'conversation_history': self.conversation_history,
|
| 317 |
+
'research_cache': self.research_cache,
|
| 318 |
+
'summary_cache': self.summary_cache,
|
| 319 |
+
'topic_keywords': self.topic_keywords
|
| 320 |
+
}
|
| 321 |
+
with open(self.memory_file, 'wb') as f:
|
| 322 |
+
pickle.dump(data, f)
|
| 323 |
+
except Exception as e:
|
| 324 |
+
print(f"⚠️ Erreur lors de la sauvegarde de la mémoire: {e}")
|
| 325 |
+
|
| 326 |
+
def add_conversation(self, user_message: str, assistant_response: str, metadata: Optional[Dict] = None):
|
| 327 |
+
"""Ajoute une conversation à l'historique"""
|
| 328 |
+
entry = {
|
| 329 |
+
'timestamp': datetime.now().isoformat(),
|
| 330 |
+
'user': user_message,
|
| 331 |
+
'assistant': assistant_response,
|
| 332 |
+
'metadata': metadata or {}
|
| 333 |
+
}
|
| 334 |
+
self.conversation_history.append(entry)
|
| 335 |
+
|
| 336 |
+
# Compression si nécessaire
|
| 337 |
+
if len(self.conversation_history) >= self.compression_threshold:
|
| 338 |
+
self._compress_memory()
|
| 339 |
+
|
| 340 |
+
self._save_memory()
|
| 341 |
+
|
| 342 |
+
def add_research_result(self, topic: str, result: any, keywords: List[str]):
|
| 343 |
+
"""Cache un résultat de recherche"""
|
| 344 |
+
self.research_cache[topic] = {
|
| 345 |
+
'result': result,
|
| 346 |
+
'timestamp': datetime.now().isoformat()
|
| 347 |
+
}
|
| 348 |
+
self.topic_keywords[topic] = keywords
|
| 349 |
+
self._save_memory()
|
| 350 |
+
|
| 351 |
+
def get_research_result(self, topic: str, max_age_hours: int = 24) -> Optional[any]:
|
| 352 |
+
"""Récupère un résultat de recherche en cache"""
|
| 353 |
+
if topic not in self.research_cache:
|
| 354 |
+
return None
|
| 355 |
+
|
| 356 |
+
cached = self.research_cache[topic]
|
| 357 |
+
cached_time = datetime.fromisoformat(cached['timestamp'])
|
| 358 |
+
|
| 359 |
+
from datetime import timedelta
|
| 360 |
+
if datetime.now() - cached_time > timedelta(hours=max_age_hours):
|
| 361 |
+
print(f"ℹ️ Cache expiré pour '{topic}'")
|
| 362 |
+
return None
|
| 363 |
+
|
| 364 |
+
print(f"✅ Résultat récupéré du cache pour '{topic}'")
|
| 365 |
+
return cached['result']
|
| 366 |
+
|
| 367 |
+
def add_summary(self, topic: str, summary: str):
|
| 368 |
+
"""Ajoute un résumé au cache"""
|
| 369 |
+
self.summary_cache[topic] = {
|
| 370 |
+
'summary': summary,
|
| 371 |
+
'timestamp': datetime.now().isoformat()
|
| 372 |
+
}
|
| 373 |
+
self._save_memory()
|
| 374 |
+
|
| 375 |
+
def get_conversation_context(self, n_last: int = 5) -> str:
|
| 376 |
+
"""Récupère le contexte des N dernières conversations"""
|
| 377 |
+
recent = list(self.conversation_history)[-n_last:]
|
| 378 |
+
|
| 379 |
+
if not recent:
|
| 380 |
+
return ""
|
| 381 |
+
|
| 382 |
+
context = "Contexte des conversations récentes:\n"
|
| 383 |
+
for i, conv in enumerate(recent, 1):
|
| 384 |
+
context += f"\n[Conversation {i}]\n"
|
| 385 |
+
context += f"User: {conv['user'][:100]}...\n"
|
| 386 |
+
context += f"Assistant: {conv['assistant'][:100]}...\n"
|
| 387 |
+
|
| 388 |
+
return context
|
| 389 |
+
|
| 390 |
+
def _compress_memory(self):
|
| 391 |
+
"""Compresse la mémoire en gardant seulement les éléments importants"""
|
| 392 |
+
print("🗜️ Compression de la mémoire...")
|
| 393 |
+
|
| 394 |
+
# Supprimer les anciennes recherches en cache (> 7 jours)
|
| 395 |
+
from datetime import timedelta
|
| 396 |
+
cutoff = datetime.now() - timedelta(days=7)
|
| 397 |
+
|
| 398 |
+
topics_to_remove = []
|
| 399 |
+
for topic, data in self.research_cache.items():
|
| 400 |
+
if datetime.fromisoformat(data['timestamp']) < cutoff:
|
| 401 |
+
topics_to_remove.append(topic)
|
| 402 |
+
|
| 403 |
+
for topic in topics_to_remove:
|
| 404 |
+
del self.research_cache[topic]
|
| 405 |
+
if topic in self.topic_keywords:
|
| 406 |
+
del self.topic_keywords[topic]
|
| 407 |
+
|
| 408 |
+
print(f"✅ {len(topics_to_remove)} anciennes recherches supprimées")
|
| 409 |
+
self._save_memory()
|
| 410 |
+
|
| 411 |
+
def get_related_topics(self, topic: str, threshold: float = 0.5) -> List[str]:
|
| 412 |
+
"""Trouve les topics similaires dans l'historique"""
|
| 413 |
+
from difflib import SequenceMatcher
|
| 414 |
+
|
| 415 |
+
related = []
|
| 416 |
+
for cached_topic in self.research_cache.keys():
|
| 417 |
+
similarity = SequenceMatcher(None, topic.lower(), cached_topic.lower()).ratio()
|
| 418 |
+
if similarity > threshold:
|
| 419 |
+
related.append((cached_topic, similarity))
|
| 420 |
+
|
| 421 |
+
return [t for t, _ in sorted(related, key=lambda x: x[1], reverse=True)]
|
| 422 |
+
|
| 423 |
+
def clear_all(self):
|
| 424 |
+
"""Réinitialise complètement la mémoire"""
|
| 425 |
+
print("🗑️ Réinitialisation complète de la mémoire...")
|
| 426 |
+
self.conversation_history.clear()
|
| 427 |
+
self.research_cache.clear()
|
| 428 |
+
self.summary_cache.clear()
|
| 429 |
+
self.topic_keywords.clear()
|
| 430 |
+
self._save_memory()
|
| 431 |
+
print("✅ Mémoire réinitialisée")
|
| 432 |
+
|
| 433 |
+
|
| 434 |
+
# ============================================================================
|
| 435 |
+
# GESTIONNAIRE INTÉGRÉ
|
| 436 |
+
# ============================================================================
|
| 437 |
+
|
| 438 |
+
class IntegratedMemorySystem:
|
| 439 |
+
"""Système de mémoire intégré combinant vectoriel et agent"""
|
| 440 |
+
|
| 441 |
+
def __init__(self):
|
| 442 |
+
self.vector_memory = VectorMemoryManager()
|
| 443 |
+
self.agent_memory = AgentMemoryManager()
|
| 444 |
+
print("✨ Système de mémoire intégré initialisé\n")
|
| 445 |
+
|
| 446 |
+
def process_research_result(self,
|
| 447 |
+
topic: str,
|
| 448 |
+
extraction_result: any,
|
| 449 |
+
summarization_result: any,
|
| 450 |
+
global_synthesis: any):
|
| 451 |
+
"""
|
| 452 |
+
Traite et stocke tous les résultats d'une recherche
|
| 453 |
+
|
| 454 |
+
Args:
|
| 455 |
+
topic: Sujet de la recherche
|
| 456 |
+
extraction_result: Résultat de l'extraction
|
| 457 |
+
summarization_result: Résultat des résumés
|
| 458 |
+
global_synthesis: Synthèse globale
|
| 459 |
+
"""
|
| 460 |
+
print(f"\n💾 Stockage des résultats pour '{topic}'...")
|
| 461 |
+
|
| 462 |
+
# 1. Stocker les documents extraits dans le vectorstore
|
| 463 |
+
if extraction_result and hasattr(extraction_result, 'documents'):
|
| 464 |
+
docs_to_store = []
|
| 465 |
+
for doc in extraction_result.documents:
|
| 466 |
+
docs_to_store.append({
|
| 467 |
+
'content': doc.content,
|
| 468 |
+
'title': doc.title,
|
| 469 |
+
'url': str(doc.url)
|
| 470 |
+
})
|
| 471 |
+
self.vector_memory.add_documents(docs_to_store, source='research')
|
| 472 |
+
|
| 473 |
+
# 2. Stocker les résumés
|
| 474 |
+
if summarization_result and hasattr(summarization_result, 'summaries'):
|
| 475 |
+
summaries_to_store = []
|
| 476 |
+
for summary in summarization_result.summaries:
|
| 477 |
+
summaries_to_store.append({
|
| 478 |
+
'content': summary.detailed_summary,
|
| 479 |
+
'title': summary.title,
|
| 480 |
+
'url': str(summary.url)
|
| 481 |
+
})
|
| 482 |
+
self.vector_memory.add_documents(summaries_to_store, source='summary')
|
| 483 |
+
|
| 484 |
+
# 3. Stocker la synthèse globale
|
| 485 |
+
if global_synthesis and hasattr(global_synthesis, 'final_report'):
|
| 486 |
+
synthesis_text = global_synthesis.final_report.formatted_outputs.get('text', '')
|
| 487 |
+
self.vector_memory.add_documents([{
|
| 488 |
+
'content': synthesis_text,
|
| 489 |
+
'title': f"Synthèse: {topic}",
|
| 490 |
+
'url': ''
|
| 491 |
+
}], source='synthesis')
|
| 492 |
+
|
| 493 |
+
# 4. Mettre en cache dans la mémoire agent
|
| 494 |
+
keywords = []
|
| 495 |
+
if hasattr(extraction_result, 'documents'):
|
| 496 |
+
# Extraire quelques mots-clés simples
|
| 497 |
+
all_text = ' '.join([doc.content[:100] for doc in extraction_result.documents[:3]])
|
| 498 |
+
keywords = list(set(all_text.split()[:10]))
|
| 499 |
+
|
| 500 |
+
self.agent_memory.add_research_result(topic, global_synthesis, keywords)
|
| 501 |
+
|
| 502 |
+
print("✅ Tous les résultats stockés avec succès")
|
| 503 |
+
|
| 504 |
+
def retrieve_context_for_query(self, query: str, use_cache: bool = True) -> Dict:
|
| 505 |
+
"""
|
| 506 |
+
Récupère le contexte pertinent pour une requête
|
| 507 |
+
|
| 508 |
+
Args:
|
| 509 |
+
query: Requête de l'utilisateur
|
| 510 |
+
use_cache: Utiliser le cache si disponible
|
| 511 |
+
|
| 512 |
+
Returns:
|
| 513 |
+
Dict avec le contexte vectoriel et conversationnel
|
| 514 |
+
"""
|
| 515 |
+
context = {
|
| 516 |
+
'semantic_context': '',
|
| 517 |
+
'conversation_context': '',
|
| 518 |
+
'cached_result': None,
|
| 519 |
+
'related_topics': []
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
# 1. Vérifier le cache
|
| 523 |
+
if use_cache:
|
| 524 |
+
context['cached_result'] = self.agent_memory.get_research_result(query)
|
| 525 |
+
|
| 526 |
+
# 2. Recherche sémantique
|
| 527 |
+
context['semantic_context'] = self.vector_memory.get_relevant_context(query, k=3)
|
| 528 |
+
|
| 529 |
+
# 3. Contexte conversationnel
|
| 530 |
+
context['conversation_context'] = self.agent_memory.get_conversation_context(n_last=3)
|
| 531 |
+
|
| 532 |
+
# 4. Topics similaires
|
| 533 |
+
context['related_topics'] = self.agent_memory.get_related_topics(query)
|
| 534 |
+
|
| 535 |
+
return context
|
| 536 |
+
|
| 537 |
+
|
| 538 |
+
# ============================================================================
|
| 539 |
+
# INITIALISATION GLOBALE
|
| 540 |
+
# ============================================================================
|
| 541 |
+
|
| 542 |
+
# Instance globale du système de mémoire
|
| 543 |
+
memory_system = IntegratedMemorySystem()
|
| 544 |
+
|
| 545 |
+
print("="*60)
|
| 546 |
+
print("✅ SYSTÈME DE MÉMOIRE PRÊT")
|
| 547 |
+
print("="*60)
|
src/models/__init__.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Modèles de données Pydantic pour le système multi-agents.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from .research_models import (
|
| 6 |
+
ResearchQuery,
|
| 7 |
+
SearchResult,
|
| 8 |
+
ResearchOutput
|
| 9 |
+
)
|
| 10 |
+
from .document_models import (
|
| 11 |
+
Document,
|
| 12 |
+
DocumentSummary,
|
| 13 |
+
SummarizationOutput,
|
| 14 |
+
KeyPoint,
|
| 15 |
+
Citation,
|
| 16 |
+
DocumentType
|
| 17 |
+
)
|
| 18 |
+
from .report_models import (
|
| 19 |
+
ReportSection,
|
| 20 |
+
Report,
|
| 21 |
+
ReportOutput,
|
| 22 |
+
Reference,
|
| 23 |
+
ReportFormat,
|
| 24 |
+
ReportMetadata
|
| 25 |
+
)
|
| 26 |
+
from .state_models import (
|
| 27 |
+
AgentState,
|
| 28 |
+
GraphState,
|
| 29 |
+
AgentType,
|
| 30 |
+
AgentStatus,
|
| 31 |
+
ProcessingStep,
|
| 32 |
+
WorkflowEvent
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
__all__ = [
|
| 36 |
+
# Research models
|
| 37 |
+
"ResearchQuery",
|
| 38 |
+
"SearchResult",
|
| 39 |
+
"ResearchOutput",
|
| 40 |
+
|
| 41 |
+
# Document models
|
| 42 |
+
"Document",
|
| 43 |
+
"DocumentSummary",
|
| 44 |
+
"SummarizationOutput",
|
| 45 |
+
"KeyPoint",
|
| 46 |
+
"Citation",
|
| 47 |
+
"DocumentType",
|
| 48 |
+
|
| 49 |
+
# Report models
|
| 50 |
+
"ReportSection",
|
| 51 |
+
"Report",
|
| 52 |
+
"ReportOutput",
|
| 53 |
+
"Reference",
|
| 54 |
+
"ReportFormat",
|
| 55 |
+
"ReportMetadata",
|
| 56 |
+
|
| 57 |
+
# State models
|
| 58 |
+
"AgentState",
|
| 59 |
+
"GraphState",
|
| 60 |
+
"AgentType",
|
| 61 |
+
"AgentStatus",
|
| 62 |
+
"ProcessingStep",
|
| 63 |
+
"WorkflowEvent"
|
| 64 |
+
]
|
src/models/agent_models.py
ADDED
|
File without changes
|
src/models/document_models.py
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Modèles Pydantic pour l'agent Reader/Summarizer.
|
| 3 |
+
Définit les structures de données pour les documents et leurs résumés.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import List, Optional, Dict, Any
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from pydantic import BaseModel, Field, HttpUrl
|
| 9 |
+
from enum import Enum
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class DocumentType(str, Enum):
|
| 13 |
+
"""Types de documents supportés."""
|
| 14 |
+
ARTICLE = "article"
|
| 15 |
+
BLOG_POST = "blog_post"
|
| 16 |
+
ACADEMIC_PAPER = "academic_paper"
|
| 17 |
+
NEWS = "news"
|
| 18 |
+
REPORT = "report"
|
| 19 |
+
OTHER = "other"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class Document(BaseModel):
|
| 23 |
+
"""
|
| 24 |
+
Modèle pour un document à analyser.
|
| 25 |
+
"""
|
| 26 |
+
title: str = Field(..., description="Titre du document")
|
| 27 |
+
url: HttpUrl = Field(..., description="URL source du document")
|
| 28 |
+
content: str = Field(..., description="Contenu textuel complet du document")
|
| 29 |
+
doc_type: DocumentType = Field(default=DocumentType.ARTICLE, description="Type de document")
|
| 30 |
+
author: Optional[str] = Field(default=None, description="Auteur du document")
|
| 31 |
+
published_date: Optional[datetime] = Field(default=None, description="Date de publication")
|
| 32 |
+
source: Optional[str] = Field(default=None, description="Site ou publication source")
|
| 33 |
+
word_count: int = Field(default=0, ge=0, description="Nombre de mots dans le document")
|
| 34 |
+
language: str = Field(default="fr", description="Langue du document (code ISO)")
|
| 35 |
+
|
| 36 |
+
class Config:
|
| 37 |
+
json_schema_extra = {
|
| 38 |
+
"example": {
|
| 39 |
+
"title": "L'impact de l'IA sur le futur du travail",
|
| 40 |
+
"url": "https://example.com/article-ia-travail",
|
| 41 |
+
"content": "L'intelligence artificielle transforme rapidement...",
|
| 42 |
+
"doc_type": "article",
|
| 43 |
+
"author": "Marie Martin",
|
| 44 |
+
"published_date": "2024-01-15T09:30:00Z",
|
| 45 |
+
"source": "TechMag",
|
| 46 |
+
"word_count": 1500,
|
| 47 |
+
"language": "fr"
|
| 48 |
+
}
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class ExtractionInput(BaseModel):
|
| 53 |
+
"""
|
| 54 |
+
Input pour l'agent Content Extractor.
|
| 55 |
+
"""
|
| 56 |
+
urls: List[str] = Field(..., description="Liste des URLs à extraire", min_items=1)
|
| 57 |
+
content_filters: Optional[Dict[str, Any]] = Field(
|
| 58 |
+
default_factory=dict,
|
| 59 |
+
description="Filtres à appliquer au contenu extrait"
|
| 60 |
+
)
|
| 61 |
+
extraction_options: Optional[Dict[str, Any]] = Field(
|
| 62 |
+
default_factory=dict,
|
| 63 |
+
description="Options d'extraction spécifiques"
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
class Config:
|
| 67 |
+
json_schema_extra = {
|
| 68 |
+
"example": {
|
| 69 |
+
"urls": [
|
| 70 |
+
"https://example.com/article1",
|
| 71 |
+
"https://example.com/article2.pdf"
|
| 72 |
+
],
|
| 73 |
+
"content_filters": {
|
| 74 |
+
"min_content_length": 100,
|
| 75 |
+
"max_content_length": 10000,
|
| 76 |
+
"language": "fr",
|
| 77 |
+
"required_keywords": ["intelligence artificielle"]
|
| 78 |
+
},
|
| 79 |
+
"extraction_options": {
|
| 80 |
+
"timeout": 30,
|
| 81 |
+
"max_retries": 2
|
| 82 |
+
}
|
| 83 |
+
}
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
class ExtractionResult(BaseModel):
|
| 88 |
+
"""
|
| 89 |
+
Résultat de l'extraction de contenu.
|
| 90 |
+
"""
|
| 91 |
+
documents: List[Document] = Field(..., description="Documents extraits avec succès")
|
| 92 |
+
total_urls: int = Field(..., ge=0, description="Nombre total d'URLs traitées")
|
| 93 |
+
successful_extractions: int = Field(..., ge=0, description="Nombre d'extractions réussies")
|
| 94 |
+
failed_extractions: int = Field(..., ge=0, description="Nombre d'extractions échouées")
|
| 95 |
+
failed_urls: List[str] = Field(default_factory=list, description="URLs qui ont échoué lors de l'extraction")
|
| 96 |
+
execution_time: float = Field(..., ge=0, description="Temps d'exécution en secondes")
|
| 97 |
+
extraction_stats: Dict[str, Any] = Field(
|
| 98 |
+
default_factory=dict,
|
| 99 |
+
description="Statistiques détaillées de l'extraction"
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
class Config:
|
| 103 |
+
json_schema_extra = {
|
| 104 |
+
"example": {
|
| 105 |
+
"documents": [],
|
| 106 |
+
"total_urls": 5,
|
| 107 |
+
"successful_extractions": 4,
|
| 108 |
+
"failed_extractions": 1,
|
| 109 |
+
"execution_time": 12.5,
|
| 110 |
+
"extraction_stats": {
|
| 111 |
+
"total_words": 5000,
|
| 112 |
+
"average_words_per_doc": 1250,
|
| 113 |
+
"doc_types": {"article": 3, "pdf": 1},
|
| 114 |
+
"languages": {"fr": 4}
|
| 115 |
+
}
|
| 116 |
+
}
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
class KeyPoint(BaseModel):
|
| 121 |
+
"""
|
| 122 |
+
Modèle pour un point clé extrait d'un document.
|
| 123 |
+
"""
|
| 124 |
+
title: str = Field(..., description="Titre du point clé")
|
| 125 |
+
content: str = Field(..., description="Contenu détaillé du point")
|
| 126 |
+
importance: float = Field(..., ge=0, le=1, description="Score d'importance (0-1)")
|
| 127 |
+
category: Optional[str] = Field(default=None, description="Catégorie du point clé")
|
| 128 |
+
|
| 129 |
+
class Config:
|
| 130 |
+
json_schema_extra = {
|
| 131 |
+
"example": {
|
| 132 |
+
"title": "Automatisation des tâches répétitives",
|
| 133 |
+
"content": "L'IA permet d'automatiser 30% des tâches actuelles...",
|
| 134 |
+
"importance": 0.9,
|
| 135 |
+
"category": "automatisation"
|
| 136 |
+
}
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
class Citation(BaseModel):
|
| 141 |
+
"""
|
| 142 |
+
Modèle pour une citation importante extraite du document.
|
| 143 |
+
"""
|
| 144 |
+
text: str = Field(..., description="Texte de la citation")
|
| 145 |
+
author: Optional[str] = Field(default=None, description="Auteur de la citation")
|
| 146 |
+
context: Optional[str] = Field(default=None, description="Contexte de la citation")
|
| 147 |
+
page_number: Optional[int] = Field(default=None, description="Numéro de page (si applicable)")
|
| 148 |
+
|
| 149 |
+
class Config:
|
| 150 |
+
json_schema_extra = {
|
| 151 |
+
"example": {
|
| 152 |
+
"text": "L'IA ne remplacera pas les humains, elle augmentera leurs capacités",
|
| 153 |
+
"author": "Dr. Jean Dupont",
|
| 154 |
+
"context": "Conclusion de l'étude sur l'IA et l'emploi",
|
| 155 |
+
"page_number": None
|
| 156 |
+
}
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
class DocumentSummary(BaseModel):
|
| 161 |
+
"""
|
| 162 |
+
Modèle pour le résumé d'un document.
|
| 163 |
+
"""
|
| 164 |
+
document_id: str = Field(..., description="Identifiant unique du document")
|
| 165 |
+
title: str = Field(..., description="Titre du document original")
|
| 166 |
+
url: HttpUrl = Field(..., description="URL du document original")
|
| 167 |
+
|
| 168 |
+
# Résumé principal
|
| 169 |
+
executive_summary: str = Field(..., description="Résumé exécutif (2-3 phrases)")
|
| 170 |
+
detailed_summary: str = Field(..., description="Résumé détaillé (1-2 paragraphes)")
|
| 171 |
+
|
| 172 |
+
# Points clés
|
| 173 |
+
key_points: List[KeyPoint] = Field(default_factory=list, description="Points clés extraits")
|
| 174 |
+
main_arguments: List[str] = Field(default_factory=list, description="Arguments principaux")
|
| 175 |
+
|
| 176 |
+
# Citations et données
|
| 177 |
+
important_citations: List[Citation] = Field(default_factory=list, description="Citations importantes")
|
| 178 |
+
statistics: List[str] = Field(default_factory=list, description="Statistiques mentionnées")
|
| 179 |
+
|
| 180 |
+
# Métadonnées d'analyse
|
| 181 |
+
sentiment: Optional[str] = Field(default=None, description="Sentiment général (positif/neutre/négatif)")
|
| 182 |
+
bias_assessment: Optional[str] = Field(default=None, description="Évaluation des biais potentiels")
|
| 183 |
+
credibility_score: Optional[float] = Field(default=None, ge=0, le=1, description="Score de crédibilité (0-1)")
|
| 184 |
+
|
| 185 |
+
# Informations de traitement
|
| 186 |
+
processed_at: datetime = Field(default_factory=datetime.now, description="Horodatage du traitement")
|
| 187 |
+
processing_time: float = Field(default=0.0, ge=0, description="Temps de traitement en secondes")
|
| 188 |
+
|
| 189 |
+
class Config:
|
| 190 |
+
json_schema_extra = {
|
| 191 |
+
"example": {
|
| 192 |
+
"document_id": "doc_123",
|
| 193 |
+
"title": "L'impact de l'IA sur le futur du travail",
|
| 194 |
+
"url": "https://example.com/article",
|
| 195 |
+
"executive_summary": "L'IA transformera 60% des emplois d'ici 2030...",
|
| 196 |
+
"detailed_summary": "Cette étude approfondie examine...",
|
| 197 |
+
"key_points": [],
|
| 198 |
+
"sentiment": "neutre",
|
| 199 |
+
"credibility_score": 0.8,
|
| 200 |
+
"processed_at": "2024-01-15T10:15:00Z",
|
| 201 |
+
"processing_time": 5.2
|
| 202 |
+
}
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
class SummarizationOutput(BaseModel):
|
| 207 |
+
"""
|
| 208 |
+
Modèle pour l'output complet de l'agent Reader/Summarizer.
|
| 209 |
+
"""
|
| 210 |
+
summaries: List[DocumentSummary] = Field(..., description="Liste des résumés de documents")
|
| 211 |
+
total_documents: int = Field(..., ge=0, description="Nombre total de documents traités")
|
| 212 |
+
total_processing_time: float = Field(..., ge=0, description="Temps total de traitement")
|
| 213 |
+
average_credibility: Optional[float] = Field(default=None, ge=0, le=1, description="Score de crédibilité moyen")
|
| 214 |
+
|
| 215 |
+
# Analyse globale
|
| 216 |
+
common_themes: List[str] = Field(default_factory=list, description="Thèmes récurrents identifiés")
|
| 217 |
+
consensus_points: List[str] = Field(default_factory=list, description="Points de consensus entre les sources")
|
| 218 |
+
conflicting_views: List[str] = Field(default_factory=list, description="Points de vue conflictuels")
|
| 219 |
+
|
| 220 |
+
timestamp: datetime = Field(default_factory=datetime.now, description="Horodatage de l'analyse")
|
| 221 |
+
|
| 222 |
+
class Config:
|
| 223 |
+
json_schema_extra = {
|
| 224 |
+
"example": {
|
| 225 |
+
"summaries": [],
|
| 226 |
+
"total_documents": 5,
|
| 227 |
+
"total_processing_time": 25.6,
|
| 228 |
+
"average_credibility": 0.75,
|
| 229 |
+
"common_themes": ["automatisation", "formation", "adaptation"],
|
| 230 |
+
"timestamp": "2024-01-15T10:30:00Z"
|
| 231 |
+
}
|
| 232 |
+
}
|
src/models/report_models.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Modèles Pydantic pour l'agent Writer/Reporter.
|
| 3 |
+
Définit les structures de données pour la génération de rapports.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import List, Optional, Dict, Any
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from pydantic import BaseModel, Field
|
| 9 |
+
from enum import Enum
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class ReportFormat(str, Enum):
|
| 13 |
+
"""Formats de rapport supportés."""
|
| 14 |
+
MARKDOWN = "markdown"
|
| 15 |
+
PDF = "pdf"
|
| 16 |
+
HTML = "html"
|
| 17 |
+
DOCX = "docx"
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class SectionType(str, Enum):
|
| 21 |
+
"""Types de sections dans un rapport."""
|
| 22 |
+
INTRODUCTION = "introduction"
|
| 23 |
+
EXECUTIVE_SUMMARY = "executive_summary"
|
| 24 |
+
MAIN_FINDINGS = "main_findings"
|
| 25 |
+
DETAILED_ANALYSIS = "detailed_analysis"
|
| 26 |
+
CONCLUSIONS = "conclusions"
|
| 27 |
+
RECOMMENDATIONS = "recommendations"
|
| 28 |
+
BIBLIOGRAPHY = "bibliography"
|
| 29 |
+
APPENDIX = "appendix"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class Reference(BaseModel):
|
| 33 |
+
"""
|
| 34 |
+
Modèle pour une référence bibliographique.
|
| 35 |
+
"""
|
| 36 |
+
title: str = Field(..., description="Titre de la source")
|
| 37 |
+
url: str = Field(..., description="URL de la source")
|
| 38 |
+
author: Optional[str] = Field(default=None, description="Auteur de la source")
|
| 39 |
+
published_date: Optional[datetime] = Field(default=None, description="Date de publication")
|
| 40 |
+
source: Optional[str] = Field(default=None, description="Publication ou site source")
|
| 41 |
+
accessed_date: datetime = Field(default_factory=datetime.now, description="Date d'accès")
|
| 42 |
+
|
| 43 |
+
def to_citation(self, style: str = "apa") -> str:
|
| 44 |
+
"""
|
| 45 |
+
Génère une citation formatée selon le style demandé.
|
| 46 |
+
"""
|
| 47 |
+
if style.lower() == "apa":
|
| 48 |
+
parts = []
|
| 49 |
+
if self.author:
|
| 50 |
+
parts.append(f"{self.author}")
|
| 51 |
+
if self.published_date:
|
| 52 |
+
parts.append(f"({self.published_date.year})")
|
| 53 |
+
parts.append(f"{self.title}")
|
| 54 |
+
if self.source:
|
| 55 |
+
parts.append(f"{self.source}")
|
| 56 |
+
parts.append(f"Récupéré de {self.url}")
|
| 57 |
+
return ". ".join(parts) + "."
|
| 58 |
+
return f"{self.title} - {self.url}"
|
| 59 |
+
|
| 60 |
+
class Config:
|
| 61 |
+
json_schema_extra = {
|
| 62 |
+
"example": {
|
| 63 |
+
"title": "L'IA et l'emploi : défis et opportunités",
|
| 64 |
+
"url": "https://example.com/article",
|
| 65 |
+
"author": "Dr. Marie Dubois",
|
| 66 |
+
"published_date": "2024-01-10T00:00:00Z",
|
| 67 |
+
"source": "Revue Technologique",
|
| 68 |
+
"accessed_date": "2024-01-15T10:00:00Z"
|
| 69 |
+
}
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
class ReportSection(BaseModel):
|
| 74 |
+
"""
|
| 75 |
+
Modèle pour une section de rapport.
|
| 76 |
+
"""
|
| 77 |
+
title: str = Field(..., description="Titre de la section")
|
| 78 |
+
content: str = Field(..., description="Contenu de la section en markdown")
|
| 79 |
+
section_type: SectionType = Field(..., description="Type de section")
|
| 80 |
+
subsections: List['ReportSection'] = Field(default_factory=list, description="Sous-sections")
|
| 81 |
+
references: List[Reference] = Field(default_factory=list, description="Références citées dans cette section")
|
| 82 |
+
order: int = Field(default=0, description="Ordre d'affichage de la section")
|
| 83 |
+
|
| 84 |
+
class Config:
|
| 85 |
+
json_schema_extra = {
|
| 86 |
+
"example": {
|
| 87 |
+
"title": "Introduction",
|
| 88 |
+
"content": "L'intelligence artificielle transforme rapidement...",
|
| 89 |
+
"section_type": "introduction",
|
| 90 |
+
"subsections": [],
|
| 91 |
+
"references": [],
|
| 92 |
+
"order": 1
|
| 93 |
+
}
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class ReportMetadata(BaseModel):
|
| 98 |
+
"""
|
| 99 |
+
Métadonnées du rapport.
|
| 100 |
+
"""
|
| 101 |
+
title: str = Field(..., description="Titre du rapport")
|
| 102 |
+
subtitle: Optional[str] = Field(default=None, description="Sous-titre du rapport")
|
| 103 |
+
author: str = Field(default="AI Research Assistant", description="Auteur du rapport")
|
| 104 |
+
creation_date: datetime = Field(default_factory=datetime.now, description="Date de création")
|
| 105 |
+
version: str = Field(default="1.0", description="Version du rapport")
|
| 106 |
+
|
| 107 |
+
# Informations sur la recherche
|
| 108 |
+
research_topic: str = Field(..., description="Sujet de recherche original")
|
| 109 |
+
sources_count: int = Field(default=0, ge=0, description="Nombre de sources utilisées")
|
| 110 |
+
|
| 111 |
+
# Tags et classification
|
| 112 |
+
keywords: List[str] = Field(default_factory=list, description="Mots-clés du rapport")
|
| 113 |
+
categories: List[str] = Field(default_factory=list, description="Catégories du rapport")
|
| 114 |
+
|
| 115 |
+
class Config:
|
| 116 |
+
json_schema_extra = {
|
| 117 |
+
"example": {
|
| 118 |
+
"title": "Impact de l'Intelligence Artificielle sur l'Emploi",
|
| 119 |
+
"subtitle": "Analyse des tendances actuelles et perspectives d'avenir",
|
| 120 |
+
"author": "AI Research Assistant",
|
| 121 |
+
"research_topic": "impact de l'IA sur l'emploi",
|
| 122 |
+
"sources_count": 8,
|
| 123 |
+
"keywords": ["IA", "emploi", "automatisation"],
|
| 124 |
+
"categories": ["technologie", "économie"]
|
| 125 |
+
}
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
class Report(BaseModel):
|
| 130 |
+
"""
|
| 131 |
+
Modèle complet pour un rapport de recherche.
|
| 132 |
+
"""
|
| 133 |
+
metadata: ReportMetadata = Field(..., description="Métadonnées du rapport")
|
| 134 |
+
sections: List[ReportSection] = Field(..., description="Sections du rapport")
|
| 135 |
+
bibliography: List[Reference] = Field(..., description="Bibliographie complète")
|
| 136 |
+
|
| 137 |
+
# Configuration de formatage
|
| 138 |
+
format_config: Dict[str, Any] = Field(
|
| 139 |
+
default_factory=dict,
|
| 140 |
+
description="Configuration de formatage spécifique au format de sortie"
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
# Statistiques du rapport
|
| 144 |
+
word_count: int = Field(default=0, ge=0, description="Nombre de mots total")
|
| 145 |
+
reading_time_minutes: int = Field(default=0, ge=0, description="Temps de lecture estimé en minutes")
|
| 146 |
+
|
| 147 |
+
def calculate_word_count(self) -> int:
|
| 148 |
+
"""Calcule le nombre de mots total du rapport."""
|
| 149 |
+
total_words = 0
|
| 150 |
+
for section in self.sections:
|
| 151 |
+
total_words += len(section.content.split())
|
| 152 |
+
# Récursif pour les sous-sections
|
| 153 |
+
def count_subsection_words(subsections):
|
| 154 |
+
words = 0
|
| 155 |
+
for subsection in subsections:
|
| 156 |
+
words += len(subsection.content.split())
|
| 157 |
+
words += count_subsection_words(subsection.subsections)
|
| 158 |
+
return words
|
| 159 |
+
total_words += count_subsection_words(section.subsections)
|
| 160 |
+
return total_words
|
| 161 |
+
|
| 162 |
+
def calculate_reading_time(self, words_per_minute: int = 200) -> int:
|
| 163 |
+
"""Calcule le temps de lecture estimé."""
|
| 164 |
+
if self.word_count == 0:
|
| 165 |
+
self.word_count = self.calculate_word_count()
|
| 166 |
+
return max(1, self.word_count // words_per_minute)
|
| 167 |
+
|
| 168 |
+
class Config:
|
| 169 |
+
json_schema_extra = {
|
| 170 |
+
"example": {
|
| 171 |
+
"metadata": {
|
| 172 |
+
"title": "Impact de l'IA sur l'Emploi",
|
| 173 |
+
"research_topic": "impact de l'IA sur l'emploi",
|
| 174 |
+
"sources_count": 5
|
| 175 |
+
},
|
| 176 |
+
"sections": [],
|
| 177 |
+
"bibliography": [],
|
| 178 |
+
"word_count": 2500,
|
| 179 |
+
"reading_time_minutes": 12
|
| 180 |
+
}
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
class ReportOutput(BaseModel):
|
| 185 |
+
"""
|
| 186 |
+
Modèle pour l'output de l'agent Writer/Reporter.
|
| 187 |
+
"""
|
| 188 |
+
report: Report = Field(..., description="Le rapport généré")
|
| 189 |
+
output_format: ReportFormat = Field(..., description="Format de sortie demandé")
|
| 190 |
+
file_path: Optional[str] = Field(default=None, description="Chemin du fichier généré")
|
| 191 |
+
|
| 192 |
+
# Informations de génération
|
| 193 |
+
generation_time: float = Field(default=0.0, ge=0, description="Temps de génération en secondes")
|
| 194 |
+
llm_calls: int = Field(default=0, ge=0, description="Nombre d'appels au LLM")
|
| 195 |
+
|
| 196 |
+
# Qualité du rapport
|
| 197 |
+
quality_score: Optional[float] = Field(default=None, ge=0, le=1, description="Score de qualité estimé")
|
| 198 |
+
completeness_score: Optional[float] = Field(default=None, ge=0, le=1, description="Score de complétude")
|
| 199 |
+
|
| 200 |
+
timestamp: datetime = Field(default_factory=datetime.now, description="Horodatage de la génération")
|
| 201 |
+
|
| 202 |
+
class Config:
|
| 203 |
+
json_schema_extra = {
|
| 204 |
+
"example": {
|
| 205 |
+
"report": {
|
| 206 |
+
"metadata": {
|
| 207 |
+
"title": "Impact de l'IA sur l'Emploi"
|
| 208 |
+
}
|
| 209 |
+
},
|
| 210 |
+
"output_format": "markdown",
|
| 211 |
+
"file_path": "./output/rapport_ia_emploi.md",
|
| 212 |
+
"generation_time": 15.3,
|
| 213 |
+
"llm_calls": 3,
|
| 214 |
+
"quality_score": 0.85,
|
| 215 |
+
"timestamp": "2024-01-15T11:00:00Z"
|
| 216 |
+
}
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
# Mise à jour des références pour éviter les erreurs de forward reference
|
| 221 |
+
ReportSection.model_rebuild()
|
src/models/research_models.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Modèles Pydantic pour l'agent Researcher.
|
| 3 |
+
Définit les structures de données pour les requêtes de recherche et les résultats.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import List, Optional, Dict, Any
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from pydantic import BaseModel, Field, HttpUrl
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
#Passer par llm --> to Retreive keywords
|
| 12 |
+
class ResearchQuery(BaseModel):
|
| 13 |
+
"""
|
| 14 |
+
Modèle pour une requête de recherche.
|
| 15 |
+
"""
|
| 16 |
+
topic: str = Field(..., description="Le sujet de recherche principal")
|
| 17 |
+
keywords: List[str] = Field(default_factory=list, description="Mots-clés spécifiques à rechercher")
|
| 18 |
+
max_results: int = Field(default=5, ge=1, le=20, description="Nombre maximum de résultats à retourner")
|
| 19 |
+
search_depth: str = Field(default="basic", description="Profondeur de la recherche: 'basic' ou 'advanced'")
|
| 20 |
+
date_range: Optional[str] = Field(default=None, description="Période de recherche (ex: 'last_year', 'last_month')")
|
| 21 |
+
|
| 22 |
+
class Config:
|
| 23 |
+
json_schema_extra = {
|
| 24 |
+
"example": {
|
| 25 |
+
"topic": "impact de l'intelligence artificielle sur l'emploi",
|
| 26 |
+
"keywords": ["IA", "automatisation", "marché du travail"],
|
| 27 |
+
"max_results": 5,
|
| 28 |
+
"search_depth": "basic",
|
| 29 |
+
"date_range": "last_year" # Faire l'intégration de year also in the research agent
|
| 30 |
+
}
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class SearchResult(BaseModel):
|
| 35 |
+
"""
|
| 36 |
+
Modèle pour un résultat de recherche individuel.
|
| 37 |
+
"""
|
| 38 |
+
title: str = Field(..., description="Titre de l'article ou de la page")
|
| 39 |
+
url: HttpUrl = Field(..., description="URL de la source")
|
| 40 |
+
snippet: str = Field(..., description="Extrait ou résumé court du contenu")
|
| 41 |
+
published_date: Optional[datetime] = Field(default=None, description="Date de publication")
|
| 42 |
+
author: Optional[str] = Field(default=None, description="Auteur de l'article")
|
| 43 |
+
source: Optional[str] = Field(default=None, description="Site source (ex: 'lemonde.fr')")
|
| 44 |
+
score: Optional[float] = Field(default=None, ge=0, le=1, description="Score de pertinence (0-1)")
|
| 45 |
+
tags: List[str] = Field(default_factory=list, description="Tags ou catégories associées")
|
| 46 |
+
|
| 47 |
+
class Config:
|
| 48 |
+
json_schema_extra = {
|
| 49 |
+
"example": {
|
| 50 |
+
"title": "L'IA transforme le marché de l'emploi",
|
| 51 |
+
"url": "https://example.com/article",
|
| 52 |
+
"snippet": "Une étude récente montre que l'intelligence artificielle...",
|
| 53 |
+
"published_date": "2024-01-15T10:00:00Z",
|
| 54 |
+
"author": "Jean Dupont",
|
| 55 |
+
"source": "example.com",
|
| 56 |
+
"score": 0.85,
|
| 57 |
+
"tags": ["technologie", "emploi"]
|
| 58 |
+
}
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
class ResearchOutput(BaseModel):
|
| 63 |
+
"""
|
| 64 |
+
Modèle pour l'output complet de l'agent Researcher.
|
| 65 |
+
"""
|
| 66 |
+
query: ResearchQuery = Field(..., description="La requête originale")
|
| 67 |
+
results: List[SearchResult] = Field(..., description="Liste des résultats trouvés")
|
| 68 |
+
total_found: int = Field(..., ge=0, description="Nombre total de résultats trouvés")
|
| 69 |
+
search_time: float = Field(..., ge=0, description="Temps de recherche en secondes")
|
| 70 |
+
search_engine: str = Field(..., description="Moteur de recherche utilisé (ex: 'tavily', 'serper')")
|
| 71 |
+
timestamp: datetime = Field(default_factory=datetime.now, description="Horodatage de la recherche")
|
| 72 |
+
|
| 73 |
+
class Config:
|
| 74 |
+
json_schema_extra = {
|
| 75 |
+
"example": {
|
| 76 |
+
"query": {
|
| 77 |
+
"topic": "impact de l'IA sur l'emploi",
|
| 78 |
+
"max_results": 5
|
| 79 |
+
},
|
| 80 |
+
"results": [],
|
| 81 |
+
"total_found": 15,
|
| 82 |
+
"search_time": 2.3,
|
| 83 |
+
"search_engine": "tavily",
|
| 84 |
+
"timestamp": "2024-01-15T10:00:00Z"
|
| 85 |
+
}
|
| 86 |
+
}
|
src/models/state_models.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Modèles d'état pour l'orchestration LangGraph.
|
| 3 |
+
Définit l'état global du système et les états des agents.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import List, Optional, Dict, Any, Union
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from pydantic import BaseModel, Field
|
| 9 |
+
from enum import Enum
|
| 10 |
+
|
| 11 |
+
from .research_models import ResearchQuery, ResearchOutput
|
| 12 |
+
from .document_models import SummarizationOutput
|
| 13 |
+
from .report_models import ReportOutput
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class AgentType(str, Enum):
|
| 17 |
+
"""Types d'agents dans le système."""
|
| 18 |
+
RESEARCHER = "researcher"
|
| 19 |
+
CONTENT_EXTRACTOR = "content_extractor"
|
| 20 |
+
READER = "reader"
|
| 21 |
+
WRITER = "writer"
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class AgentStatus(str, Enum):
|
| 25 |
+
"""Statuts possibles d'un agent."""
|
| 26 |
+
IDLE = "idle"
|
| 27 |
+
WORKING = "working"
|
| 28 |
+
COMPLETED = "completed"
|
| 29 |
+
ERROR = "error"
|
| 30 |
+
TIMEOUT = "timeout"
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class ProcessingStep(str, Enum):
|
| 34 |
+
"""Étapes du processus de recherche."""
|
| 35 |
+
INIT = "init"
|
| 36 |
+
RESEARCH = "research"
|
| 37 |
+
READING = "reading"
|
| 38 |
+
WRITING = "writing"
|
| 39 |
+
COMPLETED = "completed"
|
| 40 |
+
ERROR = "error"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class AgentState(BaseModel):
|
| 44 |
+
"""
|
| 45 |
+
État individuel d'un agent.
|
| 46 |
+
"""
|
| 47 |
+
agent_type: AgentType = Field(..., description="Type de l'agent")
|
| 48 |
+
status: AgentStatus = Field(default=AgentStatus.IDLE, description="Statut actuel")
|
| 49 |
+
|
| 50 |
+
# Informations de timing
|
| 51 |
+
start_time: Optional[datetime] = Field(default=None, description="Heure de début d'exécution")
|
| 52 |
+
end_time: Optional[datetime] = Field(default=None, description="Heure de fin d'exécution")
|
| 53 |
+
duration: Optional[float] = Field(default=None, description="Durée d'exécution en secondes")
|
| 54 |
+
|
| 55 |
+
# Gestion des erreurs
|
| 56 |
+
error_message: Optional[str] = Field(default=None, description="Message d'erreur si applicable")
|
| 57 |
+
retry_count: int = Field(default=0, ge=0, description="Nombre de tentatives")
|
| 58 |
+
max_retries: int = Field(default=3, ge=0, description="Nombre maximum de tentatives")
|
| 59 |
+
|
| 60 |
+
# Métadonnées spécifiques à l'agent
|
| 61 |
+
metadata: Dict[str, Any] = Field(default_factory=dict, description="Données spécifiques à l'agent")
|
| 62 |
+
|
| 63 |
+
def start_execution(self):
|
| 64 |
+
"""Marque le début de l'exécution."""
|
| 65 |
+
self.status = AgentStatus.WORKING
|
| 66 |
+
self.start_time = datetime.now()
|
| 67 |
+
self.end_time = None
|
| 68 |
+
|
| 69 |
+
def complete_execution(self):
|
| 70 |
+
"""Marque la fin réussie de l'exécution."""
|
| 71 |
+
self.status = AgentStatus.COMPLETED
|
| 72 |
+
self.end_time = datetime.now()
|
| 73 |
+
if self.start_time:
|
| 74 |
+
self.duration = (self.end_time - self.start_time).total_seconds()
|
| 75 |
+
|
| 76 |
+
def mark_error(self, error_message: str):
|
| 77 |
+
"""Marque l'agent en erreur."""
|
| 78 |
+
self.status = AgentStatus.ERROR
|
| 79 |
+
self.error_message = error_message
|
| 80 |
+
self.end_time = datetime.now()
|
| 81 |
+
if self.start_time:
|
| 82 |
+
self.duration = (self.end_time - self.start_time).total_seconds()
|
| 83 |
+
|
| 84 |
+
class Config:
|
| 85 |
+
json_schema_extra = {
|
| 86 |
+
"example": {
|
| 87 |
+
"agent_type": "researcher",
|
| 88 |
+
"status": "completed",
|
| 89 |
+
"start_time": "2024-01-15T10:00:00Z",
|
| 90 |
+
"end_time": "2024-01-15T10:02:30Z",
|
| 91 |
+
"duration": 150.0,
|
| 92 |
+
"retry_count": 0,
|
| 93 |
+
"metadata": {"search_engine": "tavily"}
|
| 94 |
+
}
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
class GraphState(BaseModel):
|
| 99 |
+
"""
|
| 100 |
+
État global du graph LangGraph.
|
| 101 |
+
Contient toutes les données partagées entre les agents.
|
| 102 |
+
"""
|
| 103 |
+
# Identification de la session
|
| 104 |
+
session_id: str = Field(..., description="Identifiant unique de la session")
|
| 105 |
+
current_step: ProcessingStep = Field(default=ProcessingStep.INIT, description="Étape actuelle du processus")
|
| 106 |
+
|
| 107 |
+
# Requête initiale
|
| 108 |
+
original_query: Optional[ResearchQuery] = Field(default=None, description="Requête de recherche originale")
|
| 109 |
+
|
| 110 |
+
# États des agents
|
| 111 |
+
agents: Dict[AgentType, AgentState] = Field(
|
| 112 |
+
default_factory=lambda: {
|
| 113 |
+
AgentType.RESEARCHER: AgentState(agent_type=AgentType.RESEARCHER),
|
| 114 |
+
AgentType.READER: AgentState(agent_type=AgentType.READER),
|
| 115 |
+
AgentType.WRITER: AgentState(agent_type=AgentType.WRITER)
|
| 116 |
+
},
|
| 117 |
+
description="État de chaque agent"
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
# Données partagées entre agents
|
| 121 |
+
research_output: Optional[ResearchOutput] = Field(default=None, description="Résultats de recherche")
|
| 122 |
+
summarization_output: Optional[SummarizationOutput] = Field(default=None, description="Résultats de synthèse")
|
| 123 |
+
report_output: Optional[ReportOutput] = Field(default=None, description="Rapport final")
|
| 124 |
+
|
| 125 |
+
# Métadonnées globales
|
| 126 |
+
start_time: datetime = Field(default_factory=datetime.now, description="Heure de début du processus")
|
| 127 |
+
end_time: Optional[datetime] = Field(default=None, description="Heure de fin du processus")
|
| 128 |
+
total_duration: Optional[float] = Field(default=None, description="Durée totale en secondes")
|
| 129 |
+
|
| 130 |
+
# Configuration et paramètres
|
| 131 |
+
config: Dict[str, Any] = Field(default_factory=dict, description="Configuration du processus")
|
| 132 |
+
user_preferences: Dict[str, Any] = Field(default_factory=dict, description="Préférences utilisateur")
|
| 133 |
+
|
| 134 |
+
# Gestion des erreurs globales
|
| 135 |
+
global_errors: List[str] = Field(default_factory=list, description="Erreurs globales du processus")
|
| 136 |
+
is_successful: bool = Field(default=False, description="Indique si le processus s'est terminé avec succès")
|
| 137 |
+
|
| 138 |
+
# Informations de débogage
|
| 139 |
+
debug_info: Dict[str, Any] = Field(default_factory=dict, description="Informations de débogage")
|
| 140 |
+
|
| 141 |
+
def get_current_agent(self) -> Optional[AgentType]:
|
| 142 |
+
"""Retourne l'agent actuellement en cours d'exécution."""
|
| 143 |
+
for agent_type, agent_state in self.agents.items():
|
| 144 |
+
if agent_state.status == AgentStatus.WORKING:
|
| 145 |
+
return agent_type
|
| 146 |
+
return None
|
| 147 |
+
|
| 148 |
+
def is_agent_completed(self, agent_type: AgentType) -> bool:
|
| 149 |
+
"""Vérifie si un agent a terminé son exécution."""
|
| 150 |
+
return self.agents[agent_type].status == AgentStatus.COMPLETED
|
| 151 |
+
|
| 152 |
+
def all_agents_completed(self) -> bool:
|
| 153 |
+
"""Vérifie si tous les agents ont terminé."""
|
| 154 |
+
return all(
|
| 155 |
+
agent.status == AgentStatus.COMPLETED
|
| 156 |
+
for agent in self.agents.values()
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
def has_errors(self) -> bool:
|
| 160 |
+
"""Vérifie s'il y a des erreurs dans le processus."""
|
| 161 |
+
return (
|
| 162 |
+
len(self.global_errors) > 0 or
|
| 163 |
+
any(agent.status == AgentStatus.ERROR for agent in self.agents.values())
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
def complete_process(self):
|
| 167 |
+
"""Marque le processus comme terminé."""
|
| 168 |
+
self.end_time = datetime.now()
|
| 169 |
+
self.total_duration = (self.end_time - self.start_time).total_seconds()
|
| 170 |
+
self.current_step = ProcessingStep.COMPLETED
|
| 171 |
+
self.is_successful = not self.has_errors()
|
| 172 |
+
|
| 173 |
+
def add_global_error(self, error_message: str):
|
| 174 |
+
"""Ajoute une erreur globale."""
|
| 175 |
+
self.global_errors.append(error_message)
|
| 176 |
+
self.current_step = ProcessingStep.ERROR
|
| 177 |
+
|
| 178 |
+
class Config:
|
| 179 |
+
json_schema_extra = {
|
| 180 |
+
"example": {
|
| 181 |
+
"session_id": "session_123",
|
| 182 |
+
"current_step": "research",
|
| 183 |
+
"original_query": {
|
| 184 |
+
"topic": "impact de l'IA sur l'emploi"
|
| 185 |
+
},
|
| 186 |
+
"start_time": "2024-01-15T10:00:00Z",
|
| 187 |
+
"is_successful": False,
|
| 188 |
+
"global_errors": []
|
| 189 |
+
}
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
class WorkflowEvent(BaseModel):
|
| 194 |
+
"""
|
| 195 |
+
Événement dans le workflow LangGraph.
|
| 196 |
+
"""
|
| 197 |
+
event_id: str = Field(..., description="Identifiant unique de l'événement")
|
| 198 |
+
event_type: str = Field(..., description="Type d'événement")
|
| 199 |
+
agent_type: Optional[AgentType] = Field(default=None, description="Agent concerné")
|
| 200 |
+
timestamp: datetime = Field(default_factory=datetime.now, description="Horodatage de l'événement")
|
| 201 |
+
data: Dict[str, Any] = Field(default_factory=dict, description="Données associées à l'événement")
|
| 202 |
+
|
| 203 |
+
class Config:
|
| 204 |
+
json_schema_extra = {
|
| 205 |
+
"example": {
|
| 206 |
+
"event_id": "evt_001",
|
| 207 |
+
"event_type": "agent_started",
|
| 208 |
+
"agent_type": "researcher",
|
| 209 |
+
"timestamp": "2024-01-15T10:00:00Z",
|
| 210 |
+
"data": {"query": "impact IA emploi"}
|
| 211 |
+
}
|
| 212 |
+
}
|
src/models/synthesis_models.py
ADDED
|
@@ -0,0 +1,306 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Modèles Pydantic pour l'agent Global Synthesizer.
|
| 3 |
+
Définit les structures de données pour la synthèse finale et le rapport global.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import List, Optional, Dict, Any
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from pydantic import BaseModel, Field
|
| 9 |
+
from enum import Enum
|
| 10 |
+
|
| 11 |
+
from src.models.document_models import DocumentSummary, SummarizationOutput
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ReportType(str, Enum):
|
| 15 |
+
"""Types de rapports de synthèse finale."""
|
| 16 |
+
EXECUTIVE = "executive" # Rapport exécutif court
|
| 17 |
+
DETAILED = "detailed" # Rapport détaillé complet
|
| 18 |
+
ACADEMIC = "academic" # Rapport de style académique
|
| 19 |
+
BUSINESS = "business" # Rapport orienté business
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class ReportFormat(str, Enum):
|
| 23 |
+
"""Formats de sortie du rapport."""
|
| 24 |
+
MARKDOWN = "markdown"
|
| 25 |
+
HTML = "html"
|
| 26 |
+
TEXT = "text"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class GlobalSynthesisInput(BaseModel):
|
| 30 |
+
"""
|
| 31 |
+
Input pour l'agent Global Synthesizer.
|
| 32 |
+
"""
|
| 33 |
+
summarization_output: SummarizationOutput = Field(
|
| 34 |
+
...,
|
| 35 |
+
description="Sortie complète de l'agent Summarizer avec tous les résumés"
|
| 36 |
+
)
|
| 37 |
+
original_topic: str = Field(
|
| 38 |
+
...,
|
| 39 |
+
description="Sujet de recherche original"
|
| 40 |
+
)
|
| 41 |
+
synthesis_options: Optional[Dict[str, Any]] = Field(
|
| 42 |
+
default_factory=dict,
|
| 43 |
+
description="Options de configuration pour la synthèse"
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
# Options configurables avec valeurs par défaut
|
| 47 |
+
report_type: ReportType = Field(
|
| 48 |
+
default=ReportType.DETAILED,
|
| 49 |
+
description="Type de rapport à générer"
|
| 50 |
+
)
|
| 51 |
+
report_format: ReportFormat = Field(
|
| 52 |
+
default=ReportFormat.MARKDOWN,
|
| 53 |
+
description="Format de sortie du rapport"
|
| 54 |
+
)
|
| 55 |
+
include_methodology: bool = Field(
|
| 56 |
+
default=True,
|
| 57 |
+
description="Inclure la section méthodologie"
|
| 58 |
+
)
|
| 59 |
+
include_sources: bool = Field(
|
| 60 |
+
default=True,
|
| 61 |
+
description="Inclure les références des sources"
|
| 62 |
+
)
|
| 63 |
+
include_limitations: bool = Field(
|
| 64 |
+
default=True,
|
| 65 |
+
description="Inclure les limitations de l'analyse"
|
| 66 |
+
)
|
| 67 |
+
max_report_length: int = Field(
|
| 68 |
+
default=5000,
|
| 69 |
+
description="Longueur maximale du rapport en mots"
|
| 70 |
+
)
|
| 71 |
+
target_audience: str = Field(
|
| 72 |
+
default="general",
|
| 73 |
+
description="Audience cible (general, business, academic, policy_makers)"
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
def __init__(self, **data):
|
| 77 |
+
# Extraire les options de synthesis_options si présentes
|
| 78 |
+
synthesis_options = data.get('synthesis_options', {})
|
| 79 |
+
|
| 80 |
+
# Appliquer les options aux champs correspondants
|
| 81 |
+
if 'report_type' in synthesis_options:
|
| 82 |
+
data['report_type'] = synthesis_options['report_type']
|
| 83 |
+
if 'report_format' in synthesis_options:
|
| 84 |
+
data['report_format'] = synthesis_options['report_format']
|
| 85 |
+
if 'include_methodology' in synthesis_options:
|
| 86 |
+
data['include_methodology'] = synthesis_options['include_methodology']
|
| 87 |
+
if 'include_sources' in synthesis_options:
|
| 88 |
+
data['include_sources'] = synthesis_options['include_sources']
|
| 89 |
+
if 'include_limitations' in synthesis_options:
|
| 90 |
+
data['include_limitations'] = synthesis_options['include_limitations']
|
| 91 |
+
if 'max_report_length' in synthesis_options:
|
| 92 |
+
data['max_report_length'] = synthesis_options['max_report_length']
|
| 93 |
+
if 'target_audience' in synthesis_options:
|
| 94 |
+
data['target_audience'] = synthesis_options['target_audience']
|
| 95 |
+
|
| 96 |
+
super().__init__(**data)
|
| 97 |
+
|
| 98 |
+
class Config:
|
| 99 |
+
json_schema_extra = {
|
| 100 |
+
"example": {
|
| 101 |
+
"original_topic": "impact de l'intelligence artificielle sur l'emploi",
|
| 102 |
+
"synthesis_options": {
|
| 103 |
+
"report_type": "detailed",
|
| 104 |
+
"report_format": "markdown",
|
| 105 |
+
"include_methodology": True,
|
| 106 |
+
"include_sources": True,
|
| 107 |
+
"target_audience": "business"
|
| 108 |
+
}
|
| 109 |
+
}
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
class ExecutiveSummary(BaseModel):
|
| 114 |
+
"""Résumé exécutif du rapport final."""
|
| 115 |
+
|
| 116 |
+
key_findings: List[str] = Field(
|
| 117 |
+
default_factory=list,
|
| 118 |
+
description="3-5 conclusions principales"
|
| 119 |
+
)
|
| 120 |
+
main_insights: List[str] = Field(
|
| 121 |
+
default_factory=list,
|
| 122 |
+
description="Insights et découvertes principales"
|
| 123 |
+
)
|
| 124 |
+
recommendations: List[str] = Field(
|
| 125 |
+
default_factory=list,
|
| 126 |
+
description="Recommandations basées sur l'analyse"
|
| 127 |
+
)
|
| 128 |
+
summary_text: str = Field(
|
| 129 |
+
...,
|
| 130 |
+
description="Texte de synthèse exécutive (2-3 paragraphes)"
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
class ReportSection(BaseModel):
|
| 135 |
+
"""Section individuelle du rapport."""
|
| 136 |
+
|
| 137 |
+
title: str = Field(..., description="Titre de la section")
|
| 138 |
+
content: str = Field(..., description="Contenu de la section")
|
| 139 |
+
subsections: List['ReportSection'] = Field(
|
| 140 |
+
default_factory=list,
|
| 141 |
+
description="Sous-sections"
|
| 142 |
+
)
|
| 143 |
+
order: int = Field(default=0, description="Ordre d'affichage")
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
class SourceReference(BaseModel):
|
| 147 |
+
"""Référence bibliographique d'une source."""
|
| 148 |
+
|
| 149 |
+
title: str = Field(..., description="Titre du document source")
|
| 150 |
+
url: str = Field(..., description="URL du document")
|
| 151 |
+
author: Optional[str] = Field(default=None, description="Auteur")
|
| 152 |
+
publication_date: Optional[datetime] = Field(default=None, description="Date de publication")
|
| 153 |
+
credibility_score: Optional[float] = Field(default=None, description="Score de crédibilité")
|
| 154 |
+
citation_count: int = Field(default=0, description="Nombre de fois citée dans le rapport")
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
class Methodology(BaseModel):
|
| 158 |
+
"""Description de la méthodologie utilisée."""
|
| 159 |
+
|
| 160 |
+
research_approach: str = Field(..., description="Approche de recherche utilisée")
|
| 161 |
+
sources_count: int = Field(..., description="Nombre de sources analysées")
|
| 162 |
+
analysis_methods: List[str] = Field(
|
| 163 |
+
default_factory=list,
|
| 164 |
+
description="Méthodes d'analyse utilisées"
|
| 165 |
+
)
|
| 166 |
+
limitations: List[str] = Field(
|
| 167 |
+
default_factory=list,
|
| 168 |
+
description="Limitations de l'étude"
|
| 169 |
+
)
|
| 170 |
+
data_quality_assessment: str = Field(
|
| 171 |
+
...,
|
| 172 |
+
description="Évaluation de la qualité des données"
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
class FinalReport(BaseModel):
|
| 177 |
+
"""
|
| 178 |
+
Modèle pour le rapport final de synthèse globale.
|
| 179 |
+
"""
|
| 180 |
+
|
| 181 |
+
# Métadonnées du rapport
|
| 182 |
+
report_id: str = Field(..., description="Identifiant unique du rapport")
|
| 183 |
+
title: str = Field(..., description="Titre du rapport")
|
| 184 |
+
topic: str = Field(..., description="Sujet de recherche original")
|
| 185 |
+
generated_at: datetime = Field(default_factory=datetime.now, description="Date de génération")
|
| 186 |
+
report_type: ReportType = Field(default=ReportType.DETAILED, description="Type de rapport")
|
| 187 |
+
report_format: ReportFormat = Field(default=ReportFormat.MARKDOWN, description="Format du rapport")
|
| 188 |
+
|
| 189 |
+
# Contenu principal
|
| 190 |
+
executive_summary: ExecutiveSummary = Field(..., description="Résumé exécutif")
|
| 191 |
+
introduction: str = Field(..., description="Introduction du rapport")
|
| 192 |
+
main_sections: List[ReportSection] = Field(
|
| 193 |
+
default_factory=list,
|
| 194 |
+
description="Sections principales du rapport"
|
| 195 |
+
)
|
| 196 |
+
conclusion: str = Field(..., description="Conclusion du rapport")
|
| 197 |
+
|
| 198 |
+
# Analyses transversales
|
| 199 |
+
key_themes: List[str] = Field(
|
| 200 |
+
default_factory=list,
|
| 201 |
+
description="Thèmes principaux identifiés"
|
| 202 |
+
)
|
| 203 |
+
consensus_points: List[str] = Field(
|
| 204 |
+
default_factory=list,
|
| 205 |
+
description="Points de consensus entre les sources"
|
| 206 |
+
)
|
| 207 |
+
conflicting_viewpoints: List[str] = Field(
|
| 208 |
+
default_factory=list,
|
| 209 |
+
description="Points de vue contradictoires"
|
| 210 |
+
)
|
| 211 |
+
emerging_trends: List[str] = Field(
|
| 212 |
+
default_factory=list,
|
| 213 |
+
description="Tendances émergentes identifiées"
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
# Métadonnées d'analyse
|
| 217 |
+
methodology: Methodology = Field(..., description="Méthodologie utilisée")
|
| 218 |
+
sources: List[SourceReference] = Field(
|
| 219 |
+
default_factory=list,
|
| 220 |
+
description="Sources utilisées avec références"
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
# Métriques de qualité
|
| 224 |
+
confidence_score: float = Field(
|
| 225 |
+
default=0.0,
|
| 226 |
+
ge=0.0,
|
| 227 |
+
le=1.0,
|
| 228 |
+
description="Score de confiance global (0-1)"
|
| 229 |
+
)
|
| 230 |
+
completeness_score: float = Field(
|
| 231 |
+
default=0.0,
|
| 232 |
+
ge=0.0,
|
| 233 |
+
le=1.0,
|
| 234 |
+
description="Score de complétude de l'analyse (0-1)"
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
# Statistiques de traitement
|
| 238 |
+
total_sources_analyzed: int = Field(default=0, description="Nombre total de sources analysées")
|
| 239 |
+
processing_time: float = Field(default=0.0, description="Temps de traitement en secondes")
|
| 240 |
+
word_count: int = Field(default=0, description="Nombre de mots du rapport")
|
| 241 |
+
|
| 242 |
+
class Config:
|
| 243 |
+
json_schema_extra = {
|
| 244 |
+
"example": {
|
| 245 |
+
"report_id": "rpt_20241115_001",
|
| 246 |
+
"title": "Impact de l'Intelligence Artificielle sur l'Emploi - Rapport de Synthèse",
|
| 247 |
+
"topic": "impact de l'intelligence artificielle sur l'emploi",
|
| 248 |
+
"report_type": "detailed",
|
| 249 |
+
"executive_summary": {
|
| 250 |
+
"key_findings": [
|
| 251 |
+
"L'IA transformera 60% des emplois d'ici 2030",
|
| 252 |
+
"Nouveaux emplois créés dans la tech et supervision IA"
|
| 253 |
+
],
|
| 254 |
+
"summary_text": "Analyse complète de l'impact de l'IA..."
|
| 255 |
+
},
|
| 256 |
+
"confidence_score": 0.85,
|
| 257 |
+
"total_sources_analyzed": 5
|
| 258 |
+
}
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
class GlobalSynthesisOutput(BaseModel):
|
| 263 |
+
"""
|
| 264 |
+
Modèle pour l'output de l'agent Global Synthesizer.
|
| 265 |
+
"""
|
| 266 |
+
|
| 267 |
+
final_report: FinalReport = Field(..., description="Rapport final de synthèse")
|
| 268 |
+
synthesis_metadata: Dict[str, Any] = Field(
|
| 269 |
+
default_factory=dict,
|
| 270 |
+
description="Métadonnées sur le processus de synthèse"
|
| 271 |
+
)
|
| 272 |
+
processing_stats: Dict[str, Any] = Field(
|
| 273 |
+
default_factory=dict,
|
| 274 |
+
description="Statistiques de traitement"
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
# Formats alternatifs du rapport
|
| 278 |
+
formatted_outputs: Dict[str, str] = Field(
|
| 279 |
+
default_factory=dict,
|
| 280 |
+
description="Rapport formaté dans différents formats (markdown, html, etc.)"
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
timestamp: datetime = Field(
|
| 284 |
+
default_factory=datetime.now,
|
| 285 |
+
description="Horodatage de la synthèse"
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
class Config:
|
| 289 |
+
json_schema_extra = {
|
| 290 |
+
"example": {
|
| 291 |
+
"synthesis_metadata": {
|
| 292 |
+
"llm_model_used": "groq/llama-3.1-8b-instant",
|
| 293 |
+
"synthesis_strategy": "comprehensive",
|
| 294 |
+
"quality_checks_passed": True
|
| 295 |
+
},
|
| 296 |
+
"processing_stats": {
|
| 297 |
+
"input_summaries": 5,
|
| 298 |
+
"synthesis_time": 15.3,
|
| 299 |
+
"final_report_words": 2500
|
| 300 |
+
}
|
| 301 |
+
}
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
# Configuration forward reference pour les modèles imbriqués
|
| 306 |
+
ReportSection.model_rebuild()
|
src/services/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Package des services du système.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from .search_api import (
|
| 6 |
+
SearchAPIManager,
|
| 7 |
+
TavilySearchAPI,
|
| 8 |
+
SerperSearchAPI,
|
| 9 |
+
SearchAPIError,
|
| 10 |
+
BaseSearchAPI
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
__all__ = [
|
| 14 |
+
"SearchAPIManager",
|
| 15 |
+
"TavilySearchAPI",
|
| 16 |
+
"SerperSearchAPI",
|
| 17 |
+
"SearchAPIError",
|
| 18 |
+
"BaseSearchAPI"
|
| 19 |
+
]
|
src/services/content_extraction.py
ADDED
|
@@ -0,0 +1,462 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Service d'extraction de contenu web.
|
| 3 |
+
Supporte HTML, PDF et autres formats de documents.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import aiohttp
|
| 7 |
+
import asyncio
|
| 8 |
+
from typing import Optional, List, Dict, Any, TYPE_CHECKING
|
| 9 |
+
from urllib.parse import urljoin, urlparse
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
import re
|
| 12 |
+
import mimetypes
|
| 13 |
+
|
| 14 |
+
from asyncssh import logger
|
| 15 |
+
|
| 16 |
+
from src.core.logging import setup_logger
|
| 17 |
+
from src.models.document_models import Document, DocumentType
|
| 18 |
+
|
| 19 |
+
# Import conditionnel des dépendances
|
| 20 |
+
try:
|
| 21 |
+
from bs4 import BeautifulSoup
|
| 22 |
+
BEAUTIFULSOUP_AVAILABLE = True
|
| 23 |
+
except ImportError:
|
| 24 |
+
BEAUTIFULSOUP_AVAILABLE = False
|
| 25 |
+
|
| 26 |
+
if TYPE_CHECKING:
|
| 27 |
+
from bs4 import BeautifulSoup
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
import PyPDF2
|
| 31 |
+
PDF_AVAILABLE = True
|
| 32 |
+
except ImportError:
|
| 33 |
+
PDF_AVAILABLE = False
|
| 34 |
+
PyPDF2 = None
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
import requests
|
| 38 |
+
REQUESTS_AVAILABLE = True
|
| 39 |
+
except ImportError:
|
| 40 |
+
REQUESTS_AVAILABLE = False
|
| 41 |
+
requests = None
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class ContentExtractionError(Exception):
|
| 45 |
+
"""Exception pour les erreurs d'extraction de contenu."""
|
| 46 |
+
pass
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class WebContentExtractor:
|
| 50 |
+
"""
|
| 51 |
+
Extracteur de contenu web avec support multi-format.
|
| 52 |
+
"""
|
| 53 |
+
|
| 54 |
+
def __init__(self, timeout: int = 30, max_content_length: int = 10_000_000):
|
| 55 |
+
self.logger = setup_logger("content_extractor")
|
| 56 |
+
self.timeout = timeout
|
| 57 |
+
self.max_content_length = max_content_length
|
| 58 |
+
|
| 59 |
+
# Headers pour simuler un navigateur réel
|
| 60 |
+
self.headers = {
|
| 61 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 62 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 63 |
+
'Accept-Language': 'fr-FR,fr;q=0.9,en;q=0.8',
|
| 64 |
+
'Accept-Encoding': 'gzip, deflate',
|
| 65 |
+
'Connection': 'keep-alive',
|
| 66 |
+
'Upgrade-Insecure-Requests': '1',
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
# Vérification des dépendances
|
| 70 |
+
self._check_dependencies()
|
| 71 |
+
|
| 72 |
+
def _check_dependencies(self):
|
| 73 |
+
"""Vérifie que les dépendances nécessaires sont installées."""
|
| 74 |
+
if not BEAUTIFULSOUP_AVAILABLE:
|
| 75 |
+
self.logger.warning("BeautifulSoup4 non installé - extraction HTML limitée")
|
| 76 |
+
if not PDF_AVAILABLE:
|
| 77 |
+
self.logger.warning("PyPDF2 non installé - extraction PDF non disponible")
|
| 78 |
+
if not REQUESTS_AVAILABLE:
|
| 79 |
+
self.logger.warning("requests non installé - extraction synchrone non disponible")
|
| 80 |
+
|
| 81 |
+
async def extract_content(self, url: str) -> Document:
|
| 82 |
+
"""
|
| 83 |
+
Extrait le contenu d'une URL.
|
| 84 |
+
|
| 85 |
+
Args:
|
| 86 |
+
url: URL à extraire
|
| 87 |
+
|
| 88 |
+
Returns:
|
| 89 |
+
Document avec le contenu extrait
|
| 90 |
+
|
| 91 |
+
Raises:
|
| 92 |
+
ContentExtractionError: Si l'extraction échoue
|
| 93 |
+
"""
|
| 94 |
+
self.logger.info(f"Extraction de contenu: {url}")
|
| 95 |
+
|
| 96 |
+
try:
|
| 97 |
+
# Détecter le type de contenu
|
| 98 |
+
content_type = await self._detect_content_type(url)
|
| 99 |
+
|
| 100 |
+
if content_type.startswith('application/pdf'):
|
| 101 |
+
return await self._extract_pdf_content(url)
|
| 102 |
+
elif content_type.startswith('text/html') or 'html' in content_type:
|
| 103 |
+
return await self._extract_html_content(url)
|
| 104 |
+
else:
|
| 105 |
+
# Tentative d'extraction générique
|
| 106 |
+
#################### faire aussi l'extraction en fonction de l'extension du fichier et le js ####################
|
| 107 |
+
return await self._extract_generic_content(url)
|
| 108 |
+
|
| 109 |
+
except Exception as e:
|
| 110 |
+
self.logger.error(f"Erreur lors de l'extraction de {url}: {str(e)}")
|
| 111 |
+
raise ContentExtractionError(f"Impossible d'extraire le contenu de {url}: {str(e)}")
|
| 112 |
+
|
| 113 |
+
async def _detect_content_type(self, url: str) -> str:
|
| 114 |
+
"""Détecte le type de contenu d'une URL."""
|
| 115 |
+
try:
|
| 116 |
+
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=10)) as session:
|
| 117 |
+
async with session.head(url, headers=self.headers) as response:
|
| 118 |
+
content_type = response.headers.get('content-type', '').lower()
|
| 119 |
+
if content_type:
|
| 120 |
+
return content_type.split(';')[0] # Enlever le charset
|
| 121 |
+
|
| 122 |
+
# Fallback: détecter par extension
|
| 123 |
+
parsed_url = urlparse(url)
|
| 124 |
+
content_type, _ = mimetypes.guess_type(parsed_url.path)
|
| 125 |
+
return content_type or 'text/html'
|
| 126 |
+
|
| 127 |
+
except Exception as e:
|
| 128 |
+
self.logger.warning(f"Impossible de détecter le type de contenu pour {url}: {e}")
|
| 129 |
+
return 'text/html' # Default fallback
|
| 130 |
+
|
| 131 |
+
async def _extract_html_content(self, url: str) -> Document:
|
| 132 |
+
"""Extrait le contenu d'une page HTML."""
|
| 133 |
+
if not BEAUTIFULSOUP_AVAILABLE:
|
| 134 |
+
raise ContentExtractionError("BeautifulSoup4 non installé pour l'extraction HTML")
|
| 135 |
+
|
| 136 |
+
async with aiohttp.ClientSession(
|
| 137 |
+
timeout=aiohttp.ClientTimeout(total=self.timeout)
|
| 138 |
+
) as session:
|
| 139 |
+
async with session.get(url, headers=self.headers) as response:
|
| 140 |
+
if response.status != 200:
|
| 141 |
+
raise ContentExtractionError(f"Erreur HTTP {response.status} pour {url}")
|
| 142 |
+
|
| 143 |
+
# Vérifier la taille du contenu
|
| 144 |
+
content_length = response.headers.get('content-length')
|
| 145 |
+
if content_length and int(content_length) > self.max_content_length:
|
| 146 |
+
raise ContentExtractionError(f"Contenu trop volumineux: {content_length} bytes")
|
| 147 |
+
|
| 148 |
+
html_content = await response.text()
|
| 149 |
+
|
| 150 |
+
# Parser avec BeautifulSoup
|
| 151 |
+
from bs4 import BeautifulSoup
|
| 152 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
| 153 |
+
|
| 154 |
+
# Extraire le titre
|
| 155 |
+
title = self._extract_title(soup)
|
| 156 |
+
|
| 157 |
+
# Extraire le contenu principal
|
| 158 |
+
content = self._extract_main_content(soup)
|
| 159 |
+
# Vérifier la longueur du contenu
|
| 160 |
+
if len(content) > self.max_content_length:
|
| 161 |
+
raise ContentExtractionError(f"Contenu extrait trop volumineux: {len(content)} caractères")
|
| 162 |
+
# Afficher le contenu
|
| 163 |
+
# self.logger.info(f"Contenu extrait ({len(content)} caractères)")
|
| 164 |
+
|
| 165 |
+
# Extraire les métadonnées
|
| 166 |
+
author = self._extract_author(soup)
|
| 167 |
+
publish_date = self._extract_publish_date(soup)
|
| 168 |
+
|
| 169 |
+
return Document(
|
| 170 |
+
title=title,
|
| 171 |
+
url=url,
|
| 172 |
+
content=content,
|
| 173 |
+
doc_type=DocumentType.ARTICLE,
|
| 174 |
+
author=author,
|
| 175 |
+
published_date=publish_date,
|
| 176 |
+
word_count=len(content.split()),
|
| 177 |
+
language='fr' ############################################# Détection automatique à implémenter ###################
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
def _extract_title(self, soup: "BeautifulSoup") -> str:
|
| 181 |
+
"""Extrait le titre de la page."""
|
| 182 |
+
# Priorité: title tag, h1, og:title, première heading
|
| 183 |
+
|
| 184 |
+
# Title tag
|
| 185 |
+
title_tag = soup.find('title')
|
| 186 |
+
if title_tag and title_tag.get_text().strip():
|
| 187 |
+
return title_tag.get_text().strip()
|
| 188 |
+
|
| 189 |
+
# Meta og:title
|
| 190 |
+
og_title = soup.find('meta', {'property': 'og:title'})
|
| 191 |
+
if og_title and og_title.get('content'):
|
| 192 |
+
return og_title.get('content').strip()
|
| 193 |
+
|
| 194 |
+
# Premier h1
|
| 195 |
+
h1 = soup.find('h1')
|
| 196 |
+
if h1 and h1.get_text().strip():
|
| 197 |
+
return h1.get_text().strip()
|
| 198 |
+
|
| 199 |
+
# Fallback
|
| 200 |
+
return "Titre non trouvé"
|
| 201 |
+
|
| 202 |
+
def _extract_main_content(self, soup: "BeautifulSoup") -> str:
|
| 203 |
+
"""Extrait le contenu principal de la page."""
|
| 204 |
+
# Supprimer les éléments indésirables
|
| 205 |
+
for element in soup.find_all(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form']):
|
| 206 |
+
element.decompose()
|
| 207 |
+
|
| 208 |
+
# Supprimer les commentaires
|
| 209 |
+
for comment in soup.find_all(string=lambda text: isinstance(text, str) and text.strip().startswith('<!--')):
|
| 210 |
+
comment.extract()
|
| 211 |
+
|
| 212 |
+
# Chercher le contenu principal dans l'ordre de priorité
|
| 213 |
+
content_selectors = [
|
| 214 |
+
'article',
|
| 215 |
+
'[role="main"]',
|
| 216 |
+
'main',
|
| 217 |
+
'.content',
|
| 218 |
+
'.post-content',
|
| 219 |
+
'.entry-content',
|
| 220 |
+
'.article-content',
|
| 221 |
+
'#content',
|
| 222 |
+
'.main-content'
|
| 223 |
+
]
|
| 224 |
+
|
| 225 |
+
main_content = None
|
| 226 |
+
for selector in content_selectors:
|
| 227 |
+
element = soup.select_one(selector)
|
| 228 |
+
if element:
|
| 229 |
+
main_content = element
|
| 230 |
+
break
|
| 231 |
+
|
| 232 |
+
# Fallback: tout le body
|
| 233 |
+
if not main_content:
|
| 234 |
+
main_content = soup.find('body') or soup
|
| 235 |
+
|
| 236 |
+
# Extraire le texte en gardant la structure
|
| 237 |
+
return self._clean_text(main_content.get_text())
|
| 238 |
+
|
| 239 |
+
def _clean_text(self, text: str) -> str:
|
| 240 |
+
"""Nettoie et formate le texte extrait."""
|
| 241 |
+
if not text:
|
| 242 |
+
return ""
|
| 243 |
+
|
| 244 |
+
# Supprimer les espaces multiples et les sauts de ligne excessifs
|
| 245 |
+
text = re.sub(r'\s+', ' ', text)
|
| 246 |
+
text = re.sub(r'\n\s*\n', '\n\n', text)
|
| 247 |
+
|
| 248 |
+
# Supprimer les espaces en début et fin
|
| 249 |
+
text = text.strip()
|
| 250 |
+
|
| 251 |
+
# Limiter la longueur si nécessaire
|
| 252 |
+
if len(text) > 50000: # 50k caractères max
|
| 253 |
+
text = text[:50000] + "... [Contenu tronqué]"
|
| 254 |
+
|
| 255 |
+
return text
|
| 256 |
+
|
| 257 |
+
def _extract_author(self, soup: "BeautifulSoup") -> Optional[str]:
|
| 258 |
+
"""Extrait l'auteur de l'article."""
|
| 259 |
+
# Meta author
|
| 260 |
+
author_meta = soup.find('meta', {'name': 'author'})
|
| 261 |
+
if author_meta and author_meta.get('content'):
|
| 262 |
+
return author_meta.get('content').strip()
|
| 263 |
+
|
| 264 |
+
# Schema.org author
|
| 265 |
+
author_schema = soup.find(attrs={'itemprop': 'author'})
|
| 266 |
+
if author_schema:
|
| 267 |
+
return author_schema.get_text().strip()
|
| 268 |
+
|
| 269 |
+
# Recherche par classe CSS commune
|
| 270 |
+
author_selectors = [
|
| 271 |
+
'.author',
|
| 272 |
+
'.byline',
|
| 273 |
+
'.post-author',
|
| 274 |
+
'.article-author'
|
| 275 |
+
]
|
| 276 |
+
|
| 277 |
+
for selector in author_selectors:
|
| 278 |
+
element = soup.select_one(selector)
|
| 279 |
+
if element:
|
| 280 |
+
author_text = element.get_text().strip()
|
| 281 |
+
if author_text and len(author_text) < 100: # Limite raisonnable
|
| 282 |
+
return author_text
|
| 283 |
+
|
| 284 |
+
return None
|
| 285 |
+
|
| 286 |
+
def _extract_publish_date(self, soup: "BeautifulSoup") -> Optional[datetime]:
|
| 287 |
+
"""Extrait la date de publication."""
|
| 288 |
+
# Meta published_time
|
| 289 |
+
time_meta = soup.find('meta', {'property': 'article:published_time'})
|
| 290 |
+
if time_meta and time_meta.get('content'):
|
| 291 |
+
try:
|
| 292 |
+
from dateutil.parser import parse
|
| 293 |
+
return parse(time_meta.get('content'))
|
| 294 |
+
except:
|
| 295 |
+
pass
|
| 296 |
+
|
| 297 |
+
# Schema.org datePublished
|
| 298 |
+
date_schema = soup.find(attrs={'itemprop': 'datePublished'})
|
| 299 |
+
if date_schema:
|
| 300 |
+
date_str = date_schema.get('datetime') or date_schema.get_text()
|
| 301 |
+
try:
|
| 302 |
+
from dateutil.parser import parse
|
| 303 |
+
return parse(date_str)
|
| 304 |
+
except:
|
| 305 |
+
pass
|
| 306 |
+
|
| 307 |
+
return None
|
| 308 |
+
|
| 309 |
+
async def _extract_pdf_content(self, url: str) -> Document:
|
| 310 |
+
"""Extrait le contenu d'un PDF."""
|
| 311 |
+
if not PDF_AVAILABLE:
|
| 312 |
+
raise ContentExtractionError("PyPDF2 non installé pour l'extraction PDF")
|
| 313 |
+
|
| 314 |
+
# Télécharger le PDF
|
| 315 |
+
async with aiohttp.ClientSession(
|
| 316 |
+
timeout=aiohttp.ClientTimeout(total=self.timeout)
|
| 317 |
+
) as session:
|
| 318 |
+
async with session.get(url, headers=self.headers) as response:
|
| 319 |
+
if response.status != 200:
|
| 320 |
+
raise ContentExtractionError(f"Erreur HTTP {response.status} pour {url}")
|
| 321 |
+
|
| 322 |
+
pdf_content = await response.read()
|
| 323 |
+
|
| 324 |
+
# Extraire le texte du PDF
|
| 325 |
+
try:
|
| 326 |
+
import io
|
| 327 |
+
pdf_file = io.BytesIO(pdf_content)
|
| 328 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
| 329 |
+
|
| 330 |
+
title = "Document PDF"
|
| 331 |
+
content = ""
|
| 332 |
+
|
| 333 |
+
# Extraire le texte de toutes les pages
|
| 334 |
+
for page in pdf_reader.pages:
|
| 335 |
+
page_text = page.extract_text()
|
| 336 |
+
content += page_text + "\n"
|
| 337 |
+
|
| 338 |
+
# Nettoyer le contenu
|
| 339 |
+
content = self._clean_text(content)
|
| 340 |
+
|
| 341 |
+
return Document(
|
| 342 |
+
title=title,
|
| 343 |
+
url=url,
|
| 344 |
+
content=content,
|
| 345 |
+
doc_type=DocumentType.ACADEMIC_PAPER,
|
| 346 |
+
word_count=len(content.split()),
|
| 347 |
+
language='fr' ############################################# Détection automatique à implémenter ###################
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
except Exception as e:
|
| 351 |
+
raise ContentExtractionError(f"Erreur lors de l'extraction PDF: {str(e)}")
|
| 352 |
+
|
| 353 |
+
async def _extract_generic_content(self, url: str) -> Document:
|
| 354 |
+
"""Extraction générique pour les autres types de contenu."""
|
| 355 |
+
async with aiohttp.ClientSession(
|
| 356 |
+
timeout=aiohttp.ClientTimeout(total=self.timeout)
|
| 357 |
+
) as session:
|
| 358 |
+
async with session.get(url, headers=self.headers) as response:
|
| 359 |
+
if response.status != 200:
|
| 360 |
+
raise ContentExtractionError(f"Erreur HTTP {response.status} pour {url}")
|
| 361 |
+
|
| 362 |
+
content = await response.text()
|
| 363 |
+
|
| 364 |
+
# Nettoyage basique
|
| 365 |
+
content = self._clean_text(content)
|
| 366 |
+
|
| 367 |
+
return Document(
|
| 368 |
+
title=f"Document depuis {urlparse(url).netloc}",
|
| 369 |
+
url=url,
|
| 370 |
+
content=content,
|
| 371 |
+
doc_type=DocumentType.OTHER,
|
| 372 |
+
word_count=len(content.split()),
|
| 373 |
+
language='fr'
|
| 374 |
+
)
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
class ContentExtractionManager:
|
| 378 |
+
"""
|
| 379 |
+
Gestionnaire d'extraction de contenu avec gestion des erreurs et retry.
|
| 380 |
+
"""
|
| 381 |
+
|
| 382 |
+
def __init__(self, max_concurrent: int = 5, max_retries: int = 2):
|
| 383 |
+
self.logger = setup_logger("extraction_manager")
|
| 384 |
+
self.extractor = WebContentExtractor()
|
| 385 |
+
self.max_concurrent = max_concurrent
|
| 386 |
+
self.max_retries = max_retries
|
| 387 |
+
self.semaphore = asyncio.Semaphore(max_concurrent)
|
| 388 |
+
|
| 389 |
+
async def extract_multiple(self, urls: List[str]) -> List[Document]:
|
| 390 |
+
"""
|
| 391 |
+
Extrait le contenu de plusieurs URLs en parallèle.
|
| 392 |
+
|
| 393 |
+
Args:
|
| 394 |
+
urls: Liste des URLs à extraire
|
| 395 |
+
|
| 396 |
+
Returns:
|
| 397 |
+
Liste des documents extraits (peut contenir moins d'éléments en cas d'erreur)
|
| 398 |
+
"""
|
| 399 |
+
self.logger.info(f"Extraction de contenu pour {len(urls)} URLs")
|
| 400 |
+
|
| 401 |
+
# Créer les tâches d'extraction
|
| 402 |
+
tasks = [self._extract_with_retry(url) for url in urls]
|
| 403 |
+
|
| 404 |
+
# Exécuter en parallèle avec limite de concurrence
|
| 405 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 406 |
+
|
| 407 |
+
# Filtrer les résultats valides
|
| 408 |
+
documents = []
|
| 409 |
+
for i, result in enumerate(results):
|
| 410 |
+
if isinstance(result, Document):
|
| 411 |
+
documents.append(result)
|
| 412 |
+
elif isinstance(result, Exception):
|
| 413 |
+
self.logger.error(f"Échec d'extraction pour {urls[i]}: {str(result)}")
|
| 414 |
+
else:
|
| 415 |
+
self.logger.warning(f"Résultat inattendu pour {urls[i]}: {type(result)}")
|
| 416 |
+
|
| 417 |
+
self.logger.info(f"Extraction terminée: {len(documents)}/{len(urls)} succès")
|
| 418 |
+
return documents
|
| 419 |
+
|
| 420 |
+
async def _extract_with_retry(self, url: str) -> Document:
|
| 421 |
+
"""Extrait le contenu d'une URL avec retry automatique."""
|
| 422 |
+
async with self.semaphore:
|
| 423 |
+
last_error = None
|
| 424 |
+
|
| 425 |
+
for attempt in range(self.max_retries + 1):
|
| 426 |
+
try:
|
| 427 |
+
if attempt > 0:
|
| 428 |
+
# Attendre entre les tentatives
|
| 429 |
+
await asyncio.sleep(2 ** attempt)
|
| 430 |
+
self.logger.info(f"Tentative {attempt + 1}/{self.max_retries + 1} pour {url}")
|
| 431 |
+
|
| 432 |
+
return await self.extractor.extract_content(url)
|
| 433 |
+
|
| 434 |
+
except Exception as e:
|
| 435 |
+
last_error = e
|
| 436 |
+
if attempt < self.max_retries:
|
| 437 |
+
self.logger.warning(f"Tentative {attempt + 1} échouée pour {url}: {str(e)}")
|
| 438 |
+
else:
|
| 439 |
+
self.logger.error(f"Toutes les tentatives ont échoué pour {url}: {str(e)}")
|
| 440 |
+
|
| 441 |
+
# Si toutes les tentatives échouent
|
| 442 |
+
raise last_error or ContentExtractionError(f"Échec d'extraction pour {url}")
|
| 443 |
+
|
| 444 |
+
|
| 445 |
+
|
| 446 |
+
|
| 447 |
+
|
| 448 |
+
##########################################################""
|
| 449 |
+
# Exemple d'utilisation (à exécuter dans un contexte asynchrone)
|
| 450 |
+
async def main():
|
| 451 |
+
extractor_manager = ContentExtractionManager(max_concurrent=3, max_retries=2)
|
| 452 |
+
urls = [
|
| 453 |
+
'https://www.iana.org/help/example-domains',
|
| 454 |
+
'https://documents1.worldbank.org/curated/en/691261636143890139/pdf/Taxing-Pollution.pdf'
|
| 455 |
+
]
|
| 456 |
+
documents = await extractor_manager.extract_multiple(urls)
|
| 457 |
+
for doc in documents:
|
| 458 |
+
print(f"Title: {doc.title}, URL: {doc.url}, Word Count: {doc.word_count}, Language: {doc.language}, Content Length: {len(doc.content)}, \nContenu tronqué: {doc.content[:500]}")
|
| 459 |
+
logger.error("⚠️ pytest n'est pas installé. Impossible de tester les erreurs de validation.")
|
| 460 |
+
|
| 461 |
+
if __name__ == "__main__":
|
| 462 |
+
asyncio.run(main())
|
src/services/llm_service.py
ADDED
|
@@ -0,0 +1,488 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Service LLM pour l'intégration avec Groq et autres fournisseurs.
|
| 3 |
+
Gère les appels aux modèles de langage pour le résumé et l'analyse.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import asyncio
|
| 7 |
+
import aiohttp
|
| 8 |
+
import json
|
| 9 |
+
from typing import List, Dict, Any, Optional, Union
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
import time
|
| 12 |
+
|
| 13 |
+
from config.settings import api_config
|
| 14 |
+
from src.core.logging import setup_logger
|
| 15 |
+
import traceback
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class LLMError(Exception):
|
| 19 |
+
"""Exception pour les erreurs LLM."""
|
| 20 |
+
pass
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class LLMRateLimitError(LLMError):
|
| 24 |
+
"""Exception pour les erreurs de limite de taux."""
|
| 25 |
+
pass
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class LLMService:
|
| 29 |
+
"""
|
| 30 |
+
Service pour les appels aux modèles de langage.
|
| 31 |
+
|
| 32 |
+
Fonctionnalités:
|
| 33 |
+
- Support de Groq API
|
| 34 |
+
- Gestion des limites de taux
|
| 35 |
+
- Retry automatique avec backoff
|
| 36 |
+
- Streaming optionnel
|
| 37 |
+
- Validation des réponses
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
def __init__(self):
|
| 41 |
+
self.config = api_config
|
| 42 |
+
self.logger = setup_logger("llm_service")
|
| 43 |
+
|
| 44 |
+
# Configuration Groq
|
| 45 |
+
self.groq_api_key = self.config.GROQ_API_KEY
|
| 46 |
+
self.groq_base_url = "https://api.groq.com/openai/v1"
|
| 47 |
+
self.default_model = getattr(self.config, 'GROQ_MODEL', "llama-3.1-8b-instant")
|
| 48 |
+
|
| 49 |
+
# Gestion des limites de taux
|
| 50 |
+
self.rate_limit_requests = 30 # Requêtes par minute
|
| 51 |
+
self.rate_limit_tokens = 6000 # Tokens par minute
|
| 52 |
+
self.request_timestamps = []
|
| 53 |
+
|
| 54 |
+
# Configuration par défaut
|
| 55 |
+
self.default_params = {
|
| 56 |
+
"temperature": 0.3,
|
| 57 |
+
"max_tokens": 2000,
|
| 58 |
+
"top_p": 0.9,
|
| 59 |
+
"frequency_penalty": 0.1,
|
| 60 |
+
"presence_penalty": 0.1
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
# Headers pour les requêtes
|
| 64 |
+
self.headers = {
|
| 65 |
+
"Authorization": f"Bearer {self.groq_api_key}",
|
| 66 |
+
"Content-Type": "application/json"
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
async def generate_completion(
|
| 70 |
+
self,
|
| 71 |
+
prompt: str,
|
| 72 |
+
system_prompt: Optional[str] = None,
|
| 73 |
+
model: Optional[str] = None,
|
| 74 |
+
**kwargs
|
| 75 |
+
) -> str:
|
| 76 |
+
"""
|
| 77 |
+
Génère une complétion de texte.
|
| 78 |
+
|
| 79 |
+
Args:
|
| 80 |
+
prompt: Prompt utilisateur
|
| 81 |
+
system_prompt: Prompt système optionnel
|
| 82 |
+
model: Modèle à utiliser (défaut: config)
|
| 83 |
+
**kwargs: Paramètres supplémentaires pour l'API
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
Réponse générée par le modèle
|
| 87 |
+
|
| 88 |
+
Raises:
|
| 89 |
+
LLMError: En cas d'erreur API
|
| 90 |
+
LLMRateLimitError: En cas de dépassement de limite
|
| 91 |
+
"""
|
| 92 |
+
# Préparer les messages
|
| 93 |
+
messages = []
|
| 94 |
+
if system_prompt:
|
| 95 |
+
messages.append({"role": "system", "content": system_prompt})
|
| 96 |
+
messages.append({"role": "user", "content": prompt})
|
| 97 |
+
|
| 98 |
+
# Paramètres de la requête
|
| 99 |
+
params = {**self.default_params, **kwargs}
|
| 100 |
+
payload = {
|
| 101 |
+
"model": model or self.default_model,
|
| 102 |
+
"messages": messages,
|
| 103 |
+
**params
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
# Gestion des limites de taux
|
| 107 |
+
await self._check_rate_limits()
|
| 108 |
+
|
| 109 |
+
# Appel API avec retry
|
| 110 |
+
return await self._make_api_call(payload)
|
| 111 |
+
|
| 112 |
+
async def generate_batch_completions(
|
| 113 |
+
self,
|
| 114 |
+
prompts: List[str],
|
| 115 |
+
system_prompt: Optional[str] = None,
|
| 116 |
+
model: Optional[str] = None,
|
| 117 |
+
max_concurrent: int = 3,
|
| 118 |
+
**kwargs
|
| 119 |
+
) -> List[str]:
|
| 120 |
+
"""
|
| 121 |
+
Génère plusieurs complétions en parallèle.
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
prompts: Liste des prompts
|
| 125 |
+
system_prompt: Prompt système optionnel
|
| 126 |
+
model: Modèle à utiliser
|
| 127 |
+
max_concurrent: Nombre maximum de requêtes simultanées
|
| 128 |
+
**kwargs: Paramètres supplémentaires
|
| 129 |
+
|
| 130 |
+
Returns:
|
| 131 |
+
Liste des réponses dans le même ordre que les prompts
|
| 132 |
+
"""
|
| 133 |
+
self.logger.info(f"Génération batch de {len(prompts)} complétions")
|
| 134 |
+
|
| 135 |
+
# Créer un semaphore pour limiter la concurrence
|
| 136 |
+
semaphore = asyncio.Semaphore(max_concurrent)
|
| 137 |
+
|
| 138 |
+
async def generate_single(prompt: str, index: int) -> tuple:
|
| 139 |
+
async with semaphore:
|
| 140 |
+
try:
|
| 141 |
+
# Délai pour éviter le rate limiting
|
| 142 |
+
await asyncio.sleep(index * 0.5)
|
| 143 |
+
|
| 144 |
+
result = await self.generate_completion(
|
| 145 |
+
prompt, system_prompt, model, **kwargs
|
| 146 |
+
)
|
| 147 |
+
return index, result
|
| 148 |
+
except Exception as e:
|
| 149 |
+
self.logger.error(f"Erreur completion {index}: {e}")
|
| 150 |
+
return index, f"ERREUR: {str(e)}"
|
| 151 |
+
|
| 152 |
+
# Lancer toutes les tâches
|
| 153 |
+
tasks = [generate_single(prompt, i) for i, prompt in enumerate(prompts)]
|
| 154 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 155 |
+
|
| 156 |
+
# Réorganiser les résultats dans l'ordre
|
| 157 |
+
ordered_results = [""] * len(prompts)
|
| 158 |
+
for result in results:
|
| 159 |
+
if isinstance(result, tuple):
|
| 160 |
+
index, content = result
|
| 161 |
+
ordered_results[index] = content
|
| 162 |
+
else:
|
| 163 |
+
# Exception - la placer à la fin
|
| 164 |
+
ordered_results.append(f"EXCEPTION: {str(result)}")
|
| 165 |
+
|
| 166 |
+
success_count = sum(1 for r in ordered_results if not r.startswith("ERREUR"))
|
| 167 |
+
self.logger.info(f"Batch terminé: {success_count}/{len(prompts)} succès")
|
| 168 |
+
|
| 169 |
+
return ordered_results
|
| 170 |
+
|
| 171 |
+
async def _make_api_call(self, payload: Dict[str, Any], max_retries: int = 3) -> str:
|
| 172 |
+
"""Effectue l'appel API avec retry automatique."""
|
| 173 |
+
url = f"{self.groq_base_url}/chat/completions"
|
| 174 |
+
|
| 175 |
+
for attempt in range(max_retries + 1):
|
| 176 |
+
try:
|
| 177 |
+
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60)) as session:
|
| 178 |
+
async with session.post(url, json=payload, headers=self.headers) as response:
|
| 179 |
+
|
| 180 |
+
# Enregistrer la requête pour rate limiting
|
| 181 |
+
self.request_timestamps.append(time.time())
|
| 182 |
+
|
| 183 |
+
if response.status == 200:
|
| 184 |
+
data = await response.json()
|
| 185 |
+
content = data["choices"][0]["message"]["content"]
|
| 186 |
+
|
| 187 |
+
# Validation de base
|
| 188 |
+
if not content or content.strip() == "":
|
| 189 |
+
raise LLMError("Réponse vide du modèle")
|
| 190 |
+
|
| 191 |
+
return content.strip()
|
| 192 |
+
|
| 193 |
+
elif response.status == 429:
|
| 194 |
+
# Rate limit atteint
|
| 195 |
+
retry_after = int(response.headers.get("retry-after", 60))
|
| 196 |
+
self.logger.warning(f"Rate limit atteint, attente {retry_after}s")
|
| 197 |
+
|
| 198 |
+
if attempt < max_retries:
|
| 199 |
+
await asyncio.sleep(retry_after)
|
| 200 |
+
continue
|
| 201 |
+
else:
|
| 202 |
+
raise LLMRateLimitError("Limite de taux API dépassée")
|
| 203 |
+
|
| 204 |
+
else:
|
| 205 |
+
# Autres erreurs HTTP
|
| 206 |
+
error_text = await response.text()
|
| 207 |
+
error_msg = f"Erreur API {response.status}: {error_text}"
|
| 208 |
+
|
| 209 |
+
if attempt < max_retries:
|
| 210 |
+
self.logger.warning(f"{error_msg} - Tentative {attempt + 1}/{max_retries}")
|
| 211 |
+
await asyncio.sleep(2 ** attempt) # Backoff exponentiel
|
| 212 |
+
continue
|
| 213 |
+
else:
|
| 214 |
+
raise LLMError(error_msg)
|
| 215 |
+
|
| 216 |
+
except asyncio.TimeoutError:
|
| 217 |
+
if attempt < max_retries:
|
| 218 |
+
self.logger.warning(f"Timeout API - Tentative {attempt + 1}/{max_retries}")
|
| 219 |
+
await asyncio.sleep(2 ** attempt)
|
| 220 |
+
continue
|
| 221 |
+
else:
|
| 222 |
+
raise LLMError("Timeout API après plusieurs tentatives")
|
| 223 |
+
|
| 224 |
+
except Exception as e:
|
| 225 |
+
if attempt < max_retries:
|
| 226 |
+
self.logger.warning(f"Erreur réseau: {e} - Tentative {attempt + 1}/{max_retries}")
|
| 227 |
+
await asyncio.sleep(2 ** attempt)
|
| 228 |
+
continue
|
| 229 |
+
else:
|
| 230 |
+
raise LLMError(f"Erreur de connexion: {str(e)}")
|
| 231 |
+
|
| 232 |
+
raise LLMError("Toutes les tentatives ont échoué")
|
| 233 |
+
|
| 234 |
+
async def _check_rate_limits(self):
|
| 235 |
+
"""Vérifie et applique les limites de taux."""
|
| 236 |
+
current_time = time.time()
|
| 237 |
+
|
| 238 |
+
# Nettoyer les timestamps anciens (plus de 1 minute)
|
| 239 |
+
self.request_timestamps = [
|
| 240 |
+
ts for ts in self.request_timestamps
|
| 241 |
+
if current_time - ts < 60
|
| 242 |
+
]
|
| 243 |
+
|
| 244 |
+
# Vérifier si on dépasse la limite
|
| 245 |
+
if len(self.request_timestamps) >= self.rate_limit_requests:
|
| 246 |
+
oldest_request = min(self.request_timestamps)
|
| 247 |
+
wait_time = 60 - (current_time - oldest_request)
|
| 248 |
+
|
| 249 |
+
if wait_time > 0:
|
| 250 |
+
self.logger.info(f"Rate limit: attente {wait_time:.1f}s")
|
| 251 |
+
await asyncio.sleep(wait_time)
|
| 252 |
+
|
| 253 |
+
def estimate_tokens(self, text: str) -> int:
|
| 254 |
+
"""Estime le nombre de tokens dans un texte."""
|
| 255 |
+
# Approximation: 1 token ≈ 4 caractères pour l'anglais/français
|
| 256 |
+
return len(text) // 4
|
| 257 |
+
|
| 258 |
+
def validate_input_length(self, text: str, max_tokens: int = 6000) -> bool:
|
| 259 |
+
"""Valide que le texte ne dépasse pas la limite de tokens."""
|
| 260 |
+
estimated_tokens = self.estimate_tokens(text)
|
| 261 |
+
return estimated_tokens <= max_tokens
|
| 262 |
+
|
| 263 |
+
def truncate_text(self, text: str, max_tokens: int = 6000) -> str:
|
| 264 |
+
"""Tronque un texte pour respecter la limite de tokens."""
|
| 265 |
+
estimated_tokens = self.estimate_tokens(text)
|
| 266 |
+
|
| 267 |
+
if estimated_tokens <= max_tokens:
|
| 268 |
+
return text
|
| 269 |
+
|
| 270 |
+
# Calculer le ratio de troncature
|
| 271 |
+
ratio = max_tokens / estimated_tokens
|
| 272 |
+
target_length = int(len(text) * ratio * 0.9) # Marge de sécurité
|
| 273 |
+
|
| 274 |
+
# Tronquer en préservant les phrases
|
| 275 |
+
sentences = text.split('. ')
|
| 276 |
+
truncated = ""
|
| 277 |
+
|
| 278 |
+
for sentence in sentences:
|
| 279 |
+
if len(truncated) + len(sentence) + 2 <= target_length:
|
| 280 |
+
truncated += sentence + ". "
|
| 281 |
+
else:
|
| 282 |
+
break
|
| 283 |
+
|
| 284 |
+
self.logger.info(f"Texte tronqué: {len(text)} → {len(truncated)} caractères")
|
| 285 |
+
return truncated.strip()
|
| 286 |
+
|
| 287 |
+
async def test_connection(self) -> bool:
|
| 288 |
+
"""Teste la connexion à l'API."""
|
| 289 |
+
try:
|
| 290 |
+
result = await self.generate_completion(
|
| 291 |
+
"Test de connexion. Réponds juste 'OK'.",
|
| 292 |
+
system_prompt="Tu es un assistant de test."
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
if "ok" in result.lower():
|
| 296 |
+
self.logger.info("Test de connexion LLM réussi")
|
| 297 |
+
return True
|
| 298 |
+
else:
|
| 299 |
+
self.logger.warning(f"Test de connexion étrange: {result}")
|
| 300 |
+
return False
|
| 301 |
+
|
| 302 |
+
except Exception as e:
|
| 303 |
+
self.logger.error(f"Test de connexion LLM échoué: {e}")
|
| 304 |
+
return False
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
class LLMManager:
|
| 308 |
+
"""
|
| 309 |
+
Gestionnaire de services LLM avec stratégies multiples.
|
| 310 |
+
"""
|
| 311 |
+
|
| 312 |
+
def __init__(self):
|
| 313 |
+
self.logger = setup_logger("llm_manager")
|
| 314 |
+
self.primary_service = LLMService()
|
| 315 |
+
self.services = {
|
| 316 |
+
"groq": self.primary_service
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
async def get_completion(
|
| 320 |
+
self,
|
| 321 |
+
prompt: str,
|
| 322 |
+
system_prompt: Optional[str] = None,
|
| 323 |
+
service: str = "groq",
|
| 324 |
+
**kwargs
|
| 325 |
+
) -> str:
|
| 326 |
+
"""
|
| 327 |
+
Obtient une complétion en utilisant le service spécifié.
|
| 328 |
+
|
| 329 |
+
Args:
|
| 330 |
+
prompt: Prompt utilisateur
|
| 331 |
+
system_prompt: Prompt système
|
| 332 |
+
service: Service LLM à utiliser
|
| 333 |
+
**kwargs: Paramètres supplémentaires
|
| 334 |
+
|
| 335 |
+
Returns:
|
| 336 |
+
Réponse du modèle
|
| 337 |
+
"""
|
| 338 |
+
if service not in self.services:
|
| 339 |
+
raise ValueError(f"Service LLM inconnu: {service}")
|
| 340 |
+
|
| 341 |
+
llm_service = self.services[service]
|
| 342 |
+
return await llm_service.generate_completion(prompt, system_prompt, **kwargs)
|
| 343 |
+
|
| 344 |
+
async def get_batch_completions(
|
| 345 |
+
self,
|
| 346 |
+
prompts: List[str],
|
| 347 |
+
system_prompt: Optional[str] = None,
|
| 348 |
+
service: str = "groq",
|
| 349 |
+
**kwargs
|
| 350 |
+
) -> List[str]:
|
| 351 |
+
"""Obtient des complétions en batch."""
|
| 352 |
+
if service not in self.services:
|
| 353 |
+
raise ValueError(f"Service LLM inconnu: {service}")
|
| 354 |
+
|
| 355 |
+
llm_service = self.services[service]
|
| 356 |
+
return await llm_service.generate_batch_completions(
|
| 357 |
+
prompts, system_prompt, **kwargs
|
| 358 |
+
)
|
| 359 |
+
|
| 360 |
+
async def test_all_services(self) -> Dict[str, bool]:
|
| 361 |
+
"""Teste tous les services LLM disponibles."""
|
| 362 |
+
results = {}
|
| 363 |
+
|
| 364 |
+
for name, service in self.services.items():
|
| 365 |
+
try:
|
| 366 |
+
results[name] = await service.test_connection()
|
| 367 |
+
except Exception as e:
|
| 368 |
+
self.logger.error(f"Test service {name} échoué: {e}")
|
| 369 |
+
results[name] = False
|
| 370 |
+
|
| 371 |
+
return results
|
| 372 |
+
|
| 373 |
+
# Exemple d'utilisation du service LLM
|
| 374 |
+
|
| 375 |
+
async def example_usage():
|
| 376 |
+
"""Exemple d'utilisation du service LLM."""
|
| 377 |
+
|
| 378 |
+
# 1. Test de connexion simple
|
| 379 |
+
print("=== Test de connexion ===")
|
| 380 |
+
llm_service = LLMService()
|
| 381 |
+
|
| 382 |
+
connection_ok = await llm_service.test_connection()
|
| 383 |
+
print(f"Connexion LLM: {'✓ OK' if connection_ok else '✗ Échec'}")
|
| 384 |
+
|
| 385 |
+
if not connection_ok:
|
| 386 |
+
print("Impossible de continuer sans connexion")
|
| 387 |
+
return
|
| 388 |
+
|
| 389 |
+
# 2. Génération simple
|
| 390 |
+
print("\n=== Génération simple ===")
|
| 391 |
+
try:
|
| 392 |
+
response = await llm_service.generate_completion(
|
| 393 |
+
prompt="Explique-moi en 2 phrases ce qu'est l'intelligence artificielle.",
|
| 394 |
+
system_prompt="Tu es un expert en IA qui explique simplement."
|
| 395 |
+
)
|
| 396 |
+
print(f"Réponse: {response}")
|
| 397 |
+
except Exception as e:
|
| 398 |
+
print(f"Erreur: {e}")
|
| 399 |
+
|
| 400 |
+
# 3. Génération avec paramètres personnalisés
|
| 401 |
+
print("\n=== Génération avec paramètres ===")
|
| 402 |
+
try:
|
| 403 |
+
response = await llm_service.generate_completion(
|
| 404 |
+
prompt="Écris un haiku sur la technologie.",
|
| 405 |
+
system_prompt="Tu es un poète spécialisé dans les haikus.",
|
| 406 |
+
temperature=0.8,
|
| 407 |
+
max_tokens=100
|
| 408 |
+
)
|
| 409 |
+
print(f"Haiku: {response}")
|
| 410 |
+
except Exception as e:
|
| 411 |
+
print(f"Erreur: {e}")
|
| 412 |
+
|
| 413 |
+
# 4. Génération en batch
|
| 414 |
+
print("\n=== Génération en batch ===")
|
| 415 |
+
prompts = [
|
| 416 |
+
"Qu'est-ce que Python?",
|
| 417 |
+
"Qu'est-ce que JavaScript?",
|
| 418 |
+
"Qu'est-ce que Rust?"
|
| 419 |
+
]
|
| 420 |
+
|
| 421 |
+
try:
|
| 422 |
+
responses = await llm_service.generate_batch_completions(
|
| 423 |
+
prompts=prompts,
|
| 424 |
+
system_prompt="Réponds en une phrase courte.",
|
| 425 |
+
max_concurrent=2
|
| 426 |
+
)
|
| 427 |
+
|
| 428 |
+
for i, (prompt, response) in enumerate(zip(prompts, responses)):
|
| 429 |
+
print(f"{i+1}. {prompt}")
|
| 430 |
+
print(f" → {response}\n")
|
| 431 |
+
except Exception as e:
|
| 432 |
+
print(f"Erreur batch: {e}")
|
| 433 |
+
|
| 434 |
+
# 5. Test des utilitaires
|
| 435 |
+
print("\n=== Test des utilitaires ===")
|
| 436 |
+
long_text = "Ceci est un texte très long. " * 1000
|
| 437 |
+
print(f"Texte original: {len(long_text)} caractères")
|
| 438 |
+
print(f"Tokens estimés: {llm_service.estimate_tokens(long_text)}")
|
| 439 |
+
|
| 440 |
+
is_valid = llm_service.validate_input_length(long_text, max_tokens=7000)
|
| 441 |
+
print(f"Texte valide (7000 tokens max): {is_valid}")
|
| 442 |
+
|
| 443 |
+
if not is_valid:
|
| 444 |
+
truncated = llm_service.truncate_text(long_text, max_tokens=7000)
|
| 445 |
+
print(f"Texte tronqué: {len(truncated)} caractères")
|
| 446 |
+
print(f"Contenu: {truncated[:200]}...")
|
| 447 |
+
|
| 448 |
+
# Test avec le gestionnaire LLM
|
| 449 |
+
async def example_manager_usage():
|
| 450 |
+
"""Exemple d'utilisation du gestionnaire LLM."""
|
| 451 |
+
|
| 452 |
+
print("\n=== Test du gestionnaire LLM ===")
|
| 453 |
+
|
| 454 |
+
manager = LLMManager()
|
| 455 |
+
|
| 456 |
+
# Test de tous les services
|
| 457 |
+
service_status = await manager.test_all_services()
|
| 458 |
+
print("État des services:")
|
| 459 |
+
for service, status in service_status.items():
|
| 460 |
+
print(f" {service}: {'✓' if status else '✗'}")
|
| 461 |
+
|
| 462 |
+
# Utilisation via le gestionnaire
|
| 463 |
+
try:
|
| 464 |
+
response = await manager.get_completion(
|
| 465 |
+
prompt="Salut! Comment ça va?",
|
| 466 |
+
system_prompt="Tu es un assistant amical.",
|
| 467 |
+
service="groq"
|
| 468 |
+
)
|
| 469 |
+
print(f"\nRéponse du gestionnaire: {response}")
|
| 470 |
+
except Exception as e:
|
| 471 |
+
print(f"Erreur gestionnaire: {e}")
|
| 472 |
+
|
| 473 |
+
# Fonction principale pour tester
|
| 474 |
+
async def main():
|
| 475 |
+
"""Fonction principale de test."""
|
| 476 |
+
try:
|
| 477 |
+
await example_usage()
|
| 478 |
+
await example_manager_usage()
|
| 479 |
+
except KeyboardInterrupt:
|
| 480 |
+
print("\n\nTest interrompu par l'utilisateur")
|
| 481 |
+
except Exception as e:
|
| 482 |
+
print(f"\nErreur inattendue: {e}")
|
| 483 |
+
traceback.print_exc()
|
| 484 |
+
|
| 485 |
+
# Pour exécuter le test
|
| 486 |
+
if __name__ == "__main__":
|
| 487 |
+
print("🚀 Démarrage du test du service LLM...")
|
| 488 |
+
asyncio.run(main())
|
src/services/search_api.py
ADDED
|
@@ -0,0 +1,347 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Services d'API pour la recherche web.
|
| 3 |
+
Intègre les APIs Tavily et Serper pour la recherche d'informations.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from abc import ABC, abstractmethod
|
| 7 |
+
from typing import List, Dict, Any, Optional
|
| 8 |
+
import requests
|
| 9 |
+
import asyncio
|
| 10 |
+
import aiohttp
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
import json
|
| 13 |
+
|
| 14 |
+
from src.core.logging import setup_logger
|
| 15 |
+
from src.models.research_models import SearchResult
|
| 16 |
+
|
| 17 |
+
# Import sécurisé de la configuration
|
| 18 |
+
try:
|
| 19 |
+
from config.settings import api_config
|
| 20 |
+
except Exception as e:
|
| 21 |
+
print(f"Erreur lors de l'import de la configuration: {e}")
|
| 22 |
+
api_config = None
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class SearchAPIError(Exception):
|
| 26 |
+
"""Exception pour les erreurs d'API de recherche."""
|
| 27 |
+
pass
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class BaseSearchAPI(ABC):
|
| 31 |
+
"""Interface de base pour les APIs de recherche."""
|
| 32 |
+
|
| 33 |
+
@abstractmethod
|
| 34 |
+
async def search(
|
| 35 |
+
self,
|
| 36 |
+
query: str,
|
| 37 |
+
max_results: int = 5,
|
| 38 |
+
**kwargs
|
| 39 |
+
) -> List[SearchResult]:
|
| 40 |
+
"""
|
| 41 |
+
Effectue une recherche.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
query: Requête de recherche
|
| 45 |
+
max_results: Nombre maximum de résultats
|
| 46 |
+
**kwargs: Paramètres spécifiques à l'API
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
Liste des résultats de recherche
|
| 50 |
+
"""
|
| 51 |
+
pass
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class TavilySearchAPI(BaseSearchAPI):
|
| 55 |
+
"""
|
| 56 |
+
Client pour l'API Tavily.
|
| 57 |
+
Documentation: https://docs.tavily.com/
|
| 58 |
+
"""
|
| 59 |
+
|
| 60 |
+
def __init__(self, api_key: Optional[str] = None):
|
| 61 |
+
# Accès sécurisé à la configuration
|
| 62 |
+
if api_config:
|
| 63 |
+
self.api_key = api_key or getattr(api_config, 'TAVILY_API_KEY', '')
|
| 64 |
+
else:
|
| 65 |
+
self.api_key = api_key or ''
|
| 66 |
+
self.base_url = "https://api.tavily.com"
|
| 67 |
+
self.logger = setup_logger("tavily_api")
|
| 68 |
+
|
| 69 |
+
if not self.api_key:
|
| 70 |
+
raise SearchAPIError("Clé API Tavily manquante")
|
| 71 |
+
|
| 72 |
+
async def search(
|
| 73 |
+
self,
|
| 74 |
+
query: str,
|
| 75 |
+
max_results: int = 5,
|
| 76 |
+
search_depth: str = "basic",
|
| 77 |
+
include_images: bool = False,
|
| 78 |
+
include_answer: bool = True,
|
| 79 |
+
**kwargs
|
| 80 |
+
) -> List[SearchResult]:
|
| 81 |
+
"""
|
| 82 |
+
Recherche avec l'API Tavily.
|
| 83 |
+
|
| 84 |
+
Args:
|
| 85 |
+
query: Requête de recherche
|
| 86 |
+
max_results: Nombre de résultats (max 20)
|
| 87 |
+
search_depth: "basic" ou "advanced"
|
| 88 |
+
include_images: Inclure les images
|
| 89 |
+
include_answer: Inclure une réponse IA
|
| 90 |
+
|
| 91 |
+
Returns:
|
| 92 |
+
Liste des résultats
|
| 93 |
+
"""
|
| 94 |
+
self.logger.info(f"Recherche Tavily: '{query}' (max: {max_results})")
|
| 95 |
+
|
| 96 |
+
payload = {
|
| 97 |
+
"api_key": self.api_key,
|
| 98 |
+
"query": query,
|
| 99 |
+
"search_depth": search_depth,
|
| 100 |
+
"max_results": min(max_results, 20),
|
| 101 |
+
"include_images": include_images,
|
| 102 |
+
"include_answer": include_answer,
|
| 103 |
+
"include_raw_content": False
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
async with aiohttp.ClientSession() as session:
|
| 107 |
+
try:
|
| 108 |
+
async with session.post(
|
| 109 |
+
f"{self.base_url}/search",
|
| 110 |
+
json=payload,
|
| 111 |
+
timeout=30
|
| 112 |
+
) as response:
|
| 113 |
+
|
| 114 |
+
if response.status != 200:
|
| 115 |
+
error_text = await response.text()
|
| 116 |
+
raise SearchAPIError(f"Erreur Tavily {response.status}: {error_text}")
|
| 117 |
+
|
| 118 |
+
data = await response.json()
|
| 119 |
+
return self._parse_tavily_results(data)
|
| 120 |
+
|
| 121 |
+
except aiohttp.ClientTimeout:
|
| 122 |
+
raise SearchAPIError("Timeout lors de la requête Tavily")
|
| 123 |
+
except aiohttp.ClientError as e:
|
| 124 |
+
raise SearchAPIError(f"Erreur de connexion Tavily: {str(e)}")
|
| 125 |
+
|
| 126 |
+
def _parse_tavily_results(self, data: Dict[str, Any]) -> List[SearchResult]:
|
| 127 |
+
"""Parse les résultats de l'API Tavily."""
|
| 128 |
+
results = []
|
| 129 |
+
|
| 130 |
+
for item in data.get("results", []):
|
| 131 |
+
try:
|
| 132 |
+
# Parsing de la date de publication si disponible
|
| 133 |
+
published_date = None
|
| 134 |
+
if "published_date" in item and item["published_date"]:
|
| 135 |
+
try:
|
| 136 |
+
published_date = datetime.fromisoformat(item["published_date"].replace('Z', '+00:00'))
|
| 137 |
+
except:
|
| 138 |
+
pass
|
| 139 |
+
|
| 140 |
+
result = SearchResult(
|
| 141 |
+
title=item.get("title", ""),
|
| 142 |
+
url=item.get("url", ""),
|
| 143 |
+
snippet=item.get("content", ""),
|
| 144 |
+
published_date=published_date,
|
| 145 |
+
source=item.get("source", ""),
|
| 146 |
+
score=item.get("score", 0.0)
|
| 147 |
+
)
|
| 148 |
+
results.append(result)
|
| 149 |
+
|
| 150 |
+
except Exception as e:
|
| 151 |
+
self.logger.warning(f"Erreur parsing résultat Tavily: {e}")
|
| 152 |
+
continue
|
| 153 |
+
|
| 154 |
+
self.logger.info(f"Tavily: {len(results)} résultats parsés")
|
| 155 |
+
return results
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
class SerperSearchAPI(BaseSearchAPI):
|
| 159 |
+
"""
|
| 160 |
+
Client pour l'API Serper (Google Search).
|
| 161 |
+
Documentation: https://serper.dev/
|
| 162 |
+
"""
|
| 163 |
+
|
| 164 |
+
def __init__(self, api_key: Optional[str] = None):
|
| 165 |
+
# Accès sécurisé à la configuration
|
| 166 |
+
if api_config:
|
| 167 |
+
self.api_key = api_key or getattr(api_config, 'SERPER_API_KEY', '')
|
| 168 |
+
else:
|
| 169 |
+
self.api_key = api_key or ''
|
| 170 |
+
self.base_url = "https://google.serper.dev"
|
| 171 |
+
self.logger = setup_logger("serper_api")
|
| 172 |
+
|
| 173 |
+
if not self.api_key:
|
| 174 |
+
raise SearchAPIError("Clé API Serper manquante")
|
| 175 |
+
|
| 176 |
+
async def search(
|
| 177 |
+
self,
|
| 178 |
+
query: str,
|
| 179 |
+
max_results: int = 5,
|
| 180 |
+
country: str = "fr",
|
| 181 |
+
language: str = "fr",
|
| 182 |
+
search_type: str = "search",
|
| 183 |
+
**kwargs
|
| 184 |
+
) -> List[SearchResult]:
|
| 185 |
+
"""
|
| 186 |
+
Recherche avec l'API Serper.
|
| 187 |
+
|
| 188 |
+
Args:
|
| 189 |
+
query: Requête de recherche
|
| 190 |
+
max_results: Nombre de résultats (max 100)
|
| 191 |
+
country: Code pays (ex: "fr", "us")
|
| 192 |
+
language: Code langue (ex: "fr", "en")
|
| 193 |
+
search_type: Type de recherche ("search", "news", "images")
|
| 194 |
+
|
| 195 |
+
Returns:
|
| 196 |
+
Liste des résultats
|
| 197 |
+
"""
|
| 198 |
+
self.logger.info(f"Recherche Serper: '{query}' (max: {max_results})")
|
| 199 |
+
|
| 200 |
+
payload = {
|
| 201 |
+
"q": query,
|
| 202 |
+
"num": min(max_results, 100),
|
| 203 |
+
"gl": country,
|
| 204 |
+
"hl": language
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
headers = {
|
| 208 |
+
"X-API-KEY": self.api_key,
|
| 209 |
+
"Content-Type": "application/json"
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
endpoint = f"{self.base_url}/{search_type}"
|
| 213 |
+
|
| 214 |
+
async with aiohttp.ClientSession() as session:
|
| 215 |
+
try:
|
| 216 |
+
async with session.post(
|
| 217 |
+
endpoint,
|
| 218 |
+
json=payload,
|
| 219 |
+
headers=headers,
|
| 220 |
+
timeout=30
|
| 221 |
+
) as response:
|
| 222 |
+
|
| 223 |
+
if response.status != 200:
|
| 224 |
+
error_text = await response.text()
|
| 225 |
+
raise SearchAPIError(f"Erreur Serper {response.status}: {error_text}")
|
| 226 |
+
|
| 227 |
+
data = await response.json()
|
| 228 |
+
return self._parse_serper_results(data, search_type)
|
| 229 |
+
|
| 230 |
+
except aiohttp.ClientTimeout:
|
| 231 |
+
raise SearchAPIError("Timeout lors de la requête Serper")
|
| 232 |
+
except aiohttp.ClientError as e:
|
| 233 |
+
raise SearchAPIError(f"Erreur de connexion Serper: {str(e)}")
|
| 234 |
+
|
| 235 |
+
def _parse_serper_results(self, data: Dict[str, Any], search_type: str) -> List[SearchResult]:
|
| 236 |
+
"""Parse les résultats de l'API Serper."""
|
| 237 |
+
results = []
|
| 238 |
+
|
| 239 |
+
# Les résultats sont dans différentes clés selon le type de recherche
|
| 240 |
+
items_key = "organic" if search_type == "search" else "news" if search_type == "news" else "images"
|
| 241 |
+
items = data.get(items_key, [])
|
| 242 |
+
|
| 243 |
+
for item in items:
|
| 244 |
+
try:
|
| 245 |
+
# Parsing de la date pour les news
|
| 246 |
+
published_date = None
|
| 247 |
+
if "date" in item:
|
| 248 |
+
try:
|
| 249 |
+
published_date = datetime.fromisoformat(item["date"])
|
| 250 |
+
except:
|
| 251 |
+
pass
|
| 252 |
+
|
| 253 |
+
result = SearchResult(
|
| 254 |
+
title=item.get("title", ""),
|
| 255 |
+
url=item.get("link", ""),
|
| 256 |
+
snippet=item.get("snippet", ""),
|
| 257 |
+
published_date=published_date,
|
| 258 |
+
source=item.get("source", ""),
|
| 259 |
+
score=item.get("position", 0) / 100.0 # Position convertie en score
|
| 260 |
+
)
|
| 261 |
+
results.append(result)
|
| 262 |
+
|
| 263 |
+
except Exception as e:
|
| 264 |
+
self.logger.warning(f"Erreur parsing résultat Serper: {e}")
|
| 265 |
+
continue
|
| 266 |
+
|
| 267 |
+
self.logger.info(f"Serper: {len(results)} résultats parsés")
|
| 268 |
+
return results
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
class SearchAPIManager:
|
| 272 |
+
"""
|
| 273 |
+
Gestionnaire des APIs de recherche.
|
| 274 |
+
Permet de basculer entre les APIs et de gérer les fallbacks.
|
| 275 |
+
"""
|
| 276 |
+
|
| 277 |
+
def __init__(self):
|
| 278 |
+
self.apis = {}
|
| 279 |
+
self.logger = setup_logger("search_manager")
|
| 280 |
+
|
| 281 |
+
# Initialisation des APIs disponibles
|
| 282 |
+
try:
|
| 283 |
+
if api_config and getattr(api_config, 'TAVILY_API_KEY', ''):
|
| 284 |
+
self.apis["tavily"] = TavilySearchAPI()
|
| 285 |
+
self.logger.info("API Tavily initialisée")
|
| 286 |
+
except Exception as e:
|
| 287 |
+
self.logger.warning(f"Impossible d'initialiser Tavily: {e}")
|
| 288 |
+
|
| 289 |
+
try:
|
| 290 |
+
if api_config and getattr(api_config, 'SERPER_API_KEY', ''):
|
| 291 |
+
self.apis["serper"] = SerperSearchAPI()
|
| 292 |
+
self.logger.info("API Serper initialisée")
|
| 293 |
+
except Exception as e:
|
| 294 |
+
self.logger.warning(f"Impossible d'initialiser Serper: {e}")
|
| 295 |
+
|
| 296 |
+
if not self.apis:
|
| 297 |
+
raise SearchAPIError("Aucune API de recherche disponible")
|
| 298 |
+
|
| 299 |
+
async def search(
|
| 300 |
+
self,
|
| 301 |
+
query: str,
|
| 302 |
+
max_results: int = 5,
|
| 303 |
+
preferred_api: str = "tavily",
|
| 304 |
+
**kwargs
|
| 305 |
+
) -> List[SearchResult]:
|
| 306 |
+
"""
|
| 307 |
+
Effectue une recherche avec fallback entre APIs.
|
| 308 |
+
|
| 309 |
+
Args:
|
| 310 |
+
query: Requête de recherche
|
| 311 |
+
max_results: Nombre de résultats
|
| 312 |
+
preferred_api: API préférée ("tavily" ou "serper")
|
| 313 |
+
|
| 314 |
+
Returns:
|
| 315 |
+
Liste des résultats
|
| 316 |
+
"""
|
| 317 |
+
# Ordre de priorité des APIs
|
| 318 |
+
api_order = [preferred_api] + [api for api in self.apis.keys() if api != preferred_api]
|
| 319 |
+
|
| 320 |
+
for api_name in api_order:
|
| 321 |
+
if api_name not in self.apis:
|
| 322 |
+
continue
|
| 323 |
+
|
| 324 |
+
try:
|
| 325 |
+
self.logger.info(f"Tentative de recherche avec {api_name}")
|
| 326 |
+
results = await self.apis[api_name].search(query, max_results, **kwargs)
|
| 327 |
+
|
| 328 |
+
if results:
|
| 329 |
+
self.logger.info(f"Recherche réussie avec {api_name}: {len(results)} résultats")
|
| 330 |
+
return results
|
| 331 |
+
else:
|
| 332 |
+
self.logger.warning(f"Aucun résultat avec {api_name}")
|
| 333 |
+
|
| 334 |
+
except Exception as e:
|
| 335 |
+
self.logger.warning(f"Erreur avec {api_name}: {e}")
|
| 336 |
+
continue
|
| 337 |
+
|
| 338 |
+
# Aucune API n'a fonctionné
|
| 339 |
+
raise SearchAPIError(f"Échec de recherche avec toutes les APIs pour: {query}")
|
| 340 |
+
|
| 341 |
+
def get_available_apis(self) -> List[str]:
|
| 342 |
+
"""Retourne la liste des APIs disponibles."""
|
| 343 |
+
return list(self.apis.keys())
|
| 344 |
+
|
| 345 |
+
def is_api_available(self, api_name: str) -> bool:
|
| 346 |
+
"""Vérifie si une API est disponible."""
|
| 347 |
+
return api_name in self.apis
|
src/services/text_chunking.py
ADDED
|
@@ -0,0 +1,404 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Service de chunking pour la gestion des textes longs.
|
| 3 |
+
Divise intelligemment les documents en chunks pour le traitement par LLM.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import re
|
| 7 |
+
from typing import List, Dict, Tuple, Optional
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
|
| 10 |
+
from src.core.logging import setup_logger
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class TextChunk:
|
| 15 |
+
"""Représente un chunk de texte avec métadonnées."""
|
| 16 |
+
content: str
|
| 17 |
+
start_index: int
|
| 18 |
+
end_index: int
|
| 19 |
+
chunk_id: int
|
| 20 |
+
total_chunks: int
|
| 21 |
+
word_count: int
|
| 22 |
+
has_heading: bool = False
|
| 23 |
+
heading_text: Optional[str] = None
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class TextChunker:
|
| 27 |
+
"""
|
| 28 |
+
Service de découpage intelligent de texte pour le traitement par LLM.
|
| 29 |
+
|
| 30 |
+
Fonctionnalités:
|
| 31 |
+
- Découpage respectant les phrases et paragraphes
|
| 32 |
+
- Préservation des titres et structure
|
| 33 |
+
- Gestion du chevauchement entre chunks
|
| 34 |
+
- Optimisation pour les limites de tokens LLM
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
def __init__(
|
| 38 |
+
self,
|
| 39 |
+
max_chunk_size: int = 4000, # En caractères
|
| 40 |
+
overlap_size: int = 200, # Chevauchement entre chunks
|
| 41 |
+
min_chunk_size: int = 500 # Taille minimale d'un chunk
|
| 42 |
+
):
|
| 43 |
+
self.max_chunk_size = max_chunk_size
|
| 44 |
+
self.overlap_size = overlap_size
|
| 45 |
+
self.min_chunk_size = min_chunk_size
|
| 46 |
+
self.logger = setup_logger("text_chunker")
|
| 47 |
+
|
| 48 |
+
# Patterns pour identifier la structure
|
| 49 |
+
self.heading_patterns = [
|
| 50 |
+
r'^#{1,6}\s+.+$', # Markdown headings
|
| 51 |
+
r'^\d+\.\s+.+$', # Numérotations
|
| 52 |
+
r'^[A-Z\s]{5,}$', # Titres en majuscules
|
| 53 |
+
r'^\w+:$', # Labels avec deux-points
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
self.sentence_endings = r'[.!?]+(?:\s|$)'
|
| 57 |
+
self.paragraph_breaks = r'\n\s*\n'
|
| 58 |
+
|
| 59 |
+
def chunk_text(self, text: str, preserve_structure: bool = True) -> List[TextChunk]:
|
| 60 |
+
"""
|
| 61 |
+
Découpe un texte en chunks intelligents.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
text: Texte à découper
|
| 65 |
+
preserve_structure: Préserver la structure (titres, paragraphes)
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
Liste des chunks créés
|
| 69 |
+
"""
|
| 70 |
+
if not text or len(text.strip()) == 0:
|
| 71 |
+
return []
|
| 72 |
+
|
| 73 |
+
# Nettoyage préliminaire
|
| 74 |
+
text = self._clean_text(text)
|
| 75 |
+
|
| 76 |
+
# Si le texte est assez court, retourner un seul chunk
|
| 77 |
+
if len(text) <= self.max_chunk_size:
|
| 78 |
+
return [TextChunk(
|
| 79 |
+
content=text,
|
| 80 |
+
start_index=0,
|
| 81 |
+
end_index=len(text),
|
| 82 |
+
chunk_id=1,
|
| 83 |
+
total_chunks=1,
|
| 84 |
+
word_count=len(text.split())
|
| 85 |
+
)]
|
| 86 |
+
|
| 87 |
+
# Découpage intelligent
|
| 88 |
+
if preserve_structure:
|
| 89 |
+
chunks = self._chunk_with_structure(text)
|
| 90 |
+
else:
|
| 91 |
+
chunks = self._chunk_simple(text)
|
| 92 |
+
|
| 93 |
+
# Post-traitement des chunks
|
| 94 |
+
chunks = self._post_process_chunks(chunks)
|
| 95 |
+
|
| 96 |
+
self.logger.info(f"Texte découpé en {len(chunks)} chunks (taille moyenne: {sum(len(c.content) for c in chunks) // len(chunks)} caractères)")
|
| 97 |
+
|
| 98 |
+
return chunks
|
| 99 |
+
|
| 100 |
+
def _clean_text(self, text: str) -> str:
|
| 101 |
+
"""Nettoie le texte avant découpage."""
|
| 102 |
+
# Normaliser les espaces
|
| 103 |
+
text = re.sub(r'\s+', ' ', text)
|
| 104 |
+
|
| 105 |
+
# Normaliser les sauts de ligne
|
| 106 |
+
text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
|
| 107 |
+
|
| 108 |
+
# Supprimer les espaces en début et fin
|
| 109 |
+
text = text.strip()
|
| 110 |
+
|
| 111 |
+
return text
|
| 112 |
+
|
| 113 |
+
def _chunk_with_structure(self, text: str) -> List[TextChunk]:
|
| 114 |
+
"""Découpage en préservant la structure du document."""
|
| 115 |
+
chunks = []
|
| 116 |
+
current_chunk = ""
|
| 117 |
+
current_start = 0
|
| 118 |
+
|
| 119 |
+
# Diviser en paragraphes
|
| 120 |
+
paragraphs = re.split(self.paragraph_breaks, text)
|
| 121 |
+
text_position = 0
|
| 122 |
+
|
| 123 |
+
for paragraph in paragraphs:
|
| 124 |
+
if not paragraph.strip():
|
| 125 |
+
continue
|
| 126 |
+
|
| 127 |
+
# Vérifier si le paragraphe contient un titre
|
| 128 |
+
is_heading, heading_text = self._detect_heading(paragraph)
|
| 129 |
+
|
| 130 |
+
# Si ajouter ce paragraphe dépasse la limite
|
| 131 |
+
if len(current_chunk) + len(paragraph) > self.max_chunk_size and current_chunk:
|
| 132 |
+
# Sauvegarder le chunk actuel
|
| 133 |
+
chunk = self._create_chunk(
|
| 134 |
+
current_chunk.strip(),
|
| 135 |
+
current_start,
|
| 136 |
+
text_position,
|
| 137 |
+
len(chunks) + 1
|
| 138 |
+
)
|
| 139 |
+
chunks.append(chunk)
|
| 140 |
+
|
| 141 |
+
# Commencer un nouveau chunk avec chevauchement
|
| 142 |
+
overlap_text = self._get_overlap_text(current_chunk)
|
| 143 |
+
current_chunk = overlap_text + paragraph
|
| 144 |
+
current_start = text_position - len(overlap_text)
|
| 145 |
+
else:
|
| 146 |
+
# Ajouter le paragraphe au chunk actuel
|
| 147 |
+
if current_chunk:
|
| 148 |
+
current_chunk += "\n\n" + paragraph
|
| 149 |
+
else:
|
| 150 |
+
current_chunk = paragraph
|
| 151 |
+
current_start = text_position
|
| 152 |
+
|
| 153 |
+
text_position += len(paragraph) + 2 # +2 pour \n\n
|
| 154 |
+
|
| 155 |
+
# Ajouter le dernier chunk
|
| 156 |
+
if current_chunk.strip():
|
| 157 |
+
chunk = self._create_chunk(
|
| 158 |
+
current_chunk.strip(),
|
| 159 |
+
current_start,
|
| 160 |
+
len(text),
|
| 161 |
+
len(chunks) + 1
|
| 162 |
+
)
|
| 163 |
+
chunks.append(chunk)
|
| 164 |
+
|
| 165 |
+
return chunks
|
| 166 |
+
|
| 167 |
+
def _chunk_simple(self, text: str) -> List[TextChunk]:
|
| 168 |
+
"""Découpage simple par phrases."""
|
| 169 |
+
chunks = []
|
| 170 |
+
sentences = re.split(self.sentence_endings, text)
|
| 171 |
+
|
| 172 |
+
current_chunk = ""
|
| 173 |
+
current_start = 0
|
| 174 |
+
text_position = 0
|
| 175 |
+
|
| 176 |
+
for sentence in sentences:
|
| 177 |
+
sentence = sentence.strip()
|
| 178 |
+
if not sentence:
|
| 179 |
+
continue
|
| 180 |
+
|
| 181 |
+
# Estimer la position dans le texte original
|
| 182 |
+
sentence_in_text = sentence + "." # Approximation
|
| 183 |
+
|
| 184 |
+
if len(current_chunk) + len(sentence_in_text) > self.max_chunk_size and current_chunk:
|
| 185 |
+
# Sauvegarder le chunk actuel
|
| 186 |
+
chunk = self._create_chunk(
|
| 187 |
+
current_chunk.strip(),
|
| 188 |
+
current_start,
|
| 189 |
+
text_position,
|
| 190 |
+
len(chunks) + 1
|
| 191 |
+
)
|
| 192 |
+
chunks.append(chunk)
|
| 193 |
+
|
| 194 |
+
# Nouveau chunk avec chevauchement
|
| 195 |
+
overlap_text = self._get_overlap_text(current_chunk)
|
| 196 |
+
current_chunk = overlap_text + sentence_in_text
|
| 197 |
+
current_start = text_position - len(overlap_text)
|
| 198 |
+
else:
|
| 199 |
+
if current_chunk:
|
| 200 |
+
current_chunk += " " + sentence_in_text
|
| 201 |
+
else:
|
| 202 |
+
current_chunk = sentence_in_text
|
| 203 |
+
current_start = text_position
|
| 204 |
+
|
| 205 |
+
text_position += len(sentence_in_text)
|
| 206 |
+
|
| 207 |
+
# Dernier chunk
|
| 208 |
+
if current_chunk.strip():
|
| 209 |
+
chunk = self._create_chunk(
|
| 210 |
+
current_chunk.strip(),
|
| 211 |
+
current_start,
|
| 212 |
+
len(text),
|
| 213 |
+
len(chunks) + 1
|
| 214 |
+
)
|
| 215 |
+
chunks.append(chunk)
|
| 216 |
+
|
| 217 |
+
return chunks
|
| 218 |
+
|
| 219 |
+
def _detect_heading(self, paragraph: str) -> Tuple[bool, Optional[str]]:
|
| 220 |
+
"""Détecte si un paragraphe est un titre."""
|
| 221 |
+
lines = paragraph.strip().split('\n')
|
| 222 |
+
first_line = lines[0].strip()
|
| 223 |
+
|
| 224 |
+
for pattern in self.heading_patterns:
|
| 225 |
+
if re.match(pattern, first_line, re.MULTILINE):
|
| 226 |
+
return True, first_line
|
| 227 |
+
|
| 228 |
+
# Détection heuristique
|
| 229 |
+
if (len(first_line) < 100 and
|
| 230 |
+
len(first_line.split()) < 10 and
|
| 231 |
+
first_line[0].isupper()):
|
| 232 |
+
return True, first_line
|
| 233 |
+
|
| 234 |
+
return False, None
|
| 235 |
+
|
| 236 |
+
def _get_overlap_text(self, chunk: str) -> str:
|
| 237 |
+
"""Extrait le texte de chevauchement à la fin d'un chunk."""
|
| 238 |
+
if len(chunk) <= self.overlap_size:
|
| 239 |
+
return ""
|
| 240 |
+
|
| 241 |
+
# Prendre les dernières phrases jusqu'à overlap_size
|
| 242 |
+
sentences = re.split(self.sentence_endings, chunk[-self.overlap_size:])
|
| 243 |
+
|
| 244 |
+
if len(sentences) > 1:
|
| 245 |
+
# Garder les phrases complètes
|
| 246 |
+
return ". ".join(sentences[1:]) + ". "
|
| 247 |
+
else:
|
| 248 |
+
# Fallback: prendre les derniers mots
|
| 249 |
+
words = chunk.split()
|
| 250 |
+
overlap_words = []
|
| 251 |
+
char_count = 0
|
| 252 |
+
|
| 253 |
+
for word in reversed(words):
|
| 254 |
+
if char_count + len(word) > self.overlap_size:
|
| 255 |
+
break
|
| 256 |
+
overlap_words.insert(0, word)
|
| 257 |
+
char_count += len(word) + 1
|
| 258 |
+
|
| 259 |
+
return " ".join(overlap_words) + " " if overlap_words else ""
|
| 260 |
+
|
| 261 |
+
def _create_chunk(self, content: str, start: int, end: int, chunk_id: int) -> TextChunk:
|
| 262 |
+
"""Crée un objet TextChunk avec métadonnées."""
|
| 263 |
+
is_heading, heading_text = self._detect_heading(content)
|
| 264 |
+
|
| 265 |
+
return TextChunk(
|
| 266 |
+
content=content,
|
| 267 |
+
start_index=start,
|
| 268 |
+
end_index=end,
|
| 269 |
+
chunk_id=chunk_id,
|
| 270 |
+
total_chunks=0, # Sera mis à jour dans post_process
|
| 271 |
+
word_count=len(content.split()),
|
| 272 |
+
has_heading=is_heading,
|
| 273 |
+
heading_text=heading_text
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
def _post_process_chunks(self, chunks: List[TextChunk]) -> List[TextChunk]:
|
| 277 |
+
"""Post-traitement des chunks."""
|
| 278 |
+
total_chunks = len(chunks)
|
| 279 |
+
|
| 280 |
+
# Mettre à jour le nombre total de chunks
|
| 281 |
+
for chunk in chunks:
|
| 282 |
+
chunk.total_chunks = total_chunks
|
| 283 |
+
|
| 284 |
+
# Fusionner les chunks trop petits
|
| 285 |
+
merged_chunks = []
|
| 286 |
+
i = 0
|
| 287 |
+
|
| 288 |
+
while i < len(chunks):
|
| 289 |
+
current_chunk = chunks[i]
|
| 290 |
+
|
| 291 |
+
# Si le chunk est trop petit et qu'il y a un chunk suivant
|
| 292 |
+
if (len(current_chunk.content) < self.min_chunk_size and
|
| 293 |
+
i + 1 < len(chunks) and
|
| 294 |
+
len(current_chunk.content) + len(chunks[i + 1].content) <= self.max_chunk_size):
|
| 295 |
+
|
| 296 |
+
# Fusionner avec le chunk suivant
|
| 297 |
+
next_chunk = chunks[i + 1]
|
| 298 |
+
merged_content = current_chunk.content + "\n\n" + next_chunk.content
|
| 299 |
+
|
| 300 |
+
merged_chunk = TextChunk(
|
| 301 |
+
content=merged_content,
|
| 302 |
+
start_index=current_chunk.start_index,
|
| 303 |
+
end_index=next_chunk.end_index,
|
| 304 |
+
chunk_id=len(merged_chunks) + 1,
|
| 305 |
+
total_chunks=0, # Sera mis à jour à la fin
|
| 306 |
+
word_count=len(merged_content.split()),
|
| 307 |
+
has_heading=current_chunk.has_heading or next_chunk.has_heading,
|
| 308 |
+
heading_text=current_chunk.heading_text or next_chunk.heading_text
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
merged_chunks.append(merged_chunk)
|
| 312 |
+
i += 2 # Passer les deux chunks fusionnés
|
| 313 |
+
else:
|
| 314 |
+
# Garder le chunk tel quel
|
| 315 |
+
current_chunk.chunk_id = len(merged_chunks) + 1
|
| 316 |
+
merged_chunks.append(current_chunk)
|
| 317 |
+
i += 1
|
| 318 |
+
|
| 319 |
+
# Mettre à jour le nombre total final
|
| 320 |
+
for chunk in merged_chunks:
|
| 321 |
+
chunk.total_chunks = len(merged_chunks)
|
| 322 |
+
|
| 323 |
+
return merged_chunks
|
| 324 |
+
|
| 325 |
+
def get_chunking_stats(self, chunks: List[TextChunk]) -> Dict[str, any]:
|
| 326 |
+
"""Calcule les statistiques de découpage."""
|
| 327 |
+
if not chunks:
|
| 328 |
+
return {}
|
| 329 |
+
|
| 330 |
+
chunk_sizes = [len(chunk.content) for chunk in chunks]
|
| 331 |
+
word_counts = [chunk.word_count for chunk in chunks]
|
| 332 |
+
|
| 333 |
+
return {
|
| 334 |
+
"total_chunks": len(chunks),
|
| 335 |
+
"total_characters": sum(chunk_sizes),
|
| 336 |
+
"total_words": sum(word_counts),
|
| 337 |
+
"average_chunk_size": sum(chunk_sizes) // len(chunks),
|
| 338 |
+
"average_words_per_chunk": sum(word_counts) // len(chunks),
|
| 339 |
+
"min_chunk_size": min(chunk_sizes),
|
| 340 |
+
"max_chunk_size": max(chunk_sizes),
|
| 341 |
+
"chunks_with_headings": sum(1 for chunk in chunks if chunk.has_heading)
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
|
| 345 |
+
class ChunkingManager:
|
| 346 |
+
"""
|
| 347 |
+
Gestionnaire de chunking avec différentes stratégies.
|
| 348 |
+
"""
|
| 349 |
+
|
| 350 |
+
def __init__(self):
|
| 351 |
+
self.logger = setup_logger("chunking_manager")
|
| 352 |
+
|
| 353 |
+
# Chunkers spécialisés
|
| 354 |
+
self.chunkers = {
|
| 355 |
+
"default": TextChunker(max_chunk_size=4000, overlap_size=200),
|
| 356 |
+
"small": TextChunker(max_chunk_size=2000, overlap_size=100),
|
| 357 |
+
"large": TextChunker(max_chunk_size=20000, overlap_size=300),
|
| 358 |
+
"precise": TextChunker(max_chunk_size=3000, overlap_size=150, min_chunk_size=800)
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
def chunk_document(
|
| 362 |
+
self,
|
| 363 |
+
content: str,
|
| 364 |
+
strategy: str = "default",
|
| 365 |
+
preserve_structure: bool = True
|
| 366 |
+
) -> List[TextChunk]:
|
| 367 |
+
"""
|
| 368 |
+
Découpe un document selon la stratégie spécifiée.
|
| 369 |
+
|
| 370 |
+
Args:
|
| 371 |
+
content: Contenu à découper
|
| 372 |
+
strategy: Stratégie de découpage (default, small, large, precise)
|
| 373 |
+
preserve_structure: Préserver la structure du document
|
| 374 |
+
|
| 375 |
+
Returns:
|
| 376 |
+
Liste des chunks créés
|
| 377 |
+
"""
|
| 378 |
+
if strategy not in self.chunkers:
|
| 379 |
+
self.logger.warning(f"Stratégie inconnue '{strategy}', utilisation de 'default'")
|
| 380 |
+
strategy = "default"
|
| 381 |
+
|
| 382 |
+
chunker = self.chunkers[strategy]
|
| 383 |
+
chunks = chunker.chunk_text(content, preserve_structure)
|
| 384 |
+
|
| 385 |
+
# Statistiques
|
| 386 |
+
stats = chunker.get_chunking_stats(chunks)
|
| 387 |
+
self.logger.info(f"Chunking '{strategy}': {stats['total_chunks']} chunks créés")
|
| 388 |
+
|
| 389 |
+
return chunks
|
| 390 |
+
|
| 391 |
+
def auto_select_strategy(self, content: str) -> str:
|
| 392 |
+
"""Sélectionne automatiquement la meilleure stratégie de chunking."""
|
| 393 |
+
content_length = len(content)
|
| 394 |
+
word_count = len(content.split())
|
| 395 |
+
|
| 396 |
+
# Heuristiques pour sélectionner la stratégie
|
| 397 |
+
if content_length < 5000:
|
| 398 |
+
return "small"
|
| 399 |
+
elif content_length > 20000:
|
| 400 |
+
return "large"
|
| 401 |
+
elif word_count > 3000: # Texte dense
|
| 402 |
+
return "precise"
|
| 403 |
+
else:
|
| 404 |
+
return "default"
|