| | """ |
| | Agent Researcher - Premier agent du pipeline. |
| | Effectue la recherche web sur un sujet donné et retourne des sources pertinentes. |
| | """ |
| |
|
| | from typing import List, Dict, Any, Optional |
| | import asyncio |
| | from datetime import datetime |
| |
|
| | from src.agents.base_agent import BaseAgent |
| | from src.models.research_models import ResearchQuery, ResearchOutput, SearchResult |
| | from src.models.state_models import AgentType |
| | from src.services.search_api import SearchAPIManager, SearchAPIError |
| | from src.services.llm_service import LLMService, LLMError |
| | from src.core.logging import setup_logger |
| | from config.prompts import RESEARCHER_PROMPT, SYSTEM_PROMPTS, KEYWORD_EXTRACTION_PROMPT |
| |
|
| |
|
| | class ResearcherAgent(BaseAgent[ResearchQuery, ResearchOutput]): |
| | """ |
| | Agent de recherche web. |
| | |
| | Responsabilités: |
| | - Recevoir une requête de recherche |
| | - Effectuer des recherches sur le web via des APIs |
| | - Analyser et filtrer les résultats |
| | - Retourner une liste de sources pertinentes |
| | """ |
| | |
| | def __init__( |
| | self, |
| | name: str = "researcher", |
| | max_retries: int = 3, |
| | timeout: float = 120.0 |
| | ): |
| | super().__init__( |
| | agent_type=AgentType.RESEARCHER, |
| | name=name, |
| | max_retries=max_retries, |
| | timeout=timeout |
| | ) |
| | |
| | |
| | try: |
| | self.search_manager = SearchAPIManager() |
| | self.logger.info(f"APIs disponibles: {self.search_manager.get_available_apis()}") |
| | except Exception as e: |
| | self.logger.error(f"Impossible d'initialiser le gestionnaire de recherche: {e}") |
| | raise |
| | |
| | |
| | try: |
| | self.llm_service = LLMService() |
| | self.logger.info("Service LLM initialisé pour l'extraction de mots-clés") |
| | except Exception as e: |
| | self.logger.error(f"Impossible d'initialiser le service LLM: {e}") |
| | raise |
| | |
| | |
| | self.default_search_params = { |
| | "preferred_api": "tavily", |
| | "search_depth": "basic", |
| | "include_answer": True |
| | } |
| | |
| | def validate_input(self, input_data: ResearchQuery) -> bool: |
| | """ |
| | Valide la requête de recherche. |
| | |
| | Args: |
| | input_data: Requête de recherche à valider |
| | |
| | Returns: |
| | True si la requête est valide |
| | """ |
| | if not input_data.topic or len(input_data.topic.strip()) < 3: |
| | self.logger.error("Le sujet de recherche doit contenir au moins 3 caractères") |
| | return False |
| | |
| | if input_data.max_results <= 0 or input_data.max_results > 20: |
| | self.logger.error("Le nombre de résultats doit être entre 1 et 20") |
| | return False |
| | |
| | return True |
| | |
| | async def process(self, input_data: ResearchQuery) -> ResearchOutput: |
| | """ |
| | Traite la requête de recherche. |
| | |
| | Args: |
| | input_data: Requête de recherche |
| | |
| | Returns: |
| | Résultats de recherche structurés |
| | """ |
| | start_time = datetime.now() |
| | self.logger.info(f"Début de recherche pour: '{input_data.topic}'") |
| | |
| | |
| | search_query = self._prepare_search_query(input_data) |
| | self.logger.info(f"Requête préparée: '{search_query}'") |
| | |
| | |
| | search_params = { |
| | **self.default_search_params, |
| | "search_depth": input_data.search_depth, |
| | "max_results": input_data.max_results |
| | } |
| | |
| | try: |
| | |
| | results = await self.search_manager.search( |
| | query=search_query, |
| | **search_params |
| | ) |
| | |
| | |
| | filtered_results = self._filter_and_rank_results( |
| | results, |
| | input_data.topic, |
| | input_data.keywords |
| | ) |
| | |
| | |
| | final_results = filtered_results[:input_data.max_results] |
| | |
| | |
| | search_time = (datetime.now() - start_time).total_seconds() |
| | |
| | |
| | research_output = ResearchOutput( |
| | query=input_data, |
| | results=final_results, |
| | total_found=len(results), |
| | search_time=search_time, |
| | search_engine=search_params["preferred_api"], |
| | timestamp=datetime.now() |
| | ) |
| | |
| | self.logger.info( |
| | f"Recherche terminée: {len(final_results)} résultats finaux " |
| | f"sur {len(results)} trouvés en {search_time:.2f}s" |
| | ) |
| | |
| | return research_output |
| | |
| | except SearchAPIError as e: |
| | self.logger.error(f"Erreur de recherche: {e}") |
| | raise |
| | except Exception as e: |
| | self.logger.error(f"Erreur inattendue lors de la recherche: {e}") |
| | raise |
| | |
| | def _prepare_search_query(self, query: ResearchQuery) -> str: |
| | """ |
| | Prépare la requête de recherche en optimisant les mots-clés. |
| | |
| | Args: |
| | query: Requête originale |
| | |
| | Returns: |
| | Requête optimisée pour la recherche |
| | """ |
| | |
| | search_terms = [query.topic] |
| | |
| | |
| | if query.keywords: |
| | |
| | unique_keywords = [ |
| | kw for kw in query.keywords |
| | if kw.lower() not in query.topic.lower() |
| | ] |
| | search_terms.extend(unique_keywords) |
| | |
| | |
| | search_query = " ".join(search_terms) |
| | |
| | |
| | |
| | |
| | if query.search_depth == "advanced": |
| | |
| | if "intelligence artificielle" in search_query.lower() or "ia" in search_query.lower(): |
| | search_query += " 2024 2025 récent" |
| | if "emploi" in search_query.lower() or "travail" in search_query.lower(): |
| | search_query += " marché impact" |
| | |
| | return search_query.strip() |
| | |
| | def _filter_and_rank_results( |
| | self, |
| | results: List[SearchResult], |
| | topic: str, |
| | keywords: List[str] |
| | ) -> List[SearchResult]: |
| | """ |
| | Filtre et classe les résultats par pertinence. |
| | |
| | Args: |
| | results: Résultats bruts de la recherche |
| | topic: Sujet de recherche original |
| | keywords: Mots-clés de recherche |
| | |
| | Returns: |
| | Résultats filtrés et classés |
| | """ |
| | if not results: |
| | return [] |
| | |
| | |
| | scoring_terms = [topic.lower()] + [kw.lower() for kw in keywords] |
| | |
| | |
| | scored_results = [] |
| | for result in results: |
| | score = self._calculate_relevance_score(result, scoring_terms) |
| | |
| | |
| | result.score = score |
| | scored_results.append(result) |
| | |
| | |
| | scored_results.sort(key=lambda x: x.score or 0, reverse=True) |
| | |
| | |
| | min_score = 0.1 |
| | filtered_results = [r for r in scored_results if (r.score or 0) >= min_score] |
| | |
| | self.logger.info(f"Filtrage: {len(filtered_results)} résultats conservés sur {len(results)}") |
| | |
| | return filtered_results |
| | |
| | |
| | |
| | |
| | def _calculate_relevance_score( |
| | self, |
| | result: SearchResult, |
| | scoring_terms: List[str] |
| | ) -> float: |
| | """ |
| | Calcule un score de pertinence pour un résultat. |
| | |
| | Args: |
| | result: Résultat à scorer |
| | scoring_terms: Termes de référence pour le scoring |
| | |
| | Returns: |
| | Score entre 0 et 1 |
| | """ |
| | score = 0.0 |
| | |
| | |
| | text_to_analyze = f"{result.title} {result.snippet}".lower() |
| | |
| | |
| | term_matches = 0 |
| | for term in scoring_terms: |
| | if term in text_to_analyze: |
| | term_matches += 1 |
| | |
| | if scoring_terms: |
| | term_score = term_matches / len(scoring_terms) |
| | score += term_score * 0.6 |
| | |
| | |
| | title_matches = sum(1 for term in scoring_terms if term in result.title.lower()) |
| | if scoring_terms: |
| | title_score = title_matches / len(scoring_terms) |
| | score += title_score * 0.3 |
| | |
| | |
| | if result.published_date: |
| | days_old = (datetime.now() - result.published_date.replace(tzinfo=None)).days |
| | if days_old <= 365: |
| | recency_score = max(0, 1 - (days_old / 365)) |
| | score += recency_score * 0.1 |
| | |
| | |
| | if result.score and result.score > 0: |
| | score = (score + result.score) / 2 |
| | |
| | return min(score, 1.0) |
| | |
| | async def extract_keywords_with_llm(self, topic: str) -> List[str]: |
| | """ |
| | Extrait automatiquement des mots-clés pertinents à partir du sujet |
| | en utilisant le service LLM. |
| | |
| | Args: |
| | topic: Sujet de recherche |
| | |
| | Returns: |
| | Liste de mots-clés extraits |
| | """ |
| | try: |
| | self.logger.info(f"Extraction de mots-clés pour: '{topic}'") |
| | |
| | |
| | prompt = KEYWORD_EXTRACTION_PROMPT.format(topic=topic) |
| | |
| | |
| | response = await self.llm_service.generate_completion( |
| | prompt=prompt, |
| | system_prompt="Tu es un expert en analyse sémantique spécialisé dans l'extraction de mots-clés pour la recherche web.", |
| | temperature=0.3, |
| | max_tokens=150 |
| | ) |
| | |
| | |
| | keywords = self._parse_keywords_response(response) |
| | |
| | self.logger.info(f"Mots-clés extraits: {keywords}") |
| | return keywords |
| | |
| | except LLMError as e: |
| | self.logger.error(f"Erreur LLM lors de l'extraction de mots-clés: {e}") |
| | |
| | return self._extract_keywords_fallback(topic) |
| | except Exception as e: |
| | self.logger.error(f"Erreur inattendue lors de l'extraction de mots-clés: {e}") |
| | return self._extract_keywords_fallback(topic) |
| | |
| | def _parse_keywords_response(self, response: str) -> List[str]: |
| | """ |
| | Parse la réponse du LLM pour extraire les mots-clés. |
| | |
| | Args: |
| | response: Réponse brute du LLM |
| | |
| | Returns: |
| | Liste de mots-clés nettoyés |
| | """ |
| | |
| | response = response.strip() |
| | |
| | |
| | for prefix in ["mots-clés:", "keywords:", "réponse:", "voici:", "liste:"]: |
| | if response.lower().startswith(prefix): |
| | response = response[len(prefix):].strip() |
| | |
| | |
| | keywords = [kw.strip() for kw in response.split(",")] |
| | |
| | |
| | cleaned_keywords = [] |
| | for kw in keywords: |
| | |
| | kw = kw.strip("0123456789.-\t\n ") |
| | |
| | |
| | if len(kw) >= 2 and kw.lower() not in ["et", "ou", "le", "la", "les", "de", "du", "des"]: |
| | cleaned_keywords.append(kw) |
| | |
| | |
| | return cleaned_keywords[:7] |
| | |
| | def _extract_keywords_fallback(self, topic: str) -> List[str]: |
| | """ |
| | Méthode de fallback pour extraire des mots-clés simples. |
| | |
| | Args: |
| | topic: Sujet de recherche |
| | |
| | Returns: |
| | Liste de mots-clés basiques |
| | """ |
| | self.logger.info("Utilisation du fallback pour l'extraction de mots-clés") |
| | |
| | |
| | stop_words = { |
| | "le", "la", "les", "de", "du", "des", "et", "ou", "sur", "dans", |
| | "avec", "pour", "par", "en", "à", "un", "une", "ce", "cette", "ces" |
| | } |
| | |
| | |
| | words = topic.lower().split() |
| | keywords = [word for word in words if len(word) >= 3 and word not in stop_words] |
| | |
| | return keywords[:5] |
| | |
| | async def search_with_fallback( |
| | self, |
| | query: str, |
| | max_results: int = 5 |
| | ) -> List[SearchResult]: |
| | """ |
| | Méthode utilitaire pour recherche simple avec fallback. |
| | |
| | Args: |
| | query: Requête de recherche simple |
| | max_results: Nombre de résultats souhaités |
| | |
| | Returns: |
| | Liste des résultats |
| | """ |
| | research_query = ResearchQuery( |
| | topic=query, |
| | max_results=max_results |
| | ) |
| | |
| | output = await self.process(research_query) |
| | return output.results |
| | |
| | def get_search_stats(self) -> Dict[str, Any]: |
| | """ |
| | Retourne les statistiques de recherche de l'agent. |
| | |
| | Returns: |
| | Dictionnaire avec les statistiques |
| | """ |
| | base_stats = self.get_status() |
| | search_stats = { |
| | "available_apis": self.search_manager.get_available_apis(), |
| | "search_params": self.default_search_params |
| | } |
| | |
| | return {**base_stats, **search_stats} |
| | |
| |
|
| | |
| | def save_research_output(output: ResearchOutput, filename: str = None) -> str: |
| | """ |
| | Sauvegarde un ResearchOutput dans un fichier JSON. |
| | |
| | Args: |
| | output: Sortie de recherche à sauvegarder |
| | filename: Nom du fichier (optionnel) |
| | |
| | Returns: |
| | Nom du fichier sauvegardé |
| | """ |
| | import json |
| | from datetime import datetime |
| | |
| | if not filename: |
| | |
| | clean_topic = "".join(c for c in output.query.topic if c.isalnum() or c in (' ', '-', '_')).rstrip() |
| | clean_topic = clean_topic.replace(' ', '_')[:30] |
| | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| | filename = f"research_output_{clean_topic}_{timestamp}.json" |
| | |
| | try: |
| | |
| | output_dict = output.model_dump(mode='json') |
| | |
| | |
| | with open(filename, 'w', encoding='utf-8') as f: |
| | json.dump(output_dict, f, indent=2, ensure_ascii=False) |
| | |
| | return filename |
| | |
| | except Exception as e: |
| | raise Exception(f"Erreur lors de la sauvegarde: {e}") |
| |
|
| |
|
| | def load_research_output(filename: str) -> ResearchOutput: |
| | """ |
| | Charge un ResearchOutput depuis un fichier JSON. |
| | |
| | Args: |
| | filename: Nom du fichier à charger |
| | |
| | Returns: |
| | ResearchOutput chargé |
| | """ |
| | import json |
| | |
| | try: |
| | with open(filename, 'r', encoding='utf-8') as f: |
| | data = json.load(f) |
| | |
| | |
| | return ResearchOutput(**data) |
| | |
| | except Exception as e: |
| | raise Exception(f"Erreur lors du chargement: {e}") |
| |
|
| |
|
| | |
| | if __name__ == "__main__": |
| | import asyncio |
| | import json |
| | from datetime import datetime |
| | from src.core.logging import setup_logger |
| | logger = setup_logger("researcher_agent_test") |
| | |
| | async def main(): |
| | agent = ResearcherAgent() |
| | |
| | |
| | topic = "impact de l'intelligence artificielle sur le marché de l'emploi" |
| | logger.info(f"=== Test d'extraction de mots-clés pour: {topic} ===") |
| | |
| | try: |
| | |
| | keywords = await agent.extract_keywords_with_llm(topic) |
| | logger.info(f"Mots-clés extraits automatiquement: {keywords}") |
| | |
| | |
| | query = ResearchQuery( |
| | topic=topic, |
| | keywords=keywords, |
| | max_results=2, |
| | search_depth="basic" |
| | ) |
| | |
| | if agent.validate_input(query): |
| | logger.info("=== Début de la recherche avec mots-clés automatiques ===") |
| | output = await agent.process(query) |
| | logger.info(f"Résultats obtenus: {len(output.results)}") |
| | |
| | |
| | for i, res in enumerate(output.results, 1): |
| | logger.info(f"{i}. {res.title}") |
| | logger.info(f" URL: {res.url}") |
| | logger.info(f" Score: {res.score:.3f}") |
| | logger.info(f" Snippet: {res.snippet[:100]}...") |
| | logger.info("") |
| | |
| | |
| | logger.info("=== Sauvegarde du ResearchOutput ===") |
| | |
| | try: |
| | filename = save_research_output(output) |
| | logger.info(f"✅ ResearchOutput sauvegardé dans: {filename}") |
| | |
| | |
| | logger.info("📄 Contenu sauvegardé:") |
| | logger.info(f" • Sujet: {output.query.topic}") |
| | logger.info(f" • Mots-clés: {output.query.keywords}") |
| | logger.info(f" • Nombre de résultats: {len(output.results)}") |
| | logger.info(f" • Temps de recherche: {output.search_time:.2f}s") |
| | logger.info(f" • Moteur utilisé: {output.search_engine}") |
| | logger.info(f" • Timestamp: {output.timestamp}") |
| | |
| | |
| | logger.info("=== Test de chargement ===") |
| | loaded_output = load_research_output(filename) |
| | logger.info(f"✅ ResearchOutput rechargé avec succès") |
| | logger.info(f" • Vérification: {len(loaded_output.results)} résultats chargés") |
| | |
| | |
| | if loaded_output.query.topic == output.query.topic: |
| | logger.info("✅ Intégrité des données vérifiée") |
| | else: |
| | logger.error("❌ Erreur d'intégrité des données") |
| | |
| | |
| | logger.info("\n📋 EXEMPLE DE FORMAT JSON SAUVEGARDÉ:") |
| | logger.info("-" * 50) |
| | |
| | |
| | example_output = { |
| | "query": { |
| | "topic": output.query.topic, |
| | "keywords": output.query.keywords[:3], |
| | "max_results": output.query.max_results, |
| | "search_depth": output.query.search_depth |
| | }, |
| | "results": [ |
| | { |
| | "title": res.title, |
| | "url": str(res.url), |
| | "snippet": res.snippet[:100] + "...", |
| | "score": res.score |
| | } for res in output.results[:2] |
| | ], |
| | "total_found": output.total_found, |
| | "search_time": output.search_time, |
| | "search_engine": output.search_engine, |
| | "timestamp": output.timestamp.isoformat() |
| | } |
| | |
| | print(json.dumps(example_output, indent=2, ensure_ascii=False)) |
| | |
| | except Exception as save_error: |
| | logger.error(f"❌ Erreur lors de la sauvegarde: {save_error}") |
| | |
| | else: |
| | logger.error("Requête invalide.") |
| | |
| | except Exception as e: |
| | logger.error(f"Erreur lors du test: {e}") |
| | |
| | |
| | async def test_save_load(): |
| | """Test spécifique de sauvegarde/chargement.""" |
| | logger.info("=== TEST SAUVEGARDE/CHARGEMENT SEUL ===") |
| | |
| | |
| | from datetime import datetime |
| | |
| | fake_results = [ |
| | SearchResult( |
| | title="Test Article 1", |
| | url="https://example.com/test1", |
| | snippet="Ceci est un test de snippet pour l'article 1", |
| | score=0.85 |
| | ), |
| | SearchResult( |
| | title="Test Article 2", |
| | url="https://example.com/test2", |
| | snippet="Ceci est un test de snippet pour l'article 2", |
| | score=0.78 |
| | ) |
| | ] |
| | |
| | fake_query = ResearchQuery( |
| | topic="test sauvegarde", |
| | keywords=["test", "sauvegarde", "json"], |
| | max_results=2 |
| | ) |
| | |
| | fake_output = ResearchOutput( |
| | query=fake_query, |
| | results=fake_results, |
| | total_found=2, |
| | search_time=1.5, |
| | search_engine="test", |
| | timestamp=datetime.now() |
| | ) |
| | |
| | try: |
| | |
| | filename = save_research_output(fake_output, "test_research_output.json") |
| | logger.info(f"✅ Test sauvegarde réussi: {filename}") |
| | |
| | |
| | loaded = load_research_output(filename) |
| | logger.info(f"✅ Test chargement réussi: {len(loaded.results)} résultats") |
| | |
| | except Exception as e: |
| | logger.error(f"❌ Test sauvegarde/chargement échoué: {e}") |
| | |
| | |
| | import sys |
| | if len(sys.argv) > 1 and sys.argv[1] == "--test-save": |
| | asyncio.run(test_save_load()) |
| | else: |
| | asyncio.run(main()) |