import requests from bs4 import BeautifulSoup import torch from sentence_transformers import SentenceTransformer import chromadb from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.tools import Tool import uuid from typing import List, Dict class WebSemanticSearchTool: def __init__(self): # Initialisation du modèle d'embedding self.embedding_model = SentenceTransformer('all-MiniLM-L12-v2') # Initialisation de Chroma (en mémoire) self.chroma_client = chromadb.Client() # Text splitter pour le chunking self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=50, length_function=len ) def extract_content(self, url: str) -> str: """Extrait le contenu textuel d'une page web""" try: response = requests.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Supprimer les scripts et styles for script in soup(["script", "style"]): script.decompose() # Extraire le texte principal text = soup.get_text() # Nettoyer le texte lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = ' '.join(chunk for chunk in chunks if chunk) return text except Exception as e: return f"Erreur lors de l'extraction: {str(e)}" def create_chunks(self, text: str) -> List[str]: """Divise le texte en chunks""" return self.text_splitter.split_text(text) def search_semantic(self, query: str, url: str) -> str: """Recherche sémantique dans une page web""" # 1. Extraire le contenu content = self.extract_content(url) if content.startswith("Erreur"): return content # 2. Créer les chunks chunks = self.create_chunks(content) if not chunks: return "Aucun contenu trouvé dans la page" # 3. Créer une collection Chroma temporaire collection_name = f"temp_collection_{uuid.uuid4().hex[:8]}" collection = self.chroma_client.create_collection( name=collection_name, embedding_function=None # Nous gérons les embeddings manuellement ) try: # 4. Générer les embeddings pour tous les chunks chunk_embeddings = self.embedding_model.encode(chunks) # 5. Ajouter les chunks à la collection collection.add( documents=chunks, embeddings=chunk_embeddings.tolist(), ids=[f"chunk_{i}" for i in range(len(chunks))] ) # 6. Générer l'embedding de la requête query_embedding = self.embedding_model.encode([query]) # 7. Rechercher les 3 chunks les plus similaires results = collection.query( query_embeddings=query_embedding.tolist(), n_results=3 ) # 8. Formater les résultats if results['documents']: top_chunks = results['documents'][0] distances = results['distances'][0] if results['distances'] else [] formatted_results = [] for i, chunk in enumerate(top_chunks): similarity = 1 - distances[i] if distances else "N/A" formatted_results.append(chunk) return "\n\n".join(formatted_results) else: return "Aucun résultat trouvé" finally: # Nettoyer la collection temporaire self.chroma_client.delete_collection(collection_name)