Spaces:
Configuration error
Configuration error
| import requests | |
| from bs4 import BeautifulSoup | |
| import torch | |
| from sentence_transformers import SentenceTransformer | |
| import chromadb | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.tools import Tool | |
| import uuid | |
| from typing import List, Dict | |
| class WebSemanticSearchTool: | |
| def __init__(self): | |
| # Initialisation du modèle d'embedding | |
| self.embedding_model = SentenceTransformer('all-MiniLM-L12-v2') | |
| # Initialisation de Chroma (en mémoire) | |
| self.chroma_client = chromadb.Client() | |
| # Text splitter pour le chunking | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=500, | |
| chunk_overlap=50, | |
| length_function=len | |
| ) | |
| def extract_content(self, url: str) -> str: | |
| """Extrait le contenu textuel d'une page web""" | |
| try: | |
| response = requests.get(url, timeout=10) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Supprimer les scripts et styles | |
| for script in soup(["script", "style"]): | |
| script.decompose() | |
| # Extraire le texte principal | |
| text = soup.get_text() | |
| # Nettoyer le texte | |
| lines = (line.strip() for line in text.splitlines()) | |
| chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) | |
| text = ' '.join(chunk for chunk in chunks if chunk) | |
| return text | |
| except Exception as e: | |
| return f"Erreur lors de l'extraction: {str(e)}" | |
| def create_chunks(self, text: str) -> List[str]: | |
| """Divise le texte en chunks""" | |
| return self.text_splitter.split_text(text) | |
| def search_semantic(self, query: str, url: str) -> str: | |
| """Recherche sémantique dans une page web""" | |
| # 1. Extraire le contenu | |
| content = self.extract_content(url) | |
| if content.startswith("Erreur"): | |
| return content | |
| # 2. Créer les chunks | |
| chunks = self.create_chunks(content) | |
| if not chunks: | |
| return "Aucun contenu trouvé dans la page" | |
| # 3. Créer une collection Chroma temporaire | |
| collection_name = f"temp_collection_{uuid.uuid4().hex[:8]}" | |
| collection = self.chroma_client.create_collection( | |
| name=collection_name, | |
| embedding_function=None # Nous gérons les embeddings manuellement | |
| ) | |
| try: | |
| # 4. Générer les embeddings pour tous les chunks | |
| chunk_embeddings = self.embedding_model.encode(chunks) | |
| # 5. Ajouter les chunks à la collection | |
| collection.add( | |
| documents=chunks, | |
| embeddings=chunk_embeddings.tolist(), | |
| ids=[f"chunk_{i}" for i in range(len(chunks))] | |
| ) | |
| # 6. Générer l'embedding de la requête | |
| query_embedding = self.embedding_model.encode([query]) | |
| # 7. Rechercher les 3 chunks les plus similaires | |
| results = collection.query( | |
| query_embeddings=query_embedding.tolist(), | |
| n_results=3 | |
| ) | |
| # 8. Formater les résultats | |
| if results['documents']: | |
| top_chunks = results['documents'][0] | |
| distances = results['distances'][0] if results['distances'] else [] | |
| formatted_results = [] | |
| for i, chunk in enumerate(top_chunks): | |
| similarity = 1 - distances[i] if distances else "N/A" | |
| formatted_results.append(chunk) | |
| return "\n\n".join(formatted_results) | |
| else: | |
| return "Aucun résultat trouvé" | |
| finally: | |
| # Nettoyer la collection temporaire | |
| self.chroma_client.delete_collection(collection_name) | |