Hf_Agent_Course_Final_Assignment / web_semantic_search_tool.py
felixmortas's picture
Improve URL search tool with RAG
e9b8de1
import requests
from bs4 import BeautifulSoup
import torch
from sentence_transformers import SentenceTransformer
import chromadb
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.tools import Tool
import uuid
from typing import List, Dict
class WebSemanticSearchTool:
def __init__(self):
# Initialisation du modèle d'embedding
self.embedding_model = SentenceTransformer('all-MiniLM-L12-v2')
# Initialisation de Chroma (en mémoire)
self.chroma_client = chromadb.Client()
# Text splitter pour le chunking
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
length_function=len
)
def extract_content(self, url: str) -> str:
"""Extrait le contenu textuel d'une page web"""
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Supprimer les scripts et styles
for script in soup(["script", "style"]):
script.decompose()
# Extraire le texte principal
text = soup.get_text()
# Nettoyer le texte
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
return text
except Exception as e:
return f"Erreur lors de l'extraction: {str(e)}"
def create_chunks(self, text: str) -> List[str]:
"""Divise le texte en chunks"""
return self.text_splitter.split_text(text)
def search_semantic(self, query: str, url: str) -> str:
"""Recherche sémantique dans une page web"""
# 1. Extraire le contenu
content = self.extract_content(url)
if content.startswith("Erreur"):
return content
# 2. Créer les chunks
chunks = self.create_chunks(content)
if not chunks:
return "Aucun contenu trouvé dans la page"
# 3. Créer une collection Chroma temporaire
collection_name = f"temp_collection_{uuid.uuid4().hex[:8]}"
collection = self.chroma_client.create_collection(
name=collection_name,
embedding_function=None # Nous gérons les embeddings manuellement
)
try:
# 4. Générer les embeddings pour tous les chunks
chunk_embeddings = self.embedding_model.encode(chunks)
# 5. Ajouter les chunks à la collection
collection.add(
documents=chunks,
embeddings=chunk_embeddings.tolist(),
ids=[f"chunk_{i}" for i in range(len(chunks))]
)
# 6. Générer l'embedding de la requête
query_embedding = self.embedding_model.encode([query])
# 7. Rechercher les 3 chunks les plus similaires
results = collection.query(
query_embeddings=query_embedding.tolist(),
n_results=3
)
# 8. Formater les résultats
if results['documents']:
top_chunks = results['documents'][0]
distances = results['distances'][0] if results['distances'] else []
formatted_results = []
for i, chunk in enumerate(top_chunks):
similarity = 1 - distances[i] if distances else "N/A"
formatted_results.append(chunk)
return "\n\n".join(formatted_results)
else:
return "Aucun résultat trouvé"
finally:
# Nettoyer la collection temporaire
self.chroma_client.delete_collection(collection_name)