Spaces:
Running
Running
| """Gestion du vector store FAISS avec embeddings HuggingFace.""" | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| from pathlib import Path | |
| import faiss | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| from src.config import EMBEDDING_MODEL, TOP_K, VECTORSTORE_DIR | |
| from src.document_processor import DocumentChunk, ParsedDocument | |
| logger = logging.getLogger(__name__) | |
| class VectorStore: | |
| """Vector store FAISS avec persistance sur disque.""" | |
| def __init__(self, model_name: str = EMBEDDING_MODEL): | |
| logger.info("Chargement du modèle d'embeddings : %s", model_name) | |
| self.model = SentenceTransformer(model_name) | |
| self.dimension = self.model.get_sentence_embedding_dimension() | |
| # Index FAISS (Inner Product après normalisation L2 = cosine similarity) | |
| self.index = faiss.IndexFlatIP(self.dimension) | |
| # Stockage parallèle des chunks (même ordre que l'index) | |
| self.chunks: list[DocumentChunk] = [] | |
| # Registre des documents indexés (hash -> filename) | |
| self.indexed_docs: dict[str, str] = {} | |
| # ── Indexation ─────────────────────────────────────────────────────── | |
| def add_document(self, parsed_doc: ParsedDocument) -> int: | |
| """Ajoute un document parsé au vector store.""" | |
| if parsed_doc.doc_hash in self.indexed_docs: | |
| logger.info( | |
| "Document déjà indexé : %s (hash=%s)", | |
| parsed_doc.filename, | |
| parsed_doc.doc_hash, | |
| ) | |
| return 0 | |
| if not parsed_doc.chunks: | |
| logger.warning("Aucun chunk à indexer pour %s", parsed_doc.filename) | |
| return 0 | |
| texts = [chunk.text for chunk in parsed_doc.chunks] | |
| embeddings = self._encode(texts) | |
| self.index.add(embeddings) | |
| self.chunks.extend(parsed_doc.chunks) | |
| self.indexed_docs[parsed_doc.doc_hash] = parsed_doc.filename | |
| logger.info( | |
| "Indexé %d chunks pour %s (total index : %d)", | |
| len(parsed_doc.chunks), | |
| parsed_doc.filename, | |
| self.index.ntotal, | |
| ) | |
| return len(parsed_doc.chunks) | |
| def add_documents(self, parsed_docs: list[ParsedDocument]) -> int: | |
| """Ajoute plusieurs documents parsés.""" | |
| total = 0 | |
| for doc in parsed_docs: | |
| total += self.add_document(doc) | |
| return total | |
| # ── Recherche ──────────────────────────────────────────────────────── | |
| def search(self, query: str, top_k: int = TOP_K) -> list[dict]: | |
| """Recherche les chunks les plus pertinents pour une requête.""" | |
| if self.index.ntotal == 0: | |
| logger.warning("Vector store vide — aucune recherche possible.") | |
| return [] | |
| query_embedding = self._encode([query]) | |
| scores, indices = self.index.search(query_embedding, min(top_k, self.index.ntotal)) | |
| results = [] | |
| for score, idx in zip(scores[0], indices[0]): | |
| if idx < 0: # FAISS retourne -1 pour les résultats manquants | |
| continue | |
| chunk = self.chunks[idx] | |
| results.append( | |
| { | |
| "text": chunk.text, | |
| "metadata": chunk.metadata, | |
| "score": float(score), | |
| } | |
| ) | |
| return results | |
| # ── Persistance ────────────────────────────────────────────────────── | |
| def save(self, name: str = "default") -> Path: | |
| """Sauvegarde l'index et les métadonnées sur disque.""" | |
| store_dir = VECTORSTORE_DIR / name | |
| store_dir.mkdir(parents=True, exist_ok=True) | |
| # Sauvegarder l'index FAISS | |
| index_path = store_dir / "index.faiss" | |
| faiss.write_index(self.index, str(index_path)) | |
| # Sauvegarder les chunks (texte + métadonnées) | |
| chunks_data = [ | |
| {"text": c.text, "metadata": c.metadata} for c in self.chunks | |
| ] | |
| chunks_path = store_dir / "chunks.json" | |
| with open(chunks_path, "w", encoding="utf-8") as f: | |
| json.dump(chunks_data, f, ensure_ascii=False, indent=2) | |
| # Sauvegarder le registre des documents | |
| registry_path = store_dir / "registry.json" | |
| with open(registry_path, "w", encoding="utf-8") as f: | |
| json.dump(self.indexed_docs, f, ensure_ascii=False, indent=2) | |
| logger.info("Vector store sauvegardé dans %s", store_dir) | |
| return store_dir | |
| def load(self, name: str = "default") -> bool: | |
| """Charge un index depuis le disque. Retourne True si réussi.""" | |
| store_dir = VECTORSTORE_DIR / name | |
| index_path = store_dir / "index.faiss" | |
| chunks_path = store_dir / "chunks.json" | |
| registry_path = store_dir / "registry.json" | |
| if not index_path.exists(): | |
| logger.info("Aucun index sauvegardé trouvé (%s)", store_dir) | |
| return False | |
| self.index = faiss.read_index(str(index_path)) | |
| with open(chunks_path, "r", encoding="utf-8") as f: | |
| chunks_data = json.load(f) | |
| self.chunks = [ | |
| DocumentChunk(text=c["text"], metadata=c["metadata"]) | |
| for c in chunks_data | |
| ] | |
| if registry_path.exists(): | |
| with open(registry_path, "r", encoding="utf-8") as f: | |
| self.indexed_docs = json.load(f) | |
| logger.info( | |
| "Index chargé : %d vecteurs, %d documents", | |
| self.index.ntotal, | |
| len(self.indexed_docs), | |
| ) | |
| return True | |
| # ── Info ───────────────────────────────────────────────────────────── | |
| def stats(self) -> dict: | |
| """Statistiques du vector store.""" | |
| return { | |
| "total_vectors": self.index.ntotal, | |
| "total_chunks": len(self.chunks), | |
| "indexed_documents": list(self.indexed_docs.values()), | |
| "embedding_dimension": self.dimension, | |
| } | |
| # ── Privé ──────────────────────────────────────────────────────────── | |
| def _encode(self, texts: list[str]) -> np.ndarray: | |
| """Encode une liste de textes en embeddings normalisés.""" | |
| embeddings = self.model.encode( | |
| texts, show_progress_bar=False, normalize_embeddings=True | |
| ) | |
| return np.array(embeddings, dtype=np.float32) | |