NormAssist / src /vector_store.py
Dakoro's picture
Deploy NormAssist — 2026-03-30 16:14
4211d95
Raw
History Blame Contribute Delete
6.89 kB
"""Gestion du vector store FAISS avec embeddings HuggingFace."""
from __future__ import annotations
import json
import logging
from pathlib import Path
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from src.config import EMBEDDING_MODEL, TOP_K, VECTORSTORE_DIR
from src.document_processor import DocumentChunk, ParsedDocument
logger = logging.getLogger(__name__)
class VectorStore:
"""Vector store FAISS avec persistance sur disque."""
def __init__(self, model_name: str = EMBEDDING_MODEL):
logger.info("Chargement du modèle d'embeddings : %s", model_name)
self.model = SentenceTransformer(model_name)
self.dimension = self.model.get_sentence_embedding_dimension()
# Index FAISS (Inner Product après normalisation L2 = cosine similarity)
self.index = faiss.IndexFlatIP(self.dimension)
# Stockage parallèle des chunks (même ordre que l'index)
self.chunks: list[DocumentChunk] = []
# Registre des documents indexés (hash -> filename)
self.indexed_docs: dict[str, str] = {}
# ── Indexation ───────────────────────────────────────────────────────
def add_document(self, parsed_doc: ParsedDocument) -> int:
"""Ajoute un document parsé au vector store."""
if parsed_doc.doc_hash in self.indexed_docs:
logger.info(
"Document déjà indexé : %s (hash=%s)",
parsed_doc.filename,
parsed_doc.doc_hash,
)
return 0
if not parsed_doc.chunks:
logger.warning("Aucun chunk à indexer pour %s", parsed_doc.filename)
return 0
texts = [chunk.text for chunk in parsed_doc.chunks]
embeddings = self._encode(texts)
self.index.add(embeddings)
self.chunks.extend(parsed_doc.chunks)
self.indexed_docs[parsed_doc.doc_hash] = parsed_doc.filename
logger.info(
"Indexé %d chunks pour %s (total index : %d)",
len(parsed_doc.chunks),
parsed_doc.filename,
self.index.ntotal,
)
return len(parsed_doc.chunks)
def add_documents(self, parsed_docs: list[ParsedDocument]) -> int:
"""Ajoute plusieurs documents parsés."""
total = 0
for doc in parsed_docs:
total += self.add_document(doc)
return total
# ── Recherche ────────────────────────────────────────────────────────
def search(self, query: str, top_k: int = TOP_K) -> list[dict]:
"""Recherche les chunks les plus pertinents pour une requête."""
if self.index.ntotal == 0:
logger.warning("Vector store vide — aucune recherche possible.")
return []
query_embedding = self._encode([query])
scores, indices = self.index.search(query_embedding, min(top_k, self.index.ntotal))
results = []
for score, idx in zip(scores[0], indices[0]):
if idx < 0: # FAISS retourne -1 pour les résultats manquants
continue
chunk = self.chunks[idx]
results.append(
{
"text": chunk.text,
"metadata": chunk.metadata,
"score": float(score),
}
)
return results
# ── Persistance ──────────────────────────────────────────────────────
def save(self, name: str = "default") -> Path:
"""Sauvegarde l'index et les métadonnées sur disque."""
store_dir = VECTORSTORE_DIR / name
store_dir.mkdir(parents=True, exist_ok=True)
# Sauvegarder l'index FAISS
index_path = store_dir / "index.faiss"
faiss.write_index(self.index, str(index_path))
# Sauvegarder les chunks (texte + métadonnées)
chunks_data = [
{"text": c.text, "metadata": c.metadata} for c in self.chunks
]
chunks_path = store_dir / "chunks.json"
with open(chunks_path, "w", encoding="utf-8") as f:
json.dump(chunks_data, f, ensure_ascii=False, indent=2)
# Sauvegarder le registre des documents
registry_path = store_dir / "registry.json"
with open(registry_path, "w", encoding="utf-8") as f:
json.dump(self.indexed_docs, f, ensure_ascii=False, indent=2)
logger.info("Vector store sauvegardé dans %s", store_dir)
return store_dir
def load(self, name: str = "default") -> bool:
"""Charge un index depuis le disque. Retourne True si réussi."""
store_dir = VECTORSTORE_DIR / name
index_path = store_dir / "index.faiss"
chunks_path = store_dir / "chunks.json"
registry_path = store_dir / "registry.json"
if not index_path.exists():
logger.info("Aucun index sauvegardé trouvé (%s)", store_dir)
return False
self.index = faiss.read_index(str(index_path))
with open(chunks_path, "r", encoding="utf-8") as f:
chunks_data = json.load(f)
self.chunks = [
DocumentChunk(text=c["text"], metadata=c["metadata"])
for c in chunks_data
]
if registry_path.exists():
with open(registry_path, "r", encoding="utf-8") as f:
self.indexed_docs = json.load(f)
logger.info(
"Index chargé : %d vecteurs, %d documents",
self.index.ntotal,
len(self.indexed_docs),
)
return True
# ── Info ─────────────────────────────────────────────────────────────
@property
def stats(self) -> dict:
"""Statistiques du vector store."""
return {
"total_vectors": self.index.ntotal,
"total_chunks": len(self.chunks),
"indexed_documents": list(self.indexed_docs.values()),
"embedding_dimension": self.dimension,
}
# ── Privé ────────────────────────────────────────────────────────────
def _encode(self, texts: list[str]) -> np.ndarray:
"""Encode une liste de textes en embeddings normalisés."""
embeddings = self.model.encode(
texts, show_progress_bar=False, normalize_embeddings=True
)
return np.array(embeddings, dtype=np.float32)