Spaces:

Dakoro
/

NormAssist

Running

App Files Files Community

NormAssist / src /vector_store.py

Dakoro

Deploy NormAssist — 2026-03-30 16:14

4211d95 3 months ago

Raw

History Blame Contribute Delete

6.89 kB

	"""Gestion du vector store FAISS avec embeddings HuggingFace."""

	from __future__ import annotations

	import json
	import logging
	from pathlib import Path

	import faiss
	import numpy as np
	from sentence_transformers import SentenceTransformer

	from src.config import EMBEDDING_MODEL, TOP_K, VECTORSTORE_DIR
	from src.document_processor import DocumentChunk, ParsedDocument

	logger = logging.getLogger(__name__)


	class VectorStore:
	"""Vector store FAISS avec persistance sur disque."""

	def __init__(self, model_name: str = EMBEDDING_MODEL):
	logger.info("Chargement du modèle d'embeddings : %s", model_name)
	self.model = SentenceTransformer(model_name)
	self.dimension = self.model.get_sentence_embedding_dimension()

	# Index FAISS (Inner Product après normalisation L2 = cosine similarity)
	self.index = faiss.IndexFlatIP(self.dimension)

	# Stockage parallèle des chunks (même ordre que l'index)
	self.chunks: list[DocumentChunk] = []

	# Registre des documents indexés (hash -> filename)
	self.indexed_docs: dict[str, str] = {}

	# ── Indexation ───────────────────────────────────────────────────────

	def add_document(self, parsed_doc: ParsedDocument) -> int:
	"""Ajoute un document parsé au vector store."""
	if parsed_doc.doc_hash in self.indexed_docs:
	logger.info(
	"Document déjà indexé : %s (hash=%s)",
	parsed_doc.filename,
	parsed_doc.doc_hash,
	)
	return 0

	if not parsed_doc.chunks:
	logger.warning("Aucun chunk à indexer pour %s", parsed_doc.filename)
	return 0

	texts = [chunk.text for chunk in parsed_doc.chunks]
	embeddings = self._encode(texts)

	self.index.add(embeddings)
	self.chunks.extend(parsed_doc.chunks)
	self.indexed_docs[parsed_doc.doc_hash] = parsed_doc.filename

	logger.info(
	"Indexé %d chunks pour %s (total index : %d)",
	len(parsed_doc.chunks),
	parsed_doc.filename,
	self.index.ntotal,
	)
	return len(parsed_doc.chunks)

	def add_documents(self, parsed_docs: list[ParsedDocument]) -> int:
	"""Ajoute plusieurs documents parsés."""
	total = 0
	for doc in parsed_docs:
	total += self.add_document(doc)
	return total

	# ── Recherche ────────────────────────────────────────────────────────

	def search(self, query: str, top_k: int = TOP_K) -> list[dict]:
	"""Recherche les chunks les plus pertinents pour une requête."""
	if self.index.ntotal == 0:
	logger.warning("Vector store vide — aucune recherche possible.")
	return []

	query_embedding = self._encode([query])
	scores, indices = self.index.search(query_embedding, min(top_k, self.index.ntotal))

	results = []
	for score, idx in zip(scores[0], indices[0]):
	if idx < 0: # FAISS retourne -1 pour les résultats manquants
	continue
	chunk = self.chunks[idx]
	results.append(
	{
	"text": chunk.text,
	"metadata": chunk.metadata,
	"score": float(score),
	}
	)

	return results

	# ── Persistance ──────────────────────────────────────────────────────

	def save(self, name: str = "default") -> Path:
	"""Sauvegarde l'index et les métadonnées sur disque."""
	store_dir = VECTORSTORE_DIR / name
	store_dir.mkdir(parents=True, exist_ok=True)

	# Sauvegarder l'index FAISS
	index_path = store_dir / "index.faiss"
	faiss.write_index(self.index, str(index_path))

	# Sauvegarder les chunks (texte + métadonnées)
	chunks_data = [
	{"text": c.text, "metadata": c.metadata} for c in self.chunks
	]
	chunks_path = store_dir / "chunks.json"
	with open(chunks_path, "w", encoding="utf-8") as f:
	json.dump(chunks_data, f, ensure_ascii=False, indent=2)

	# Sauvegarder le registre des documents
	registry_path = store_dir / "registry.json"
	with open(registry_path, "w", encoding="utf-8") as f:
	json.dump(self.indexed_docs, f, ensure_ascii=False, indent=2)

	logger.info("Vector store sauvegardé dans %s", store_dir)
	return store_dir

	def load(self, name: str = "default") -> bool:
	"""Charge un index depuis le disque. Retourne True si réussi."""
	store_dir = VECTORSTORE_DIR / name
	index_path = store_dir / "index.faiss"
	chunks_path = store_dir / "chunks.json"
	registry_path = store_dir / "registry.json"

	if not index_path.exists():
	logger.info("Aucun index sauvegardé trouvé (%s)", store_dir)
	return False

	self.index = faiss.read_index(str(index_path))

	with open(chunks_path, "r", encoding="utf-8") as f:
	chunks_data = json.load(f)
	self.chunks = [
	DocumentChunk(text=c["text"], metadata=c["metadata"])
	for c in chunks_data
	]

	if registry_path.exists():
	with open(registry_path, "r", encoding="utf-8") as f:
	self.indexed_docs = json.load(f)

	logger.info(
	"Index chargé : %d vecteurs, %d documents",
	self.index.ntotal,
	len(self.indexed_docs),
	)
	return True

	# ── Info ─────────────────────────────────────────────────────────────

	@property
	def stats(self) -> dict:
	"""Statistiques du vector store."""
	return {
	"total_vectors": self.index.ntotal,
	"total_chunks": len(self.chunks),
	"indexed_documents": list(self.indexed_docs.values()),
	"embedding_dimension": self.dimension,
	}

	# ── Privé ────────────────────────────────────────────────────────────

	def _encode(self, texts: list[str]) -> np.ndarray:
	"""Encode une liste de textes en embeddings normalisés."""
	embeddings = self.model.encode(
	texts, show_progress_bar=False, normalize_embeddings=True
	)
	return np.array(embeddings, dtype=np.float32)