NormAssist / src /document_processor.py
Dakoro's picture
Deploy NormAssist — 2026-03-30 16:14
4211d95
Raw
History Blame Contribute Delete
5.6 kB
"""Parsing et chunking de documents PDF techniques."""
from __future__ import annotations
import hashlib
import logging
from dataclasses import dataclass, field
from pathlib import Path
import fitz # PyMuPDF
from langchain_text_splitters import RecursiveCharacterTextSplitter
from src.config import CHUNK_OVERLAP, CHUNK_SIZE
logger = logging.getLogger(__name__)
@dataclass
class DocumentChunk:
"""Un fragment de document enrichi de métadonnées."""
text: str
metadata: dict = field(default_factory=dict)
@property
def page_label(self) -> str:
return f"p.{self.metadata.get('page', '?')}"
@dataclass
class ParsedDocument:
"""Résultat du parsing d'un PDF."""
filename: str
total_pages: int
chunks: list[DocumentChunk]
doc_hash: str
@property
def summary(self) -> str:
return (
f"📄 {self.filename}{self.total_pages} pages, "
f"{len(self.chunks)} chunks indexés"
)
class DocumentProcessor:
"""Parse des PDFs techniques et les découpe en chunks sémantiques."""
def __init__(
self,
chunk_size: int = CHUNK_SIZE,
chunk_overlap: int = CHUNK_OVERLAP,
):
self.splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=["\n\n", "\n", ". ", " ", ""],
length_function=len,
)
# ── API publique ─────────────────────────────────────────────────────
def process_pdf(self, pdf_path: str | Path) -> ParsedDocument:
"""Parse un PDF et retourne les chunks avec métadonnées."""
pdf_path = Path(pdf_path)
if not pdf_path.exists():
raise FileNotFoundError(f"Fichier introuvable : {pdf_path}")
logger.info("Traitement de %s", pdf_path.name)
doc_hash = self._compute_hash(pdf_path)
pages = self._extract_pages(pdf_path)
chunks = self._create_chunks(pages, pdf_path.name)
parsed = ParsedDocument(
filename=pdf_path.name,
total_pages=len(pages),
chunks=chunks,
doc_hash=doc_hash,
)
logger.info(parsed.summary)
return parsed
def process_multiple(self, pdf_paths: list[str | Path]) -> list[ParsedDocument]:
"""Traite plusieurs PDFs."""
results = []
for path in pdf_paths:
try:
results.append(self.process_pdf(path))
except Exception as e:
logger.error("Erreur sur %s : %s", path, e)
return results
# ── Extraction ───────────────────────────────────────────────────────
def _extract_pages(self, pdf_path: Path) -> list[dict]:
"""Extrait le texte page par page avec nettoyage."""
pages = []
with fitz.open(str(pdf_path)) as doc:
for page_num, page in enumerate(doc, start=1):
text = page.get_text("text")
cleaned = self._clean_text(text)
if cleaned.strip():
pages.append({"page": page_num, "text": cleaned})
return pages
def _create_chunks(
self, pages: list[dict], filename: str
) -> list[DocumentChunk]:
"""Découpe les pages en chunks avec métadonnées de traçabilité."""
all_chunks = []
for page_data in pages:
page_num = page_data["page"]
text = page_data["text"]
splits = self.splitter.split_text(text)
for i, split_text in enumerate(splits):
chunk = DocumentChunk(
text=split_text,
metadata={
"source": filename,
"page": page_num,
"chunk_index": i,
"total_chars": len(split_text),
},
)
all_chunks.append(chunk)
# Numéroter globalement
for idx, chunk in enumerate(all_chunks):
chunk.metadata["global_index"] = idx
return all_chunks
# ── Utilitaires ──────────────────────────────────────────────────────
@staticmethod
def _clean_text(text: str) -> str:
"""Nettoie le texte extrait d'un PDF."""
import re
# Supprimer les caractères de contrôle sauf newlines
text = re.sub(r"[^\S\n]+", " ", text)
# Supprimer les lignes vides multiples
text = re.sub(r"\n{3,}", "\n\n", text)
# Supprimer les en-têtes/pieds de page répétitifs (heuristique)
lines = text.split("\n")
cleaned_lines = []
for line in lines:
stripped = line.strip()
# Ignorer les lignes qui ne sont que des numéros de page
if stripped.isdigit() and len(stripped) <= 4:
continue
cleaned_lines.append(line)
return "\n".join(cleaned_lines).strip()
@staticmethod
def _compute_hash(path: Path) -> str:
"""Hash SHA-256 du fichier pour détecter les doublons."""
sha = hashlib.sha256()
with open(path, "rb") as f:
for block in iter(lambda: f.read(8192), b""):
sha.update(block)
return sha.hexdigest()[:16]