"""Parsing et chunking de documents PDF techniques.""" from __future__ import annotations import hashlib import logging from dataclasses import dataclass, field from pathlib import Path import fitz # PyMuPDF from langchain_text_splitters import RecursiveCharacterTextSplitter from src.config import CHUNK_OVERLAP, CHUNK_SIZE logger = logging.getLogger(__name__) @dataclass class DocumentChunk: """Un fragment de document enrichi de mΓ©tadonnΓ©es.""" text: str metadata: dict = field(default_factory=dict) @property def page_label(self) -> str: return f"p.{self.metadata.get('page', '?')}" @dataclass class ParsedDocument: """RΓ©sultat du parsing d'un PDF.""" filename: str total_pages: int chunks: list[DocumentChunk] doc_hash: str @property def summary(self) -> str: return ( f"πŸ“„ {self.filename} β€” {self.total_pages} pages, " f"{len(self.chunks)} chunks indexΓ©s" ) class DocumentProcessor: """Parse des PDFs techniques et les dΓ©coupe en chunks sΓ©mantiques.""" def __init__( self, chunk_size: int = CHUNK_SIZE, chunk_overlap: int = CHUNK_OVERLAP, ): self.splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n\n", "\n", ". ", " ", ""], length_function=len, ) # ── API publique ───────────────────────────────────────────────────── def process_pdf(self, pdf_path: str | Path) -> ParsedDocument: """Parse un PDF et retourne les chunks avec mΓ©tadonnΓ©es.""" pdf_path = Path(pdf_path) if not pdf_path.exists(): raise FileNotFoundError(f"Fichier introuvable : {pdf_path}") logger.info("Traitement de %s", pdf_path.name) doc_hash = self._compute_hash(pdf_path) pages = self._extract_pages(pdf_path) chunks = self._create_chunks(pages, pdf_path.name) parsed = ParsedDocument( filename=pdf_path.name, total_pages=len(pages), chunks=chunks, doc_hash=doc_hash, ) logger.info(parsed.summary) return parsed def process_multiple(self, pdf_paths: list[str | Path]) -> list[ParsedDocument]: """Traite plusieurs PDFs.""" results = [] for path in pdf_paths: try: results.append(self.process_pdf(path)) except Exception as e: logger.error("Erreur sur %s : %s", path, e) return results # ── Extraction ─────────────────────────────────────────────────────── def _extract_pages(self, pdf_path: Path) -> list[dict]: """Extrait le texte page par page avec nettoyage.""" pages = [] with fitz.open(str(pdf_path)) as doc: for page_num, page in enumerate(doc, start=1): text = page.get_text("text") cleaned = self._clean_text(text) if cleaned.strip(): pages.append({"page": page_num, "text": cleaned}) return pages def _create_chunks( self, pages: list[dict], filename: str ) -> list[DocumentChunk]: """DΓ©coupe les pages en chunks avec mΓ©tadonnΓ©es de traΓ§abilitΓ©.""" all_chunks = [] for page_data in pages: page_num = page_data["page"] text = page_data["text"] splits = self.splitter.split_text(text) for i, split_text in enumerate(splits): chunk = DocumentChunk( text=split_text, metadata={ "source": filename, "page": page_num, "chunk_index": i, "total_chars": len(split_text), }, ) all_chunks.append(chunk) # NumΓ©roter globalement for idx, chunk in enumerate(all_chunks): chunk.metadata["global_index"] = idx return all_chunks # ── Utilitaires ────────────────────────────────────────────────────── @staticmethod def _clean_text(text: str) -> str: """Nettoie le texte extrait d'un PDF.""" import re # Supprimer les caractΓ¨res de contrΓ΄le sauf newlines text = re.sub(r"[^\S\n]+", " ", text) # Supprimer les lignes vides multiples text = re.sub(r"\n{3,}", "\n\n", text) # Supprimer les en-tΓͺtes/pieds de page rΓ©pΓ©titifs (heuristique) lines = text.split("\n") cleaned_lines = [] for line in lines: stripped = line.strip() # Ignorer les lignes qui ne sont que des numΓ©ros de page if stripped.isdigit() and len(stripped) <= 4: continue cleaned_lines.append(line) return "\n".join(cleaned_lines).strip() @staticmethod def _compute_hash(path: Path) -> str: """Hash SHA-256 du fichier pour dΓ©tecter les doublons.""" sha = hashlib.sha256() with open(path, "rb") as f: for block in iter(lambda: f.read(8192), b""): sha.update(block) return sha.hexdigest()[:16]