import fitz # PyMuPDF from langchain_text_splitters import RecursiveCharacterTextSplitter from typing import List, Dict import re import os def clean_text(text: str) -> str: """Nettoie le texte extrait du PDF.""" text = re.sub(r' +', ' ', text) text = re.sub(r'\n{3,}', '\n\n', text) text = '\n'.join(line.strip() for line in text.split('\n')) return text.strip() def extract_pages_from_pdf(file_path: str, filename: str = None) -> List[Dict]: """Extrait le contenu de chaque page, son numéro ET le nom du fichier.""" doc = fitz.open(file_path) pages_content = [] if not filename: filename = os.path.basename(file_path) for page_num, page in enumerate(doc): raw_text = page.get_text() cleaned_text = clean_text(raw_text) if len(cleaned_text.strip()) > 50: pages_content.append({ "page_number": page_num + 1, "content": cleaned_text, "filename": filename }) else: print(f"Page {page_num + 1} ignorée (trop courte)") print(f"{len(pages_content)} pages extraites du PDF '{filename}'") return pages_content def split_text_into_chunks(pages: List[Dict]) -> List[Dict]: """Découpe le texte de chaque page en morceaux en conservant les métadonnées.""" text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, length_function=len, separators=["\n\n\n", "\n\n", "\n", ". ", " ", ""] ) all_chunks = [] chunk_global_index = 0 for page in pages: page_num = page["page_number"] filename = page.get("filename", "Document inconnu") chunks_on_page = text_splitter.split_text(page["content"]) for local_idx, chunk_text in enumerate(chunks_on_page): if len(chunk_text.strip()) < 50: continue all_chunks.append({ "text": chunk_text.strip(), "metadata": { "page": page_num, "filename": filename, "chunk_index_on_page": local_idx, "global_chunk_index": chunk_global_index, "chunk_length": len(chunk_text) } }) chunk_global_index += 1 print(f"{len(all_chunks)} chunks créés au total") if all_chunks: avg_length = sum(c["metadata"]["chunk_length"] for c in all_chunks) / len(all_chunks) print(f"Longueur moyenne des chunks : {avg_length:.0f} caractères") return all_chunks def preview_chunks(chunks: List[Dict], n: int = 3): """Affiche les n premiers chunks pour vérifier la qualité du découpage.""" print(f"\n Aperçu des {min(n, len(chunks))} premiers chunks :\n") for i, chunk in enumerate(chunks[:n]): text = chunk["text"] meta = chunk["metadata"] print(f"--- Chunk #{i+1} (Source: {meta.get('filename')}, Page {meta['page']}) ---") print(f"Longueur : {meta['chunk_length']} caractères") print(f"Texte (100 premiers caractères) : {text[:100]}...") print()