Assistant-Web-Educatif / backend /services /document_processor.py
hamba-ho's picture
Mise à jour services backend et ajout streamlit_app
ee966d6
import fitz # PyMuPDF
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List, Dict
import re
import os
def clean_text(text: str) -> str:
"""Nettoie le texte extrait du PDF."""
text = re.sub(r' +', ' ', text)
text = re.sub(r'\n{3,}', '\n\n', text)
text = '\n'.join(line.strip() for line in text.split('\n'))
return text.strip()
def extract_pages_from_pdf(file_path: str, filename: str = None) -> List[Dict]:
"""Extrait le contenu de chaque page, son numéro ET le nom du fichier."""
doc = fitz.open(file_path)
pages_content = []
if not filename:
filename = os.path.basename(file_path)
for page_num, page in enumerate(doc):
raw_text = page.get_text()
cleaned_text = clean_text(raw_text)
if len(cleaned_text.strip()) > 50:
pages_content.append({
"page_number": page_num + 1,
"content": cleaned_text,
"filename": filename
})
else:
print(f"Page {page_num + 1} ignorée (trop courte)")
print(f"{len(pages_content)} pages extraites du PDF '{filename}'")
return pages_content
def split_text_into_chunks(pages: List[Dict]) -> List[Dict]:
"""Découpe le texte de chaque page en morceaux en conservant les métadonnées."""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len,
separators=["\n\n\n", "\n\n", "\n", ". ", " ", ""]
)
all_chunks = []
chunk_global_index = 0
for page in pages:
page_num = page["page_number"]
filename = page.get("filename", "Document inconnu")
chunks_on_page = text_splitter.split_text(page["content"])
for local_idx, chunk_text in enumerate(chunks_on_page):
if len(chunk_text.strip()) < 50:
continue
all_chunks.append({
"text": chunk_text.strip(),
"metadata": {
"page": page_num,
"filename": filename,
"chunk_index_on_page": local_idx,
"global_chunk_index": chunk_global_index,
"chunk_length": len(chunk_text)
}
})
chunk_global_index += 1
print(f"{len(all_chunks)} chunks créés au total")
if all_chunks:
avg_length = sum(c["metadata"]["chunk_length"] for c in all_chunks) / len(all_chunks)
print(f"Longueur moyenne des chunks : {avg_length:.0f} caractères")
return all_chunks
def preview_chunks(chunks: List[Dict], n: int = 3):
"""Affiche les n premiers chunks pour vérifier la qualité du découpage."""
print(f"\n Aperçu des {min(n, len(chunks))} premiers chunks :\n")
for i, chunk in enumerate(chunks[:n]):
text = chunk["text"]
meta = chunk["metadata"]
print(f"--- Chunk #{i+1} (Source: {meta.get('filename')}, Page {meta['page']}) ---")
print(f"Longueur : {meta['chunk_length']} caractères")
print(f"Texte (100 premiers caractères) : {text[:100]}...")
print()