Spaces:
Sleeping
Sleeping
File size: 3,246 Bytes
7826da3 ee966d6 35b525b c2f46ba ee966d6 7826da3 c2f46ba 7826da3 ee966d6 35b525b ee966d6 c2f46ba 35b525b c2f46ba ee966d6 c2f46ba ee966d6 c2f46ba ee966d6 c2f46ba ee966d6 35b525b 7826da3 ee966d6 c2f46ba ee966d6 7826da3 35b525b c2f46ba 35b525b c2f46ba ee966d6 35b525b c2f46ba ee966d6 c2f46ba ee966d6 35b525b c2f46ba ee966d6 c2f46ba 35b525b c2f46ba ee966d6 c2f46ba ee966d6 c2f46ba ee966d6 c2f46ba ee966d6 c2f46ba ee966d6 c2f46ba | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 | import fitz # PyMuPDF
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List, Dict
import re
import os
def clean_text(text: str) -> str:
"""Nettoie le texte extrait du PDF."""
text = re.sub(r' +', ' ', text)
text = re.sub(r'\n{3,}', '\n\n', text)
text = '\n'.join(line.strip() for line in text.split('\n'))
return text.strip()
def extract_pages_from_pdf(file_path: str, filename: str = None) -> List[Dict]:
"""Extrait le contenu de chaque page, son numéro ET le nom du fichier."""
doc = fitz.open(file_path)
pages_content = []
if not filename:
filename = os.path.basename(file_path)
for page_num, page in enumerate(doc):
raw_text = page.get_text()
cleaned_text = clean_text(raw_text)
if len(cleaned_text.strip()) > 50:
pages_content.append({
"page_number": page_num + 1,
"content": cleaned_text,
"filename": filename
})
else:
print(f"Page {page_num + 1} ignorée (trop courte)")
print(f"{len(pages_content)} pages extraites du PDF '{filename}'")
return pages_content
def split_text_into_chunks(pages: List[Dict]) -> List[Dict]:
"""Découpe le texte de chaque page en morceaux en conservant les métadonnées."""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len,
separators=["\n\n\n", "\n\n", "\n", ". ", " ", ""]
)
all_chunks = []
chunk_global_index = 0
for page in pages:
page_num = page["page_number"]
filename = page.get("filename", "Document inconnu")
chunks_on_page = text_splitter.split_text(page["content"])
for local_idx, chunk_text in enumerate(chunks_on_page):
if len(chunk_text.strip()) < 50:
continue
all_chunks.append({
"text": chunk_text.strip(),
"metadata": {
"page": page_num,
"filename": filename,
"chunk_index_on_page": local_idx,
"global_chunk_index": chunk_global_index,
"chunk_length": len(chunk_text)
}
})
chunk_global_index += 1
print(f"{len(all_chunks)} chunks créés au total")
if all_chunks:
avg_length = sum(c["metadata"]["chunk_length"] for c in all_chunks) / len(all_chunks)
print(f"Longueur moyenne des chunks : {avg_length:.0f} caractères")
return all_chunks
def preview_chunks(chunks: List[Dict], n: int = 3):
"""Affiche les n premiers chunks pour vérifier la qualité du découpage."""
print(f"\n Aperçu des {min(n, len(chunks))} premiers chunks :\n")
for i, chunk in enumerate(chunks[:n]):
text = chunk["text"]
meta = chunk["metadata"]
print(f"--- Chunk #{i+1} (Source: {meta.get('filename')}, Page {meta['page']}) ---")
print(f"Longueur : {meta['chunk_length']} caractères")
print(f"Texte (100 premiers caractères) : {text[:100]}...")
print() |