File size: 3,246 Bytes
7826da3
ee966d6
35b525b
c2f46ba
ee966d6
7826da3
c2f46ba
 
 
 
 
 
7826da3
ee966d6
 
35b525b
 
ee966d6
 
c2f46ba
35b525b
c2f46ba
 
ee966d6
c2f46ba
 
 
ee966d6
 
c2f46ba
 
ee966d6
c2f46ba
ee966d6
35b525b
 
 
 
7826da3
ee966d6
 
c2f46ba
ee966d6
7826da3
35b525b
 
c2f46ba
 
35b525b
c2f46ba
ee966d6
 
35b525b
c2f46ba
 
ee966d6
c2f46ba
ee966d6
35b525b
c2f46ba
 
 
ee966d6
c2f46ba
 
 
 
35b525b
c2f46ba
 
ee966d6
c2f46ba
 
ee966d6
 
c2f46ba
ee966d6
c2f46ba
 
ee966d6
c2f46ba
 
 
 
 
ee966d6
c2f46ba
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import fitz  # PyMuPDF
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List, Dict
import re
import os

def clean_text(text: str) -> str:
    """Nettoie le texte extrait du PDF."""
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = '\n'.join(line.strip() for line in text.split('\n'))
    return text.strip()

def extract_pages_from_pdf(file_path: str, filename: str = None) -> List[Dict]:
    """Extrait le contenu de chaque page, son numéro ET le nom du fichier."""
    doc = fitz.open(file_path)
    pages_content = []
    if not filename:
        filename = os.path.basename(file_path)
    
    for page_num, page in enumerate(doc):
        raw_text = page.get_text()
        cleaned_text = clean_text(raw_text)
        
        if len(cleaned_text.strip()) > 50: 
            pages_content.append({
                "page_number": page_num + 1,
                "content": cleaned_text,
                "filename": filename 
            })
        else:
            print(f"Page {page_num + 1} ignorée (trop courte)")
    
    print(f"{len(pages_content)} pages extraites du PDF '{filename}'")
    return pages_content

def split_text_into_chunks(pages: List[Dict]) -> List[Dict]:
    """Découpe le texte de chaque page en morceaux en conservant les métadonnées."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,        
        chunk_overlap=200,      
        length_function=len,
        separators=["\n\n\n", "\n\n", "\n", ". ", " ", ""]
    )
    
    all_chunks = []
    chunk_global_index = 0
    
    for page in pages:
        page_num = page["page_number"]
        filename = page.get("filename", "Document inconnu")

        chunks_on_page = text_splitter.split_text(page["content"])
        
        for local_idx, chunk_text in enumerate(chunks_on_page):
            if len(chunk_text.strip()) < 50:  
                continue
                
            all_chunks.append({
                "text": chunk_text.strip(),
                "metadata": {
                    "page": page_num,
                    "filename": filename, 
                    "chunk_index_on_page": local_idx,
                    "global_chunk_index": chunk_global_index,
                    "chunk_length": len(chunk_text)
                }
            })
            chunk_global_index += 1
    
    print(f"{len(all_chunks)} chunks créés au total")
    if all_chunks:
        avg_length = sum(c["metadata"]["chunk_length"] for c in all_chunks) / len(all_chunks)
        print(f"Longueur moyenne des chunks : {avg_length:.0f} caractères")  
        
    return all_chunks

def preview_chunks(chunks: List[Dict], n: int = 3):
    """Affiche les n premiers chunks pour vérifier la qualité du découpage."""
    print(f"\n Aperçu des {min(n, len(chunks))} premiers chunks :\n")
    
    for i, chunk in enumerate(chunks[:n]):
        text = chunk["text"]
        meta = chunk["metadata"]
        
        print(f"--- Chunk #{i+1} (Source: {meta.get('filename')}, Page {meta['page']}) ---")
        print(f"Longueur : {meta['chunk_length']} caractères")
        print(f"Texte (100 premiers caractères) : {text[:100]}...")
        print()