File size: 1,454 Bytes
346dab1
a83126f
 
346dab1
 
 
a83126f
 
 
 
 
 
 
 
 
6807123
a83126f
 
29b0b66
a83126f
85a9e7b
a83126f
 
346dab1
 
 
 
a83126f
346dab1
a83126f
 
346dab1
 
 
a83126f
346dab1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from transformers import AutoTokenizer
from langchain.text_splitter import CharacterTextSplitter
import pdfplumber
from config import *
import re
import os

def get_existing_pdf(filename="La Confession muette.pdf"):
    """Retrieve the PDF file if it exists."""
    if os.path.isfile(filename):
        return filename
    return None

def load_and_preprocess_pdf(pdf_path):
    """Load and preprocess the PDF text."""
    with pdfplumber.open("La Confession muette.pdf") as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text() or ""  

    text = re.sub(r'\*+ebook converter demo watermarks\*+', '', text, flags=re.IGNORECASE)
    return text

def token_length(text):
    """Calcule la longueur en tokens en utilisant SentenceTransformer."""
    return len(model.tokenize(text)) 

def split_text(text):
    """Split text into chunks basés sur les tokens."""
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1024,  # Taille du chunk en tokens
        chunk_overlap=200,  # Chevauchement en tokens
        length_function=token_length  # Mesurer en tokens
    )
    return text_splitter.split_text(text)









#def split_text(text):
#    """Split text into chunks."""
#    text_splitter = CharacterTextSplitter(
#        separator="\n",
#        chunk_size=2500,
#        chunk_overlap=200,
#        length_function=len
#    )
#    return text_splitter.split_text(text)