from transformers import AutoTokenizer from langchain.text_splitter import CharacterTextSplitter import pdfplumber from config import * import re import os def get_existing_pdf(filename="La Confession muette.pdf"): """Retrieve the PDF file if it exists.""" if os.path.isfile(filename): return filename return None def load_and_preprocess_pdf(pdf_path): """Load and preprocess the PDF text.""" with pdfplumber.open("La Confession muette.pdf") as pdf: text = "" for page in pdf.pages: text += page.extract_text() or "" text = re.sub(r'\*+ebook converter demo watermarks\*+', '', text, flags=re.IGNORECASE) return text def token_length(text): """Calcule la longueur en tokens en utilisant SentenceTransformer.""" return len(model.tokenize(text)) def split_text(text): """Split text into chunks basés sur les tokens.""" text_splitter = CharacterTextSplitter( separator="\n", chunk_size=1024, # Taille du chunk en tokens chunk_overlap=200, # Chevauchement en tokens length_function=token_length # Mesurer en tokens ) return text_splitter.split_text(text) #def split_text(text): # """Split text into chunks.""" # text_splitter = CharacterTextSplitter( # separator="\n", # chunk_size=2500, # chunk_overlap=200, # length_function=len # ) # return text_splitter.split_text(text)