RAG_architectures / pdf_processing.py
Aidahaouas's picture
Correctness LLM evaluation
346dab1
from transformers import AutoTokenizer
from langchain.text_splitter import CharacterTextSplitter
import pdfplumber
from config import *
import re
import os
def get_existing_pdf(filename="La Confession muette.pdf"):
"""Retrieve the PDF file if it exists."""
if os.path.isfile(filename):
return filename
return None
def load_and_preprocess_pdf(pdf_path):
"""Load and preprocess the PDF text."""
with pdfplumber.open("La Confession muette.pdf") as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text() or ""
text = re.sub(r'\*+ebook converter demo watermarks\*+', '', text, flags=re.IGNORECASE)
return text
def token_length(text):
"""Calcule la longueur en tokens en utilisant SentenceTransformer."""
return len(model.tokenize(text))
def split_text(text):
"""Split text into chunks basés sur les tokens."""
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=1024, # Taille du chunk en tokens
chunk_overlap=200, # Chevauchement en tokens
length_function=token_length # Mesurer en tokens
)
return text_splitter.split_text(text)
#def split_text(text):
# """Split text into chunks."""
# text_splitter = CharacterTextSplitter(
# separator="\n",
# chunk_size=2500,
# chunk_overlap=200,
# length_function=len
# )
# return text_splitter.split_text(text)