Spaces:
Running
Running
File size: 1,454 Bytes
346dab1 a83126f 346dab1 a83126f 6807123 a83126f 29b0b66 a83126f 85a9e7b a83126f 346dab1 a83126f 346dab1 a83126f 346dab1 a83126f 346dab1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
from transformers import AutoTokenizer
from langchain.text_splitter import CharacterTextSplitter
import pdfplumber
from config import *
import re
import os
def get_existing_pdf(filename="La Confession muette.pdf"):
"""Retrieve the PDF file if it exists."""
if os.path.isfile(filename):
return filename
return None
def load_and_preprocess_pdf(pdf_path):
"""Load and preprocess the PDF text."""
with pdfplumber.open("La Confession muette.pdf") as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text() or ""
text = re.sub(r'\*+ebook converter demo watermarks\*+', '', text, flags=re.IGNORECASE)
return text
def token_length(text):
"""Calcule la longueur en tokens en utilisant SentenceTransformer."""
return len(model.tokenize(text))
def split_text(text):
"""Split text into chunks basés sur les tokens."""
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=1024, # Taille du chunk en tokens
chunk_overlap=200, # Chevauchement en tokens
length_function=token_length # Mesurer en tokens
)
return text_splitter.split_text(text)
#def split_text(text):
# """Split text into chunks."""
# text_splitter = CharacterTextSplitter(
# separator="\n",
# chunk_size=2500,
# chunk_overlap=200,
# length_function=len
# )
# return text_splitter.split_text(text)
|