Spaces:
Running
Running
| from transformers import AutoTokenizer | |
| from langchain.text_splitter import CharacterTextSplitter | |
| import pdfplumber | |
| from config import * | |
| import re | |
| import os | |
| def get_existing_pdf(filename="La Confession muette.pdf"): | |
| """Retrieve the PDF file if it exists.""" | |
| if os.path.isfile(filename): | |
| return filename | |
| return None | |
| def load_and_preprocess_pdf(pdf_path): | |
| """Load and preprocess the PDF text.""" | |
| with pdfplumber.open("La Confession muette.pdf") as pdf: | |
| text = "" | |
| for page in pdf.pages: | |
| text += page.extract_text() or "" | |
| text = re.sub(r'\*+ebook converter demo watermarks\*+', '', text, flags=re.IGNORECASE) | |
| return text | |
| def token_length(text): | |
| """Calcule la longueur en tokens en utilisant SentenceTransformer.""" | |
| return len(model.tokenize(text)) | |
| def split_text(text): | |
| """Split text into chunks basés sur les tokens.""" | |
| text_splitter = CharacterTextSplitter( | |
| separator="\n", | |
| chunk_size=1024, # Taille du chunk en tokens | |
| chunk_overlap=200, # Chevauchement en tokens | |
| length_function=token_length # Mesurer en tokens | |
| ) | |
| return text_splitter.split_text(text) | |
| #def split_text(text): | |
| # """Split text into chunks.""" | |
| # text_splitter = CharacterTextSplitter( | |
| # separator="\n", | |
| # chunk_size=2500, | |
| # chunk_overlap=200, | |
| # length_function=len | |
| # ) | |
| # return text_splitter.split_text(text) | |