Spaces:
Sleeping
Sleeping
File size: 638 Bytes
22cff0b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | #splitting text to chunks from the extracted pdf file, and overlapping chunks to keep some previous context
import re
from langchain_text_splitters import RecursiveCharacterTextSplitter
from config import CHUNK_SIZE, CHUNK_OVERLAP #getting values from configuration file
def clean_text(corpus: str) -> str:
corpus = re.sub(r'\s+', ' ', corpus)
corpus = re.sub(r'([a-z])([A-Z])', r'\1 \2', corpus)
return corpus.lower()
def create_chunks(text: str):
splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP
)
return splitter.split_text(text) |