DeepLearningRAGchatbot / langchain_text_splitter.py
murkasad's picture
Upload 10 files
22cff0b verified
raw
history blame contribute delete
638 Bytes
#splitting text to chunks from the extracted pdf file, and overlapping chunks to keep some previous context
import re
from langchain_text_splitters import RecursiveCharacterTextSplitter
from config import CHUNK_SIZE, CHUNK_OVERLAP #getting values from configuration file
def clean_text(corpus: str) -> str:
corpus = re.sub(r'\s+', ' ', corpus)
corpus = re.sub(r'([a-z])([A-Z])', r'\1 \2', corpus)
return corpus.lower()
def create_chunks(text: str):
splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP
)
return splitter.split_text(text)