rag-chatbot / ingestion /preprocessor.py
Abeshith's picture
RAG Chatbot with LangChain, FastAPI, and service layer architecture
64d7fdf
import re
from langchain_core.documents import Document
from app.utils.logger import logger
from typing import List
class TextPreprocessor:
@staticmethod
def clean_text(text: str) -> str:
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Remove leading/trailing whitespace
text = text.strip()
# Remove multiple newlines
text = re.sub(r'\n+', '\n', text)
return text
@staticmethod
def preprocess_documents(documents: List[Document]) -> List[Document]:
processed_docs = []
for doc in documents:
cleaned_content = TextPreprocessor.clean_text(doc.page_content)
if not cleaned_content:
continue
processed_doc = Document(
page_content=cleaned_content,
metadata=doc.metadata
)
processed_docs.append(processed_doc)
logger.info(f"Preprocessed {len(processed_docs)} documents")
return processed_docs
text_preprocessor = TextPreprocessor()