import re from langchain_core.documents import Document from app.utils.logger import logger from typing import List class TextPreprocessor: @staticmethod def clean_text(text: str) -> str: # Remove extra whitespace text = re.sub(r'\s+', ' ', text) # Remove leading/trailing whitespace text = text.strip() # Remove multiple newlines text = re.sub(r'\n+', '\n', text) return text @staticmethod def preprocess_documents(documents: List[Document]) -> List[Document]: processed_docs = [] for doc in documents: cleaned_content = TextPreprocessor.clean_text(doc.page_content) if not cleaned_content: continue processed_doc = Document( page_content=cleaned_content, metadata=doc.metadata ) processed_docs.append(processed_doc) logger.info(f"Preprocessed {len(processed_docs)} documents") return processed_docs text_preprocessor = TextPreprocessor()