Spaces:
Sleeping
Sleeping
| import re | |
| from langchain_core.documents import Document | |
| from app.utils.logger import logger | |
| from typing import List | |
| class TextPreprocessor: | |
| def clean_text(text: str) -> str: | |
| # Remove extra whitespace | |
| text = re.sub(r'\s+', ' ', text) | |
| # Remove leading/trailing whitespace | |
| text = text.strip() | |
| # Remove multiple newlines | |
| text = re.sub(r'\n+', '\n', text) | |
| return text | |
| def preprocess_documents(documents: List[Document]) -> List[Document]: | |
| processed_docs = [] | |
| for doc in documents: | |
| cleaned_content = TextPreprocessor.clean_text(doc.page_content) | |
| if not cleaned_content: | |
| continue | |
| processed_doc = Document( | |
| page_content=cleaned_content, | |
| metadata=doc.metadata | |
| ) | |
| processed_docs.append(processed_doc) | |
| logger.info(f"Preprocessed {len(processed_docs)} documents") | |
| return processed_docs | |
| text_preprocessor = TextPreprocessor() |