Spaces:
Sleeping
Sleeping
File size: 1,131 Bytes
64d7fdf | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | import re
from langchain_core.documents import Document
from app.utils.logger import logger
from typing import List
class TextPreprocessor:
@staticmethod
def clean_text(text: str) -> str:
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Remove leading/trailing whitespace
text = text.strip()
# Remove multiple newlines
text = re.sub(r'\n+', '\n', text)
return text
@staticmethod
def preprocess_documents(documents: List[Document]) -> List[Document]:
processed_docs = []
for doc in documents:
cleaned_content = TextPreprocessor.clean_text(doc.page_content)
if not cleaned_content:
continue
processed_doc = Document(
page_content=cleaned_content,
metadata=doc.metadata
)
processed_docs.append(processed_doc)
logger.info(f"Preprocessed {len(processed_docs)} documents")
return processed_docs
text_preprocessor = TextPreprocessor() |