File size: 1,131 Bytes
64d7fdf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import re
from langchain_core.documents import Document
from app.utils.logger import logger
from typing import List


class TextPreprocessor:
    @staticmethod
    def clean_text(text: str) -> str:
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        
        # Remove leading/trailing whitespace
        text = text.strip()
        
        # Remove multiple newlines
        text = re.sub(r'\n+', '\n', text)
        
        return text
    
    @staticmethod
    def preprocess_documents(documents: List[Document]) -> List[Document]:
        processed_docs = []
        
        for doc in documents:
            cleaned_content = TextPreprocessor.clean_text(doc.page_content)
            
            if not cleaned_content:
                continue
            
            processed_doc = Document(
                page_content=cleaned_content,
                metadata=doc.metadata
            )
            processed_docs.append(processed_doc)
        
        logger.info(f"Preprocessed {len(processed_docs)} documents")
        return processed_docs


text_preprocessor = TextPreprocessor()