File size: 4,563 Bytes
bec06d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import re
from typing import List, Tuple
import logging

logger = logging.getLogger(__name__)

class TextPreprocessor:
    """
    A utility class for preprocessing text before embedding.
    Includes cleaning, normalization, and chunking methods.
    """
    
    @staticmethod
    def clean_text(text: str) -> str:
        """Clean text by removing extra whitespaces, newlines, etc."""
        # Remove extra whitespaces and newlines
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters, keeping only alphanumeric and basic punctuation
        text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', ' ', text)
        # Remove extra spaces again after special character removal
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    @staticmethod
    def split_by_sentences(text: str) -> List[str]:
        """Split text into sentences."""
        # Split by sentence endings
        sentences = re.split(r'[.!?]+', text)
        # Remove empty strings and strip whitespace
        sentences = [s.strip() for s in sentences if s.strip()]
        return sentences
    
    @staticmethod
    def split_by_paragraphs(text: str) -> List[str]:
        """Split text into paragraphs."""
        paragraphs = text.split('\n\n')
        # Remove empty strings and strip whitespace
        paragraphs = [p.strip() for p in paragraphs if p.strip()]
        return paragraphs
    
    @staticmethod
    def chunk_text(text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]:
        """
        Split text into overlapping chunks of specified size.
        
        Args:
            text: The input text to chunk
            chunk_size: Maximum size of each chunk (in characters)
            overlap: Number of characters to overlap between chunks
        
        Returns:
            List of text chunks
        """
        if len(text) <= chunk_size:
            return [text]
        
        chunks = []
        start = 0
        
        while start < len(text):
            end = start + chunk_size
            
            # Try to break at sentence boundaries if possible
            if end < len(text):
                # Look for a sentence boundary near the end
                sentence_end = text.rfind('.', start, end)
                if sentence_end != -1 and sentence_end > start + chunk_size // 2:
                    end = sentence_end + 1
                else:
                    # If no sentence boundary found, look for a space
                    space_end = text.rfind(' ', start, end)
                    if space_end != -1 and space_end > start + chunk_size // 2:
                        end = space_end
            
            chunk = text[start:end].strip()
            if chunk:
                chunks.append(chunk)
            
            # Move start position, considering overlap
            start = end - overlap if overlap < end else end
            
            # If the last chunk was not processed and we've reached the end
            if start >= len(text) and end < len(text):
                final_chunk = text[end:].strip()
                if final_chunk:
                    chunks.append(final_chunk)
        
        # Filter out any empty chunks
        chunks = [chunk for chunk in chunks if chunk]
        return chunks
    
    @staticmethod
    def extract_key_info(text: str) -> dict:
        """
        Extract key information from text such as headers, titles, etc.
        This is a simple implementation that looks for common patterns.
        """
        info = {}
        
        # Look for potential titles (lines that are short and capitalized)
        lines = text.split('\n')
        potential_titles = [
            line.strip() 
            for line in lines[:10]  # Check first 10 lines
            if 10 < len(line.strip()) < 100 and  # Length between 10-100 chars
               line.strip().isupper() or  # All caps
               line.strip().istitle()  # Title case
        ]
        
        if potential_titles:
            info['potential_title'] = potential_titles[0]
        
        # Extract any email addresses
        emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
        if emails:
            info['emails'] = emails[:5]  # Limit to first 5 emails
        
        # Extract any URLs
        urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
        if urls:
            info['urls'] = urls[:5]  # Limit to first 5 URLs
        
        return info