File size: 4,227 Bytes
ae4e2a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""
Text Processor
==============
Text processing utilities (Single Responsibility)
"""

import re
from typing import List, Dict


class TextProcessor:
    """
    Text processing service
    
    Responsibilities:
    - Split text into sentences
    - Extract words from text
    - Identify stop words
    - Identify punctuation
    """
    
    STOP_WORDS = {
        'này', 'kia', 'đó', 'ấy', 'nọ', 'đây', 'nào',
        'các', 'những', 'mọi', 'cả',
        'tôi', 'ta', 'mình', 'bạn', 'anh', 'chị', 'em',
        'nó', 'họ', 'chúng', 'ai', 'gì',
        'và', 'hoặc', 'nhưng', 'mà', 'nên', 'vì', 'nếu', 'thì', 'hay',
        'rồi', 'còn', 'cũng', 'luôn', 'đều',
        'thế', 'như',
        'của', 'cho', 'với', 'từ', 'bởi', 'về', 'trong', 'ngoài',
        'là', 'có', 'được', 'bị', 'ở', 'đang', 'sẽ', 'đã',
        'thể', 'phải', 'nên', 'muốn', 'cần', 'biết',
        'rất', 'quá', 'khá', 'hơi', 'vẫn', 'còn',
        'chỉ', 'vừa', 'mới',
        'đâu', 'sao',
        'không', 'chẳng', 'chưa',
        'nhiều', 'ít', 'vài', 'một',
        'việc', 'chuyện', 'điều', 'lúc', 'khi',
        'ra', 'vào', 'nhau', 'nhữ',
        'vậy', 'ạ', 'nhé',
    }
    
    PUNCTUATION = set('.,!?;:()[]{}"\'-/\\@#$%^&*+=<>~`|')
    
    @staticmethod
    def split_into_sentences(text: str) -> List[Dict[str, any]]:
        """
        Split text into sentences
        
        Args:
            text: Input text
            
        Returns:
            List of sentences with positions
        """
        sentence_pattern = r'([.!?]+)\s*'
        parts = re.split(sentence_pattern, text)
        
        sentences = []
        current_pos = 0
        i = 0
        
        while i < len(parts):
            if not parts[i].strip():
                current_pos += len(parts[i])
                i += 1
                continue
            
            if not re.match(r'^[.!?]+$', parts[i]):
                sentence_text = parts[i]
                
                if i + 1 < len(parts) and re.match(r'^[.!?]+$', parts[i + 1]):
                    sentence_text += parts[i + 1]
                    i += 2
                else:
                    i += 1
                
                if sentence_text.strip():
                    sentences.append({
                        'text': sentence_text,
                        'start': current_pos,
                        'end': current_pos + len(sentence_text)
                    })
                
                current_pos += len(sentence_text)
            else:
                current_pos += len(parts[i])
                i += 1
        
        if len(sentences) == 0:
            sentences.append({'text': text, 'start': 0, 'end': len(text)})
        
        return sentences
    
    @staticmethod
    def extract_words(text: str) -> List[Dict[str, any]]:
        """
        Extract words from text
        
        Args:
            text: Input text
            
        Returns:
            List of words with positions
        """
        pattern = r'[a-zA-Zàáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵđ_]+'
        
        words = []
        for match in re.finditer(pattern, text, re.IGNORECASE):
            words.append({
                'word': match.group(),
                'start': match.start(),
                'end': match.end()
            })
        
        return words
    
    @classmethod
    def is_stop_word(cls, word: str) -> bool:
        """
        Check if word is a stop word
        
        Args:
            word: Word to check
            
        Returns:
            True if stop word
        """
        return word.lower().strip() in cls.STOP_WORDS
    
    @classmethod
    def is_punctuation(cls, token: str) -> bool:
        """
        Check if token is punctuation
        
        Args:
            token: Token to check
            
        Returns:
            True if punctuation
        """
        return not token or all(c in cls.PUNCTUATION for c in token)