""" Text Processor ============== Text processing utilities (Single Responsibility) """ import re from typing import List, Dict class TextProcessor: """ Text processing service Responsibilities: - Split text into sentences - Extract words from text - Identify stop words - Identify punctuation """ STOP_WORDS = { 'này', 'kia', 'đó', 'ấy', 'nọ', 'đây', 'nào', 'các', 'những', 'mọi', 'cả', 'tôi', 'ta', 'mình', 'bạn', 'anh', 'chị', 'em', 'nó', 'họ', 'chúng', 'ai', 'gì', 'và', 'hoặc', 'nhưng', 'mà', 'nên', 'vì', 'nếu', 'thì', 'hay', 'rồi', 'còn', 'cũng', 'luôn', 'đều', 'thế', 'như', 'của', 'cho', 'với', 'từ', 'bởi', 'về', 'trong', 'ngoài', 'là', 'có', 'được', 'bị', 'ở', 'đang', 'sẽ', 'đã', 'thể', 'phải', 'nên', 'muốn', 'cần', 'biết', 'rất', 'quá', 'khá', 'hơi', 'vẫn', 'còn', 'chỉ', 'vừa', 'mới', 'đâu', 'sao', 'không', 'chẳng', 'chưa', 'nhiều', 'ít', 'vài', 'một', 'việc', 'chuyện', 'điều', 'lúc', 'khi', 'ra', 'vào', 'nhau', 'nhữ', 'vậy', 'ạ', 'nhé', } PUNCTUATION = set('.,!?;:()[]{}"\'-/\\@#$%^&*+=<>~`|') @staticmethod def split_into_sentences(text: str) -> List[Dict[str, any]]: """ Split text into sentences Args: text: Input text Returns: List of sentences with positions """ sentence_pattern = r'([.!?]+)\s*' parts = re.split(sentence_pattern, text) sentences = [] current_pos = 0 i = 0 while i < len(parts): if not parts[i].strip(): current_pos += len(parts[i]) i += 1 continue if not re.match(r'^[.!?]+$', parts[i]): sentence_text = parts[i] if i + 1 < len(parts) and re.match(r'^[.!?]+$', parts[i + 1]): sentence_text += parts[i + 1] i += 2 else: i += 1 if sentence_text.strip(): sentences.append({ 'text': sentence_text, 'start': current_pos, 'end': current_pos + len(sentence_text) }) current_pos += len(sentence_text) else: current_pos += len(parts[i]) i += 1 if len(sentences) == 0: sentences.append({'text': text, 'start': 0, 'end': len(text)}) return sentences @staticmethod def extract_words(text: str) -> List[Dict[str, any]]: """ Extract words from text Args: text: Input text Returns: List of words with positions """ pattern = r'[a-zA-Zàáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵđ_]+' words = [] for match in re.finditer(pattern, text, re.IGNORECASE): words.append({ 'word': match.group(), 'start': match.start(), 'end': match.end() }) return words @classmethod def is_stop_word(cls, word: str) -> bool: """ Check if word is a stop word Args: word: Word to check Returns: True if stop word """ return word.lower().strip() in cls.STOP_WORDS @classmethod def is_punctuation(cls, token: str) -> bool: """ Check if token is punctuation Args: token: Token to check Returns: True if punctuation """ return not token or all(c in cls.PUNCTUATION for c in token)