Spaces:
Sleeping
Sleeping
| """ | |
| Text Processor | |
| ============== | |
| Text processing utilities (Single Responsibility) | |
| """ | |
| import re | |
| from typing import List, Dict | |
| class TextProcessor: | |
| """ | |
| Text processing service | |
| Responsibilities: | |
| - Split text into sentences | |
| - Extract words from text | |
| - Identify stop words | |
| - Identify punctuation | |
| """ | |
| STOP_WORDS = { | |
| 'này', 'kia', 'đó', 'ấy', 'nọ', 'đây', 'nào', | |
| 'các', 'những', 'mọi', 'cả', | |
| 'tôi', 'ta', 'mình', 'bạn', 'anh', 'chị', 'em', | |
| 'nó', 'họ', 'chúng', 'ai', 'gì', | |
| 'và', 'hoặc', 'nhưng', 'mà', 'nên', 'vì', 'nếu', 'thì', 'hay', | |
| 'rồi', 'còn', 'cũng', 'luôn', 'đều', | |
| 'thế', 'như', | |
| 'của', 'cho', 'với', 'từ', 'bởi', 'về', 'trong', 'ngoài', | |
| 'là', 'có', 'được', 'bị', 'ở', 'đang', 'sẽ', 'đã', | |
| 'thể', 'phải', 'nên', 'muốn', 'cần', 'biết', | |
| 'rất', 'quá', 'khá', 'hơi', 'vẫn', 'còn', | |
| 'chỉ', 'vừa', 'mới', | |
| 'đâu', 'sao', | |
| 'không', 'chẳng', 'chưa', | |
| 'nhiều', 'ít', 'vài', 'một', | |
| 'việc', 'chuyện', 'điều', 'lúc', 'khi', | |
| 'ra', 'vào', 'nhau', 'nhữ', | |
| 'vậy', 'ạ', 'nhé', | |
| } | |
| PUNCTUATION = set('.,!?;:()[]{}"\'-/\\@#$%^&*+=<>~`|') | |
| def split_into_sentences(text: str) -> List[Dict[str, any]]: | |
| """ | |
| Split text into sentences | |
| Args: | |
| text: Input text | |
| Returns: | |
| List of sentences with positions | |
| """ | |
| sentence_pattern = r'([.!?]+)\s*' | |
| parts = re.split(sentence_pattern, text) | |
| sentences = [] | |
| current_pos = 0 | |
| i = 0 | |
| while i < len(parts): | |
| if not parts[i].strip(): | |
| current_pos += len(parts[i]) | |
| i += 1 | |
| continue | |
| if not re.match(r'^[.!?]+$', parts[i]): | |
| sentence_text = parts[i] | |
| if i + 1 < len(parts) and re.match(r'^[.!?]+$', parts[i + 1]): | |
| sentence_text += parts[i + 1] | |
| i += 2 | |
| else: | |
| i += 1 | |
| if sentence_text.strip(): | |
| sentences.append({ | |
| 'text': sentence_text, | |
| 'start': current_pos, | |
| 'end': current_pos + len(sentence_text) | |
| }) | |
| current_pos += len(sentence_text) | |
| else: | |
| current_pos += len(parts[i]) | |
| i += 1 | |
| if len(sentences) == 0: | |
| sentences.append({'text': text, 'start': 0, 'end': len(text)}) | |
| return sentences | |
| def extract_words(text: str) -> List[Dict[str, any]]: | |
| """ | |
| Extract words from text | |
| Args: | |
| text: Input text | |
| Returns: | |
| List of words with positions | |
| """ | |
| pattern = r'[a-zA-Zàáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵđ_]+' | |
| words = [] | |
| for match in re.finditer(pattern, text, re.IGNORECASE): | |
| words.append({ | |
| 'word': match.group(), | |
| 'start': match.start(), | |
| 'end': match.end() | |
| }) | |
| return words | |
| def is_stop_word(cls, word: str) -> bool: | |
| """ | |
| Check if word is a stop word | |
| Args: | |
| word: Word to check | |
| Returns: | |
| True if stop word | |
| """ | |
| return word.lower().strip() in cls.STOP_WORDS | |
| def is_punctuation(cls, token: str) -> bool: | |
| """ | |
| Check if token is punctuation | |
| Args: | |
| token: Token to check | |
| Returns: | |
| True if punctuation | |
| """ | |
| return not token or all(c in cls.PUNCTUATION for c in token) | |