toxic-api / app /services /text_processor.py
handrix
Initial deployment - Toxic Detection API
ae4e2a6
"""
Text Processor
==============
Text processing utilities (Single Responsibility)
"""
import re
from typing import List, Dict
class TextProcessor:
"""
Text processing service
Responsibilities:
- Split text into sentences
- Extract words from text
- Identify stop words
- Identify punctuation
"""
STOP_WORDS = {
'này', 'kia', 'đó', 'ấy', 'nọ', 'đây', 'nào',
'các', 'những', 'mọi', 'cả',
'tôi', 'ta', 'mình', 'bạn', 'anh', 'chị', 'em',
'nó', 'họ', 'chúng', 'ai', 'gì',
'và', 'hoặc', 'nhưng', 'mà', 'nên', 'vì', 'nếu', 'thì', 'hay',
'rồi', 'còn', 'cũng', 'luôn', 'đều',
'thế', 'như',
'của', 'cho', 'với', 'từ', 'bởi', 'về', 'trong', 'ngoài',
'là', 'có', 'được', 'bị', 'ở', 'đang', 'sẽ', 'đã',
'thể', 'phải', 'nên', 'muốn', 'cần', 'biết',
'rất', 'quá', 'khá', 'hơi', 'vẫn', 'còn',
'chỉ', 'vừa', 'mới',
'đâu', 'sao',
'không', 'chẳng', 'chưa',
'nhiều', 'ít', 'vài', 'một',
'việc', 'chuyện', 'điều', 'lúc', 'khi',
'ra', 'vào', 'nhau', 'nhữ',
'vậy', 'ạ', 'nhé',
}
PUNCTUATION = set('.,!?;:()[]{}"\'-/\\@#$%^&*+=<>~`|')
@staticmethod
def split_into_sentences(text: str) -> List[Dict[str, any]]:
"""
Split text into sentences
Args:
text: Input text
Returns:
List of sentences with positions
"""
sentence_pattern = r'([.!?]+)\s*'
parts = re.split(sentence_pattern, text)
sentences = []
current_pos = 0
i = 0
while i < len(parts):
if not parts[i].strip():
current_pos += len(parts[i])
i += 1
continue
if not re.match(r'^[.!?]+$', parts[i]):
sentence_text = parts[i]
if i + 1 < len(parts) and re.match(r'^[.!?]+$', parts[i + 1]):
sentence_text += parts[i + 1]
i += 2
else:
i += 1
if sentence_text.strip():
sentences.append({
'text': sentence_text,
'start': current_pos,
'end': current_pos + len(sentence_text)
})
current_pos += len(sentence_text)
else:
current_pos += len(parts[i])
i += 1
if len(sentences) == 0:
sentences.append({'text': text, 'start': 0, 'end': len(text)})
return sentences
@staticmethod
def extract_words(text: str) -> List[Dict[str, any]]:
"""
Extract words from text
Args:
text: Input text
Returns:
List of words with positions
"""
pattern = r'[a-zA-Zàáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵđ_]+'
words = []
for match in re.finditer(pattern, text, re.IGNORECASE):
words.append({
'word': match.group(),
'start': match.start(),
'end': match.end()
})
return words
@classmethod
def is_stop_word(cls, word: str) -> bool:
"""
Check if word is a stop word
Args:
word: Word to check
Returns:
True if stop word
"""
return word.lower().strip() in cls.STOP_WORDS
@classmethod
def is_punctuation(cls, token: str) -> bool:
"""
Check if token is punctuation
Args:
token: Token to check
Returns:
True if punctuation
"""
return not token or all(c in cls.PUNCTUATION for c in token)