Spaces:

HoangDaoAI
/

toxic-api

Running

File size: 4,227 Bytes

ae4e2a6

"""
Text Processor
==============
Text processing utilities (Single Responsibility)
"""

import re
from typing import List, Dict


class TextProcessor:
    """
    Text processing service
    
    Responsibilities:
    - Split text into sentences
    - Extract words from text
    - Identify stop words
    - Identify punctuation
    """
    
    STOP_WORDS = {
        'này', 'kia', 'đó', 'ấy', 'nọ', 'đây', 'nào',
        'các', 'những', 'mọi', 'cả',
        'tôi', 'ta', 'mình', 'bạn', 'anh', 'chị', 'em',
        'nó', 'họ', 'chúng', 'ai', 'gì',
        'và', 'hoặc', 'nhưng', 'mà', 'nên', 'vì', 'nếu', 'thì', 'hay',
        'rồi', 'còn', 'cũng', 'luôn', 'đều',
        'thế', 'như',
        'của', 'cho', 'với', 'từ', 'bởi', 'về', 'trong', 'ngoài',
        'là', 'có', 'được', 'bị', 'ở', 'đang', 'sẽ', 'đã',
        'thể', 'phải', 'nên', 'muốn', 'cần', 'biết',
        'rất', 'quá', 'khá', 'hơi', 'vẫn', 'còn',
        'chỉ', 'vừa', 'mới',
        'đâu', 'sao',
        'không', 'chẳng', 'chưa',
        'nhiều', 'ít', 'vài', 'một',
        'việc', 'chuyện', 'điều', 'lúc', 'khi',
        'ra', 'vào', 'nhau', 'nhữ',
        'vậy', 'ạ', 'nhé',
    }
    
    PUNCTUATION = set('.,!?;:()[]{}"\'-/\\@#$%^&*+=<>~`|')
    
    @staticmethod
    def split_into_sentences(text: str) -> List[Dict[str, any]]:
        """
        Split text into sentences
        
        Args:
            text: Input text
            
        Returns:
            List of sentences with positions
        """
        sentence_pattern = r'([.!?]+)\s*'
        parts = re.split(sentence_pattern, text)
        
        sentences = []
        current_pos = 0
        i = 0
        
        while i < len(parts):
            if not parts[i].strip():
                current_pos += len(parts[i])
                i += 1
                continue
            
            if not re.match(r'^[.!?]+$', parts[i]):
                sentence_text = parts[i]
                
                if i + 1 < len(parts) and re.match(r'^[.!?]+$', parts[i + 1]):
                    sentence_text += parts[i + 1]
                    i += 2
                else:
                    i += 1
                
                if sentence_text.strip():
                    sentences.append({
                        'text': sentence_text,
                        'start': current_pos,
                        'end': current_pos + len(sentence_text)
                    })
                
                current_pos += len(sentence_text)
            else:
                current_pos += len(parts[i])
                i += 1
        
        if len(sentences) == 0:
            sentences.append({'text': text, 'start': 0, 'end': len(text)})
        
        return sentences
    
    @staticmethod
    def extract_words(text: str) -> List[Dict[str, any]]:
        """
        Extract words from text
        
        Args:
            text: Input text
            
        Returns:
            List of words with positions
        """
        pattern = r'[a-zA-Zàáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵđ_]+'
        
        words = []
        for match in re.finditer(pattern, text, re.IGNORECASE):
            words.append({
                'word': match.group(),
                'start': match.start(),
                'end': match.end()
            })
        
        return words
    
    @classmethod
    def is_stop_word(cls, word: str) -> bool:
        """
        Check if word is a stop word
        
        Args:
            word: Word to check
            
        Returns:
            True if stop word
        """
        return word.lower().strip() in cls.STOP_WORDS
    
    @classmethod
    def is_punctuation(cls, token: str) -> bool:
        """
        Check if token is punctuation
        
        Args:
            token: Token to check
            
        Returns:
            True if punctuation
        """
        return not token or all(c in cls.PUNCTUATION for c in token)