Spaces:

HoangDaoAI
/

toxic-api

Sleeping

toxic-api / app /services /text_processor.py

handrix

Initial deployment - Toxic Detection API

ae4e2a6 12 days ago

4.23 kB

	"""
	Text Processor
	==============
	Text processing utilities (Single Responsibility)
	"""

	import re
	from typing import List, Dict


	class TextProcessor:
	"""
	Text processing service

	Responsibilities:
	- Split text into sentences
	- Extract words from text
	- Identify stop words
	- Identify punctuation
	"""

	STOP_WORDS = {
	'này', 'kia', 'đó', 'ấy', 'nọ', 'đây', 'nào',
	'các', 'những', 'mọi', 'cả',
	'tôi', 'ta', 'mình', 'bạn', 'anh', 'chị', 'em',
	'nó', 'họ', 'chúng', 'ai', 'gì',
	'và', 'hoặc', 'nhưng', 'mà', 'nên', 'vì', 'nếu', 'thì', 'hay',
	'rồi', 'còn', 'cũng', 'luôn', 'đều',
	'thế', 'như',
	'của', 'cho', 'với', 'từ', 'bởi', 'về', 'trong', 'ngoài',
	'là', 'có', 'được', 'bị', 'ở', 'đang', 'sẽ', 'đã',
	'thể', 'phải', 'nên', 'muốn', 'cần', 'biết',
	'rất', 'quá', 'khá', 'hơi', 'vẫn', 'còn',
	'chỉ', 'vừa', 'mới',
	'đâu', 'sao',
	'không', 'chẳng', 'chưa',
	'nhiều', 'ít', 'vài', 'một',
	'việc', 'chuyện', 'điều', 'lúc', 'khi',
	'ra', 'vào', 'nhau', 'nhữ',
	'vậy', 'ạ', 'nhé',
	}

	PUNCTUATION = set('.,!?;:()[]{}"\'-/\\@#$%^&*+=<>~`\|')

	@staticmethod
	def split_into_sentences(text: str) -> List[Dict[str, any]]:
	"""
	Split text into sentences

	Args:
	text: Input text

	Returns:
	List of sentences with positions
	"""
	sentence_pattern = r'([.!?]+)\s*'
	parts = re.split(sentence_pattern, text)

	sentences = []
	current_pos = 0
	i = 0

	while i < len(parts):
	if not parts[i].strip():
	current_pos += len(parts[i])
	i += 1
	continue

	if not re.match(r'^[.!?]+$', parts[i]):
	sentence_text = parts[i]

	if i + 1 < len(parts) and re.match(r'^[.!?]+$', parts[i + 1]):
	sentence_text += parts[i + 1]
	i += 2
	else:
	i += 1

	if sentence_text.strip():
	sentences.append({
	'text': sentence_text,
	'start': current_pos,
	'end': current_pos + len(sentence_text)
	})

	current_pos += len(sentence_text)
	else:
	current_pos += len(parts[i])
	i += 1

	if len(sentences) == 0:
	sentences.append({'text': text, 'start': 0, 'end': len(text)})

	return sentences

	@staticmethod
	def extract_words(text: str) -> List[Dict[str, any]]:
	"""
	Extract words from text

	Args:
	text: Input text

	Returns:
	List of words with positions
	"""
	pattern = r'[a-zA-Zàáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵđ_]+'

	words = []
	for match in re.finditer(pattern, text, re.IGNORECASE):
	words.append({
	'word': match.group(),
	'start': match.start(),
	'end': match.end()
	})

	return words

	@classmethod
	def is_stop_word(cls, word: str) -> bool:
	"""
	Check if word is a stop word

	Args:
	word: Word to check

	Returns:
	True if stop word
	"""
	return word.lower().strip() in cls.STOP_WORDS

	@classmethod
	def is_punctuation(cls, token: str) -> bool:
	"""
	Check if token is punctuation

	Args:
	token: Token to check

	Returns:
	True if punctuation
	"""
	return not token or all(c in cls.PUNCTUATION for c in token)