Spaces:
Sleeping
Sleeping
| """TextContent value object for representing text data with language and encoding validation.""" | |
| from dataclasses import dataclass | |
| from typing import Optional | |
| import re | |
| class TextContent: | |
| """Value object representing text content with language and encoding information.""" | |
| text: str | |
| language: str | |
| encoding: str = 'utf-8' | |
| def __post_init__(self): | |
| """Validate text content after initialization.""" | |
| self._validate() | |
| def _validate(self): | |
| """Validate text content properties.""" | |
| if not isinstance(self.text, str): | |
| raise TypeError("Text must be a string") | |
| if not self.text.strip(): | |
| raise ValueError("Text content cannot be empty or whitespace only") | |
| if len(self.text) > 50000: # Reasonable limit for TTS processing | |
| raise ValueError("Text content too long (maximum 50,000 characters)") | |
| if not isinstance(self.language, str): | |
| raise TypeError("Language must be a string") | |
| if not self.language.strip(): | |
| raise ValueError("Language cannot be empty") | |
| # Validate language code format (ISO 639-1 or ISO 639-3) | |
| if not re.match(r'^[a-z]{2,3}(-[A-Z]{2})?$', self.language): | |
| raise ValueError(f"Invalid language code format: {self.language}. Expected format: 'en', 'en-US', etc.") | |
| if not isinstance(self.encoding, str): | |
| raise TypeError("Encoding must be a string") | |
| if self.encoding not in ['utf-8', 'utf-16', 'ascii', 'latin-1']: | |
| raise ValueError(f"Unsupported encoding: {self.encoding}. Supported: utf-8, utf-16, ascii, latin-1") | |
| # Validate that text can be encoded with specified encoding | |
| try: | |
| self.text.encode(self.encoding) | |
| except UnicodeEncodeError: | |
| raise ValueError(f"Text cannot be encoded with {self.encoding} encoding") | |
| def word_count(self) -> int: | |
| """Get the approximate word count of the text.""" | |
| return len(self.text.split()) | |
| def character_count(self) -> int: | |
| """Get the character count of the text.""" | |
| return len(self.text) | |
| def is_empty(self) -> bool: | |
| """Check if the text content is effectively empty.""" | |
| return not self.text.strip() | |
| def truncate(self, max_length: int) -> 'TextContent': | |
| """Create a new TextContent with truncated text.""" | |
| if max_length <= 0: | |
| raise ValueError("Max length must be positive") | |
| if len(self.text) <= max_length: | |
| return self | |
| truncated_text = self.text[:max_length].rstrip() | |
| return TextContent( | |
| text=truncated_text, | |
| language=self.language, | |
| encoding=self.encoding | |
| ) |