Spaces:
Runtime error
Runtime error
| from smolagents import Tool | |
| from typing import Dict, Optional, Any | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class TextCleanerTool(Tool): | |
| name = "clean_text" | |
| description = ( | |
| "Cleans and normalizes text by removing or replacing unwanted elements" | |
| ) | |
| inputs = { | |
| "text": {"type": "string", "description": "The input text to clean"}, | |
| "fix_unicode": { | |
| "type": "boolean", | |
| "description": "Fix broken unicode characters and mojibake", | |
| "default": True, | |
| }, | |
| "to_ascii": { | |
| "type": "boolean", | |
| "description": "Convert non-ASCII characters to their closest ASCII equivalents", | |
| "default": True, | |
| }, | |
| "lower": { | |
| "type": "boolean", | |
| "description": "Convert text to lowercase", | |
| "default": True, | |
| }, | |
| "no_line_breaks": { | |
| "type": "boolean", | |
| "description": "Replace line breaks with spaces", | |
| "default": False, | |
| }, | |
| "no_urls": { | |
| "type": "boolean", | |
| "description": "Replace URLs with a token", | |
| "default": False, | |
| }, | |
| "no_emails": { | |
| "type": "boolean", | |
| "description": "Replace email addresses with a token", | |
| "default": False, | |
| }, | |
| "no_phone_numbers": { | |
| "type": "boolean", | |
| "description": "Replace phone numbers with a token", | |
| "default": False, | |
| }, | |
| "no_numbers": { | |
| "type": "boolean", | |
| "description": "Replace all numbers with a token", | |
| "default": False, | |
| }, | |
| "no_digits": { | |
| "type": "boolean", | |
| "description": "Replace all digits with 0", | |
| "default": False, | |
| }, | |
| "no_currency_symbols": { | |
| "type": "boolean", | |
| "description": "Replace currency symbols with a token", | |
| "default": False, | |
| }, | |
| "no_punct": { | |
| "type": "boolean", | |
| "description": "Remove all punctuation", | |
| "default": False, | |
| }, | |
| "no_emoji": { | |
| "type": "boolean", | |
| "description": "Remove all emoji characters", | |
| "default": False, | |
| }, | |
| "lang": { | |
| "type": "string", | |
| "description": "Language code for special handling ('en' or 'de' supported)", | |
| "default": "en", | |
| }, | |
| "custom_replacements": { | |
| "type": "object", | |
| "description": "Dictionary of custom string replacements to apply", | |
| "optional": True, | |
| }, | |
| } | |
| output_type = "string" | |
| def forward( | |
| self, | |
| text: str, | |
| fix_unicode: bool = True, | |
| to_ascii: bool = True, | |
| lower: bool = True, | |
| no_line_breaks: bool = False, | |
| no_urls: bool = False, | |
| no_emails: bool = False, | |
| no_phone_numbers: bool = False, | |
| no_numbers: bool = False, | |
| no_digits: bool = False, | |
| no_currency_symbols: bool = False, | |
| no_punct: bool = False, | |
| no_emoji: bool = False, | |
| lang: str = "en", | |
| custom_replacements: Optional[Dict[str, str]] = None, | |
| ) -> str: | |
| """Clean and normalize text by removing or replacing unwanted elements.""" | |
| # Input validation | |
| if not text: | |
| return "" | |
| if not isinstance(text, str): | |
| try: | |
| text = str(text) | |
| except Exception as e: | |
| logger.error(f"Failed to convert input to string: {e}") | |
| return f"Error: Could not process input of type {type(text)}" | |
| # Import cleantext safely | |
| try: | |
| from cleantext import clean | |
| except ImportError: | |
| logger.error( | |
| "cleantext package not installed. Install with: pip install clean-text" | |
| ) | |
| return "Error: Required dependency 'clean-text' is not installed." | |
| # Special parameter handling | |
| replace_params = { | |
| "replace_with_url": "<URL>", | |
| "replace_with_email": "<EMAIL>", | |
| "replace_with_phone_number": "<PHONE>", | |
| "replace_with_number": "<NUMBER>", | |
| "replace_with_digit": "0", | |
| "replace_with_currency_symbol": "<CUR>", | |
| "replace_with_punct": "", | |
| } | |
| try: | |
| # Apply cleantext with parameters | |
| cleaned_text = clean( | |
| text, | |
| fix_unicode=fix_unicode, | |
| to_ascii=to_ascii, | |
| lower=lower, | |
| no_line_breaks=no_line_breaks, | |
| no_urls=no_urls, | |
| no_emails=no_emails, | |
| no_phone_numbers=no_phone_numbers, | |
| no_numbers=no_numbers, | |
| no_digits=no_digits, | |
| no_currency_symbols=no_currency_symbols, | |
| no_punct=no_punct, | |
| no_emoji=no_emoji, | |
| lang=lang, | |
| **replace_params, | |
| ) | |
| # Apply any custom replacements | |
| if custom_replacements: | |
| for old, new in custom_replacements.items(): | |
| cleaned_text = cleaned_text.replace(old, new) | |
| return cleaned_text | |
| except Exception as e: | |
| logger.error(f"Error cleaning text: {e}") | |
| return f"Error during text cleaning: {str(e)}" | |