from smolagents import Tool from typing import Dict, Optional, Any import logging logger = logging.getLogger(__name__) class TextCleanerTool(Tool): name = "clean_text" description = ( "Cleans and normalizes text by removing or replacing unwanted elements" ) inputs = { "text": {"type": "string", "description": "The input text to clean"}, "fix_unicode": { "type": "boolean", "description": "Fix broken unicode characters and mojibake", "default": True, }, "to_ascii": { "type": "boolean", "description": "Convert non-ASCII characters to their closest ASCII equivalents", "default": True, }, "lower": { "type": "boolean", "description": "Convert text to lowercase", "default": True, }, "no_line_breaks": { "type": "boolean", "description": "Replace line breaks with spaces", "default": False, }, "no_urls": { "type": "boolean", "description": "Replace URLs with a token", "default": False, }, "no_emails": { "type": "boolean", "description": "Replace email addresses with a token", "default": False, }, "no_phone_numbers": { "type": "boolean", "description": "Replace phone numbers with a token", "default": False, }, "no_numbers": { "type": "boolean", "description": "Replace all numbers with a token", "default": False, }, "no_digits": { "type": "boolean", "description": "Replace all digits with 0", "default": False, }, "no_currency_symbols": { "type": "boolean", "description": "Replace currency symbols with a token", "default": False, }, "no_punct": { "type": "boolean", "description": "Remove all punctuation", "default": False, }, "no_emoji": { "type": "boolean", "description": "Remove all emoji characters", "default": False, }, "lang": { "type": "string", "description": "Language code for special handling ('en' or 'de' supported)", "default": "en", }, "custom_replacements": { "type": "object", "description": "Dictionary of custom string replacements to apply", "optional": True, }, } output_type = "string" def forward( self, text: str, fix_unicode: bool = True, to_ascii: bool = True, lower: bool = True, no_line_breaks: bool = False, no_urls: bool = False, no_emails: bool = False, no_phone_numbers: bool = False, no_numbers: bool = False, no_digits: bool = False, no_currency_symbols: bool = False, no_punct: bool = False, no_emoji: bool = False, lang: str = "en", custom_replacements: Optional[Dict[str, str]] = None, ) -> str: """Clean and normalize text by removing or replacing unwanted elements.""" # Input validation if not text: return "" if not isinstance(text, str): try: text = str(text) except Exception as e: logger.error(f"Failed to convert input to string: {e}") return f"Error: Could not process input of type {type(text)}" # Import cleantext safely try: from cleantext import clean except ImportError: logger.error( "cleantext package not installed. Install with: pip install clean-text" ) return "Error: Required dependency 'clean-text' is not installed." # Special parameter handling replace_params = { "replace_with_url": "", "replace_with_email": "", "replace_with_phone_number": "", "replace_with_number": "", "replace_with_digit": "0", "replace_with_currency_symbol": "", "replace_with_punct": "", } try: # Apply cleantext with parameters cleaned_text = clean( text, fix_unicode=fix_unicode, to_ascii=to_ascii, lower=lower, no_line_breaks=no_line_breaks, no_urls=no_urls, no_emails=no_emails, no_phone_numbers=no_phone_numbers, no_numbers=no_numbers, no_digits=no_digits, no_currency_symbols=no_currency_symbols, no_punct=no_punct, no_emoji=no_emoji, lang=lang, **replace_params, ) # Apply any custom replacements if custom_replacements: for old, new in custom_replacements.items(): cleaned_text = cleaned_text.replace(old, new) return cleaned_text except Exception as e: logger.error(f"Error cleaning text: {e}") return f"Error during text cleaning: {str(e)}"