from smolagents import Tool from typing import Dict, Any, Optional import logging logger = logging.getLogger(__name__) class TextCleanerTool(Tool): """A simplified text cleaner tool that avoids typing issues.""" name = "clean_text" description = ( "Cleans and normalizes text using the cleantext library. " "Transforms messy user-generated content into normalized text." ) inputs = { "text": {"type": "string", "description": "The input text to clean"}, "options": { "type": "object", "description": ( "Optional parameters for text cleaning. Available options: " "fix_unicode, to_ascii, lower, no_line_breaks, no_urls, no_emails, " "no_phone_numbers, no_numbers, no_digits, no_currency_symbols, " "no_punct, no_emoji, lang" ), "optional": True, "nullable": True, }, } output_type = "string" def forward(self, text: str, options: Optional[Dict[str, Any]] = None) -> str: """ Clean text using the cleantext library with flexible options. User-generated content on the Web and in social media is often dirty. Preprocess your scraped data with `clean-text` to create a normalized text representation. For instance, turn this corrupted input: ``` A bunch of \\u2018new\\u2019 references, including [Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29). »Yóù àré rïght <3!« ``` into this clean output: ``` A bunch of 'new' references, including [moana](). "you are right <3!" ``` `clean-text` uses ftfy, unidecode and numerous hand-crafted rules, i.e., RegEx. Example API: clean("some input", fix_unicode=True, # fix various unicode errors to_ascii=True, # transliterate to closest ASCII representation lower=True, # lowercase text no_line_breaks=False, # fully strip line breaks as opposed to only normalizing them no_urls=False, # replace all URLs with a special token no_emails=False, # replace all email addresses with a special token no_phone_numbers=False, # replace all phone numbers with a special token no_numbers=False, # replace all numbers with a special token no_digits=False, # replace all digits with a special token no_currency_symbols=False, # replace all currency symbols with a special token no_punct=False, # remove punctuations replace_with_punct="", # instead of removing punctuations you may replace them replace_with_url="", replace_with_email="", replace_with_phone_number="", replace_with_number="", replace_with_digit="0", replace_with_currency_symbol="", lang="en" # set to 'de' for German special handling ) """ # Input validation if not text: return "" if not isinstance(text, str): try: text = str(text) except Exception as e: logger.error(f"Failed to convert input to string: {e}") return f"Error: Could not process input of type {type(text)}" # Import cleantext safely try: from cleantext import clean except ImportError: logger.error( "cleantext package not installed. Install with: pip install clean-text" ) return "Error: Required dependency 'clean-text' is not installed." # Default replacement tokens replacements = { "replace_with_url": "", "replace_with_email": "", "replace_with_phone_number": "", "replace_with_number": "", "replace_with_digit": "0", "replace_with_currency_symbol": "", "replace_with_punct": "", } # Default options default_options = { "fix_unicode": True, "to_ascii": True, "lower": True, "no_line_breaks": False, "no_urls": False, "no_emails": False, "no_phone_numbers": False, "no_numbers": False, "no_digits": False, "no_currency_symbols": False, "no_punct": False, "no_emoji": False, "lang": "en", } # Merge user options with defaults if options: default_options.update(options) # Merge all parameters params = {**default_options, **replacements} try: # Apply cleantext with parameters return clean(text, **params) except Exception as e: logger.error(f"Error cleaning text: {e}") return f"Error during text cleaning: {str(e)}"