""" Text cleaning tool for smolagents. Provides a Tool implementation that wraps the cleantext library for normalizing text content with handling for various text transformation options. """ # Standard library imports import logging from typing import Dict, Any, Optional # Third-party imports from smolagents import Tool # Try to import cleantext - handle gracefully if not installed try: from cleantext import clean CLEANTEXT_AVAILABLE = True except ImportError: CLEANTEXT_AVAILABLE = False # Configure module logger logger = logging.getLogger(__name__) # pylint: disable=too-few-public-methods class TextCleanerTool(Tool): """A simplified text cleaner tool that avoids typing issues.""" name = "clean_text" description = ( "Cleans and normalizes text using the cleantext library. " "Transforms messy user-generated content into normalized text." ) inputs = { "text": {"type": "string", "description": "The input text to clean"}, "options": { "type": "object", "description": ( "Optional parameters for text cleaning. Available options: " "fix_unicode, to_ascii, lower, no_line_breaks, no_urls, " "no_emails, no_phone_numbers, no_numbers, no_digits, " "no_currency_symbols, no_punct, no_emoji, lang" ), "optional": True, "nullable": True, }, } output_type = "string" def forward(self, text: str, options: Optional[Dict[str, Any]] = None) -> str: """ Clean text using the cleantext library with flexible options. User-generated content on the Web and in social media is often dirty. Preprocess your scraped data with `clean-text` to create a normalized text representation. For instance, turn this corrupted input: ``` A bunch of \\u2018new\\u2019 references, including [Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29). »Yóù àré rïght <3!« ``` into this clean output: ``` A bunch of 'new' references, including [moana](). "you are right <3!" ``` `clean-text` uses ftfy, unidecode and numerous hand-crafted rules, i.e., RegEx. Example API: clean("some input", fix_unicode=True, # fix various unicode errors to_ascii=True, # transliterate to closest ASCII lower=True, # lowercase text no_line_breaks=False, # normalize line breaks no_urls=False, # replace URLs with a token no_emails=False, # replace email addresses with token no_phone_numbers=False, # replace phone numbers with token no_numbers=False, # replace all numbers with token no_digits=False, # replace all digits with 0 no_currency_symbols=False, # replace currency symbols with token no_punct=False, # remove punctuations replace_with_punct="", # replacement for punctuation replace_with_url="", # replacement for URLs replace_with_email="", # replacement for emails replace_with_phone_number="", # replacement for phones replace_with_number="", # replacement for numbers replace_with_digit="0", # replacement for digits replace_with_currency_symbol="", # currency replacement lang="en" # language ('en' or 'de' supported) ) """ # Input validation if not text: return "" if not isinstance(text, str): try: text = str(text) except (ValueError, TypeError) as e: logger.error("Failed to convert input to string: %s", e) return f"Error: Could not process input of type {type(text)}" # Check if cleantext is available if not CLEANTEXT_AVAILABLE: logger.error( "cleantext package not installed. " "Install with: pip install clean-text" ) return "Error: Required dependency 'clean-text' is not installed." # Default replacement tokens replacements = { "replace_with_url": "", "replace_with_email": "", "replace_with_phone_number": "", "replace_with_number": "", "replace_with_digit": "0", "replace_with_currency_symbol": "", "replace_with_punct": "", } # Default options default_options = { "fix_unicode": True, "to_ascii": True, "lower": True, "no_line_breaks": False, "no_urls": False, "no_emails": False, "no_phone_numbers": False, "no_numbers": False, "no_digits": False, "no_currency_symbols": False, "no_punct": False, "no_emoji": False, "lang": "en", } # Merge user options with defaults if options: default_options.update(options) # Merge all parameters params = {**default_options, **replacements} try: # Apply cleantext with parameters return clean(text, **params) except (ValueError, TypeError, AttributeError) as e: logger.error("Error cleaning text: %s", e) return f"Error during text cleaning: {str(e)}"