OpenDeepResearch

Runtime error

File size: 5,230 Bytes

from smolagents import Tool
from typing import Dict, Any, Optional
import logging

logger = logging.getLogger(__name__)


class TextCleanerTool(Tool):
    """A simplified text cleaner tool that avoids typing issues."""

    name = "clean_text"
    description = (
        "Cleans and normalizes text using the cleantext library. "
        "Transforms messy user-generated content into normalized text."
    )
    inputs = {
        "text": {"type": "string", "description": "The input text to clean"},
        "options": {
            "type": "object",
            "description": (
                "Optional parameters for text cleaning. Available options: "
                "fix_unicode, to_ascii, lower, no_line_breaks, no_urls, no_emails, "
                "no_phone_numbers, no_numbers, no_digits, no_currency_symbols, "
                "no_punct, no_emoji, lang"
            ),
            "optional": True,
            "nullable": True,
        },
    }
    output_type = "string"

    def forward(self, text: str, options: Optional[Dict[str, Any]] = None) -> str:
        """
        Clean text using the cleantext library with flexible options.

        User-generated content on the Web and in social media is often dirty. Preprocess your scraped data with `clean-text` to create a normalized text representation. For instance, turn this corrupted input:

        ```
        A bunch of \\u2018new\\u2019 references, including [Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29).


        »Yóù àré     rïght &lt;3!«
        ```

        into this clean output:

        ```
        A bunch of 'new' references, including [moana](<URL>).

        "you are right <3!"
        ```

        `clean-text` uses ftfy, unidecode and numerous hand-crafted rules, i.e., RegEx.

        Example API:
        clean("some input",
            fix_unicode=True,               # fix various unicode errors
            to_ascii=True,                  # transliterate to closest ASCII representation
            lower=True,                     # lowercase text
            no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
            no_urls=False,                  # replace all URLs with a special token
            no_emails=False,                # replace all email addresses with a special token
            no_phone_numbers=False,         # replace all phone numbers with a special token
            no_numbers=False,               # replace all numbers with a special token
            no_digits=False,                # replace all digits with a special token
            no_currency_symbols=False,      # replace all currency symbols with a special token
            no_punct=False,                 # remove punctuations
            replace_with_punct="",          # instead of removing punctuations you may replace them
            replace_with_url="<URL>",
            replace_with_email="<EMAIL>",
            replace_with_phone_number="<PHONE>",
            replace_with_number="<NUMBER>",
            replace_with_digit="0",
            replace_with_currency_symbol="<CUR>",
            lang="en"                       # set to 'de' for German special handling
        )
        """
        # Input validation
        if not text:
            return ""

        if not isinstance(text, str):
            try:
                text = str(text)
            except Exception as e:
                logger.error(f"Failed to convert input to string: {e}")
                return f"Error: Could not process input of type {type(text)}"

        # Import cleantext safely
        try:
            from cleantext import clean
        except ImportError:
            logger.error(
                "cleantext package not installed. Install with: pip install clean-text"
            )
            return "Error: Required dependency 'clean-text' is not installed."

        # Default replacement tokens
        replacements = {
            "replace_with_url": "<URL>",
            "replace_with_email": "<EMAIL>",
            "replace_with_phone_number": "<PHONE>",
            "replace_with_number": "<NUMBER>",
            "replace_with_digit": "0",
            "replace_with_currency_symbol": "<CUR>",
            "replace_with_punct": "",
        }

        # Default options
        default_options = {
            "fix_unicode": True,
            "to_ascii": True,
            "lower": True,
            "no_line_breaks": False,
            "no_urls": False,
            "no_emails": False,
            "no_phone_numbers": False,
            "no_numbers": False,
            "no_digits": False,
            "no_currency_symbols": False,
            "no_punct": False,
            "no_emoji": False,
            "lang": "en",
        }

        # Merge user options with defaults
        if options:
            default_options.update(options)

        # Merge all parameters
        params = {**default_options, **replacements}

        try:
            # Apply cleantext with parameters
            return clean(text, **params)
        except Exception as e:
            logger.error(f"Error cleaning text: {e}")
            return f"Error during text cleaning: {str(e)}"