OpenDeepResearch

Runtime error

App Files Files Community

Leonardo commited on Mar 28, 2025

Commit

4c54085

verified ·

1 Parent(s): fe63cd7

Update scripts/text_cleaner_tool.py

Browse files

Files changed (1) hide show

scripts/text_cleaner_tool.py +59 -36

scripts/text_cleaner_tool.py CHANGED Viewed

@@ -1,10 +1,30 @@
-from smolagents import Tool
-from typing import Dict, Any, Optional
 import logging
 logger = logging.getLogger(__name__)
 class TextCleanerTool(Tool):
     """A simplified text cleaner tool that avoids typing issues."""
@@ -19,9 +39,9 @@ class TextCleanerTool(Tool):
             "type": "object",
             "description": (
                 "Optional parameters for text cleaning. Available options: "
-                "fix_unicode, to_ascii, lower, no_line_breaks, no_urls, no_emails, "
-                "no_phone_numbers, no_numbers, no_digits, no_currency_symbols, "
-                "no_punct, no_emoji, lang"
             ),
             "optional": True,
             "nullable": True,
@@ -33,10 +53,13 @@ class TextCleanerTool(Tool):
         """
         Clean text using the cleantext library with flexible options.
-        User-generated content on the Web and in social media is often dirty. Preprocess your scraped data with `clean-text` to create a normalized text representation. For instance, turn this corrupted input:
         ```
-        A bunch of \\u2018new\\u2019 references, including [Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29).
         »Yóù àré     rïght &lt;3!«
@@ -50,29 +73,30 @@ class TextCleanerTool(Tool):
         "you are right <3!"
         ```
-        `clean-text` uses ftfy, unidecode and numerous hand-crafted rules, i.e., RegEx.
         Example API:
         clean("some input",
-            fix_unicode=True,               # fix various unicode errors
-            to_ascii=True,                  # transliterate to closest ASCII representation
-            lower=True,                     # lowercase text
-            no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
-            no_urls=False,                  # replace all URLs with a special token
-            no_emails=False,                # replace all email addresses with a special token
-            no_phone_numbers=False,         # replace all phone numbers with a special token
-            no_numbers=False,               # replace all numbers with a special token
-            no_digits=False,                # replace all digits with a special token
-            no_currency_symbols=False,      # replace all currency symbols with a special token
-            no_punct=False,                 # remove punctuations
-            replace_with_punct="",          # instead of removing punctuations you may replace them
-            replace_with_url="<URL>",
-            replace_with_email="<EMAIL>",
-            replace_with_phone_number="<PHONE>",
-            replace_with_number="<NUMBER>",
-            replace_with_digit="0",
-            replace_with_currency_symbol="<CUR>",
-            lang="en"                       # set to 'de' for German special handling
         )
         """
         # Input validation
@@ -82,16 +106,15 @@ class TextCleanerTool(Tool):
         if not isinstance(text, str):
             try:
                 text = str(text)
-            except Exception as e:
-                logger.error(f"Failed to convert input to string: {e}")
                 return f"Error: Could not process input of type {type(text)}"
-        # Import cleantext safely
-        try:
-            from cleantext import clean
-        except ImportError:
             logger.error(
-                "cleantext package not installed. Install with: pip install clean-text"
             )
             return "Error: Required dependency 'clean-text' is not installed."
@@ -133,6 +156,6 @@ class TextCleanerTool(Tool):
         try:
             # Apply cleantext with parameters
             return clean(text, **params)
-        except Exception as e:
-            logger.error(f"Error cleaning text: {e}")
             return f"Error during text cleaning: {str(e)}"

+"""
+Text cleaning tool for smolagents.
+Provides a Tool implementation that wraps the cleantext library for normalizing
+text content with handling for various text transformation options.
+"""
+# Standard library imports
 import logging
+from typing import Dict, Any, Optional
+# Third-party imports
+from smolagents import Tool
+# Try to import cleantext - handle gracefully if not installed
+try:
+    from cleantext import clean
+    CLEANTEXT_AVAILABLE = True
+except ImportError:
+    CLEANTEXT_AVAILABLE = False
+# Configure module logger
 logger = logging.getLogger(__name__)
+# pylint: disable=too-few-public-methods
 class TextCleanerTool(Tool):
     """A simplified text cleaner tool that avoids typing issues."""
             "type": "object",
             "description": (
                 "Optional parameters for text cleaning. Available options: "
+                "fix_unicode, to_ascii, lower, no_line_breaks, no_urls, "
+                "no_emails, no_phone_numbers, no_numbers, no_digits, "
+                "no_currency_symbols, no_punct, no_emoji, lang"
             ),
             "optional": True,
             "nullable": True,
         """
         Clean text using the cleantext library with flexible options.
+        User-generated content on the Web and in social media is often dirty.
+        Preprocess your scraped data with `clean-text` to create a normalized
+        text representation. For instance, turn this corrupted input:
         ```
+        A bunch of \\u2018new\\u2019 references, including
+        [Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29).
         »Yóù àré     rïght &lt;3!«
         "you are right <3!"
         ```
+        `clean-text` uses ftfy, unidecode and numerous hand-crafted rules,
+        i.e., RegEx.
         Example API:
         clean("some input",
+            fix_unicode=True,          # fix various unicode errors
+            to_ascii=True,             # transliterate to closest ASCII
+            lower=True,                # lowercase text
+            no_line_breaks=False,      # normalize line breaks
+            no_urls=False,             # replace URLs with a token
+            no_emails=False,           # replace email addresses with token
+            no_phone_numbers=False,    # replace phone numbers with token
+            no_numbers=False,          # replace all numbers with token
+            no_digits=False,           # replace all digits with 0
+            no_currency_symbols=False, # replace currency symbols with token
+            no_punct=False,            # remove punctuations
+            replace_with_punct="",     # replacement for punctuation
+            replace_with_url="<URL>",  # replacement for URLs
+            replace_with_email="<EMAIL>", # replacement for emails
+            replace_with_phone_number="<PHONE>", # replacement for phones
+            replace_with_number="<NUMBER>", # replacement for numbers
+            replace_with_digit="0",    # replacement for digits
+            replace_with_currency_symbol="<CUR>", # currency replacement
+            lang="en"                  # language ('en' or 'de' supported)
         )
         """
         # Input validation
         if not isinstance(text, str):
             try:
                 text = str(text)
+            except (ValueError, TypeError) as e:
+                logger.error("Failed to convert input to string: %s", e)
                 return f"Error: Could not process input of type {type(text)}"
+        # Check if cleantext is available
+        if not CLEANTEXT_AVAILABLE:
             logger.error(
+                "cleantext package not installed. "
+                "Install with: pip install clean-text"
             )
             return "Error: Required dependency 'clean-text' is not installed."
         try:
             # Apply cleantext with parameters
             return clean(text, **params)
+        except (ValueError, TypeError, AttributeError) as e:
+            logger.error("Error cleaning text: %s", e)
             return f"Error during text cleaning: {str(e)}"