Spaces:
Runtime error
Runtime error
Leonardo
commited on
Update scripts/text_cleaner_tool.py
Browse files- scripts/text_cleaner_tool.py +59 -36
scripts/text_cleaner_tool.py
CHANGED
|
@@ -1,10 +1,30 @@
|
|
| 1 |
-
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import logging
|
|
|
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
logger = logging.getLogger(__name__)
|
| 6 |
|
| 7 |
|
|
|
|
| 8 |
class TextCleanerTool(Tool):
|
| 9 |
"""A simplified text cleaner tool that avoids typing issues."""
|
| 10 |
|
|
@@ -19,9 +39,9 @@ class TextCleanerTool(Tool):
|
|
| 19 |
"type": "object",
|
| 20 |
"description": (
|
| 21 |
"Optional parameters for text cleaning. Available options: "
|
| 22 |
-
"fix_unicode, to_ascii, lower, no_line_breaks, no_urls,
|
| 23 |
-
"no_phone_numbers, no_numbers, no_digits,
|
| 24 |
-
"no_punct, no_emoji, lang"
|
| 25 |
),
|
| 26 |
"optional": True,
|
| 27 |
"nullable": True,
|
|
@@ -33,10 +53,13 @@ class TextCleanerTool(Tool):
|
|
| 33 |
"""
|
| 34 |
Clean text using the cleantext library with flexible options.
|
| 35 |
|
| 36 |
-
User-generated content on the Web and in social media is often dirty.
|
|
|
|
|
|
|
| 37 |
|
| 38 |
```
|
| 39 |
-
A bunch of \\u2018new\\u2019 references, including
|
|
|
|
| 40 |
|
| 41 |
|
| 42 |
»Yóù àré rïght <3!«
|
|
@@ -50,29 +73,30 @@ class TextCleanerTool(Tool):
|
|
| 50 |
"you are right <3!"
|
| 51 |
```
|
| 52 |
|
| 53 |
-
`clean-text` uses ftfy, unidecode and numerous hand-crafted rules,
|
|
|
|
| 54 |
|
| 55 |
Example API:
|
| 56 |
clean("some input",
|
| 57 |
-
fix_unicode=True,
|
| 58 |
-
to_ascii=True,
|
| 59 |
-
lower=True,
|
| 60 |
-
no_line_breaks=False,
|
| 61 |
-
no_urls=False,
|
| 62 |
-
no_emails=False,
|
| 63 |
-
no_phone_numbers=False,
|
| 64 |
-
no_numbers=False,
|
| 65 |
-
no_digits=False,
|
| 66 |
-
no_currency_symbols=False,
|
| 67 |
-
no_punct=False,
|
| 68 |
-
replace_with_punct="",
|
| 69 |
-
replace_with_url="<URL>",
|
| 70 |
-
replace_with_email="<EMAIL>",
|
| 71 |
-
replace_with_phone_number="<PHONE>",
|
| 72 |
-
replace_with_number="<NUMBER>",
|
| 73 |
-
replace_with_digit="0",
|
| 74 |
-
replace_with_currency_symbol="<CUR>",
|
| 75 |
-
lang="en"
|
| 76 |
)
|
| 77 |
"""
|
| 78 |
# Input validation
|
|
@@ -82,16 +106,15 @@ class TextCleanerTool(Tool):
|
|
| 82 |
if not isinstance(text, str):
|
| 83 |
try:
|
| 84 |
text = str(text)
|
| 85 |
-
except
|
| 86 |
-
logger.error(
|
| 87 |
return f"Error: Could not process input of type {type(text)}"
|
| 88 |
|
| 89 |
-
#
|
| 90 |
-
|
| 91 |
-
from cleantext import clean
|
| 92 |
-
except ImportError:
|
| 93 |
logger.error(
|
| 94 |
-
"cleantext package not installed.
|
|
|
|
| 95 |
)
|
| 96 |
return "Error: Required dependency 'clean-text' is not installed."
|
| 97 |
|
|
@@ -133,6 +156,6 @@ class TextCleanerTool(Tool):
|
|
| 133 |
try:
|
| 134 |
# Apply cleantext with parameters
|
| 135 |
return clean(text, **params)
|
| 136 |
-
except
|
| 137 |
-
logger.error(
|
| 138 |
return f"Error during text cleaning: {str(e)}"
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Text cleaning tool for smolagents.
|
| 3 |
+
|
| 4 |
+
Provides a Tool implementation that wraps the cleantext library for normalizing
|
| 5 |
+
text content with handling for various text transformation options.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
# Standard library imports
|
| 9 |
import logging
|
| 10 |
+
from typing import Dict, Any, Optional
|
| 11 |
|
| 12 |
+
# Third-party imports
|
| 13 |
+
from smolagents import Tool
|
| 14 |
+
|
| 15 |
+
# Try to import cleantext - handle gracefully if not installed
|
| 16 |
+
try:
|
| 17 |
+
from cleantext import clean
|
| 18 |
+
|
| 19 |
+
CLEANTEXT_AVAILABLE = True
|
| 20 |
+
except ImportError:
|
| 21 |
+
CLEANTEXT_AVAILABLE = False
|
| 22 |
+
|
| 23 |
+
# Configure module logger
|
| 24 |
logger = logging.getLogger(__name__)
|
| 25 |
|
| 26 |
|
| 27 |
+
# pylint: disable=too-few-public-methods
|
| 28 |
class TextCleanerTool(Tool):
|
| 29 |
"""A simplified text cleaner tool that avoids typing issues."""
|
| 30 |
|
|
|
|
| 39 |
"type": "object",
|
| 40 |
"description": (
|
| 41 |
"Optional parameters for text cleaning. Available options: "
|
| 42 |
+
"fix_unicode, to_ascii, lower, no_line_breaks, no_urls, "
|
| 43 |
+
"no_emails, no_phone_numbers, no_numbers, no_digits, "
|
| 44 |
+
"no_currency_symbols, no_punct, no_emoji, lang"
|
| 45 |
),
|
| 46 |
"optional": True,
|
| 47 |
"nullable": True,
|
|
|
|
| 53 |
"""
|
| 54 |
Clean text using the cleantext library with flexible options.
|
| 55 |
|
| 56 |
+
User-generated content on the Web and in social media is often dirty.
|
| 57 |
+
Preprocess your scraped data with `clean-text` to create a normalized
|
| 58 |
+
text representation. For instance, turn this corrupted input:
|
| 59 |
|
| 60 |
```
|
| 61 |
+
A bunch of \\u2018new\\u2019 references, including
|
| 62 |
+
[Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29).
|
| 63 |
|
| 64 |
|
| 65 |
»Yóù àré rïght <3!«
|
|
|
|
| 73 |
"you are right <3!"
|
| 74 |
```
|
| 75 |
|
| 76 |
+
`clean-text` uses ftfy, unidecode and numerous hand-crafted rules,
|
| 77 |
+
i.e., RegEx.
|
| 78 |
|
| 79 |
Example API:
|
| 80 |
clean("some input",
|
| 81 |
+
fix_unicode=True, # fix various unicode errors
|
| 82 |
+
to_ascii=True, # transliterate to closest ASCII
|
| 83 |
+
lower=True, # lowercase text
|
| 84 |
+
no_line_breaks=False, # normalize line breaks
|
| 85 |
+
no_urls=False, # replace URLs with a token
|
| 86 |
+
no_emails=False, # replace email addresses with token
|
| 87 |
+
no_phone_numbers=False, # replace phone numbers with token
|
| 88 |
+
no_numbers=False, # replace all numbers with token
|
| 89 |
+
no_digits=False, # replace all digits with 0
|
| 90 |
+
no_currency_symbols=False, # replace currency symbols with token
|
| 91 |
+
no_punct=False, # remove punctuations
|
| 92 |
+
replace_with_punct="", # replacement for punctuation
|
| 93 |
+
replace_with_url="<URL>", # replacement for URLs
|
| 94 |
+
replace_with_email="<EMAIL>", # replacement for emails
|
| 95 |
+
replace_with_phone_number="<PHONE>", # replacement for phones
|
| 96 |
+
replace_with_number="<NUMBER>", # replacement for numbers
|
| 97 |
+
replace_with_digit="0", # replacement for digits
|
| 98 |
+
replace_with_currency_symbol="<CUR>", # currency replacement
|
| 99 |
+
lang="en" # language ('en' or 'de' supported)
|
| 100 |
)
|
| 101 |
"""
|
| 102 |
# Input validation
|
|
|
|
| 106 |
if not isinstance(text, str):
|
| 107 |
try:
|
| 108 |
text = str(text)
|
| 109 |
+
except (ValueError, TypeError) as e:
|
| 110 |
+
logger.error("Failed to convert input to string: %s", e)
|
| 111 |
return f"Error: Could not process input of type {type(text)}"
|
| 112 |
|
| 113 |
+
# Check if cleantext is available
|
| 114 |
+
if not CLEANTEXT_AVAILABLE:
|
|
|
|
|
|
|
| 115 |
logger.error(
|
| 116 |
+
"cleantext package not installed. "
|
| 117 |
+
"Install with: pip install clean-text"
|
| 118 |
)
|
| 119 |
return "Error: Required dependency 'clean-text' is not installed."
|
| 120 |
|
|
|
|
| 156 |
try:
|
| 157 |
# Apply cleantext with parameters
|
| 158 |
return clean(text, **params)
|
| 159 |
+
except (ValueError, TypeError, AttributeError) as e:
|
| 160 |
+
logger.error("Error cleaning text: %s", e)
|
| 161 |
return f"Error during text cleaning: {str(e)}"
|