OpenDeepResearch / scripts /text_cleaner_tool.py
Leonardo
Update scripts/text_cleaner_tool.py
fe63cd7 verified
raw
history blame
5.23 kB
from smolagents import Tool
from typing import Dict, Any, Optional
import logging
logger = logging.getLogger(__name__)
class TextCleanerTool(Tool):
"""A simplified text cleaner tool that avoids typing issues."""
name = "clean_text"
description = (
"Cleans and normalizes text using the cleantext library. "
"Transforms messy user-generated content into normalized text."
)
inputs = {
"text": {"type": "string", "description": "The input text to clean"},
"options": {
"type": "object",
"description": (
"Optional parameters for text cleaning. Available options: "
"fix_unicode, to_ascii, lower, no_line_breaks, no_urls, no_emails, "
"no_phone_numbers, no_numbers, no_digits, no_currency_symbols, "
"no_punct, no_emoji, lang"
),
"optional": True,
"nullable": True,
},
}
output_type = "string"
def forward(self, text: str, options: Optional[Dict[str, Any]] = None) -> str:
"""
Clean text using the cleantext library with flexible options.
User-generated content on the Web and in social media is often dirty. Preprocess your scraped data with `clean-text` to create a normalized text representation. For instance, turn this corrupted input:
```
A bunch of \\u2018new\\u2019 references, including [Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29).
»Yóù àré rïght <3!«
```
into this clean output:
```
A bunch of 'new' references, including [moana](<URL>).
"you are right <3!"
```
`clean-text` uses ftfy, unidecode and numerous hand-crafted rules, i.e., RegEx.
Example API:
clean("some input",
fix_unicode=True, # fix various unicode errors
to_ascii=True, # transliterate to closest ASCII representation
lower=True, # lowercase text
no_line_breaks=False, # fully strip line breaks as opposed to only normalizing them
no_urls=False, # replace all URLs with a special token
no_emails=False, # replace all email addresses with a special token
no_phone_numbers=False, # replace all phone numbers with a special token
no_numbers=False, # replace all numbers with a special token
no_digits=False, # replace all digits with a special token
no_currency_symbols=False, # replace all currency symbols with a special token
no_punct=False, # remove punctuations
replace_with_punct="", # instead of removing punctuations you may replace them
replace_with_url="<URL>",
replace_with_email="<EMAIL>",
replace_with_phone_number="<PHONE>",
replace_with_number="<NUMBER>",
replace_with_digit="0",
replace_with_currency_symbol="<CUR>",
lang="en" # set to 'de' for German special handling
)
"""
# Input validation
if not text:
return ""
if not isinstance(text, str):
try:
text = str(text)
except Exception as e:
logger.error(f"Failed to convert input to string: {e}")
return f"Error: Could not process input of type {type(text)}"
# Import cleantext safely
try:
from cleantext import clean
except ImportError:
logger.error(
"cleantext package not installed. Install with: pip install clean-text"
)
return "Error: Required dependency 'clean-text' is not installed."
# Default replacement tokens
replacements = {
"replace_with_url": "<URL>",
"replace_with_email": "<EMAIL>",
"replace_with_phone_number": "<PHONE>",
"replace_with_number": "<NUMBER>",
"replace_with_digit": "0",
"replace_with_currency_symbol": "<CUR>",
"replace_with_punct": "",
}
# Default options
default_options = {
"fix_unicode": True,
"to_ascii": True,
"lower": True,
"no_line_breaks": False,
"no_urls": False,
"no_emails": False,
"no_phone_numbers": False,
"no_numbers": False,
"no_digits": False,
"no_currency_symbols": False,
"no_punct": False,
"no_emoji": False,
"lang": "en",
}
# Merge user options with defaults
if options:
default_options.update(options)
# Merge all parameters
params = {**default_options, **replacements}
try:
# Apply cleantext with parameters
return clean(text, **params)
except Exception as e:
logger.error(f"Error cleaning text: {e}")
return f"Error during text cleaning: {str(e)}"