OpenDeepResearch / scripts /text_cleaner_tool.py
Leonardo
Create text_cleaner_tool.py
1e9a96d verified
raw
history blame
5.45 kB
from smolagents import Tool
from typing import Dict, Optional, Any
import logging
logger = logging.getLogger(__name__)
class TextCleanerTool(Tool):
name = "clean_text"
description = (
"Cleans and normalizes text by removing or replacing unwanted elements"
)
inputs = {
"text": {"type": "string", "description": "The input text to clean"},
"fix_unicode": {
"type": "boolean",
"description": "Fix broken unicode characters and mojibake",
"default": True,
},
"to_ascii": {
"type": "boolean",
"description": "Convert non-ASCII characters to their closest ASCII equivalents",
"default": True,
},
"lower": {
"type": "boolean",
"description": "Convert text to lowercase",
"default": True,
},
"no_line_breaks": {
"type": "boolean",
"description": "Replace line breaks with spaces",
"default": False,
},
"no_urls": {
"type": "boolean",
"description": "Replace URLs with a token",
"default": False,
},
"no_emails": {
"type": "boolean",
"description": "Replace email addresses with a token",
"default": False,
},
"no_phone_numbers": {
"type": "boolean",
"description": "Replace phone numbers with a token",
"default": False,
},
"no_numbers": {
"type": "boolean",
"description": "Replace all numbers with a token",
"default": False,
},
"no_digits": {
"type": "boolean",
"description": "Replace all digits with 0",
"default": False,
},
"no_currency_symbols": {
"type": "boolean",
"description": "Replace currency symbols with a token",
"default": False,
},
"no_punct": {
"type": "boolean",
"description": "Remove all punctuation",
"default": False,
},
"no_emoji": {
"type": "boolean",
"description": "Remove all emoji characters",
"default": False,
},
"lang": {
"type": "string",
"description": "Language code for special handling ('en' or 'de' supported)",
"default": "en",
},
"custom_replacements": {
"type": "object",
"description": "Dictionary of custom string replacements to apply",
"optional": True,
},
}
output_type = "string"
def forward(
self,
text: str,
fix_unicode: bool = True,
to_ascii: bool = True,
lower: bool = True,
no_line_breaks: bool = False,
no_urls: bool = False,
no_emails: bool = False,
no_phone_numbers: bool = False,
no_numbers: bool = False,
no_digits: bool = False,
no_currency_symbols: bool = False,
no_punct: bool = False,
no_emoji: bool = False,
lang: str = "en",
custom_replacements: Optional[Dict[str, str]] = None,
) -> str:
"""Clean and normalize text by removing or replacing unwanted elements."""
# Input validation
if not text:
return ""
if not isinstance(text, str):
try:
text = str(text)
except Exception as e:
logger.error(f"Failed to convert input to string: {e}")
return f"Error: Could not process input of type {type(text)}"
# Import cleantext safely
try:
from cleantext import clean
except ImportError:
logger.error(
"cleantext package not installed. Install with: pip install clean-text"
)
return "Error: Required dependency 'clean-text' is not installed."
# Special parameter handling
replace_params = {
"replace_with_url": "<URL>",
"replace_with_email": "<EMAIL>",
"replace_with_phone_number": "<PHONE>",
"replace_with_number": "<NUMBER>",
"replace_with_digit": "0",
"replace_with_currency_symbol": "<CUR>",
"replace_with_punct": "",
}
try:
# Apply cleantext with parameters
cleaned_text = clean(
text,
fix_unicode=fix_unicode,
to_ascii=to_ascii,
lower=lower,
no_line_breaks=no_line_breaks,
no_urls=no_urls,
no_emails=no_emails,
no_phone_numbers=no_phone_numbers,
no_numbers=no_numbers,
no_digits=no_digits,
no_currency_symbols=no_currency_symbols,
no_punct=no_punct,
no_emoji=no_emoji,
lang=lang,
**replace_params,
)
# Apply any custom replacements
if custom_replacements:
for old, new in custom_replacements.items():
cleaned_text = cleaned_text.replace(old, new)
return cleaned_text
except Exception as e:
logger.error(f"Error cleaning text: {e}")
return f"Error during text cleaning: {str(e)}"