OpenDeepResearch

Runtime error

OpenDeepResearch / scripts /text_cleaner_tool.py

Leonardo

Create text_cleaner_tool.py

1e9a96d verified 9 months ago

5.45 kB

	from smolagents import Tool
	from typing import Dict, Optional, Any
	import logging

	logger = logging.getLogger(__name__)


	class TextCleanerTool(Tool):
	name = "clean_text"
	description = (
	"Cleans and normalizes text by removing or replacing unwanted elements"
	)
	inputs = {
	"text": {"type": "string", "description": "The input text to clean"},
	"fix_unicode": {
	"type": "boolean",
	"description": "Fix broken unicode characters and mojibake",
	"default": True,
	},
	"to_ascii": {
	"type": "boolean",
	"description": "Convert non-ASCII characters to their closest ASCII equivalents",
	"default": True,
	},
	"lower": {
	"type": "boolean",
	"description": "Convert text to lowercase",
	"default": True,
	},
	"no_line_breaks": {
	"type": "boolean",
	"description": "Replace line breaks with spaces",
	"default": False,
	},
	"no_urls": {
	"type": "boolean",
	"description": "Replace URLs with a token",
	"default": False,
	},
	"no_emails": {
	"type": "boolean",
	"description": "Replace email addresses with a token",
	"default": False,
	},
	"no_phone_numbers": {
	"type": "boolean",
	"description": "Replace phone numbers with a token",
	"default": False,
	},
	"no_numbers": {
	"type": "boolean",
	"description": "Replace all numbers with a token",
	"default": False,
	},
	"no_digits": {
	"type": "boolean",
	"description": "Replace all digits with 0",
	"default": False,
	},
	"no_currency_symbols": {
	"type": "boolean",
	"description": "Replace currency symbols with a token",
	"default": False,
	},
	"no_punct": {
	"type": "boolean",
	"description": "Remove all punctuation",
	"default": False,
	},
	"no_emoji": {
	"type": "boolean",
	"description": "Remove all emoji characters",
	"default": False,
	},
	"lang": {
	"type": "string",
	"description": "Language code for special handling ('en' or 'de' supported)",
	"default": "en",
	},
	"custom_replacements": {
	"type": "object",
	"description": "Dictionary of custom string replacements to apply",
	"optional": True,
	},
	}
	output_type = "string"

	def forward(
	self,
	text: str,
	fix_unicode: bool = True,
	to_ascii: bool = True,
	lower: bool = True,
	no_line_breaks: bool = False,
	no_urls: bool = False,
	no_emails: bool = False,
	no_phone_numbers: bool = False,
	no_numbers: bool = False,
	no_digits: bool = False,
	no_currency_symbols: bool = False,
	no_punct: bool = False,
	no_emoji: bool = False,
	lang: str = "en",
	custom_replacements: Optional[Dict[str, str]] = None,
	) -> str:
	"""Clean and normalize text by removing or replacing unwanted elements."""
	# Input validation
	if not text:
	return ""

	if not isinstance(text, str):
	try:
	text = str(text)
	except Exception as e:
	logger.error(f"Failed to convert input to string: {e}")
	return f"Error: Could not process input of type {type(text)}"

	# Import cleantext safely
	try:
	from cleantext import clean
	except ImportError:
	logger.error(
	"cleantext package not installed. Install with: pip install clean-text"
	)
	return "Error: Required dependency 'clean-text' is not installed."

	# Special parameter handling
	replace_params = {
	"replace_with_url": "<URL>",
	"replace_with_email": "<EMAIL>",
	"replace_with_phone_number": "<PHONE>",
	"replace_with_number": "<NUMBER>",
	"replace_with_digit": "0",
	"replace_with_currency_symbol": "<CUR>",
	"replace_with_punct": "",
	}

	try:
	# Apply cleantext with parameters
	cleaned_text = clean(
	text,
	fix_unicode=fix_unicode,
	to_ascii=to_ascii,
	lower=lower,
	no_line_breaks=no_line_breaks,
	no_urls=no_urls,
	no_emails=no_emails,
	no_phone_numbers=no_phone_numbers,
	no_numbers=no_numbers,
	no_digits=no_digits,
	no_currency_symbols=no_currency_symbols,
	no_punct=no_punct,
	no_emoji=no_emoji,
	lang=lang,
	**replace_params,
	)

	# Apply any custom replacements
	if custom_replacements:
	for old, new in custom_replacements.items():
	cleaned_text = cleaned_text.replace(old, new)

	return cleaned_text

	except Exception as e:
	logger.error(f"Error cleaning text: {e}")
	return f"Error during text cleaning: {str(e)}"