OpenDeepResearch

Runtime error

OpenDeepResearch / scripts /text_cleaner_tool.py

Leonardo

Update scripts/text_cleaner_tool.py

fe63cd7 verified 10 months ago

5.23 kB

	from smolagents import Tool
	from typing import Dict, Any, Optional
	import logging

	logger = logging.getLogger(__name__)


	class TextCleanerTool(Tool):
	"""A simplified text cleaner tool that avoids typing issues."""

	name = "clean_text"
	description = (
	"Cleans and normalizes text using the cleantext library. "
	"Transforms messy user-generated content into normalized text."
	)
	inputs = {
	"text": {"type": "string", "description": "The input text to clean"},
	"options": {
	"type": "object",
	"description": (
	"Optional parameters for text cleaning. Available options: "
	"fix_unicode, to_ascii, lower, no_line_breaks, no_urls, no_emails, "
	"no_phone_numbers, no_numbers, no_digits, no_currency_symbols, "
	"no_punct, no_emoji, lang"
	),
	"optional": True,
	"nullable": True,
	},
	}
	output_type = "string"

	def forward(self, text: str, options: Optional[Dict[str, Any]] = None) -> str:
	"""
	Clean text using the cleantext library with flexible options.

	User-generated content on the Web and in social media is often dirty. Preprocess your scraped data with `clean-text` to create a normalized text representation. For instance, turn this corrupted input:

	```
	A bunch of \\u2018new\\u2019 references, including [Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29).


	»Yóù àré rïght <3!«
	```

	into this clean output:

	```
	A bunch of 'new' references, including [moana](<URL>).

	"you are right <3!"
	```

	`clean-text` uses ftfy, unidecode and numerous hand-crafted rules, i.e., RegEx.

	Example API:
	clean("some input",
	fix_unicode=True, # fix various unicode errors
	to_ascii=True, # transliterate to closest ASCII representation
	lower=True, # lowercase text
	no_line_breaks=False, # fully strip line breaks as opposed to only normalizing them
	no_urls=False, # replace all URLs with a special token
	no_emails=False, # replace all email addresses with a special token
	no_phone_numbers=False, # replace all phone numbers with a special token
	no_numbers=False, # replace all numbers with a special token
	no_digits=False, # replace all digits with a special token
	no_currency_symbols=False, # replace all currency symbols with a special token
	no_punct=False, # remove punctuations
	replace_with_punct="", # instead of removing punctuations you may replace them
	replace_with_url="<URL>",
	replace_with_email="<EMAIL>",
	replace_with_phone_number="<PHONE>",
	replace_with_number="<NUMBER>",
	replace_with_digit="0",
	replace_with_currency_symbol="<CUR>",
	lang="en" # set to 'de' for German special handling
	)
	"""
	# Input validation
	if not text:
	return ""

	if not isinstance(text, str):
	try:
	text = str(text)
	except Exception as e:
	logger.error(f"Failed to convert input to string: {e}")
	return f"Error: Could not process input of type {type(text)}"

	# Import cleantext safely
	try:
	from cleantext import clean
	except ImportError:
	logger.error(
	"cleantext package not installed. Install with: pip install clean-text"
	)
	return "Error: Required dependency 'clean-text' is not installed."

	# Default replacement tokens
	replacements = {
	"replace_with_url": "<URL>",
	"replace_with_email": "<EMAIL>",
	"replace_with_phone_number": "<PHONE>",
	"replace_with_number": "<NUMBER>",
	"replace_with_digit": "0",
	"replace_with_currency_symbol": "<CUR>",
	"replace_with_punct": "",
	}

	# Default options
	default_options = {
	"fix_unicode": True,
	"to_ascii": True,
	"lower": True,
	"no_line_breaks": False,
	"no_urls": False,
	"no_emails": False,
	"no_phone_numbers": False,
	"no_numbers": False,
	"no_digits": False,
	"no_currency_symbols": False,
	"no_punct": False,
	"no_emoji": False,
	"lang": "en",
	}

	# Merge user options with defaults
	if options:
	default_options.update(options)

	# Merge all parameters
	params = {default_options, replacements}

	try:
	# Apply cleantext with parameters
	return clean(text, **params)
	except Exception as e:
	logger.error(f"Error cleaning text: {e}")
	return f"Error during text cleaning: {str(e)}"