Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python | |
| # coding=utf-8 | |
| # Copyright 2025 The Footscray Coding Collective. All rights reserved. | |
| """ | |
| Text cleaning tool for smolagents. | |
| Provides a Tool implementation that wraps the cleantext library for normalizing | |
| text content with handling for various text transformation options. | |
| """ | |
| # Standard library imports | |
| import logging | |
| from typing import Any, Dict, Optional | |
| # Third-party imports | |
| from cleantext import clean | |
| from smolagents import Tool | |
| # Configure module logger | |
| logger = logging.getLogger(__name__) | |
| # pylint: disable=too-few-public-methods | |
| class TextCleanerTool(Tool): | |
| """A simple text cleaner tool.""" | |
| name = "clean_text" | |
| description = """This tool can be used to process messy user-generated content into | |
| normalized text. It handles a variety of text transformation options, | |
| such as fixing unicode errors, transliterating to closest ASCII, | |
| lowercasing text, normalizing line breaks, removing punctuation, | |
| replacing numbers with a token, and more.""" | |
| inputs = { | |
| "text": {"type": "string", "description": "The input text to clean"}, | |
| "options": { | |
| "type": "object", | |
| "description": ( | |
| "Optional parameters for text cleaning. Available options: " | |
| "fix_unicode, to_ascii, lower, no_line_breaks, no_urls, " | |
| "no_emails, no_phone_numbers, no_numbers, no_digits, " | |
| "no_currency_symbols, no_punct, no_emoji, lang" | |
| ), | |
| "optional": True, | |
| "nullable": True, | |
| }, | |
| } | |
| output_type = "string" | |
| def forward(self, text: str, options: Optional[Dict[str, Any]] = None) -> str: | |
| """ | |
| Clean text using the cleantext library with flexible options. | |
| User-generated content on the Web and in social media is often dirty. | |
| Preprocess your scraped data with `clean-text` to create a normalized | |
| text representation. For instance, turn this corrupted input: | |
| ``` | |
| A bunch of \\u2018new\\u2019 references, including | |
| [Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29). | |
| »Yóù àré rïght <3!« | |
| ``` | |
| into this clean output: | |
| ``` | |
| A bunch of 'new' references, including [moana](<URL>). | |
| "you are right <3!" | |
| ``` | |
| `clean-text` uses ftfy, unidecode and numerous hand-crafted rules, | |
| i.e., RegEx. | |
| Usage of the cleantext API: | |
| clean("some input", | |
| fix_unicode=True, # fix various unicode errors | |
| to_ascii=True, # transliterate to closest ASCII | |
| lower=True, # lowercase text | |
| no_line_breaks=False, # normalize line breaks | |
| no_urls=False, # replace URLs with a token | |
| no_emails=False, # replace email addresses with token | |
| no_phone_numbers=False, # replace phone numbers with token | |
| no_numbers=False, # replace all numbers with token | |
| no_digits=False, # replace all digits with 0 | |
| no_currency_symbols=False, # replace currency symbols with token | |
| no_punct=False, # remove punctuations | |
| replace_with_punct="", # replacement for punctuation | |
| replace_with_url="<URL>", # replacement for URLs | |
| replace_with_email="<EMAIL>", # replacement for emails | |
| replace_with_phone_number="<PHONE>", # replacement for phones | |
| replace_with_number="<NUMBER>", # replacement for numbers | |
| replace_with_digit="0", # replacement for digits | |
| replace_with_currency_symbol="<CUR>", # currency replacement | |
| lang="en" # language ('en' or 'de' supported) | |
| ) | |
| """ | |
| # Input validation | |
| if not text: | |
| return "" | |
| if not isinstance(text, str): | |
| try: | |
| text = str(text) | |
| except (ValueError, TypeError) as e: | |
| logger.error("Failed to convert input to string: %s", e) | |
| return f"Error: Could not process input of type {type(text)}" | |
| # Default replacement tokens | |
| replacements = { | |
| "replace_with_url": "<URL>", | |
| "replace_with_email": "<EMAIL>", | |
| "replace_with_phone_number": "<PHONE>", | |
| "replace_with_number": "<NUMBER>", | |
| "replace_with_digit": "0", | |
| "replace_with_currency_symbol": "<CUR>", | |
| "replace_with_punct": "", | |
| } | |
| # Default options | |
| default_options = { | |
| "fix_unicode": True, | |
| "to_ascii": True, | |
| "lower": True, | |
| "no_line_breaks": False, | |
| "no_urls": False, | |
| "no_emails": False, | |
| "no_phone_numbers": False, | |
| "no_numbers": False, | |
| "no_digits": False, | |
| "no_currency_symbols": False, | |
| "no_punct": False, | |
| "no_emoji": False, | |
| "lang": "en", | |
| } | |
| # Merge user options with defaults | |
| if options: | |
| default_options.update(options) | |
| # Merge all parameters | |
| params = {**default_options, **replacements} | |
| try: | |
| # Apply cleantext with parameters | |
| return clean(text, **params) | |
| except (ValueError, TypeError, AttributeError) as e: | |
| logger.error("Error cleaning text: %s", e) | |
| return f"Error during text cleaning: {str(e)}" | |
| __all__ = ["TextCleanerTool"] | |