Spaces:
Runtime error
Runtime error
File size: 5,230 Bytes
1e9a96d 8520a66 1e9a96d 8520a66 1e9a96d 8520a66 fe63cd7 1e9a96d 8520a66 1e9a96d 8520a66 1e9a96d 8520a66 1e9a96d 8520a66 fe63cd7 8520a66 1e9a96d 8520a66 1e9a96d cdb1539 8520a66 1e9a96d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
from smolagents import Tool
from typing import Dict, Any, Optional
import logging
logger = logging.getLogger(__name__)
class TextCleanerTool(Tool):
"""A simplified text cleaner tool that avoids typing issues."""
name = "clean_text"
description = (
"Cleans and normalizes text using the cleantext library. "
"Transforms messy user-generated content into normalized text."
)
inputs = {
"text": {"type": "string", "description": "The input text to clean"},
"options": {
"type": "object",
"description": (
"Optional parameters for text cleaning. Available options: "
"fix_unicode, to_ascii, lower, no_line_breaks, no_urls, no_emails, "
"no_phone_numbers, no_numbers, no_digits, no_currency_symbols, "
"no_punct, no_emoji, lang"
),
"optional": True,
"nullable": True,
},
}
output_type = "string"
def forward(self, text: str, options: Optional[Dict[str, Any]] = None) -> str:
"""
Clean text using the cleantext library with flexible options.
User-generated content on the Web and in social media is often dirty. Preprocess your scraped data with `clean-text` to create a normalized text representation. For instance, turn this corrupted input:
```
A bunch of \\u2018new\\u2019 references, including [Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29).
»Yóù àré rïght <3!«
```
into this clean output:
```
A bunch of 'new' references, including [moana](<URL>).
"you are right <3!"
```
`clean-text` uses ftfy, unidecode and numerous hand-crafted rules, i.e., RegEx.
Example API:
clean("some input",
fix_unicode=True, # fix various unicode errors
to_ascii=True, # transliterate to closest ASCII representation
lower=True, # lowercase text
no_line_breaks=False, # fully strip line breaks as opposed to only normalizing them
no_urls=False, # replace all URLs with a special token
no_emails=False, # replace all email addresses with a special token
no_phone_numbers=False, # replace all phone numbers with a special token
no_numbers=False, # replace all numbers with a special token
no_digits=False, # replace all digits with a special token
no_currency_symbols=False, # replace all currency symbols with a special token
no_punct=False, # remove punctuations
replace_with_punct="", # instead of removing punctuations you may replace them
replace_with_url="<URL>",
replace_with_email="<EMAIL>",
replace_with_phone_number="<PHONE>",
replace_with_number="<NUMBER>",
replace_with_digit="0",
replace_with_currency_symbol="<CUR>",
lang="en" # set to 'de' for German special handling
)
"""
# Input validation
if not text:
return ""
if not isinstance(text, str):
try:
text = str(text)
except Exception as e:
logger.error(f"Failed to convert input to string: {e}")
return f"Error: Could not process input of type {type(text)}"
# Import cleantext safely
try:
from cleantext import clean
except ImportError:
logger.error(
"cleantext package not installed. Install with: pip install clean-text"
)
return "Error: Required dependency 'clean-text' is not installed."
# Default replacement tokens
replacements = {
"replace_with_url": "<URL>",
"replace_with_email": "<EMAIL>",
"replace_with_phone_number": "<PHONE>",
"replace_with_number": "<NUMBER>",
"replace_with_digit": "0",
"replace_with_currency_symbol": "<CUR>",
"replace_with_punct": "",
}
# Default options
default_options = {
"fix_unicode": True,
"to_ascii": True,
"lower": True,
"no_line_breaks": False,
"no_urls": False,
"no_emails": False,
"no_phone_numbers": False,
"no_numbers": False,
"no_digits": False,
"no_currency_symbols": False,
"no_punct": False,
"no_emoji": False,
"lang": "en",
}
# Merge user options with defaults
if options:
default_options.update(options)
# Merge all parameters
params = {**default_options, **replacements}
try:
# Apply cleantext with parameters
return clean(text, **params)
except Exception as e:
logger.error(f"Error cleaning text: {e}")
return f"Error during text cleaning: {str(e)}"
|