Spaces:
Runtime error
Runtime error
File size: 5,708 Bytes
4c54085 1e9a96d 4c54085 1e9a96d 4c54085 1e9a96d 4c54085 1e9a96d 8520a66 1e9a96d 8520a66 fe63cd7 1e9a96d 8520a66 1e9a96d 8520a66 4c54085 8520a66 1e9a96d 8520a66 1e9a96d 8520a66 4c54085 fe63cd7 4c54085 fe63cd7 4c54085 fe63cd7 8520a66 4c54085 8520a66 1e9a96d 4c54085 1e9a96d 4c54085 cdb1539 4c54085 cdb1539 8520a66 4c54085 1e9a96d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
"""
Text cleaning tool for smolagents.
Provides a Tool implementation that wraps the cleantext library for normalizing
text content with handling for various text transformation options.
"""
# Standard library imports
import logging
from typing import Dict, Any, Optional
# Third-party imports
from smolagents import Tool
# Try to import cleantext - handle gracefully if not installed
try:
from cleantext import clean
CLEANTEXT_AVAILABLE = True
except ImportError:
CLEANTEXT_AVAILABLE = False
# Configure module logger
logger = logging.getLogger(__name__)
# pylint: disable=too-few-public-methods
class TextCleanerTool(Tool):
"""A simplified text cleaner tool that avoids typing issues."""
name = "clean_text"
description = (
"Cleans and normalizes text using the cleantext library. "
"Transforms messy user-generated content into normalized text."
)
inputs = {
"text": {"type": "string", "description": "The input text to clean"},
"options": {
"type": "object",
"description": (
"Optional parameters for text cleaning. Available options: "
"fix_unicode, to_ascii, lower, no_line_breaks, no_urls, "
"no_emails, no_phone_numbers, no_numbers, no_digits, "
"no_currency_symbols, no_punct, no_emoji, lang"
),
"optional": True,
"nullable": True,
},
}
output_type = "string"
def forward(self, text: str, options: Optional[Dict[str, Any]] = None) -> str:
"""
Clean text using the cleantext library with flexible options.
User-generated content on the Web and in social media is often dirty.
Preprocess your scraped data with `clean-text` to create a normalized
text representation. For instance, turn this corrupted input:
```
A bunch of \\u2018new\\u2019 references, including
[Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29).
»Yóù àré rïght <3!«
```
into this clean output:
```
A bunch of 'new' references, including [moana](<URL>).
"you are right <3!"
```
`clean-text` uses ftfy, unidecode and numerous hand-crafted rules,
i.e., RegEx.
Example API:
clean("some input",
fix_unicode=True, # fix various unicode errors
to_ascii=True, # transliterate to closest ASCII
lower=True, # lowercase text
no_line_breaks=False, # normalize line breaks
no_urls=False, # replace URLs with a token
no_emails=False, # replace email addresses with token
no_phone_numbers=False, # replace phone numbers with token
no_numbers=False, # replace all numbers with token
no_digits=False, # replace all digits with 0
no_currency_symbols=False, # replace currency symbols with token
no_punct=False, # remove punctuations
replace_with_punct="", # replacement for punctuation
replace_with_url="<URL>", # replacement for URLs
replace_with_email="<EMAIL>", # replacement for emails
replace_with_phone_number="<PHONE>", # replacement for phones
replace_with_number="<NUMBER>", # replacement for numbers
replace_with_digit="0", # replacement for digits
replace_with_currency_symbol="<CUR>", # currency replacement
lang="en" # language ('en' or 'de' supported)
)
"""
# Input validation
if not text:
return ""
if not isinstance(text, str):
try:
text = str(text)
except (ValueError, TypeError) as e:
logger.error("Failed to convert input to string: %s", e)
return f"Error: Could not process input of type {type(text)}"
# Check if cleantext is available
if not CLEANTEXT_AVAILABLE:
logger.error(
"cleantext package not installed. "
"Install with: pip install clean-text"
)
return "Error: Required dependency 'clean-text' is not installed."
# Default replacement tokens
replacements = {
"replace_with_url": "<URL>",
"replace_with_email": "<EMAIL>",
"replace_with_phone_number": "<PHONE>",
"replace_with_number": "<NUMBER>",
"replace_with_digit": "0",
"replace_with_currency_symbol": "<CUR>",
"replace_with_punct": "",
}
# Default options
default_options = {
"fix_unicode": True,
"to_ascii": True,
"lower": True,
"no_line_breaks": False,
"no_urls": False,
"no_emails": False,
"no_phone_numbers": False,
"no_numbers": False,
"no_digits": False,
"no_currency_symbols": False,
"no_punct": False,
"no_emoji": False,
"lang": "en",
}
# Merge user options with defaults
if options:
default_options.update(options)
# Merge all parameters
params = {**default_options, **replacements}
try:
# Apply cleantext with parameters
return clean(text, **params)
except (ValueError, TypeError, AttributeError) as e:
logger.error("Error cleaning text: %s", e)
return f"Error during text cleaning: {str(e)}"
|