Spaces:
Runtime error
Runtime error
Leonardo
commited on
Update scripts/text_cleaner_tool.py
Browse files- scripts/text_cleaner_tool.py +17 -22
scripts/text_cleaner_tool.py
CHANGED
|
@@ -16,6 +16,7 @@ class TextCleanerTool(Tool):
|
|
| 16 |
"type": "boolean",
|
| 17 |
"description": "Fix broken unicode characters and mojibake",
|
| 18 |
"default": True,
|
|
|
|
| 19 |
},
|
| 20 |
"to_ascii": {
|
| 21 |
"type": "boolean",
|
|
@@ -88,7 +89,7 @@ class TextCleanerTool(Tool):
|
|
| 88 |
def forward(
|
| 89 |
self,
|
| 90 |
text: str,
|
| 91 |
-
fix_unicode: bool = True,
|
| 92 |
to_ascii: bool = True,
|
| 93 |
lower: bool = True,
|
| 94 |
no_line_breaks: bool = False,
|
|
@@ -104,7 +105,7 @@ class TextCleanerTool(Tool):
|
|
| 104 |
custom_replacements: Optional[Dict[str, str]] = None,
|
| 105 |
) -> str:
|
| 106 |
"""Clean and normalize text by removing or replacing unwanted elements."""
|
| 107 |
-
#
|
| 108 |
if not text:
|
| 109 |
return ""
|
| 110 |
|
|
@@ -115,28 +116,11 @@ class TextCleanerTool(Tool):
|
|
| 115 |
logger.error(f"Failed to convert input to string: {e}")
|
| 116 |
return f"Error: Could not process input of type {type(text)}"
|
| 117 |
|
| 118 |
-
# Import cleantext safely
|
| 119 |
try:
|
|
|
|
| 120 |
from cleantext import clean
|
| 121 |
-
except ImportError:
|
| 122 |
-
logger.error(
|
| 123 |
-
"cleantext package not installed. Install with: pip install clean-text"
|
| 124 |
-
)
|
| 125 |
-
return "Error: Required dependency 'clean-text' is not installed."
|
| 126 |
-
|
| 127 |
-
# Special parameter handling
|
| 128 |
-
replace_params = {
|
| 129 |
-
"replace_with_url": "<URL>",
|
| 130 |
-
"replace_with_email": "<EMAIL>",
|
| 131 |
-
"replace_with_phone_number": "<PHONE>",
|
| 132 |
-
"replace_with_number": "<NUMBER>",
|
| 133 |
-
"replace_with_digit": "0",
|
| 134 |
-
"replace_with_currency_symbol": "<CUR>",
|
| 135 |
-
"replace_with_punct": "",
|
| 136 |
-
}
|
| 137 |
|
| 138 |
-
|
| 139 |
-
# Apply cleantext with parameters
|
| 140 |
cleaned_text = clean(
|
| 141 |
text,
|
| 142 |
fix_unicode=fix_unicode,
|
|
@@ -152,7 +136,13 @@ class TextCleanerTool(Tool):
|
|
| 152 |
no_punct=no_punct,
|
| 153 |
no_emoji=no_emoji,
|
| 154 |
lang=lang,
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
)
|
| 157 |
|
| 158 |
# Apply any custom replacements
|
|
@@ -162,6 +152,11 @@ class TextCleanerTool(Tool):
|
|
| 162 |
|
| 163 |
return cleaned_text
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
except Exception as e:
|
| 166 |
logger.error(f"Error cleaning text: {e}")
|
| 167 |
return f"Error during text cleaning: {str(e)}"
|
|
|
|
| 16 |
"type": "boolean",
|
| 17 |
"description": "Fix broken unicode characters and mojibake",
|
| 18 |
"default": True,
|
| 19 |
+
# Removed nullable: True - this parameter doesn't accept None
|
| 20 |
},
|
| 21 |
"to_ascii": {
|
| 22 |
"type": "boolean",
|
|
|
|
| 89 |
def forward(
|
| 90 |
self,
|
| 91 |
text: str,
|
| 92 |
+
fix_unicode: bool = True, # No Optional - this doesn't accept None
|
| 93 |
to_ascii: bool = True,
|
| 94 |
lower: bool = True,
|
| 95 |
no_line_breaks: bool = False,
|
|
|
|
| 105 |
custom_replacements: Optional[Dict[str, str]] = None,
|
| 106 |
) -> str:
|
| 107 |
"""Clean and normalize text by removing or replacing unwanted elements."""
|
| 108 |
+
# Basic input validation
|
| 109 |
if not text:
|
| 110 |
return ""
|
| 111 |
|
|
|
|
| 116 |
logger.error(f"Failed to convert input to string: {e}")
|
| 117 |
return f"Error: Could not process input of type {type(text)}"
|
| 118 |
|
|
|
|
| 119 |
try:
|
| 120 |
+
# Import cleantext
|
| 121 |
from cleantext import clean
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
+
# Directly use the clean function with replacement parameters
|
|
|
|
| 124 |
cleaned_text = clean(
|
| 125 |
text,
|
| 126 |
fix_unicode=fix_unicode,
|
|
|
|
| 136 |
no_punct=no_punct,
|
| 137 |
no_emoji=no_emoji,
|
| 138 |
lang=lang,
|
| 139 |
+
replace_with_url="<URL>",
|
| 140 |
+
replace_with_email="<EMAIL>",
|
| 141 |
+
replace_with_phone_number="<PHONE>",
|
| 142 |
+
replace_with_number="<NUMBER>",
|
| 143 |
+
replace_with_digit="0",
|
| 144 |
+
replace_with_currency_symbol="<CUR>",
|
| 145 |
+
replace_with_punct="",
|
| 146 |
)
|
| 147 |
|
| 148 |
# Apply any custom replacements
|
|
|
|
| 152 |
|
| 153 |
return cleaned_text
|
| 154 |
|
| 155 |
+
except ImportError:
|
| 156 |
+
logger.error(
|
| 157 |
+
"cleantext package not installed. Install with: pip install clean-text"
|
| 158 |
+
)
|
| 159 |
+
return "Error: Required dependency 'clean-text' is not installed."
|
| 160 |
except Exception as e:
|
| 161 |
logger.error(f"Error cleaning text: {e}")
|
| 162 |
return f"Error during text cleaning: {str(e)}"
|