Leonardo commited on
Commit
1e9a96d
·
verified ·
1 Parent(s): c453528

Create text_cleaner_tool.py

Browse files
Files changed (1) hide show
  1. scripts/text_cleaner_tool.py +167 -0
scripts/text_cleaner_tool.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from smolagents import Tool
2
+ from typing import Dict, Optional, Any
3
+ import logging
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+
8
+ class TextCleanerTool(Tool):
9
+ name = "clean_text"
10
+ description = (
11
+ "Cleans and normalizes text by removing or replacing unwanted elements"
12
+ )
13
+ inputs = {
14
+ "text": {"type": "string", "description": "The input text to clean"},
15
+ "fix_unicode": {
16
+ "type": "boolean",
17
+ "description": "Fix broken unicode characters and mojibake",
18
+ "default": True,
19
+ },
20
+ "to_ascii": {
21
+ "type": "boolean",
22
+ "description": "Convert non-ASCII characters to their closest ASCII equivalents",
23
+ "default": True,
24
+ },
25
+ "lower": {
26
+ "type": "boolean",
27
+ "description": "Convert text to lowercase",
28
+ "default": True,
29
+ },
30
+ "no_line_breaks": {
31
+ "type": "boolean",
32
+ "description": "Replace line breaks with spaces",
33
+ "default": False,
34
+ },
35
+ "no_urls": {
36
+ "type": "boolean",
37
+ "description": "Replace URLs with a token",
38
+ "default": False,
39
+ },
40
+ "no_emails": {
41
+ "type": "boolean",
42
+ "description": "Replace email addresses with a token",
43
+ "default": False,
44
+ },
45
+ "no_phone_numbers": {
46
+ "type": "boolean",
47
+ "description": "Replace phone numbers with a token",
48
+ "default": False,
49
+ },
50
+ "no_numbers": {
51
+ "type": "boolean",
52
+ "description": "Replace all numbers with a token",
53
+ "default": False,
54
+ },
55
+ "no_digits": {
56
+ "type": "boolean",
57
+ "description": "Replace all digits with 0",
58
+ "default": False,
59
+ },
60
+ "no_currency_symbols": {
61
+ "type": "boolean",
62
+ "description": "Replace currency symbols with a token",
63
+ "default": False,
64
+ },
65
+ "no_punct": {
66
+ "type": "boolean",
67
+ "description": "Remove all punctuation",
68
+ "default": False,
69
+ },
70
+ "no_emoji": {
71
+ "type": "boolean",
72
+ "description": "Remove all emoji characters",
73
+ "default": False,
74
+ },
75
+ "lang": {
76
+ "type": "string",
77
+ "description": "Language code for special handling ('en' or 'de' supported)",
78
+ "default": "en",
79
+ },
80
+ "custom_replacements": {
81
+ "type": "object",
82
+ "description": "Dictionary of custom string replacements to apply",
83
+ "optional": True,
84
+ },
85
+ }
86
+ output_type = "string"
87
+
88
+ def forward(
89
+ self,
90
+ text: str,
91
+ fix_unicode: bool = True,
92
+ to_ascii: bool = True,
93
+ lower: bool = True,
94
+ no_line_breaks: bool = False,
95
+ no_urls: bool = False,
96
+ no_emails: bool = False,
97
+ no_phone_numbers: bool = False,
98
+ no_numbers: bool = False,
99
+ no_digits: bool = False,
100
+ no_currency_symbols: bool = False,
101
+ no_punct: bool = False,
102
+ no_emoji: bool = False,
103
+ lang: str = "en",
104
+ custom_replacements: Optional[Dict[str, str]] = None,
105
+ ) -> str:
106
+ """Clean and normalize text by removing or replacing unwanted elements."""
107
+ # Input validation
108
+ if not text:
109
+ return ""
110
+
111
+ if not isinstance(text, str):
112
+ try:
113
+ text = str(text)
114
+ except Exception as e:
115
+ logger.error(f"Failed to convert input to string: {e}")
116
+ return f"Error: Could not process input of type {type(text)}"
117
+
118
+ # Import cleantext safely
119
+ try:
120
+ from cleantext import clean
121
+ except ImportError:
122
+ logger.error(
123
+ "cleantext package not installed. Install with: pip install clean-text"
124
+ )
125
+ return "Error: Required dependency 'clean-text' is not installed."
126
+
127
+ # Special parameter handling
128
+ replace_params = {
129
+ "replace_with_url": "<URL>",
130
+ "replace_with_email": "<EMAIL>",
131
+ "replace_with_phone_number": "<PHONE>",
132
+ "replace_with_number": "<NUMBER>",
133
+ "replace_with_digit": "0",
134
+ "replace_with_currency_symbol": "<CUR>",
135
+ "replace_with_punct": "",
136
+ }
137
+
138
+ try:
139
+ # Apply cleantext with parameters
140
+ cleaned_text = clean(
141
+ text,
142
+ fix_unicode=fix_unicode,
143
+ to_ascii=to_ascii,
144
+ lower=lower,
145
+ no_line_breaks=no_line_breaks,
146
+ no_urls=no_urls,
147
+ no_emails=no_emails,
148
+ no_phone_numbers=no_phone_numbers,
149
+ no_numbers=no_numbers,
150
+ no_digits=no_digits,
151
+ no_currency_symbols=no_currency_symbols,
152
+ no_punct=no_punct,
153
+ no_emoji=no_emoji,
154
+ lang=lang,
155
+ **replace_params,
156
+ )
157
+
158
+ # Apply any custom replacements
159
+ if custom_replacements:
160
+ for old, new in custom_replacements.items():
161
+ cleaned_text = cleaned_text.replace(old, new)
162
+
163
+ return cleaned_text
164
+
165
+ except Exception as e:
166
+ logger.error(f"Error cleaning text: {e}")
167
+ return f"Error during text cleaning: {str(e)}"