Leonardo commited on
Commit
8520a66
·
verified ·
1 Parent(s): cdb1539

Update scripts/text_cleaner_tool.py

Browse files
Files changed (1) hide show
  1. scripts/text_cleaner_tool.py +81 -124
scripts/text_cleaner_tool.py CHANGED
@@ -1,111 +1,62 @@
1
  from smolagents import Tool
2
- from typing import Dict, Optional, Any
3
  import logging
4
 
5
  logger = logging.getLogger(__name__)
6
 
7
 
8
  class TextCleanerTool(Tool):
 
 
9
  name = "clean_text"
10
  description = (
11
- "Cleans and normalizes text by removing or replacing unwanted elements"
 
12
  )
13
  inputs = {
14
  "text": {"type": "string", "description": "The input text to clean"},
15
- "fix_unicode": {
16
- "type": "boolean",
17
- "description": "Fix broken unicode characters and mojibake",
18
- "default": True,
19
- # Removed nullable: True - this parameter doesn't accept None
20
- },
21
- "to_ascii": {
22
- "type": "boolean",
23
- "description": "Convert non-ASCII characters to their closest ASCII equivalents",
24
- "default": True,
25
- },
26
- "lower": {
27
- "type": "boolean",
28
- "description": "Convert text to lowercase",
29
- "default": True,
30
- },
31
- "no_line_breaks": {
32
- "type": "boolean",
33
- "description": "Replace line breaks with spaces",
34
- "default": False,
35
- },
36
- "no_urls": {
37
- "type": "boolean",
38
- "description": "Replace URLs with a token",
39
- "default": False,
40
- },
41
- "no_emails": {
42
- "type": "boolean",
43
- "description": "Replace email addresses with a token",
44
- "default": False,
45
- },
46
- "no_phone_numbers": {
47
- "type": "boolean",
48
- "description": "Replace phone numbers with a token",
49
- "default": False,
50
- },
51
- "no_numbers": {
52
- "type": "boolean",
53
- "description": "Replace all numbers with a token",
54
- "default": False,
55
- },
56
- "no_digits": {
57
- "type": "boolean",
58
- "description": "Replace all digits with 0",
59
- "default": False,
60
- },
61
- "no_currency_symbols": {
62
- "type": "boolean",
63
- "description": "Replace currency symbols with a token",
64
- "default": False,
65
- },
66
- "no_punct": {
67
- "type": "boolean",
68
- "description": "Remove all punctuation",
69
- "default": False,
70
- },
71
- "no_emoji": {
72
- "type": "boolean",
73
- "description": "Remove all emoji characters",
74
- "default": False,
75
- },
76
- "lang": {
77
- "type": "string",
78
- "description": "Language code for special handling ('en' or 'de' supported)",
79
- "default": "en",
80
- },
81
- "custom_replacements": {
82
  "type": "object",
83
- "description": "Dictionary of custom string replacements to apply",
 
 
 
 
 
84
  "optional": True,
 
85
  },
86
  }
87
  output_type = "string"
88
 
89
- def forward(
90
- self,
91
- text: str,
92
- fix_unicode: bool = True, # No Optional - this doesn't accept None
93
- to_ascii: bool = True,
94
- lower: bool = True,
95
- no_line_breaks: bool = False,
96
- no_urls: bool = False,
97
- no_emails: bool = False,
98
- no_phone_numbers: bool = False,
99
- no_numbers: bool = False,
100
- no_digits: bool = False,
101
- no_currency_symbols: bool = False,
102
- no_punct: bool = False,
103
- no_emoji: bool = False,
104
- lang: str = "en",
105
- custom_replacements: Optional[Dict[str, str]] = None,
106
- ) -> str:
107
- """Clean and normalize text by removing or replacing unwanted elements."""
108
- # Basic input validation
 
 
 
 
 
 
 
 
109
  if not text:
110
  return ""
111
 
@@ -116,47 +67,53 @@ class TextCleanerTool(Tool):
116
  logger.error(f"Failed to convert input to string: {e}")
117
  return f"Error: Could not process input of type {type(text)}"
118
 
 
119
  try:
120
- # Import cleantext
121
  from cleantext import clean
122
-
123
- # Directly use the clean function with replacement parameters
124
- cleaned_text = clean(
125
- text,
126
- fix_unicode=fix_unicode,
127
- to_ascii=to_ascii,
128
- lower=lower,
129
- no_line_breaks=no_line_breaks,
130
- no_urls=no_urls,
131
- no_emails=no_emails,
132
- no_phone_numbers=no_phone_numbers,
133
- no_numbers=no_numbers,
134
- no_digits=no_digits,
135
- no_currency_symbols=no_currency_symbols,
136
- no_punct=no_punct,
137
- no_emoji=no_emoji,
138
- lang=lang,
139
- replace_with_url="<URL>",
140
- replace_with_email="<EMAIL>",
141
- replace_with_phone_number="<PHONE>",
142
- replace_with_number="<NUMBER>",
143
- replace_with_digit="0",
144
- replace_with_currency_symbol="<CUR>",
145
- replace_with_punct="",
146
- )
147
-
148
- # Apply any custom replacements
149
- if custom_replacements:
150
- for old, new in custom_replacements.items():
151
- cleaned_text = cleaned_text.replace(old, new)
152
-
153
- return cleaned_text
154
-
155
  except ImportError:
156
  logger.error(
157
  "cleantext package not installed. Install with: pip install clean-text"
158
  )
159
  return "Error: Required dependency 'clean-text' is not installed."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  except Exception as e:
161
  logger.error(f"Error cleaning text: {e}")
162
  return f"Error during text cleaning: {str(e)}"
 
1
  from smolagents import Tool
2
+ from typing import Dict, Any, Optional
3
  import logging
4
 
5
  logger = logging.getLogger(__name__)
6
 
7
 
8
  class TextCleanerTool(Tool):
9
+ """A simplified text cleaner tool that avoids typing issues."""
10
+
11
  name = "clean_text"
12
  description = (
13
+ "Cleans and normalizes text using the cleantext library. "
14
+ "Example usage: clean_text(text='Your text here', options={'lower': True, 'no_urls': True})"
15
  )
16
  inputs = {
17
  "text": {"type": "string", "description": "The input text to clean"},
18
+ "options": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  "type": "object",
20
+ "description": (
21
+ "Optional parameters for text cleaning. Available options: "
22
+ "fix_unicode, to_ascii, lower, no_line_breaks, no_urls, no_emails, "
23
+ "no_phone_numbers, no_numbers, no_digits, no_currency_symbols, "
24
+ "no_punct, no_emoji, lang"
25
+ ),
26
  "optional": True,
27
+ "nullable": True,
28
  },
29
  }
30
  output_type = "string"
31
 
32
+ def forward(self, text: str, options: Optional[Dict[str, Any]] = None) -> str:
33
+ """
34
+ Clean text using the cleantext library with flexible options.
35
+
36
+ Example API:
37
+ clean("some input",
38
+ fix_unicode=True, # fix various unicode errors
39
+ to_ascii=True, # transliterate to closest ASCII representation
40
+ lower=True, # lowercase text
41
+ no_line_breaks=False, # fully strip line breaks as opposed to only normalizing them
42
+ no_urls=False, # replace all URLs with a special token
43
+ no_emails=False, # replace all email addresses with a special token
44
+ no_phone_numbers=False, # replace all phone numbers with a special token
45
+ no_numbers=False, # replace all numbers with a special token
46
+ no_digits=False, # replace all digits with a special token
47
+ no_currency_symbols=False, # replace all currency symbols with a special token
48
+ no_punct=False, # remove punctuations
49
+ replace_with_punct="", # instead of removing punctuations you may replace them
50
+ replace_with_url="<URL>",
51
+ replace_with_email="<EMAIL>",
52
+ replace_with_phone_number="<PHONE>",
53
+ replace_with_number="<NUMBER>",
54
+ replace_with_digit="0",
55
+ replace_with_currency_symbol="<CUR>",
56
+ lang="en" # set to 'de' for German special handling
57
+ )
58
+ """
59
+ # Input validation
60
  if not text:
61
  return ""
62
 
 
67
  logger.error(f"Failed to convert input to string: {e}")
68
  return f"Error: Could not process input of type {type(text)}"
69
 
70
+ # Import cleantext safely
71
  try:
 
72
  from cleantext import clean
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  except ImportError:
74
  logger.error(
75
  "cleantext package not installed. Install with: pip install clean-text"
76
  )
77
  return "Error: Required dependency 'clean-text' is not installed."
78
+
79
+ # Default replacement tokens
80
+ replacements = {
81
+ "replace_with_url": "<URL>",
82
+ "replace_with_email": "<EMAIL>",
83
+ "replace_with_phone_number": "<PHONE>",
84
+ "replace_with_number": "<NUMBER>",
85
+ "replace_with_digit": "0",
86
+ "replace_with_currency_symbol": "<CUR>",
87
+ "replace_with_punct": "",
88
+ }
89
+
90
+ # Default options
91
+ default_options = {
92
+ "fix_unicode": True,
93
+ "to_ascii": True,
94
+ "lower": True,
95
+ "no_line_breaks": False,
96
+ "no_urls": False,
97
+ "no_emails": False,
98
+ "no_phone_numbers": False,
99
+ "no_numbers": False,
100
+ "no_digits": False,
101
+ "no_currency_symbols": False,
102
+ "no_punct": False,
103
+ "no_emoji": False,
104
+ "lang": "en",
105
+ }
106
+
107
+ # Merge user options with defaults
108
+ if options:
109
+ default_options.update(options)
110
+
111
+ # Merge all parameters
112
+ params = {**default_options, **replacements}
113
+
114
+ try:
115
+ # Apply cleantext with parameters
116
+ return clean(text, **params)
117
  except Exception as e:
118
  logger.error(f"Error cleaning text: {e}")
119
  return f"Error during text cleaning: {str(e)}"