Leonardo commited on
Commit
cdb1539
·
verified ·
1 Parent(s): c7daeea

Update scripts/text_cleaner_tool.py

Browse files
Files changed (1) hide show
  1. scripts/text_cleaner_tool.py +17 -22
scripts/text_cleaner_tool.py CHANGED
@@ -16,6 +16,7 @@ class TextCleanerTool(Tool):
16
  "type": "boolean",
17
  "description": "Fix broken unicode characters and mojibake",
18
  "default": True,
 
19
  },
20
  "to_ascii": {
21
  "type": "boolean",
@@ -88,7 +89,7 @@ class TextCleanerTool(Tool):
88
  def forward(
89
  self,
90
  text: str,
91
- fix_unicode: bool = True,
92
  to_ascii: bool = True,
93
  lower: bool = True,
94
  no_line_breaks: bool = False,
@@ -104,7 +105,7 @@ class TextCleanerTool(Tool):
104
  custom_replacements: Optional[Dict[str, str]] = None,
105
  ) -> str:
106
  """Clean and normalize text by removing or replacing unwanted elements."""
107
- # Input validation
108
  if not text:
109
  return ""
110
 
@@ -115,28 +116,11 @@ class TextCleanerTool(Tool):
115
  logger.error(f"Failed to convert input to string: {e}")
116
  return f"Error: Could not process input of type {type(text)}"
117
 
118
- # Import cleantext safely
119
  try:
 
120
  from cleantext import clean
121
- except ImportError:
122
- logger.error(
123
- "cleantext package not installed. Install with: pip install clean-text"
124
- )
125
- return "Error: Required dependency 'clean-text' is not installed."
126
-
127
- # Special parameter handling
128
- replace_params = {
129
- "replace_with_url": "<URL>",
130
- "replace_with_email": "<EMAIL>",
131
- "replace_with_phone_number": "<PHONE>",
132
- "replace_with_number": "<NUMBER>",
133
- "replace_with_digit": "0",
134
- "replace_with_currency_symbol": "<CUR>",
135
- "replace_with_punct": "",
136
- }
137
 
138
- try:
139
- # Apply cleantext with parameters
140
  cleaned_text = clean(
141
  text,
142
  fix_unicode=fix_unicode,
@@ -152,7 +136,13 @@ class TextCleanerTool(Tool):
152
  no_punct=no_punct,
153
  no_emoji=no_emoji,
154
  lang=lang,
155
- **replace_params,
 
 
 
 
 
 
156
  )
157
 
158
  # Apply any custom replacements
@@ -162,6 +152,11 @@ class TextCleanerTool(Tool):
162
 
163
  return cleaned_text
164
 
 
 
 
 
 
165
  except Exception as e:
166
  logger.error(f"Error cleaning text: {e}")
167
  return f"Error during text cleaning: {str(e)}"
 
16
  "type": "boolean",
17
  "description": "Fix broken unicode characters and mojibake",
18
  "default": True,
19
+ # Removed nullable: True - this parameter doesn't accept None
20
  },
21
  "to_ascii": {
22
  "type": "boolean",
 
89
  def forward(
90
  self,
91
  text: str,
92
+ fix_unicode: bool = True, # No Optional - this doesn't accept None
93
  to_ascii: bool = True,
94
  lower: bool = True,
95
  no_line_breaks: bool = False,
 
105
  custom_replacements: Optional[Dict[str, str]] = None,
106
  ) -> str:
107
  """Clean and normalize text by removing or replacing unwanted elements."""
108
+ # Basic input validation
109
  if not text:
110
  return ""
111
 
 
116
  logger.error(f"Failed to convert input to string: {e}")
117
  return f"Error: Could not process input of type {type(text)}"
118
 
 
119
  try:
120
+ # Import cleantext
121
  from cleantext import clean
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
+ # Directly use the clean function with replacement parameters
 
124
  cleaned_text = clean(
125
  text,
126
  fix_unicode=fix_unicode,
 
136
  no_punct=no_punct,
137
  no_emoji=no_emoji,
138
  lang=lang,
139
+ replace_with_url="<URL>",
140
+ replace_with_email="<EMAIL>",
141
+ replace_with_phone_number="<PHONE>",
142
+ replace_with_number="<NUMBER>",
143
+ replace_with_digit="0",
144
+ replace_with_currency_symbol="<CUR>",
145
+ replace_with_punct="",
146
  )
147
 
148
  # Apply any custom replacements
 
152
 
153
  return cleaned_text
154
 
155
+ except ImportError:
156
+ logger.error(
157
+ "cleantext package not installed. Install with: pip install clean-text"
158
+ )
159
+ return "Error: Required dependency 'clean-text' is not installed."
160
  except Exception as e:
161
  logger.error(f"Error cleaning text: {e}")
162
  return f"Error during text cleaning: {str(e)}"