Leonardo commited on
Commit
4c54085
·
verified ·
1 Parent(s): fe63cd7

Update scripts/text_cleaner_tool.py

Browse files
Files changed (1) hide show
  1. scripts/text_cleaner_tool.py +59 -36
scripts/text_cleaner_tool.py CHANGED
@@ -1,10 +1,30 @@
1
- from smolagents import Tool
2
- from typing import Dict, Any, Optional
 
 
 
 
 
 
3
  import logging
 
4
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  logger = logging.getLogger(__name__)
6
 
7
 
 
8
  class TextCleanerTool(Tool):
9
  """A simplified text cleaner tool that avoids typing issues."""
10
 
@@ -19,9 +39,9 @@ class TextCleanerTool(Tool):
19
  "type": "object",
20
  "description": (
21
  "Optional parameters for text cleaning. Available options: "
22
- "fix_unicode, to_ascii, lower, no_line_breaks, no_urls, no_emails, "
23
- "no_phone_numbers, no_numbers, no_digits, no_currency_symbols, "
24
- "no_punct, no_emoji, lang"
25
  ),
26
  "optional": True,
27
  "nullable": True,
@@ -33,10 +53,13 @@ class TextCleanerTool(Tool):
33
  """
34
  Clean text using the cleantext library with flexible options.
35
 
36
- User-generated content on the Web and in social media is often dirty. Preprocess your scraped data with `clean-text` to create a normalized text representation. For instance, turn this corrupted input:
 
 
37
 
38
  ```
39
- A bunch of \\u2018new\\u2019 references, including [Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29).
 
40
 
41
 
42
  »Yóù àré rïght <3!«
@@ -50,29 +73,30 @@ class TextCleanerTool(Tool):
50
  "you are right <3!"
51
  ```
52
 
53
- `clean-text` uses ftfy, unidecode and numerous hand-crafted rules, i.e., RegEx.
 
54
 
55
  Example API:
56
  clean("some input",
57
- fix_unicode=True, # fix various unicode errors
58
- to_ascii=True, # transliterate to closest ASCII representation
59
- lower=True, # lowercase text
60
- no_line_breaks=False, # fully strip line breaks as opposed to only normalizing them
61
- no_urls=False, # replace all URLs with a special token
62
- no_emails=False, # replace all email addresses with a special token
63
- no_phone_numbers=False, # replace all phone numbers with a special token
64
- no_numbers=False, # replace all numbers with a special token
65
- no_digits=False, # replace all digits with a special token
66
- no_currency_symbols=False, # replace all currency symbols with a special token
67
- no_punct=False, # remove punctuations
68
- replace_with_punct="", # instead of removing punctuations you may replace them
69
- replace_with_url="<URL>",
70
- replace_with_email="<EMAIL>",
71
- replace_with_phone_number="<PHONE>",
72
- replace_with_number="<NUMBER>",
73
- replace_with_digit="0",
74
- replace_with_currency_symbol="<CUR>",
75
- lang="en" # set to 'de' for German special handling
76
  )
77
  """
78
  # Input validation
@@ -82,16 +106,15 @@ class TextCleanerTool(Tool):
82
  if not isinstance(text, str):
83
  try:
84
  text = str(text)
85
- except Exception as e:
86
- logger.error(f"Failed to convert input to string: {e}")
87
  return f"Error: Could not process input of type {type(text)}"
88
 
89
- # Import cleantext safely
90
- try:
91
- from cleantext import clean
92
- except ImportError:
93
  logger.error(
94
- "cleantext package not installed. Install with: pip install clean-text"
 
95
  )
96
  return "Error: Required dependency 'clean-text' is not installed."
97
 
@@ -133,6 +156,6 @@ class TextCleanerTool(Tool):
133
  try:
134
  # Apply cleantext with parameters
135
  return clean(text, **params)
136
- except Exception as e:
137
- logger.error(f"Error cleaning text: {e}")
138
  return f"Error during text cleaning: {str(e)}"
 
1
+ """
2
+ Text cleaning tool for smolagents.
3
+
4
+ Provides a Tool implementation that wraps the cleantext library for normalizing
5
+ text content with handling for various text transformation options.
6
+ """
7
+
8
+ # Standard library imports
9
  import logging
10
+ from typing import Dict, Any, Optional
11
 
12
+ # Third-party imports
13
+ from smolagents import Tool
14
+
15
+ # Try to import cleantext - handle gracefully if not installed
16
+ try:
17
+ from cleantext import clean
18
+
19
+ CLEANTEXT_AVAILABLE = True
20
+ except ImportError:
21
+ CLEANTEXT_AVAILABLE = False
22
+
23
+ # Configure module logger
24
  logger = logging.getLogger(__name__)
25
 
26
 
27
+ # pylint: disable=too-few-public-methods
28
  class TextCleanerTool(Tool):
29
  """A simplified text cleaner tool that avoids typing issues."""
30
 
 
39
  "type": "object",
40
  "description": (
41
  "Optional parameters for text cleaning. Available options: "
42
+ "fix_unicode, to_ascii, lower, no_line_breaks, no_urls, "
43
+ "no_emails, no_phone_numbers, no_numbers, no_digits, "
44
+ "no_currency_symbols, no_punct, no_emoji, lang"
45
  ),
46
  "optional": True,
47
  "nullable": True,
 
53
  """
54
  Clean text using the cleantext library with flexible options.
55
 
56
+ User-generated content on the Web and in social media is often dirty.
57
+ Preprocess your scraped data with `clean-text` to create a normalized
58
+ text representation. For instance, turn this corrupted input:
59
 
60
  ```
61
+ A bunch of \\u2018new\\u2019 references, including
62
+ [Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29).
63
 
64
 
65
  »Yóù àré rïght &lt;3!«
 
73
  "you are right <3!"
74
  ```
75
 
76
+ `clean-text` uses ftfy, unidecode and numerous hand-crafted rules,
77
+ i.e., RegEx.
78
 
79
  Example API:
80
  clean("some input",
81
+ fix_unicode=True, # fix various unicode errors
82
+ to_ascii=True, # transliterate to closest ASCII
83
+ lower=True, # lowercase text
84
+ no_line_breaks=False, # normalize line breaks
85
+ no_urls=False, # replace URLs with a token
86
+ no_emails=False, # replace email addresses with token
87
+ no_phone_numbers=False, # replace phone numbers with token
88
+ no_numbers=False, # replace all numbers with token
89
+ no_digits=False, # replace all digits with 0
90
+ no_currency_symbols=False, # replace currency symbols with token
91
+ no_punct=False, # remove punctuations
92
+ replace_with_punct="", # replacement for punctuation
93
+ replace_with_url="<URL>", # replacement for URLs
94
+ replace_with_email="<EMAIL>", # replacement for emails
95
+ replace_with_phone_number="<PHONE>", # replacement for phones
96
+ replace_with_number="<NUMBER>", # replacement for numbers
97
+ replace_with_digit="0", # replacement for digits
98
+ replace_with_currency_symbol="<CUR>", # currency replacement
99
+ lang="en" # language ('en' or 'de' supported)
100
  )
101
  """
102
  # Input validation
 
106
  if not isinstance(text, str):
107
  try:
108
  text = str(text)
109
+ except (ValueError, TypeError) as e:
110
+ logger.error("Failed to convert input to string: %s", e)
111
  return f"Error: Could not process input of type {type(text)}"
112
 
113
+ # Check if cleantext is available
114
+ if not CLEANTEXT_AVAILABLE:
 
 
115
  logger.error(
116
+ "cleantext package not installed. "
117
+ "Install with: pip install clean-text"
118
  )
119
  return "Error: Required dependency 'clean-text' is not installed."
120
 
 
156
  try:
157
  # Apply cleantext with parameters
158
  return clean(text, **params)
159
+ except (ValueError, TypeError, AttributeError) as e:
160
+ logger.error("Error cleaning text: %s", e)
161
  return f"Error during text cleaning: {str(e)}"