Spaces:
Build error
Build error
Create utils.py
Browse files
utils.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
def clean_text(text):
|
| 4 |
+
# Remove HTML tags
|
| 5 |
+
text = re.sub(r'<[^>]*?>', '', text)
|
| 6 |
+
# Remove URLs
|
| 7 |
+
text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
|
| 8 |
+
# Remove special characters
|
| 9 |
+
text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
|
| 10 |
+
# Replace multiple spaces with a single space
|
| 11 |
+
text = re.sub(r'\s{2,}', ' ', text)
|
| 12 |
+
# Trim leading and trailing whitespace
|
| 13 |
+
text = text.strip()
|
| 14 |
+
# Remove extra whitespace
|
| 15 |
+
text = ' '.join(text.split())
|
| 16 |
+
return text
|