Upload preprocess.py
#10
by
robzjgman
- opened
1 _ Data Preprocessing/preprocess.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
def remove_symbols(text):
|
| 4 |
+
text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
|
| 5 |
+
return text
|
| 6 |
+
|
| 7 |
+
def word_tokens(text):
|
| 8 |
+
text = re.split(r"\s", text)
|
| 9 |
+
return text
|
| 10 |
+
|
| 11 |
+
# Test
|
| 12 |
+
# if __name__ == "__main__":
|
| 13 |
+
# sample = 'goo very light lang..... ganda naman pero mas maganda ako!!!! hahahahahabababababababababababahahahahahahahahaha'
|
| 14 |
+
# print("Original:", sample)
|
| 15 |
+
# print("Cleaned:", remove_repetitive_symbols(sample))
|
| 16 |
+
# print("Tokens:", process_data(sample))
|