Upload preprocess.py

#10
by robzjgman - opened
1 _ Data Preprocessing/preprocess.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def remove_symbols(text):
4
+ text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
5
+ return text
6
+
7
+ def word_tokens(text):
8
+ text = re.split(r"\s", text)
9
+ return text
10
+
11
+ # Test
12
+ # if __name__ == "__main__":
13
+ # sample = 'goo very light lang..... ganda naman pero mas maganda ako!!!! hahahahahabababababababababababahahahahahahahahaha'
14
+ # print("Original:", sample)
15
+ # print("Cleaned:", remove_repetitive_symbols(sample))
16
+ # print("Tokens:", process_data(sample))