aaron-rae-nicolas's picture
Upload preprocess.py (#10)
d2b597c verified
raw
history blame contribute delete
491 Bytes
import re
def remove_symbols(text):
text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
return text
def word_tokens(text):
text = re.split(r"\s", text)
return text
# Test
# if __name__ == "__main__":
# sample = 'goo very light lang..... ganda naman pero mas maganda ako!!!! hahahahahabababababababababababahahahahahahahahaha'
# print("Original:", sample)
# print("Cleaned:", remove_repetitive_symbols(sample))
# print("Tokens:", process_data(sample))