| import re | |
| import string | |
| from pyvi.ViTokenizer import tokenize | |
| import bm25s | |
| def clean_text(text): | |
| text = re.sub('<.*?>', '', text).strip() | |
| text = re.sub(r'(\s)+', r'\1', text) | |
| return text | |
| def normalize_text(text): | |
| listpunctuation = string.punctuation.replace('_', '') | |
| for i in listpunctuation: | |
| text = text.replace(i, ' ') | |
| return text.lower().strip() | |
| def process_text(text): | |
| text = clean_text(text) | |
| text = tokenize(text) | |
| text = normalize_text(text) | |
| return text |