File size: 1,088 Bytes
1946eb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# clean text
import re
from pyvi import ViTokenizer
from typing import List, Set

CLEAN_PATTERN = re.compile(
    r"http\S+|@\S+|#\S+|\d+"
)

SPECIAL_PATTERN = re.compile(r"[^a-zA-ZÀ-ỹ!?\\s]")
WHITESPACE_PATTERN = re.compile(r"\s+")
def clean_text(text: str) -> str:
    text = text.lower()
    text = CLEAN_PATTERN.sub(" ", text)
    text = SPECIAL_PATTERN.sub(" ", text)
    text = WHITESPACE_PATTERN.sub(" ", text).strip()
    return text

def tokenize(text: str) -> list:
    return ViTokenizer.tokenize(text).split()

def load_stopwords(path: str) -> set:
    with open(path, encoding="utf-8") as f:
        return set(line.strip() for line in f)

def remove_stopwords(tokens: List[str], stopwords: Set[str]) -> List[str]:
    return [word for word in tokens if word not in stopwords]

def preprocess(text: str, stopwords: Set[str] = None) -> List[str]:
    # clean text
    text = clean_text(text)
    # tokenize text
    tokens = tokenize(text)
    # remove stopwords
    if stopwords is not None:
        tokens = remove_stopwords(tokens, stopwords)
    
    return tokens