File size: 969 Bytes
a90b4b9
45674cb
 
 
a90b4b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd66c88
45674cb
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import re
from nltk.corpus.reader import pickle
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer


def clean_text(text):
    stop_words = set(stopwords.words("english"))
    # english_stopwords = stopwords.words("english")
    english_stemmer = SnowballStemmer("english")
    text = text.replace('', '') # Remove 
    text = re.sub(r'[^\w]', ' ', text) # Remove symbols
    text = re.sub(r'[ ]{2,}', ' ', text) # Remove extra spaces
    text = re.sub(r'[ \t]+$', '', text) # Remove trailing white spaces
    tokens = []
    for token in text.split():
        if token not in stop_words:
            token = english_stemmer.stem(token)
            tokens.append(token)
    return " ".join(tokens)

def preprocess_pipeline(text):
    return clean_text(text)

def vectorizer(text):
    count_vectorizer = pickle.load(open("vectorizers/count_vectorizer.pkl", "rb"))
    return count_vectorizer.transform(text)