Spaces:
Sleeping
Sleeping
File size: 1,171 Bytes
f1554a2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | import csv
import joblib
from sklearn.naive_bayes import MultinomialNB
from SEM.text_preprocessing import pre_process_title
from sklearn.feature_extraction.text import TfidfVectorizer
def readtrain():
with open('SEM/training_data/title.csv', 'rt') as csvfile:
reader = csv.reader(csvfile)
column1 = [row for row in reader]
content_train = [i[0] for i in column1[1:]]
opinion_train = [i[1] for i in column1[1:]]
train = [content_train, opinion_train]
return train
def segmentWord(cont):
c = []
for i in cont:
clean_text = pre_process_title(i)
c.append(clean_text)
return c
train = readtrain()
content = segmentWord(train[1])
textMark = train[0]
train_content = content[:]
# test_content = content[450:508]
train_textMark = textMark[:]
# test_textMark = textMark[450:508]
tf = TfidfVectorizer(max_df=0.5)
train_features = tf.fit_transform(train_content)
load_pretrain_model = True
if not load_pretrain_model:
clf = MultinomialNB(alpha=0.1)
clf.fit(train_features,train_textMark)
joblib.dump(clf, 'SEM/model/para_model.pkl')
else:
clf = joblib.load('SEM/model/para_model.pkl')
|