import csv import joblib from sklearn.metrics import f1_score, recall_score from sklearn.naive_bayes import MultinomialNB from SEM.text_preprocessing import pre_process_title from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer def readtrain(): with open('SEM/training_data/personal_type.csv', 'rt') as csvfile: reader = csv.reader(csvfile) column1 = [row for row in reader] content_train = [i[0] for i in column1[1:]] opinion_train = [i[1] for i in column1[1:]] train = [content_train, opinion_train] return train def segmentWord(cont): c = [] for i in cont: clean_text = pre_process_title(i) c.append(clean_text) return c train = readtrain() content = segmentWord(train[1]) textMark = train[0] train_content = content[:499] # test_content = content[400:499] train_textMark = textMark[:499] # test_textMark = textMark[400:499] tf = TfidfVectorizer(max_df=0.5) train_features = tf.fit_transform(train_content) load_pretrain_model = True if not load_pretrain_model: clf_type = MultinomialNB(alpha=0.1) clf_type.fit(train_features,train_textMark) joblib.dump(clf_type, 'SEM/model/sen_model.pkl') # test_features = tf.transform(test_content) # print("clf test score: ", clf_type.score(test_features, test_textMark)) else: clf_type = joblib.load('SEM/model/sen_model.pkl') # print("clf training score: ", clf_type.score(train_features, train_textMark)) # test_features = tf.transform(test_content) # print("clf test score: ", clf_type.score(test_features, test_textMark))