| """# MODEL BUILDING""" |
| |
| import numpy as np |
| import pandas as pd |
| |
| |
| |
| |
| |
|
|
| import nltk |
| from nltk.corpus import stopwords |
| nltk.download('stopwords') |
| nltk.download('punkt') |
|
|
| def remove_stopword(text): |
| stopword=nltk.corpus.stopwords.words('english') |
| stopword.remove('not') |
| a=[w for w in nltk.word_tokenize(text) if w not in stopword] |
| return ' '.join(a) |
| |
|
|
| data = pd.read_csv('train-cleaned.csv') |
| data |
|
|
| import nltk |
| def punc_clean(text): |
| import string as st |
| a=[w for w in text if w not in st.punctuation] |
| return ''.join(a) |
| data[''] = data['Extracted text'].apply(punc_clean) |
| |
|
|
| from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
| vectr = TfidfVectorizer(ngram_range=(1,2),min_df=1) |
| vectr.fit(data['Extracted text']) |
|
|
| vect_X = vectr.transform(data['Extracted text']) |
|
|
| |
|
|
| from sklearn.svm import SVC |
| from sklearn.linear_model import LogisticRegression |
| from sklearn.ensemble import VotingClassifier |
|
|
| svm_classifier = SVC(kernel='linear', probability=True) |
| logistic_classifier = LogisticRegression() |
|
|
|
|
| model = VotingClassifier(estimators=[ |
| ('svm', svm_classifier), |
| ('logistic', logistic_classifier) |
| ], voting='hard') |
|
|
|
|
| clf=model.fit(vect_X,data['saliency']) |
| |
|
|
| |
|
|
| |
|
|
| |
|
|
| |
|
|
|
|