File size: 2,254 Bytes
e8e72fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
import sys
import os
parent_root = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, os.pardir))
sys.path.append(parent_root)
from src.dataset import *

from src.config.configs import Params
param = Params()

## Naives Bayes, Logistic Regression

class BaseLine(object):
    def __init__(self, vectorizer, model):
        self.vectorizer = vectorizer
        self.model = model


    def train(self, X_train, y_train, X_val, y_val, X_test, y_test):
        model= Pipeline([
        ("vectorizer", self.vectorizer),
        ("model", self.model)
        ])
        print("Vectorizer: ", self.vectorizer)
        print("Model: ", self.model)
        model.fit(X_train, y_train)
        val_score = model.score(X_val, y_val)
        test_score = model.score(X_test, y_test)
        print("Model accuracy on val set", model.score(X_val, y_val))
        print("Model accuracy on test set", model.score(X_test, y_test))
        return val_score, test_score
    
if __name__ == "__main__":
    print("---------------Baseline ML model------------------------")
    params = Params()
    # Define dataset
    dataset = Dataset(train_txt=params.TRAIN_DIR, val_txt=params.VAL_DIR, test_txt=params.TEST_DIR, num_inputs=1)

    # Define train/val/test
    train_sentences = dataset.train_sentences
    val_sentences = dataset.val_sentences
    test_sentences = dataset.test_sentences
    y_train = dataset.y_train
    y_val = dataset.y_val
    y_test = dataset.y_test
    val_score, test_score =BaseLine(vectorizer=TfidfVectorizer(), 
                                    model=LogisticRegression()).train(
                                    X_train=train_sentences, y_train=y_train,
                                    X_val=val_sentences, y_val=y_val,
                                    X_test=test_sentences, y_test=y_test)
    
    print("Baseline Val score: ", val_score)
    print("Baseline Test score: ", test_score)