from sklearn.naive_bayes import MultinomialNB, BernoulliNB from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import StratifiedKFold from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer import nltk from nltk.tokenize import word_tokenize import numpy as np import logging from pt_variety_identifier.src.n_grams.model import LanguageIdentifier class Trainer: def __init__(self, train_dataset, params) -> None: self.train_dataset = train_dataset self.model = LanguageIdentifier(params) def train(self): logging.info("Training model...") fitted_model = self.model.fit( np.array(self.train_dataset['text']), np.array(self.train_dataset['label'])) logging.info("Training finished!") return fitted_model """ class Trainer: def __init__(self, train_dataset, params, n_iter=500) -> None: nltk.download("stopwords") nltk.download("punkt") self.pipeline = Pipeline([ ('tfidf', TfidfVectorizer( tokenizer=lambda text: word_tokenize( text, language='portuguese'), stop_words=nltk.corpus.stopwords.words('portuguese') )), ('clf', BernoulliNB()) ]) self.params = params self.n_iter = n_iter self.cv = StratifiedKFold(n_splits=2, random_state=42, shuffle=True) self.search = RandomizedSearchCV( self.pipeline, self.params, scoring='f1_macro', n_jobs=-1, n_iter=self.n_iter, cv=self.cv, error_score='raise' ) self.train_dataset = train_dataset def train(self): logging.info("Training model...") results = self.search.fit( np.array(self.train_dataset['text']), np.array(self.train_dataset['label'])) logging.info("Training finished!") return results, results.best_estimator_ """