| from sklearn.naive_bayes import MultinomialNB, BernoulliNB | |
| from sklearn.model_selection import RandomizedSearchCV | |
| from sklearn.model_selection import StratifiedKFold | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| import nltk | |
| from nltk.tokenize import word_tokenize | |
| import numpy as np | |
| import logging | |
| from pt_variety_identifier.src.n_grams.model import LanguageIdentifier | |
| class Trainer: | |
| def __init__(self, train_dataset, params) -> None: | |
| self.train_dataset = train_dataset | |
| self.model = LanguageIdentifier(params) | |
| def train(self): | |
| logging.info("Training model...") | |
| fitted_model = self.model.fit( | |
| np.array(self.train_dataset['text']), np.array(self.train_dataset['label'])) | |
| logging.info("Training finished!") | |
| return fitted_model | |
| """ | |
| class Trainer: | |
| def __init__(self, train_dataset, params, n_iter=500) -> None: | |
| nltk.download("stopwords") | |
| nltk.download("punkt") | |
| self.pipeline = Pipeline([ | |
| ('tfidf', TfidfVectorizer( | |
| tokenizer=lambda text: word_tokenize( | |
| text, language='portuguese'), | |
| stop_words=nltk.corpus.stopwords.words('portuguese') | |
| )), | |
| ('clf', BernoulliNB()) | |
| ]) | |
| self.params = params | |
| self.n_iter = n_iter | |
| self.cv = StratifiedKFold(n_splits=2, random_state=42, shuffle=True) | |
| self.search = RandomizedSearchCV( | |
| self.pipeline, | |
| self.params, | |
| scoring='f1_macro', | |
| n_jobs=-1, | |
| n_iter=self.n_iter, | |
| cv=self.cv, | |
| error_score='raise' | |
| ) | |
| self.train_dataset = train_dataset | |
| def train(self): | |
| logging.info("Training model...") | |
| results = self.search.fit( | |
| np.array(self.train_dataset['text']), np.array(self.train_dataset['label'])) | |
| logging.info("Training finished!") | |
| return results, results.best_estimator_ | |
| """ | |