import nltk from nltk.tokenize import word_tokenize from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import BernoulliNB from sklearn.base import BaseEstimator from joblib import load import os import math from tqdm import tqdm import logging class EnsembleIdentfier(BaseEstimator): def __init__(self, models_path) -> None: super().__init__() self.models = [] for filename in os.listdir(models_path): if filename.endswith(".joblib"): logging.info(f"Loading model {filename}") model = load(os.path.join(models_path, filename)) self.models.append(model) def _bagging(self, predictions_proba): # Initialize best_predictions with the first prediction best_prediction = None best_proba = -math.inf for prediction_proba in predictions_proba: pred_0_label = prediction_proba[0][0] pred_1_label = prediction_proba[0][1] if pred_0_label > best_proba: best_prediction = 0 best_proba = pred_0_label if pred_1_label > best_proba: best_prediction = 1 best_proba = pred_1_label return best_prediction def predict(self, X): return self.predict_proba(X) def predict_proba(self, X): final_predictions = [] for i in tqdm(range(len(X))): predictions = [] for model in self.models: predictions.append(model.predict_proba([X[i]])) final_predictions.append(self._bagging(predictions)) return final_predictions class LanguageIdentifier(BaseEstimator): def __init__(self, params: dict) -> None: nltk.download("stopwords") nltk.download("punkt") self.pipeline = Pipeline([ ('tfidf', TfidfVectorizer( tokenizer=lambda text: word_tokenize( text, language='portuguese'), stop_words=nltk.corpus.stopwords.words('portuguese'), ngram_range=(params['tfidf__ngram_range'][0], params['tfidf__ngram_range'][1]), max_features=params['tfidf__max_features'], analyzer=params['tfidf__analyzer'], lowercase=params['tfidf__lowercase'] )), ('clf', BernoulliNB()) ]) def fit(self, X, y): return self.pipeline.fit(X, y) def predict(self, X): return self.pipeline.predict(X) def predict_proba(self, X): return self.pipeline.predict_proba(X) def score(self, X, y): return self.pipeline.score(X, y) def get_params(self, deep=True): return self.pipeline.get_params(deep) def set_params(self, **params): return self.pipeline.set_params(**params) def __str__(self) -> str: return self.pipeline.__str__()