| import nltk |
| from nltk.tokenize import word_tokenize |
| from sklearn.pipeline import Pipeline |
| from sklearn.feature_extraction.text import TfidfVectorizer |
| from sklearn.naive_bayes import BernoulliNB |
| from sklearn.base import BaseEstimator |
| from joblib import load |
| import os |
| import math |
| from tqdm import tqdm |
| import logging |
|
|
|
|
| class EnsembleIdentfier(BaseEstimator): |
| def __init__(self, models_path) -> None: |
| super().__init__() |
| self.models = [] |
|
|
| for filename in os.listdir(models_path): |
| if filename.endswith(".joblib"): |
| logging.info(f"Loading model {filename}") |
| model = load(os.path.join(models_path, filename)) |
| self.models.append(model) |
|
|
| def _bagging(self, predictions_proba): |
| |
| best_prediction = None |
| best_proba = -math.inf |
|
|
| for prediction_proba in predictions_proba: |
| pred_0_label = prediction_proba[0][0] |
| pred_1_label = prediction_proba[0][1] |
|
|
| if pred_0_label > best_proba: |
| best_prediction = 0 |
| best_proba = pred_0_label |
|
|
| if pred_1_label > best_proba: |
| best_prediction = 1 |
| best_proba = pred_1_label |
|
|
| return best_prediction |
|
|
| def predict(self, X): |
| return self.predict_proba(X) |
|
|
| def predict_proba(self, X): |
| final_predictions = [] |
|
|
| for i in tqdm(range(len(X))): |
| predictions = [] |
|
|
| for model in self.models: |
| predictions.append(model.predict_proba([X[i]])) |
|
|
| final_predictions.append(self._bagging(predictions)) |
|
|
| return final_predictions |
|
|
|
|
| class LanguageIdentifier(BaseEstimator): |
| def __init__(self, params: dict) -> None: |
| nltk.download("stopwords") |
| nltk.download("punkt") |
|
|
| self.pipeline = Pipeline([ |
| ('tfidf', TfidfVectorizer( |
| tokenizer=lambda text: word_tokenize( |
| text, language='portuguese'), |
| stop_words=nltk.corpus.stopwords.words('portuguese'), |
| ngram_range=(params['tfidf__ngram_range'][0], |
| params['tfidf__ngram_range'][1]), |
| max_features=params['tfidf__max_features'], |
| analyzer=params['tfidf__analyzer'], |
| lowercase=params['tfidf__lowercase'] |
| )), |
| ('clf', BernoulliNB()) |
| ]) |
|
|
| def fit(self, X, y): |
| return self.pipeline.fit(X, y) |
|
|
| def predict(self, X): |
| return self.pipeline.predict(X) |
|
|
| def predict_proba(self, X): |
| return self.pipeline.predict_proba(X) |
|
|
| def score(self, X, y): |
| return self.pipeline.score(X, y) |
|
|
| def get_params(self, deep=True): |
| return self.pipeline.get_params(deep) |
|
|
| def set_params(self, **params): |
| return self.pipeline.set_params(**params) |
|
|
| def __str__(self) -> str: |
| return self.pipeline.__str__() |
|
|