LCA-PORVID's picture
Upload 34 files
ebdb5af verified
import nltk
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.base import BaseEstimator
from joblib import load
import os
import math
from tqdm import tqdm
import logging
class EnsembleIdentfier(BaseEstimator):
def __init__(self, models_path) -> None:
super().__init__()
self.models = []
for filename in os.listdir(models_path):
if filename.endswith(".joblib"):
logging.info(f"Loading model {filename}")
model = load(os.path.join(models_path, filename))
self.models.append(model)
def _bagging(self, predictions_proba):
# Initialize best_predictions with the first prediction
best_prediction = None
best_proba = -math.inf
for prediction_proba in predictions_proba:
pred_0_label = prediction_proba[0][0]
pred_1_label = prediction_proba[0][1]
if pred_0_label > best_proba:
best_prediction = 0
best_proba = pred_0_label
if pred_1_label > best_proba:
best_prediction = 1
best_proba = pred_1_label
return best_prediction
def predict(self, X):
return self.predict_proba(X)
def predict_proba(self, X):
final_predictions = []
for i in tqdm(range(len(X))):
predictions = []
for model in self.models:
predictions.append(model.predict_proba([X[i]]))
final_predictions.append(self._bagging(predictions))
return final_predictions
class LanguageIdentifier(BaseEstimator):
def __init__(self, params: dict) -> None:
nltk.download("stopwords")
nltk.download("punkt")
self.pipeline = Pipeline([
('tfidf', TfidfVectorizer(
tokenizer=lambda text: word_tokenize(
text, language='portuguese'),
stop_words=nltk.corpus.stopwords.words('portuguese'),
ngram_range=(params['tfidf__ngram_range'][0],
params['tfidf__ngram_range'][1]),
max_features=params['tfidf__max_features'],
analyzer=params['tfidf__analyzer'],
lowercase=params['tfidf__lowercase']
)),
('clf', BernoulliNB())
])
def fit(self, X, y):
return self.pipeline.fit(X, y)
def predict(self, X):
return self.pipeline.predict(X)
def predict_proba(self, X):
return self.pipeline.predict_proba(X)
def score(self, X, y):
return self.pipeline.score(X, y)
def get_params(self, deep=True):
return self.pipeline.get_params(deep)
def set_params(self, **params):
return self.pipeline.set_params(**params)
def __str__(self) -> str:
return self.pipeline.__str__()