Spaces:
Build error
Build error
| from typing import List | |
| import numpy as np | |
| import pandas as pd | |
| import streamlit as st | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.utils import resample | |
| from stqdm import stqdm | |
| from .configs import ModelConfigs | |
| stqdm.pandas() | |
| def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs): | |
| n_instances, n_features = X.shape | |
| n_classes = len(y_names) | |
| # NOTE: the * 10 / 10 trick is to have "nice" round-ups | |
| sample_fraction = np.ceil((n_features / n_instances) * 10) / 10 | |
| sample_size = min( | |
| # this is the maximum supported | |
| configs.MAX_SELECTION.value, | |
| # at minimum you want MIN_SELECTION but in general you want | |
| # n_instances * sample_fraction | |
| max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)), | |
| # however if previous one is bigger the the available instances take | |
| # the number of available instances | |
| n_instances, | |
| ) | |
| # TODO: might want to try out something to subsample features at each iteration | |
| # initialize coefficient matrices | |
| pos_scores = np.zeros((n_classes, n_features), dtype=int) | |
| neg_scores = np.zeros((n_classes, n_features), dtype=int) | |
| with st.spinner("Wordifying!"): | |
| for _ in stqdm(range(configs.NUM_ITERS.value)): | |
| # run randomized regression | |
| clf = LogisticRegression( | |
| penalty="l1", | |
| C=configs.PENALTIES.value[ | |
| np.random.randint(len(configs.PENALTIES.value)) | |
| ], | |
| solver="liblinear", | |
| multi_class="auto", | |
| max_iter=500, | |
| class_weight="balanced", | |
| ) | |
| # sample indices to subsample matrix | |
| selection = resample( | |
| np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size | |
| ) | |
| # fit | |
| try: | |
| clf.fit(X[selection], y[selection]) | |
| except ValueError: | |
| continue | |
| # record coefficients | |
| if n_classes == 2: | |
| pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0) | |
| neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0) | |
| pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0) | |
| neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0) | |
| else: | |
| pos_scores += clf.coef_ > 0 | |
| neg_scores += clf.coef_ < 0 | |
| # normalize | |
| pos_scores = pos_scores / configs.NUM_ITERS.value | |
| neg_scores = neg_scores / configs.NUM_ITERS.value | |
| # get only active features | |
| pos_positions = np.where( | |
| pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0 | |
| ) | |
| neg_positions = np.where( | |
| neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0 | |
| ) | |
| # prepare DataFrame | |
| pos = [ | |
| (X_names[i], pos_scores[c, i], y_names[c]) | |
| for c, i in zip(*pos_positions.nonzero()) | |
| ] | |
| neg = [ | |
| (X_names[i], neg_scores[c, i], y_names[c]) | |
| for c, i in zip(*neg_positions.nonzero()) | |
| ] | |
| posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values( | |
| ["label", "score"], ascending=False | |
| ) | |
| negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values( | |
| ["label", "score"], ascending=False | |
| ) | |
| return posdf, negdf | |