In [65]:
import sys
sys.path.insert(0, "..")
import vaex
from vaex.ml import LabelEncoder
import spacy
import pandas as pd
from tqdm import tqdm
import os
import multiprocessing as mp
from src.preprocessing import PreprocessingPipeline, encode
from src.wordifier import ModelConfigs
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [67]:
pipe = PreprocessingPipeline(
    language="English",
    pre_steps=list(PreprocessingPipeline.pipeline_components().keys()),
    lemmatization_step=list(PreprocessingPipeline.lemmatization_component().keys())[1],
    post_steps=list(PreprocessingPipeline.pipeline_components().keys()),
)

In [68]:
def fn(t):
    return pipe.post(pipe.lemma(pipe.nlp(pipe.pre(t))))

In [69]:
vdf = vaex.from_pandas(df)
vdf["processed_text"] = vdf.apply(fn, arguments=[vdf["text"]], vectorize=False)
df = vdf.to_pandas_df()

In [71]:
import streamlit as st
pbar = st.progress(0)
N = 100
for i, _ in enumerate(range(N)):
    if i % N == 0:
        pbar.progress(1)

2021-11-28 17:01:36.883 
  command:

    streamlit run /Users/pietrolesci/miniconda3/envs/wordify/lib/python3.7/site-packages/ipykernel_launcher.py [ARGUMENTS]


In [24]:
configs = ModelConfigs
clf = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        (
            "classifier",
            LogisticRegression(
                penalty="l1",
                C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],
                solver="liblinear",
                multi_class="auto",
                max_iter=500,
                class_weight="balanced",
            ),
        ),
    ]
)


In [29]:
clf.fit(df["text"], df["label"])

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('classifier',
                 LogisticRegression(C=1, class_weight='balanced', max_iter=500,
                                    penalty='l1', solver='liblinear'))])

array(['00', '000', '00001', ..., 'ís', 'über', 'überwoman'], dtype=object)

In [40]:
def wordifier(df, text_col, label_col, configs=ModelConfigs):

    n_instances, n_features = X.shape
    n_classes = np.unique(y)

    # NOTE: the * 10 / 10 trick is to have "nice" round-ups
    sample_fraction = np.ceil((n_features / n_instances) * 10) / 10

    sample_size = min(
        # this is the maximum supported
        configs.MAX_SELECTION.value,
        # at minimum you want MIN_SELECTION but in general you want
        # n_instances * sample_fraction
        max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)),
        # however if previous one is bigger the the available instances take
        # the number of available instances
        n_instances,
    )

    # TODO: might want to try out something to subsample features at each iteration

    # initialize coefficient matrices
    pos_scores = np.zeros((n_classes, n_features), dtype=int)
    neg_scores = np.zeros((n_classes, n_features), dtype=int)

    for _ in range(configs.NUM_ITERS.value):

        # run randomized regression
        clf = Pipeline([
            ('tfidf', TfidfVectorizer()), 
            ('classifier', LogisticRegression(
                penalty="l1",
                C=configs.PENALTIES.value[
                    np.random.randint(len(configs.PENALTIES.value))
                ],
                solver="liblinear",
                multi_class="auto",
                max_iter=500,
                class_weight="balanced",
            ))]
        )

        # sample indices to subsample matrix
        selection = resample(
            np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size
        )

        # fit
        try:
            clf.fit(X[selection], y[selection])
        except ValueError:
            continue

        # record coefficients
        if n_classes == 2:
            pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)
            neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)
            pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)
            neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)
        else:
            pos_scores += clf.coef_ > 0
            neg_scores += clf.coef_ < 0


        # normalize
        pos_scores = pos_scores / configs.NUM_ITERS.value
        neg_scores = neg_scores / configs.NUM_ITERS.value

        # get only active features
        pos_positions = np.where(
            pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0
        )
        neg_positions = np.where(
            neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0
        )

        # prepare DataFrame
        X_names = clf.steps[0][1].get_feature_names_out()
        pos = [
            (X_names[i], pos_scores[c, i], y_names[c])
            for c, i in zip(*pos_positions.nonzero())
        ]
        neg = [
            (X_names[i], neg_scores[c, i], y_names[c])
            for c, i in zip(*neg_positions.nonzero())
        ]

    posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(
        ["label", "score"], ascending=False
    )
    negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(
        ["label", "score"], ascending=False
    )

    return posdf, negdf

In [41]:
res = vdf.apply(wordifier, arguments=[vdf.processed_text, vdf.encoded_label], vectorize=False)

In [45]:
from vaex.ml.sklearn import Predictor

In [60]:
clf = Pipeline(
    [
        (
            "tfidf",
            TfidfVectorizer(
                input="content",  # default: file already in memory
                encoding="utf-8",  # default
                decode_error="strict",  # default
                strip_accents=None,  # do nothing
                lowercase=False,  # do nothing
                preprocessor=None,  # do nothing - default
                tokenizer=None,  # default
                stop_words=None,  # do nothing
                analyzer="word",
                ngram_range=(1, 3),  # maximum 3-ngrams
                min_df=0.001,
                max_df=0.75,
                sublinear_tf=True,
            ),
        ),
        (
            "classifier",
            LogisticRegression(
                penalty="l1",
                C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],
                solver="liblinear",
                multi_class="auto",
                max_iter=500,
                class_weight="balanced",
            ),
        ),
    ]
)

vaex_model = Predictor(
    features=["processed_text"],
    target="encoded_label",
    model=clf,
    prediction_name="prediction",
)


In [61]:
vaex_model.fit(vdf)

TypeError: unhashable type: 'list'

In [52]:
import pickle
pickle.dumps(wordifier)

b'\x80\x03c__main__\nwordifier\nq\x00.'

TypeError: unhashable type: 'list'

In [None]:
res = []
with tqdm(total=len(df)) as pbar:
    for doc in tqdm(nlp.pipe(df["text"].values, batch_size=500, n_process=n_cpus)):
        res.append([i.lemma_ for i in doc])
        pbar.update(1)

In [None]:
import pickle

In [None]:
def fn(t):
    return 

In [None]:
%%timeit
with mp.Pool(mp.cpu_count()) as pool:
    new_s = pool.map(nlp, df["text"].values)

In [None]:
from typing import List
import numpy as np
import pandas as pd
import streamlit as st
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample

from src.configs import ModelConfigs


def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs):

    n_instances, n_features = X.shape
    n_classes = len(y_names)

    # NOTE: the * 10 / 10 trick is to have "nice" round-ups
    sample_fraction = np.ceil((n_features / n_instances) * 10) / 10

    sample_size = min(
        # this is the maximum supported
        configs.MAX_SELECTION.value,
        # at minimum you want MIN_SELECTION but in general you want
        # n_instances * sample_fraction
        max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)),
        # however if previous one is bigger the the available instances take
        # the number of available instances
        n_instances,
    )

    # TODO: might want to try out something to subsample features at each iteration

    # initialize coefficient matrices
    pos_scores = np.zeros((n_classes, n_features), dtype=int)
    neg_scores = np.zeros((n_classes, n_features), dtype=int)

    with st.spinner("Wordifying!"):
        pbar = st.progress(0)

        for i, _ in enumerate(range(configs.NUM_ITERS.value)):

            # run randomized regression
            clf = LogisticRegression(
                penalty="l1",
                C=configs.PENALTIES.value[
                    np.random.randint(len(configs.PENALTIES.value))
                ],
                solver="liblinear",
                multi_class="auto",
                max_iter=500,
                class_weight="balanced",
            )

            # sample indices to subsample matrix
            selection = resample(
                np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size
            )

            # fit
            try:
                clf.fit(X[selection], y[selection])
            except ValueError:
                continue

            # record coefficients
            if n_classes == 2:
                pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)
                neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)
                pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)
                neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)
            else:
                pos_scores += clf.coef_ > 0
                neg_scores += clf.coef_ < 0

            pbar.progress(i + 1)

        # normalize
        pos_scores = pos_scores / configs.NUM_ITERS.value
        neg_scores = neg_scores / configs.NUM_ITERS.value

        # get only active features
        pos_positions = np.where(
            pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0
        )
        neg_positions = np.where(
            neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0
        )

        # prepare DataFrame
        pos = [
            (X_names[i], pos_scores[c, i], y_names[c])
            for c, i in zip(*pos_positions.nonzero())
        ]
        neg = [
            (X_names[i], neg_scores[c, i], y_names[c])
            for c, i in zip(*neg_positions.nonzero())
        ]

    posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(
        ["label", "score"], ascending=False
    )
    negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(
        ["label", "score"], ascending=False
    )

    return posdf, negdf


In [None]:
path = "../../../../Downloads/wordify_10000_copy.xlsx"

In [None]:
df = pd.read_excel(path, dtype=str).dropna()

In [None]:
# df = pd.read_excel("../data/test_de.xlsx")
# mdf = mpd.read_csv("../data/test_en.csv")
language = "English"
nlp = spacy.load(Languages[language].value, exclude=["parser", "ner", "pos", "tok2vec"])

In [None]:
prep = TextPreprocessor(
    language="English", 
    cleaning_steps=list(TextPreprocessor._cleaning_options().keys()),
    lemmatizer_when=None,
)

In [None]:
df["p_text"] = prep.fit_transform(df["text"])

In [None]:
X, y, X_names, y_names = encode(df["p_text"], df["label"]).values()

In [None]:
clf = LogisticRegression(
    penalty="l1",
    C=0.05,#ModelConfigs.PENALTIES.value[np.random.randint(len(ModelConfigs.PENALTIES.value))],
    solver="liblinear",
    multi_class="auto",
    max_iter=500,
    class_weight="balanced",
)

In [None]:
%%time
clf.fit(X, y)

In [None]:
n_instances, n_features = X.shape
n_classes = len(y_names)

# NOTE: the * 10 / 10 trick is to have "nice" round-ups
sample_fraction = np.ceil((n_features / n_instances) * 10) / 10

sample_size = min(
    # this is the maximum supported
    ModelConfigs.MAX_SELECTION.value,
    # at minimum you want MIN_SELECTION but in general you want
    # n_instances * sample_fraction
    max(ModelConfigs.MIN_SELECTION.value, int(n_instances * sample_fraction)),
    # however if previous one is bigger the the available instances take
    # the number of available instances
    n_instances,
)

# TODO: might want to try out something to subsample features at each iteration

# initialize coefficient matrices
pos_scores = np.zeros((n_classes, n_features), dtype=int)
neg_scores = np.zeros((n_classes, n_features), dtype=int)

for _ in trange(ModelConfigs.NUM_ITERS.value):

    # run randomized regression
    clf = LogisticRegression(
        penalty="l1",
        C=ModelConfigs.PENALTIES.value[np.random.randint(len(ModelConfigs.PENALTIES.value))],
        solver="liblinear",
        multi_class="auto",
        max_iter=500,
        class_weight="balanced",
    )

    # sample indices to subsample matrix
    selection = resample(np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size)

    # fit
    try:
        clf.fit(X[selection], y[selection])
    except ValueError:
        continue

    # record coefficients
    if n_classes == 2:
        pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)
        neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)
        pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)
        neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)
    else:
        pos_scores += clf.coef_ > 0
        neg_scores += clf.coef_ < 0

In [None]:
# normalize
pos_scores = pos_scores / ModelConfigs.NUM_ITERS.value
neg_scores = neg_scores / ModelConfigs.NUM_ITERS.value

# get only active features
pos_positions = np.where(pos_scores >= ModelConfigs.SELECTION_THRESHOLD.value, pos_scores, 0)
neg_positions = np.where(neg_scores >= ModelConfigs.SELECTION_THRESHOLD.value, neg_scores, 0)

# prepare DataFrame
pos = [(X_names[i], pos_scores[c, i], y_names[c]) for c, i in zip(*pos_positions.nonzero())]
neg = [(X_names[i], neg_scores[c, i], y_names[c]) for c, i in zip(*neg_positions.nonzero())]

posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(["label", "score"], ascending=False)