| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import logging |
| import warnings |
| from pathlib import Path |
|
|
| import numpy as np |
| import pandas as pd |
| from sklearn.preprocessing import MultiLabelBinarizer |
|
|
| |
| warnings.filterwarnings(action="ignore", category=UserWarning) |
| |
| np.seterr(divide="ignore") |
|
|
| |
| pd.set_option("use_inf_as_na", True) |
|
|
| logs = logging.getLogger(__name__) |
| logs.setLevel(logging.INFO) |
| logs.propagate = False |
|
|
| if not logs.handlers: |
|
|
| Path("./log_files").mkdir(exist_ok=True) |
|
|
| |
| file = logging.FileHandler("./log_files/npmi.log") |
| fileformat = logging.Formatter("%(asctime)s:%(message)s") |
| file.setLevel(logging.INFO) |
| file.setFormatter(fileformat) |
|
|
| |
| stream = logging.StreamHandler() |
| streamformat = logging.Formatter("[data_measurements_tool] %(message)s") |
| stream.setLevel(logging.WARNING) |
| stream.setFormatter(streamformat) |
|
|
| logs.addHandler(file) |
| logs.addHandler(stream) |
|
|
| _NUM_BATCHES = 500 |
|
|
|
|
| class nPMI: |
| |
| def __init__( |
| self, |
| vocab_counts_df, |
| tokenized_df, |
| tokenized_col_name="tokenized_text", |
| num_batches=_NUM_BATCHES, |
| ): |
| logs.info("Initiating npmi class.") |
| logs.info("vocab is") |
| logs.info(vocab_counts_df) |
| self.vocab_counts_df = vocab_counts_df |
| logs.info("tokenized is") |
| self.tokenized_df = tokenized_df |
| logs.info(self.tokenized_df) |
| self.tokenized_col_name = tokenized_col_name |
| |
| self.mlb_list = [] |
|
|
| def binarize_words_in_sentence(self): |
| logs.info("Creating co-occurrence matrix for PMI calculations.") |
| batches = np.linspace(0, self.tokenized_df.shape[0], _NUM_BATCHES).astype(int) |
| i = 0 |
| |
| while i < len(batches) - 1: |
| |
| |
| mlb = MultiLabelBinarizer(classes=self.vocab_counts_df.index) |
| logs.info( |
| "%s of %s sentence binarize batches." % (str(i), str(len(batches))) |
| ) |
| |
| mlb_series = mlb.fit_transform( |
| self.tokenized_df[self.tokenized_col_name][batches[i] : batches[i + 1]] |
| ) |
| i += 1 |
| self.mlb_list.append(mlb_series) |
|
|
| def calc_cooccurrences(self, subgroup, subgroup_idx): |
| initialize = True |
| coo_df = None |
| |
| logs.info( |
| "Approaching big computation! Here, we binarize all words in the sentences, making a sparse matrix of sentences." |
| ) |
| if not self.mlb_list: |
| self.binarize_words_in_sentence() |
| for batch_id in range(len(self.mlb_list)): |
| logs.info( |
| "%s of %s co-occurrence count batches" |
| % (str(batch_id), str(len(self.mlb_list))) |
| ) |
| |
| batch_sentence_row = self.mlb_list[batch_id] |
| |
| sent_batch_df = pd.DataFrame(batch_sentence_row) |
| |
| |
| |
| subgroup_df = sent_batch_df[subgroup_idx] |
| subgroup_df.columns = [subgroup] |
| |
| |
| subgroup_df = subgroup_df[subgroup_df > 0] |
| logs.info("Removing 0 counts, subgroup_df is") |
| logs.info(subgroup_df) |
| mlb_subgroup_only = sent_batch_df[sent_batch_df[subgroup_idx] > 0] |
| logs.info("mlb subgroup only is") |
| logs.info(mlb_subgroup_only) |
| |
| logs.info("Now we do the T.dot approach for co-occurrences") |
| batch_coo_df = pd.DataFrame(mlb_subgroup_only.T.dot(subgroup_df)) |
|
|
| |
| |
| if initialize: |
| coo_df = batch_coo_df |
| else: |
| coo_df = coo_df.add(batch_coo_df, fill_value=0) |
| logs.info("coo_df is") |
| logs.info(coo_df) |
| initialize = False |
| logs.info("Returning co-occurrence matrix") |
| logs.info(coo_df) |
| return pd.DataFrame(coo_df) |
|
|
| def calc_paired_metrics(self, subgroup_pair, subgroup_npmi_dict): |
| """ |
| Calculates nPMI metrics between paired subgroups. |
| Special handling for a subgroup paired with itself. |
| :param subgroup_npmi_dict: |
| :return: |
| """ |
| paired_results_dict = {"npmi": {}, "pmi": {}, "count": {}} |
| |
| subgroup1, subgroup2 = sorted(subgroup_pair) |
| vocab_cooc_df1, pmi_df1, npmi_df1 = subgroup_npmi_dict[subgroup1] |
| logs.info("vocab cooc") |
| logs.info(vocab_cooc_df1) |
| if subgroup1 == subgroup2: |
| shared_npmi_df = npmi_df1 |
| shared_pmi_df = pmi_df1 |
| shared_vocab_cooc_df = vocab_cooc_df1 |
| else: |
| vocab_cooc_df2, pmi_df2, npmi_df2 = subgroup_npmi_dict[subgroup2] |
| logs.info("vocab cooc2") |
| logs.info(vocab_cooc_df2) |
| |
| shared_npmi_df = npmi_df1.join( |
| npmi_df2, how="inner", lsuffix="1", rsuffix="2" |
| ) |
| shared_pmi_df = pmi_df1.join(pmi_df2, how="inner", lsuffix="1", rsuffix="2") |
| shared_vocab_cooc_df = vocab_cooc_df1.join( |
| vocab_cooc_df2, how="inner", lsuffix="1", rsuffix="2" |
| ) |
| shared_vocab_cooc_df = shared_vocab_cooc_df.dropna() |
| shared_vocab_cooc_df = shared_vocab_cooc_df[ |
| shared_vocab_cooc_df.index.notnull() |
| ] |
| logs.info("shared npmi df") |
| logs.info(shared_npmi_df) |
| logs.info("shared vocab df") |
| logs.info(shared_vocab_cooc_df) |
| npmi_bias = ( |
| shared_npmi_df[subgroup1 + "-npmi"] - shared_npmi_df[subgroup2 + "-npmi"] |
| ) |
| paired_results_dict["npmi-bias"] = npmi_bias.dropna() |
| paired_results_dict["npmi"] = shared_npmi_df.dropna() |
| paired_results_dict["pmi"] = shared_pmi_df.dropna() |
| paired_results_dict["count"] = shared_vocab_cooc_df.dropna() |
| return paired_results_dict |
|
|
| def calc_metrics(self, subgroup): |
| |
| subgroup_idx = self.vocab_counts_df.index.get_loc(subgroup) |
| logs.info("Calculating co-occurrences...") |
| df_coo = self.calc_cooccurrences(subgroup, subgroup_idx) |
| vocab_cooc_df = self.set_idx_cols(df_coo, subgroup) |
| logs.info(vocab_cooc_df) |
| logs.info("Calculating PMI...") |
| pmi_df = self.calc_PMI(vocab_cooc_df, subgroup) |
| logs.info(pmi_df) |
| logs.info("Calculating nPMI...") |
| npmi_df = self.calc_nPMI(pmi_df, vocab_cooc_df, subgroup) |
| logs.info(npmi_df) |
| return vocab_cooc_df, pmi_df, npmi_df |
|
|
| def set_idx_cols(self, df_coo, subgroup): |
| """ |
| :param df_coo: Co-occurrence counts for subgroup, length is num_words |
| :return: |
| """ |
| count_df = df_coo.set_index(self.vocab_counts_df.index) |
| count_df.columns = [subgroup + "-count"] |
| count_df[subgroup + "-count"] = count_df[subgroup + "-count"].astype(int) |
| return count_df |
|
|
| def calc_PMI(self, vocab_cooc_df, subgroup): |
| """ |
| # PMI(x;y) = h(y) - h(y|x) |
| # = h(subgroup) - h(subgroup|word) |
| # = log (p(subgroup|word) / p(subgroup)) |
| # nPMI additionally divides by -log(p(x,y)) = -log(p(x|y)p(y)) |
| """ |
| |
| subgroup_prob = self.vocab_counts_df.loc[subgroup]["proportion"] |
| |
| |
| |
| p_subgroup_g_word = ( |
| vocab_cooc_df[subgroup + "-count"] / self.vocab_counts_df["count"] |
| ) |
| logs.info("p_subgroup_g_word is") |
| logs.info(p_subgroup_g_word) |
| pmi_df = pd.DataFrame() |
| pmi_df[subgroup + "-pmi"] = np.log(p_subgroup_g_word / subgroup_prob) |
| |
| |
| |
| |
| return pmi_df.dropna() |
|
|
| def calc_nPMI(self, pmi_df, vocab_cooc_df, subgroup): |
| """ |
| # nPMI additionally divides by -log(p(x,y)) = -log(p(x|y)p(y)) |
| # = -log(p(word|subgroup)p(word)) |
| """ |
| p_word_g_subgroup = vocab_cooc_df[subgroup + "-count"] / sum( |
| vocab_cooc_df[subgroup + "-count"] |
| ) |
| p_word = pmi_df.apply( |
| lambda x: self.vocab_counts_df.loc[x.name]["proportion"], axis=1 |
| ) |
| normalize_pmi = -np.log(p_word_g_subgroup * p_word) |
| npmi_df = pd.DataFrame() |
| npmi_df[subgroup + "-npmi"] = pmi_df[subgroup + "-pmi"] / normalize_pmi |
| return npmi_df.dropna() |
|
|