chatbot-model-bills / src /process.py
penguinsfly's picture
reorganize into component files, add info, and add pca plot
fd4a87f verified
import pandas as pd
import numpy as np
from cmap import Colormap, Color
from sklearn import preprocessing, decomposition, manifold
def filter_valid_sentences(
df,
min_nwords,
):
# filter for only valid sentences
df = df.merge(
(
df
.filter(['source__model','source__unq_sentence_id','source__est_nwords'])
.drop_duplicates()
.query('source__est_nwords >= @min_nwords')
.groupby('source__model')
['source__unq_sentence_id'].count()
.to_frame('source__model_num_valid_length')
.reset_index()
),
how='left'
)
return df
def process_progression(
df,
min_nwords,
thres_ratio,
min_high_sim_num_sents,
min_high_sim_pct_sim,
):
main_groupbys = [
'source__model',
'source__model_num_valid_length',
'target__doc_id',
'target__doc_rank',
'target__is_most_recent_doc',
'target__bill',
'target__bill_id',
'target__state',
'target__type',
'target__url',
]
prog_high_sim_df = (
# select valid model sentences and only consider bills
df.query(
'source__est_nwords >= @min_nwords '
'and (not target__is_model)'
)
.reset_index(drop=True)
# high similar model sentences
.assign(
source__thres_met = lambda x: x['ratio'] > thres_ratio
)
)
prog_high_sim_df = (
prog_high_sim_df
# count similar model sentences
.groupby(main_groupbys)
.agg(
num_high_sim_src_seg = ('source__thres_met', 'sum')
)
.join(
# get similar sections
(
prog_high_sim_df
.query('source__thres_met')
.groupby(main_groupbys)
.agg(
source_sections = ('source__section_label', 'unique')
)
),
how='left'
)
.reset_index()
# get % of similar model sentences
.assign(
pct_src_sim = lambda x: (
100 * x['num_high_sim_src_seg'] / x['source__model_num_valid_length']
),
)
# threshold to determine that a bill is similar to a model
.assign(
ver_has_high_sim = lambda x: (
(x['num_high_sim_src_seg'] >= min_high_sim_num_sents) |
(x['pct_src_sim'] >= min_high_sim_pct_sim)
)
)
)
prog_high_sim_df['source_sections'] = prog_high_sim_df['source_sections'].apply(
lambda x: list(x) if hasattr(x, '__len__') else []
)
# find bills with at least one version considered as similar to a model
bills_with_atleast_one_ver_sim = (
prog_high_sim_df
.query('ver_has_high_sim')
.filter(['source__model','target__bill_id'])
.drop_duplicates()
.assign(bill_has_1ver_sim=True)
)
prog_high_sim_df = (
prog_high_sim_df.merge(
bills_with_atleast_one_ver_sim,
how='left'
)
.fillna({'bill_has_1ver_sim':False})
.astype({'bill_has_1ver_sim':'bool'})
)
# version of bills with max similarity
max_sim_ver_of_bills = (
prog_high_sim_df
.query('ver_has_high_sim')
.sort_values(['pct_src_sim','target__doc_rank'])
.groupby([
'source__model',
'target__bill_id',
])
.tail(1)
.filter([
'source__model',
'target__bill_id',
'target__doc_id',
'target__doc_rank',
])
.assign(bill_max_sim_ver = True)
)
prog_high_sim_df = (
prog_high_sim_df.merge(
max_sim_ver_of_bills,
how='left'
)
.fillna({'bill_max_sim_ver':False})
.astype({'bill_max_sim_ver':'bool'})
)
return prog_high_sim_df
def construct_dfs_to_display(sim_df):
# add some display columns
sim_df['disp_model_pct_sim'] = sim_df.apply(
lambda x: '{source__model} ({pct_src_sim:.0f}%)'.format(**x),
axis=1
)
sim_df['disp_% similar'] = sim_df['pct_src_sim'].round(1)
sim_df['disp_# sentences'] = sim_df.apply(
lambda x: '{num_high_sim_src_seg} / {source__model_num_valid_length}'.format(**x),
axis=1
)
sim_df['disp_bill'] = sim_df.apply(
lambda x: '{target__bill} ({target__type})'.format(
**x,
), axis=1
)
sim_df['disp_version'] = sim_df.apply(
lambda x: 'ver. {target__doc_rank}/{target__nvers}'.format(**x),
axis=1
)
# finalize display df per model
disp_sim_df = (
sim_df
.filter(
regex='disp|source__model|source_sections|target__bill_id|target__doc_id|target__url|progress_pct_src_sim'
)
.drop(columns='source__model_num_valid_length')
)
disp_sim_df.columns = (
disp_sim_df.columns
.str.replace('disp_','')
.str.replace('source__','')
.str.replace('target__','')
)
disp_sim_df = (
disp_sim_df
.filter([
'model',
'bill',
'doc_id',
'bill_id',
'version',
'% similar',
'# sentences',
'progress_pct_src_sim',
'source_sections',
'url',
])
)
# bills similar to multiple models
multimodel_bills = (
sim_df
.groupby(['disp_bill','disp_version','target__url'])
['disp_model_pct_sim'].unique().apply(list)
.reset_index()
)
multi_model_bills = (
multimodel_bills.loc[
multimodel_bills['disp_model_pct_sim'].apply(len) > 1
]
.reset_index(drop=True)
)
multi_model_bills.columns = (
multimodel_bills.columns
.str.replace('disp_','')
.str.replace('source__','')
.str.replace('target__','')
)
sim_df.drop(
columns=sim_df.filter(regex='^disp_').columns.to_list(),
inplace=True
)
return disp_sim_df, multi_model_bills
def count_similar_bills(sim_df):
# count bills
sim_bill_counts = (
sim_df
.value_counts('source__model',ascending=True)
.reset_index()
.rename(columns={'source__model':'model'})
)
# count states
sim_state_counts = (
sim_df.pivot_table(
index='target__state',
columns='source__model',
values='target__bill',
aggfunc='nunique',
dropna=False
)
.fillna(0)
.melt(value_name='count',ignore_index=False)
.reset_index()
)
sim_state_counts.columns = (
sim_state_counts.columns
.str.replace('target__','')
.str.replace('source__','')
)
return sim_bill_counts, sim_state_counts
def count_model_sections(sim_df, section_idf):
sec_counts = (
sim_df
.filter(['source__model','source_sections', 'target__bill_id'])
.explode('source_sections')
.drop_duplicates(ignore_index=True)
.astype({'source_sections':'str','source__model':'str'})
.groupby(['source__model','source_sections'])
['target__bill_id'].nunique()
.to_frame('count')
.reset_index()
.rename(columns={
'source__model': 'model',
'source_sections':'section'
})
.merge(
section_idf,
how='right'
)
.fillna({'count':0})
)
return sec_counts
def project_bills(
df,
colored_target_bills,
min_nwords,
thres_ratio,
version_to_count,
projector='PCA'
):
projector = projector.upper()
assert projector in ['PCA','MDS']
source_cols = ['source__model']
target_cols = [
'target__bill',
'target__bill_id',
'target__is_model',
]
tgt_ver_cols = [
'target__doc_id',
'target__doc_rank',
'target__is_most_recent_doc',
]
# construct features per billver-model pair
agg_df = (
df.query('source__est_nwords >= @min_nwords')
.reset_index(drop=True)
.assign(
source__thres_met = lambda x: x['ratio'] > thres_ratio
)
.groupby(source_cols + target_cols + tgt_ver_cols)
.agg(
feat__max_ratio = ("ratio", "max"),
feat__mean_ratio = ("ratio", "mean"),
feat__pct_high_sim = ("source__thres_met", "mean"),
feat__n_seg = ("ratio", "count"),
)
.reset_index()
)
permod_feat_cols = agg_df.filter(regex='feat__').columns.to_list()
# select version to visualize
if version_to_count == 'latest':
agg_df = agg_df.query('target__is_most_recent_doc')
if version_to_count == 'max':
agg_df = (
agg_df
.sort_values('feat__pct_high_sim')
.groupby(source_cols + target_cols)
.tail(1)
)
agg_df = (
agg_df
.sort_values(['source__model','feat__pct_high_sim'])
.reset_index(drop=True)
)
# assign label
label_df = (
agg_df
.query('target__bill in @colored_target_bills')
.sort_values('feat__pct_high_sim')
.groupby(['target__bill'])
.tail(1)
.filter(['source__model', 'target__bill'])
.drop_duplicates(ignore_index=True)
.rename(columns={'source__model': 'label'})
.merge(
agg_df.filter(['target__bill','target__is_model']).drop_duplicates(ignore_index=True),
how='right'
)
.sort_values('target__bill', ignore_index=True)
)
label_df['label'] = label_df.apply(
lambda x: (
x['target__bill'] if x['target__is_model']
else x['label'] if not pd.isna(x['label'])
else 'none'
), axis=1
)
label_df['type'] = label_df.apply(
lambda x: (
'model bill' if x['target__is_model']
else 'legis. sim. to model' if x['target__bill'] in colored_target_bills
else 'legis. w/o sim.'
), axis=1
)
label_df['is-model'] = label_df.apply(
lambda x: (
'model bill' if x['target__is_model']
else 'legislation'
), axis=1
)
# construct features per bill
feat_df = agg_df.pivot(
index='target__bill',
columns='source__model',
values=permod_feat_cols
).fillna(0).astype(float)
feat_df.columns = feat_df.columns.map(
lambda x: f'{x[0]}_{x[1]}'
)
# projection
X = feat_df.values
zscore = preprocessing.StandardScaler()
Xz = zscore.fit_transform(X)
if projector == 'PCA':
proj_fn = decomposition.PCA(whiten=True)
if projector == 'MDS':
proj_fn = manifold.MDS()
X_proj = proj_fn.fit_transform(Xz)
# finalize
proj_df = pd.DataFrame({
'bill': feat_df.index.to_list(),
'PC 1': X_proj[:,0],
'PC 2': X_proj[:,1],
})
proj_df = proj_df.merge(
label_df,
how='left',
left_on='bill',
right_on='target__bill'
)
return proj_df
def process_comparisons(
df,
seg_df,
min_nwords,
thres_ratio,
min_high_sim_num_sents,
min_high_sim_pct_sim,
version_to_count
):
vercnt_alias_map = {
'Latest version': 'latest',
'Max. similar version': 'max'
}
assert version_to_count in vercnt_alias_map.keys()
version_to_count = vercnt_alias_map[version_to_count]
# valid model sentences
df = filter_valid_sentences(
df, min_nwords,
)
# similarity progress
prog_high_sim_df = process_progression(
df,
min_nwords,
thres_ratio,
min_high_sim_num_sents,
min_high_sim_pct_sim,
)
# select version
sim_df = None
if version_to_count == 'latest':
sim_df = prog_high_sim_df.query(
'target__is_most_recent_doc '
'and ver_has_high_sim'
)
if version_to_count == 'max':
sim_df = prog_high_sim_df.query(
'bill_max_sim_ver == True '
'and ver_has_high_sim'
)
assert sim_df is not None
# append the similarity progress
_prog_sim = (
prog_high_sim_df
.sort_values('target__doc_rank')
.groupby(['source__model','target__bill_id'])
['pct_src_sim'].agg(list)
.to_frame('progress_pct_src_sim')
.reset_index()
)
_prog_sim['target__nvers'] = _prog_sim['progress_pct_src_sim'].apply(len)
sim_df = sim_df.merge(_prog_sim, how='left')
# data frame to analyze
sim_df = (
sim_df
.sort_values(
['source__model','pct_src_sim'],
ascending=[True,False]
)
.reset_index(drop=True)
)
# data frames to display
disp_sim_df, multi_model_bills = construct_dfs_to_display(sim_df)
# count bills and states
sim_bill_counts, sim_state_counts = count_similar_bills(sim_df)
# counts sections
section_idf = (
seg_df
.filter(['model','section_label'])
.drop_duplicates()
.rename(columns={
'section_label':'section'
})
.astype({'section':'str','model':'str'})
)
sec_counts = count_model_sections(sim_df, section_idf)
# project bills
colored_target_bills = sim_df['target__bill'].unique()
proj_df = project_bills(
df,
colored_target_bills,
min_nwords,
thres_ratio,
version_to_count
)
return (
sim_df,
proj_df,
disp_sim_df,
multi_model_bills,
sim_bill_counts,
sim_state_counts,
sec_counts
)
def process_model_sentences(
df,
seg_df,
min_nwords,
thres_ratio,
count_ceiling=30,
cmap_name='colorbrewer:Blues'
):
cm = Colormap(cmap_name)
model_sent_df = (
df
.query(
'ratio > @thres_ratio '
'and source__est_nwords >= @min_nwords '
'and (not target__is_model)'
)
.sort_values('ratio')
.groupby([
'source__model',
'source__model_sent_idx',
'target__bill_id'
])
.tail(1)
.groupby([
'source__model',
'source__model_sent_idx'
])
['target__bill_id'].nunique()
.to_frame('bill_count')
.reset_index()
)
model_sent_df.columns = model_sent_df.columns.str.replace('source__','')
model_sent_df = (
model_sent_df
.merge(seg_df, how='right')
.fillna({'bill_count':0})
.astype({'bill_count':'int'})
)
model_sent_df = model_sent_df.merge(
model_sent_df.groupby('model')
['bill_count'].max()
.to_frame('max_count')
.reset_index()
)
model_sent_df['norm_value'] = model_sent_df.apply(
lambda x: min(
1,
x['bill_count'] / (
x['max_count'] if count_ceiling is None
else min(x['max_count'], count_ceiling)
)
), axis=1
)
model_sent_df['sentence'] = model_sent_df['sentence'].apply(
lambda x: x.replace('$',r'\$')
)
model_sent_df['color'] = [Color(x) for x in cm(model_sent_df['norm_value'])]
model_sent_df['annotations'] = model_sent_df.apply(
lambda x: (
x['sentence'] if x['bill_count'] == 0 or x['est_nwords'] < min_nwords
else (
x['sentence'],
str(x['bill_count']),
x['color'].hex,
'#ffffff' if x['color'].hsl.l < 0.5 else '#000000'
)
), axis=1
)
model_sent_df = (
model_sent_df
.groupby([
'model',
'section_idx',
'section_label',
])
['annotations'].agg(list)
.reset_index()
)
return model_sent_df