Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from cmap import Colormap, Color | |
| from sklearn import preprocessing, decomposition, manifold | |
| def filter_valid_sentences( | |
| df, | |
| min_nwords, | |
| ): | |
| # filter for only valid sentences | |
| df = df.merge( | |
| ( | |
| df | |
| .filter(['source__model','source__unq_sentence_id','source__est_nwords']) | |
| .drop_duplicates() | |
| .query('source__est_nwords >= @min_nwords') | |
| .groupby('source__model') | |
| ['source__unq_sentence_id'].count() | |
| .to_frame('source__model_num_valid_length') | |
| .reset_index() | |
| ), | |
| how='left' | |
| ) | |
| return df | |
| def process_progression( | |
| df, | |
| min_nwords, | |
| thres_ratio, | |
| min_high_sim_num_sents, | |
| min_high_sim_pct_sim, | |
| ): | |
| main_groupbys = [ | |
| 'source__model', | |
| 'source__model_num_valid_length', | |
| 'target__doc_id', | |
| 'target__doc_rank', | |
| 'target__is_most_recent_doc', | |
| 'target__bill', | |
| 'target__bill_id', | |
| 'target__state', | |
| 'target__type', | |
| 'target__url', | |
| ] | |
| prog_high_sim_df = ( | |
| # select valid model sentences and only consider bills | |
| df.query( | |
| 'source__est_nwords >= @min_nwords ' | |
| 'and (not target__is_model)' | |
| ) | |
| .reset_index(drop=True) | |
| # high similar model sentences | |
| .assign( | |
| source__thres_met = lambda x: x['ratio'] > thres_ratio | |
| ) | |
| ) | |
| prog_high_sim_df = ( | |
| prog_high_sim_df | |
| # count similar model sentences | |
| .groupby(main_groupbys) | |
| .agg( | |
| num_high_sim_src_seg = ('source__thres_met', 'sum') | |
| ) | |
| .join( | |
| # get similar sections | |
| ( | |
| prog_high_sim_df | |
| .query('source__thres_met') | |
| .groupby(main_groupbys) | |
| .agg( | |
| source_sections = ('source__section_label', 'unique') | |
| ) | |
| ), | |
| how='left' | |
| ) | |
| .reset_index() | |
| # get % of similar model sentences | |
| .assign( | |
| pct_src_sim = lambda x: ( | |
| 100 * x['num_high_sim_src_seg'] / x['source__model_num_valid_length'] | |
| ), | |
| ) | |
| # threshold to determine that a bill is similar to a model | |
| .assign( | |
| ver_has_high_sim = lambda x: ( | |
| (x['num_high_sim_src_seg'] >= min_high_sim_num_sents) | | |
| (x['pct_src_sim'] >= min_high_sim_pct_sim) | |
| ) | |
| ) | |
| ) | |
| prog_high_sim_df['source_sections'] = prog_high_sim_df['source_sections'].apply( | |
| lambda x: list(x) if hasattr(x, '__len__') else [] | |
| ) | |
| # find bills with at least one version considered as similar to a model | |
| bills_with_atleast_one_ver_sim = ( | |
| prog_high_sim_df | |
| .query('ver_has_high_sim') | |
| .filter(['source__model','target__bill_id']) | |
| .drop_duplicates() | |
| .assign(bill_has_1ver_sim=True) | |
| ) | |
| prog_high_sim_df = ( | |
| prog_high_sim_df.merge( | |
| bills_with_atleast_one_ver_sim, | |
| how='left' | |
| ) | |
| .fillna({'bill_has_1ver_sim':False}) | |
| .astype({'bill_has_1ver_sim':'bool'}) | |
| ) | |
| # version of bills with max similarity | |
| max_sim_ver_of_bills = ( | |
| prog_high_sim_df | |
| .query('ver_has_high_sim') | |
| .sort_values(['pct_src_sim','target__doc_rank']) | |
| .groupby([ | |
| 'source__model', | |
| 'target__bill_id', | |
| ]) | |
| .tail(1) | |
| .filter([ | |
| 'source__model', | |
| 'target__bill_id', | |
| 'target__doc_id', | |
| 'target__doc_rank', | |
| ]) | |
| .assign(bill_max_sim_ver = True) | |
| ) | |
| prog_high_sim_df = ( | |
| prog_high_sim_df.merge( | |
| max_sim_ver_of_bills, | |
| how='left' | |
| ) | |
| .fillna({'bill_max_sim_ver':False}) | |
| .astype({'bill_max_sim_ver':'bool'}) | |
| ) | |
| return prog_high_sim_df | |
| def construct_dfs_to_display(sim_df): | |
| # add some display columns | |
| sim_df['disp_model_pct_sim'] = sim_df.apply( | |
| lambda x: '{source__model} ({pct_src_sim:.0f}%)'.format(**x), | |
| axis=1 | |
| ) | |
| sim_df['disp_% similar'] = sim_df['pct_src_sim'].round(1) | |
| sim_df['disp_# sentences'] = sim_df.apply( | |
| lambda x: '{num_high_sim_src_seg} / {source__model_num_valid_length}'.format(**x), | |
| axis=1 | |
| ) | |
| sim_df['disp_bill'] = sim_df.apply( | |
| lambda x: '{target__bill} ({target__type})'.format( | |
| **x, | |
| ), axis=1 | |
| ) | |
| sim_df['disp_version'] = sim_df.apply( | |
| lambda x: 'ver. {target__doc_rank}/{target__nvers}'.format(**x), | |
| axis=1 | |
| ) | |
| # finalize display df per model | |
| disp_sim_df = ( | |
| sim_df | |
| .filter( | |
| regex='disp|source__model|source_sections|target__bill_id|target__doc_id|target__url|progress_pct_src_sim' | |
| ) | |
| .drop(columns='source__model_num_valid_length') | |
| ) | |
| disp_sim_df.columns = ( | |
| disp_sim_df.columns | |
| .str.replace('disp_','') | |
| .str.replace('source__','') | |
| .str.replace('target__','') | |
| ) | |
| disp_sim_df = ( | |
| disp_sim_df | |
| .filter([ | |
| 'model', | |
| 'bill', | |
| 'doc_id', | |
| 'bill_id', | |
| 'version', | |
| '% similar', | |
| '# sentences', | |
| 'progress_pct_src_sim', | |
| 'source_sections', | |
| 'url', | |
| ]) | |
| ) | |
| # bills similar to multiple models | |
| multimodel_bills = ( | |
| sim_df | |
| .groupby(['disp_bill','disp_version','target__url']) | |
| ['disp_model_pct_sim'].unique().apply(list) | |
| .reset_index() | |
| ) | |
| multi_model_bills = ( | |
| multimodel_bills.loc[ | |
| multimodel_bills['disp_model_pct_sim'].apply(len) > 1 | |
| ] | |
| .reset_index(drop=True) | |
| ) | |
| multi_model_bills.columns = ( | |
| multimodel_bills.columns | |
| .str.replace('disp_','') | |
| .str.replace('source__','') | |
| .str.replace('target__','') | |
| ) | |
| sim_df.drop( | |
| columns=sim_df.filter(regex='^disp_').columns.to_list(), | |
| inplace=True | |
| ) | |
| return disp_sim_df, multi_model_bills | |
| def count_similar_bills(sim_df): | |
| # count bills | |
| sim_bill_counts = ( | |
| sim_df | |
| .value_counts('source__model',ascending=True) | |
| .reset_index() | |
| .rename(columns={'source__model':'model'}) | |
| ) | |
| # count states | |
| sim_state_counts = ( | |
| sim_df.pivot_table( | |
| index='target__state', | |
| columns='source__model', | |
| values='target__bill', | |
| aggfunc='nunique', | |
| dropna=False | |
| ) | |
| .fillna(0) | |
| .melt(value_name='count',ignore_index=False) | |
| .reset_index() | |
| ) | |
| sim_state_counts.columns = ( | |
| sim_state_counts.columns | |
| .str.replace('target__','') | |
| .str.replace('source__','') | |
| ) | |
| return sim_bill_counts, sim_state_counts | |
| def count_model_sections(sim_df, section_idf): | |
| sec_counts = ( | |
| sim_df | |
| .filter(['source__model','source_sections', 'target__bill_id']) | |
| .explode('source_sections') | |
| .drop_duplicates(ignore_index=True) | |
| .astype({'source_sections':'str','source__model':'str'}) | |
| .groupby(['source__model','source_sections']) | |
| ['target__bill_id'].nunique() | |
| .to_frame('count') | |
| .reset_index() | |
| .rename(columns={ | |
| 'source__model': 'model', | |
| 'source_sections':'section' | |
| }) | |
| .merge( | |
| section_idf, | |
| how='right' | |
| ) | |
| .fillna({'count':0}) | |
| ) | |
| return sec_counts | |
| def project_bills( | |
| df, | |
| colored_target_bills, | |
| min_nwords, | |
| thres_ratio, | |
| version_to_count, | |
| projector='PCA' | |
| ): | |
| projector = projector.upper() | |
| assert projector in ['PCA','MDS'] | |
| source_cols = ['source__model'] | |
| target_cols = [ | |
| 'target__bill', | |
| 'target__bill_id', | |
| 'target__is_model', | |
| ] | |
| tgt_ver_cols = [ | |
| 'target__doc_id', | |
| 'target__doc_rank', | |
| 'target__is_most_recent_doc', | |
| ] | |
| # construct features per billver-model pair | |
| agg_df = ( | |
| df.query('source__est_nwords >= @min_nwords') | |
| .reset_index(drop=True) | |
| .assign( | |
| source__thres_met = lambda x: x['ratio'] > thres_ratio | |
| ) | |
| .groupby(source_cols + target_cols + tgt_ver_cols) | |
| .agg( | |
| feat__max_ratio = ("ratio", "max"), | |
| feat__mean_ratio = ("ratio", "mean"), | |
| feat__pct_high_sim = ("source__thres_met", "mean"), | |
| feat__n_seg = ("ratio", "count"), | |
| ) | |
| .reset_index() | |
| ) | |
| permod_feat_cols = agg_df.filter(regex='feat__').columns.to_list() | |
| # select version to visualize | |
| if version_to_count == 'latest': | |
| agg_df = agg_df.query('target__is_most_recent_doc') | |
| if version_to_count == 'max': | |
| agg_df = ( | |
| agg_df | |
| .sort_values('feat__pct_high_sim') | |
| .groupby(source_cols + target_cols) | |
| .tail(1) | |
| ) | |
| agg_df = ( | |
| agg_df | |
| .sort_values(['source__model','feat__pct_high_sim']) | |
| .reset_index(drop=True) | |
| ) | |
| # assign label | |
| label_df = ( | |
| agg_df | |
| .query('target__bill in @colored_target_bills') | |
| .sort_values('feat__pct_high_sim') | |
| .groupby(['target__bill']) | |
| .tail(1) | |
| .filter(['source__model', 'target__bill']) | |
| .drop_duplicates(ignore_index=True) | |
| .rename(columns={'source__model': 'label'}) | |
| .merge( | |
| agg_df.filter(['target__bill','target__is_model']).drop_duplicates(ignore_index=True), | |
| how='right' | |
| ) | |
| .sort_values('target__bill', ignore_index=True) | |
| ) | |
| label_df['label'] = label_df.apply( | |
| lambda x: ( | |
| x['target__bill'] if x['target__is_model'] | |
| else x['label'] if not pd.isna(x['label']) | |
| else 'none' | |
| ), axis=1 | |
| ) | |
| label_df['type'] = label_df.apply( | |
| lambda x: ( | |
| 'model bill' if x['target__is_model'] | |
| else 'legis. sim. to model' if x['target__bill'] in colored_target_bills | |
| else 'legis. w/o sim.' | |
| ), axis=1 | |
| ) | |
| label_df['is-model'] = label_df.apply( | |
| lambda x: ( | |
| 'model bill' if x['target__is_model'] | |
| else 'legislation' | |
| ), axis=1 | |
| ) | |
| # construct features per bill | |
| feat_df = agg_df.pivot( | |
| index='target__bill', | |
| columns='source__model', | |
| values=permod_feat_cols | |
| ).fillna(0).astype(float) | |
| feat_df.columns = feat_df.columns.map( | |
| lambda x: f'{x[0]}_{x[1]}' | |
| ) | |
| # projection | |
| X = feat_df.values | |
| zscore = preprocessing.StandardScaler() | |
| Xz = zscore.fit_transform(X) | |
| if projector == 'PCA': | |
| proj_fn = decomposition.PCA(whiten=True) | |
| if projector == 'MDS': | |
| proj_fn = manifold.MDS() | |
| X_proj = proj_fn.fit_transform(Xz) | |
| # finalize | |
| proj_df = pd.DataFrame({ | |
| 'bill': feat_df.index.to_list(), | |
| 'PC 1': X_proj[:,0], | |
| 'PC 2': X_proj[:,1], | |
| }) | |
| proj_df = proj_df.merge( | |
| label_df, | |
| how='left', | |
| left_on='bill', | |
| right_on='target__bill' | |
| ) | |
| return proj_df | |
| def process_comparisons( | |
| df, | |
| seg_df, | |
| min_nwords, | |
| thres_ratio, | |
| min_high_sim_num_sents, | |
| min_high_sim_pct_sim, | |
| version_to_count | |
| ): | |
| vercnt_alias_map = { | |
| 'Latest version': 'latest', | |
| 'Max. similar version': 'max' | |
| } | |
| assert version_to_count in vercnt_alias_map.keys() | |
| version_to_count = vercnt_alias_map[version_to_count] | |
| # valid model sentences | |
| df = filter_valid_sentences( | |
| df, min_nwords, | |
| ) | |
| # similarity progress | |
| prog_high_sim_df = process_progression( | |
| df, | |
| min_nwords, | |
| thres_ratio, | |
| min_high_sim_num_sents, | |
| min_high_sim_pct_sim, | |
| ) | |
| # select version | |
| sim_df = None | |
| if version_to_count == 'latest': | |
| sim_df = prog_high_sim_df.query( | |
| 'target__is_most_recent_doc ' | |
| 'and ver_has_high_sim' | |
| ) | |
| if version_to_count == 'max': | |
| sim_df = prog_high_sim_df.query( | |
| 'bill_max_sim_ver == True ' | |
| 'and ver_has_high_sim' | |
| ) | |
| assert sim_df is not None | |
| # append the similarity progress | |
| _prog_sim = ( | |
| prog_high_sim_df | |
| .sort_values('target__doc_rank') | |
| .groupby(['source__model','target__bill_id']) | |
| ['pct_src_sim'].agg(list) | |
| .to_frame('progress_pct_src_sim') | |
| .reset_index() | |
| ) | |
| _prog_sim['target__nvers'] = _prog_sim['progress_pct_src_sim'].apply(len) | |
| sim_df = sim_df.merge(_prog_sim, how='left') | |
| # data frame to analyze | |
| sim_df = ( | |
| sim_df | |
| .sort_values( | |
| ['source__model','pct_src_sim'], | |
| ascending=[True,False] | |
| ) | |
| .reset_index(drop=True) | |
| ) | |
| # data frames to display | |
| disp_sim_df, multi_model_bills = construct_dfs_to_display(sim_df) | |
| # count bills and states | |
| sim_bill_counts, sim_state_counts = count_similar_bills(sim_df) | |
| # counts sections | |
| section_idf = ( | |
| seg_df | |
| .filter(['model','section_label']) | |
| .drop_duplicates() | |
| .rename(columns={ | |
| 'section_label':'section' | |
| }) | |
| .astype({'section':'str','model':'str'}) | |
| ) | |
| sec_counts = count_model_sections(sim_df, section_idf) | |
| # project bills | |
| colored_target_bills = sim_df['target__bill'].unique() | |
| proj_df = project_bills( | |
| df, | |
| colored_target_bills, | |
| min_nwords, | |
| thres_ratio, | |
| version_to_count | |
| ) | |
| return ( | |
| sim_df, | |
| proj_df, | |
| disp_sim_df, | |
| multi_model_bills, | |
| sim_bill_counts, | |
| sim_state_counts, | |
| sec_counts | |
| ) | |
| def process_model_sentences( | |
| df, | |
| seg_df, | |
| min_nwords, | |
| thres_ratio, | |
| count_ceiling=30, | |
| cmap_name='colorbrewer:Blues' | |
| ): | |
| cm = Colormap(cmap_name) | |
| model_sent_df = ( | |
| df | |
| .query( | |
| 'ratio > @thres_ratio ' | |
| 'and source__est_nwords >= @min_nwords ' | |
| 'and (not target__is_model)' | |
| ) | |
| .sort_values('ratio') | |
| .groupby([ | |
| 'source__model', | |
| 'source__model_sent_idx', | |
| 'target__bill_id' | |
| ]) | |
| .tail(1) | |
| .groupby([ | |
| 'source__model', | |
| 'source__model_sent_idx' | |
| ]) | |
| ['target__bill_id'].nunique() | |
| .to_frame('bill_count') | |
| .reset_index() | |
| ) | |
| model_sent_df.columns = model_sent_df.columns.str.replace('source__','') | |
| model_sent_df = ( | |
| model_sent_df | |
| .merge(seg_df, how='right') | |
| .fillna({'bill_count':0}) | |
| .astype({'bill_count':'int'}) | |
| ) | |
| model_sent_df = model_sent_df.merge( | |
| model_sent_df.groupby('model') | |
| ['bill_count'].max() | |
| .to_frame('max_count') | |
| .reset_index() | |
| ) | |
| model_sent_df['norm_value'] = model_sent_df.apply( | |
| lambda x: min( | |
| 1, | |
| x['bill_count'] / ( | |
| x['max_count'] if count_ceiling is None | |
| else min(x['max_count'], count_ceiling) | |
| ) | |
| ), axis=1 | |
| ) | |
| model_sent_df['sentence'] = model_sent_df['sentence'].apply( | |
| lambda x: x.replace('$',r'\$') | |
| ) | |
| model_sent_df['color'] = [Color(x) for x in cm(model_sent_df['norm_value'])] | |
| model_sent_df['annotations'] = model_sent_df.apply( | |
| lambda x: ( | |
| x['sentence'] if x['bill_count'] == 0 or x['est_nwords'] < min_nwords | |
| else ( | |
| x['sentence'], | |
| str(x['bill_count']), | |
| x['color'].hex, | |
| '#ffffff' if x['color'].hsl.l < 0.5 else '#000000' | |
| ) | |
| ), axis=1 | |
| ) | |
| model_sent_df = ( | |
| model_sent_df | |
| .groupby([ | |
| 'model', | |
| 'section_idx', | |
| 'section_label', | |
| ]) | |
| ['annotations'].agg(list) | |
| .reset_index() | |
| ) | |
| return model_sent_df | |