import pandas as pd import numpy as np from cmap import Colormap, Color from sklearn import preprocessing, decomposition, manifold def filter_valid_sentences( df, min_nwords, ): # filter for only valid sentences df = df.merge( ( df .filter(['source__model','source__unq_sentence_id','source__est_nwords']) .drop_duplicates() .query('source__est_nwords >= @min_nwords') .groupby('source__model') ['source__unq_sentence_id'].count() .to_frame('source__model_num_valid_length') .reset_index() ), how='left' ) return df def process_progression( df, min_nwords, thres_ratio, min_high_sim_num_sents, min_high_sim_pct_sim, ): main_groupbys = [ 'source__model', 'source__model_num_valid_length', 'target__doc_id', 'target__doc_rank', 'target__is_most_recent_doc', 'target__bill', 'target__bill_id', 'target__state', 'target__type', 'target__url', ] prog_high_sim_df = ( # select valid model sentences and only consider bills df.query( 'source__est_nwords >= @min_nwords ' 'and (not target__is_model)' ) .reset_index(drop=True) # high similar model sentences .assign( source__thres_met = lambda x: x['ratio'] > thres_ratio ) ) prog_high_sim_df = ( prog_high_sim_df # count similar model sentences .groupby(main_groupbys) .agg( num_high_sim_src_seg = ('source__thres_met', 'sum') ) .join( # get similar sections ( prog_high_sim_df .query('source__thres_met') .groupby(main_groupbys) .agg( source_sections = ('source__section_label', 'unique') ) ), how='left' ) .reset_index() # get % of similar model sentences .assign( pct_src_sim = lambda x: ( 100 * x['num_high_sim_src_seg'] / x['source__model_num_valid_length'] ), ) # threshold to determine that a bill is similar to a model .assign( ver_has_high_sim = lambda x: ( (x['num_high_sim_src_seg'] >= min_high_sim_num_sents) | (x['pct_src_sim'] >= min_high_sim_pct_sim) ) ) ) prog_high_sim_df['source_sections'] = prog_high_sim_df['source_sections'].apply( lambda x: list(x) if hasattr(x, '__len__') else [] ) # find bills with at least one version considered as similar to a model bills_with_atleast_one_ver_sim = ( prog_high_sim_df .query('ver_has_high_sim') .filter(['source__model','target__bill_id']) .drop_duplicates() .assign(bill_has_1ver_sim=True) ) prog_high_sim_df = ( prog_high_sim_df.merge( bills_with_atleast_one_ver_sim, how='left' ) .fillna({'bill_has_1ver_sim':False}) .astype({'bill_has_1ver_sim':'bool'}) ) # version of bills with max similarity max_sim_ver_of_bills = ( prog_high_sim_df .query('ver_has_high_sim') .sort_values(['pct_src_sim','target__doc_rank']) .groupby([ 'source__model', 'target__bill_id', ]) .tail(1) .filter([ 'source__model', 'target__bill_id', 'target__doc_id', 'target__doc_rank', ]) .assign(bill_max_sim_ver = True) ) prog_high_sim_df = ( prog_high_sim_df.merge( max_sim_ver_of_bills, how='left' ) .fillna({'bill_max_sim_ver':False}) .astype({'bill_max_sim_ver':'bool'}) ) return prog_high_sim_df def construct_dfs_to_display(sim_df): # add some display columns sim_df['disp_model_pct_sim'] = sim_df.apply( lambda x: '{source__model} ({pct_src_sim:.0f}%)'.format(**x), axis=1 ) sim_df['disp_% similar'] = sim_df['pct_src_sim'].round(1) sim_df['disp_# sentences'] = sim_df.apply( lambda x: '{num_high_sim_src_seg} / {source__model_num_valid_length}'.format(**x), axis=1 ) sim_df['disp_bill'] = sim_df.apply( lambda x: '{target__bill} ({target__type})'.format( **x, ), axis=1 ) sim_df['disp_version'] = sim_df.apply( lambda x: 'ver. {target__doc_rank}/{target__nvers}'.format(**x), axis=1 ) # finalize display df per model disp_sim_df = ( sim_df .filter( regex='disp|source__model|source_sections|target__bill_id|target__doc_id|target__url|progress_pct_src_sim' ) .drop(columns='source__model_num_valid_length') ) disp_sim_df.columns = ( disp_sim_df.columns .str.replace('disp_','') .str.replace('source__','') .str.replace('target__','') ) disp_sim_df = ( disp_sim_df .filter([ 'model', 'bill', 'doc_id', 'bill_id', 'version', '% similar', '# sentences', 'progress_pct_src_sim', 'source_sections', 'url', ]) ) # bills similar to multiple models multimodel_bills = ( sim_df .groupby(['disp_bill','disp_version','target__url']) ['disp_model_pct_sim'].unique().apply(list) .reset_index() ) multi_model_bills = ( multimodel_bills.loc[ multimodel_bills['disp_model_pct_sim'].apply(len) > 1 ] .reset_index(drop=True) ) multi_model_bills.columns = ( multimodel_bills.columns .str.replace('disp_','') .str.replace('source__','') .str.replace('target__','') ) sim_df.drop( columns=sim_df.filter(regex='^disp_').columns.to_list(), inplace=True ) return disp_sim_df, multi_model_bills def count_similar_bills(sim_df): # count bills sim_bill_counts = ( sim_df .value_counts('source__model',ascending=True) .reset_index() .rename(columns={'source__model':'model'}) ) # count states sim_state_counts = ( sim_df.pivot_table( index='target__state', columns='source__model', values='target__bill', aggfunc='nunique', dropna=False ) .fillna(0) .melt(value_name='count',ignore_index=False) .reset_index() ) sim_state_counts.columns = ( sim_state_counts.columns .str.replace('target__','') .str.replace('source__','') ) return sim_bill_counts, sim_state_counts def count_model_sections(sim_df, section_idf): sec_counts = ( sim_df .filter(['source__model','source_sections', 'target__bill_id']) .explode('source_sections') .drop_duplicates(ignore_index=True) .astype({'source_sections':'str','source__model':'str'}) .groupby(['source__model','source_sections']) ['target__bill_id'].nunique() .to_frame('count') .reset_index() .rename(columns={ 'source__model': 'model', 'source_sections':'section' }) .merge( section_idf, how='right' ) .fillna({'count':0}) ) return sec_counts def project_bills( df, colored_target_bills, min_nwords, thres_ratio, version_to_count, projector='PCA' ): projector = projector.upper() assert projector in ['PCA','MDS'] source_cols = ['source__model'] target_cols = [ 'target__bill', 'target__bill_id', 'target__is_model', ] tgt_ver_cols = [ 'target__doc_id', 'target__doc_rank', 'target__is_most_recent_doc', ] # construct features per billver-model pair agg_df = ( df.query('source__est_nwords >= @min_nwords') .reset_index(drop=True) .assign( source__thres_met = lambda x: x['ratio'] > thres_ratio ) .groupby(source_cols + target_cols + tgt_ver_cols) .agg( feat__max_ratio = ("ratio", "max"), feat__mean_ratio = ("ratio", "mean"), feat__pct_high_sim = ("source__thres_met", "mean"), feat__n_seg = ("ratio", "count"), ) .reset_index() ) permod_feat_cols = agg_df.filter(regex='feat__').columns.to_list() # select version to visualize if version_to_count == 'latest': agg_df = agg_df.query('target__is_most_recent_doc') if version_to_count == 'max': agg_df = ( agg_df .sort_values('feat__pct_high_sim') .groupby(source_cols + target_cols) .tail(1) ) agg_df = ( agg_df .sort_values(['source__model','feat__pct_high_sim']) .reset_index(drop=True) ) # assign label label_df = ( agg_df .query('target__bill in @colored_target_bills') .sort_values('feat__pct_high_sim') .groupby(['target__bill']) .tail(1) .filter(['source__model', 'target__bill']) .drop_duplicates(ignore_index=True) .rename(columns={'source__model': 'label'}) .merge( agg_df.filter(['target__bill','target__is_model']).drop_duplicates(ignore_index=True), how='right' ) .sort_values('target__bill', ignore_index=True) ) label_df['label'] = label_df.apply( lambda x: ( x['target__bill'] if x['target__is_model'] else x['label'] if not pd.isna(x['label']) else 'none' ), axis=1 ) label_df['type'] = label_df.apply( lambda x: ( 'model bill' if x['target__is_model'] else 'legis. sim. to model' if x['target__bill'] in colored_target_bills else 'legis. w/o sim.' ), axis=1 ) label_df['is-model'] = label_df.apply( lambda x: ( 'model bill' if x['target__is_model'] else 'legislation' ), axis=1 ) # construct features per bill feat_df = agg_df.pivot( index='target__bill', columns='source__model', values=permod_feat_cols ).fillna(0).astype(float) feat_df.columns = feat_df.columns.map( lambda x: f'{x[0]}_{x[1]}' ) # projection X = feat_df.values zscore = preprocessing.StandardScaler() Xz = zscore.fit_transform(X) if projector == 'PCA': proj_fn = decomposition.PCA(whiten=True) if projector == 'MDS': proj_fn = manifold.MDS() X_proj = proj_fn.fit_transform(Xz) # finalize proj_df = pd.DataFrame({ 'bill': feat_df.index.to_list(), 'PC 1': X_proj[:,0], 'PC 2': X_proj[:,1], }) proj_df = proj_df.merge( label_df, how='left', left_on='bill', right_on='target__bill' ) return proj_df def process_comparisons( df, seg_df, min_nwords, thres_ratio, min_high_sim_num_sents, min_high_sim_pct_sim, version_to_count ): vercnt_alias_map = { 'Latest version': 'latest', 'Max. similar version': 'max' } assert version_to_count in vercnt_alias_map.keys() version_to_count = vercnt_alias_map[version_to_count] # valid model sentences df = filter_valid_sentences( df, min_nwords, ) # similarity progress prog_high_sim_df = process_progression( df, min_nwords, thres_ratio, min_high_sim_num_sents, min_high_sim_pct_sim, ) # select version sim_df = None if version_to_count == 'latest': sim_df = prog_high_sim_df.query( 'target__is_most_recent_doc ' 'and ver_has_high_sim' ) if version_to_count == 'max': sim_df = prog_high_sim_df.query( 'bill_max_sim_ver == True ' 'and ver_has_high_sim' ) assert sim_df is not None # append the similarity progress _prog_sim = ( prog_high_sim_df .sort_values('target__doc_rank') .groupby(['source__model','target__bill_id']) ['pct_src_sim'].agg(list) .to_frame('progress_pct_src_sim') .reset_index() ) _prog_sim['target__nvers'] = _prog_sim['progress_pct_src_sim'].apply(len) sim_df = sim_df.merge(_prog_sim, how='left') # data frame to analyze sim_df = ( sim_df .sort_values( ['source__model','pct_src_sim'], ascending=[True,False] ) .reset_index(drop=True) ) # data frames to display disp_sim_df, multi_model_bills = construct_dfs_to_display(sim_df) # count bills and states sim_bill_counts, sim_state_counts = count_similar_bills(sim_df) # counts sections section_idf = ( seg_df .filter(['model','section_label']) .drop_duplicates() .rename(columns={ 'section_label':'section' }) .astype({'section':'str','model':'str'}) ) sec_counts = count_model_sections(sim_df, section_idf) # project bills colored_target_bills = sim_df['target__bill'].unique() proj_df = project_bills( df, colored_target_bills, min_nwords, thres_ratio, version_to_count ) return ( sim_df, proj_df, disp_sim_df, multi_model_bills, sim_bill_counts, sim_state_counts, sec_counts ) def process_model_sentences( df, seg_df, min_nwords, thres_ratio, count_ceiling=30, cmap_name='colorbrewer:Blues' ): cm = Colormap(cmap_name) model_sent_df = ( df .query( 'ratio > @thres_ratio ' 'and source__est_nwords >= @min_nwords ' 'and (not target__is_model)' ) .sort_values('ratio') .groupby([ 'source__model', 'source__model_sent_idx', 'target__bill_id' ]) .tail(1) .groupby([ 'source__model', 'source__model_sent_idx' ]) ['target__bill_id'].nunique() .to_frame('bill_count') .reset_index() ) model_sent_df.columns = model_sent_df.columns.str.replace('source__','') model_sent_df = ( model_sent_df .merge(seg_df, how='right') .fillna({'bill_count':0}) .astype({'bill_count':'int'}) ) model_sent_df = model_sent_df.merge( model_sent_df.groupby('model') ['bill_count'].max() .to_frame('max_count') .reset_index() ) model_sent_df['norm_value'] = model_sent_df.apply( lambda x: min( 1, x['bill_count'] / ( x['max_count'] if count_ceiling is None else min(x['max_count'], count_ceiling) ) ), axis=1 ) model_sent_df['sentence'] = model_sent_df['sentence'].apply( lambda x: x.replace('$',r'\$') ) model_sent_df['color'] = [Color(x) for x in cm(model_sent_df['norm_value'])] model_sent_df['annotations'] = model_sent_df.apply( lambda x: ( x['sentence'] if x['bill_count'] == 0 or x['est_nwords'] < min_nwords else ( x['sentence'], str(x['bill_count']), x['color'].hex, '#ffffff' if x['color'].hsl.l < 0.5 else '#000000' ) ), axis=1 ) model_sent_df = ( model_sent_df .groupby([ 'model', 'section_idx', 'section_label', ]) ['annotations'].agg(list) .reset_index() ) return model_sent_df