import numpy as np import pandas as pd import streamlit as st from annotated_text import annotated_text def render( tab, df, seg_df, text_df, disp_sim_df, min_nwords, thres_ratio, ratio_bg_cmap = 'Reds', inspect_disp_max_cols = 4, thres_fuzzy_ratio_valid = 50, disp_char_jitter = 100, disp_text_flank = ' [...] ', disp_text_color = '#fc9272', ): if 'bill_model_pair' not in st.session_state: return # Get selection sel_bm_idx = st.session_state['bill_model_pair']['selection']['rows'][0] sel_bm_row = disp_sim_df.iloc[sel_bm_idx] sel_model, sel_bill, sel_doc = sel_bm_row['model'], sel_bm_row['bill_id'], sel_bm_row['doc_id'] # Get selected bill with high sim. sentences sel_bm_df = ( df.query( 'source__model == @sel_model ' 'and target__bill_id == @sel_bill ' 'and source__est_nwords >= @min_nwords' ) .reset_index(drop=True) ) idx_highsentidx = ( sel_bm_df.query('ratio > @thres_ratio') ['source__model_sent_idx'].unique() ) sel_sim_seg_df = ( sel_bm_df.query( 'source__model_sent_idx in @idx_highsentidx' ) .filter( regex='ratio|postproc|source__model(_sent_idx)?$|target__bill|target__doc_id|target__doc_rank' ) .merge( seg_df.filter([ 'model','section_label','model_sent_idx','sentence' ]).add_prefix('source__'), how='left' ) ) assert sel_sim_seg_df['target__bill'].nunique() == 1 sel_bill_name = sel_sim_seg_df['target__bill'].iloc[0] # Display table for high sim. model sentencecs disp_ver_seg_df = ( sel_sim_seg_df .rename(columns={ 'source__model_sent_idx': ('model', 'sentence id'), 'source__section_label': ('model', 'section'), 'source__sentence': ('model', 'sentence') }) .pivot( index=[ ('model', 'sentence id'), ('model', 'section'), ('model', 'sentence'), ], columns='target__doc_rank', values=['ratio'] ) .add_prefix('bill ver. ') .rename( columns={'bill ver. ratio': 'similarity % with model sentence'} ) .reset_index() ) disp_ver_seg_df.columns.names = [None,None] # Create sub components tab.header(f'Inspect similarity between {sel_model} model and {sel_bill_name} bill') selseg_col1, selseg_col2 = tab.columns([1, 2]) # Visualize whole-bill similarity across available bill progress selseg_col1.subheader(f'Bill-model similarity across versions') sel_bill_prog = ( disp_sim_df.query('bill_id == @sel_bill and model == @sel_model') ) assert len(sel_bill_prog) == 1 sel_bill_prog = sel_bill_prog.iloc[0]['progress_pct_src_sim'] if not hasattr(sel_bill_prog, '__len__'): sel_bill_prog = [sel_bill_prog] sel_bill_nvers = len(sel_bill_prog) selseg_col1.bar_chart( pd.DataFrame({ 'version': np.arange(1, sel_bill_nvers + 1), '% model sentences in bill': sel_bill_prog }), x='version', y='% model sentences in bill' ) # Table model sentences with single-row selection to inspect further selseg_col2.subheader(f'Sentences from model {sel_model} found in bill') selseg_col2.text( 'Note: Select one of the model sentences from table below.' ) selseg_col2.dataframe( data=( disp_ver_seg_df.style .background_gradient( cmap=ratio_bg_cmap, vmin=thres_ratio, vmax=100.0, subset=disp_ver_seg_df.filter(regex='bill').columns ) .format(precision=1) ), selection_mode='single-row-required', key='selected_model_sentence', on_select="rerun", ) # Display model sentence and the potential corresponding bill's sentences per version __inspect_single_sentence__( tab, text_df, sel_sim_seg_df, disp_ver_seg_df, sel_bill_nvers, inspect_disp_max_cols, thres_fuzzy_ratio_valid, disp_char_jitter, disp_text_flank, disp_text_color, ) def __inspect_single_sentence__( tab, text_df, sel_sim_seg_df, disp_ver_seg_df, sel_bill_nvers, inspect_disp_max_cols = 4, thres_fuzzy_ratio_valid = 50, disp_char_jitter = 100, disp_text_flank = ' [...] ', disp_text_color = '#fc9272', ): if 'selected_model_sentence' not in st.session_state: return sel_state_ms_idx = st.session_state['selected_model_sentence']['selection']['rows'][0] # Get selection if sel_state_ms_idx >= len(disp_ver_seg_df): sel_state_ms_idx = 0 sel_ms_idx = disp_ver_seg_df.iloc[sel_state_ms_idx][('model','sentence id')] sel_ms = disp_ver_seg_df.iloc[sel_state_ms_idx][('model','sentence')] tab.subheader(f'Selected model sentence id {sel_ms_idx}, and found occurences across bill versions') tab.text( 'Note: The annotation package currently messes with spacing so currently there may be disjoint sentences in show bill texts. ' 'Typically at least 70% is where the similarity should be considered. Disregard stuff under that. ' 'Additionally, the highlighted bill sentence may be a bit off sometimes to save processing time.' ) # Create columns to display bill's segments similar to model sentence num_dispsent_cols = sel_bill_nvers + 1 dispsent_cols = [] for _ in range(int(np.ceil(num_dispsent_cols / inspect_disp_max_cols))): dispsent_cols.extend(tab.columns([1] * inspect_disp_max_cols)) for i_dsc, ds_col in enumerate(dispsent_cols): # Display model sentence first if i_dsc == 0: ds_col.badge("Model sentence", color="blue") ds_col.text(sel_ms) continue # Do nothing when columns exceed number of bill versions if i_dsc > sel_bill_nvers: continue # `i_dsc` between 1 and `sel_bill_nvers` are the bill versions sel_tgt_row = ( sel_sim_seg_df.query( 'source__model_sent_idx == @sel_ms_idx ' 'and target__doc_rank == @i_dsc' ) ) # None found if len(sel_tgt_row) == 0: ds_col.badge( f"Bill ver. {i_dsc} sentence", color="red" ) ds_col.markdown('*(no fuzzy equiv. found)*') continue # Get version and text assert len(sel_tgt_row) == 1 sel_tgt_row = sel_tgt_row.iloc[0] sel_disp_doc_id = sel_tgt_row['target__doc_id'] sel_bill_ver_model_ratio = sel_tgt_row['ratio'] ds_col.badge( f"Bill ver. {i_dsc} sentence" + ( f" ({sel_bill_ver_model_ratio:.1f}%)" if sel_bill_ver_model_ratio > thres_fuzzy_ratio_valid else '' ), color="red" ) # Too low fuzzy ratio to be display if sel_bill_ver_model_ratio <= thres_fuzzy_ratio_valid: ds_col.markdown('*(no fuzzy equiv. found)*') continue # Sufficient fuzzy ratio to display text, even if not reaching the global `thres_ratio` sel_bill_ver_text = text_df.query('doc_id == @sel_disp_doc_id') assert len(sel_bill_ver_text) == 1 sel_bill_ver_text = sel_bill_ver_text.iloc[0]['text'].replace('$',r'\$') # Target bill text segments with some flanking to see some context around # Note: the indices are found after some processing from `rapidfuzz`, but here just display pre-proc text sel_tgt_start = int(sel_tgt_row['target_postproc_start']) sel_tgt_end = int(sel_tgt_row['target_postproc_end']) flanked_sel_tgt_start = max(sel_tgt_start - disp_char_jitter, 0) flanked_sel_tgt_end = min(sel_tgt_end + disp_char_jitter, len(sel_bill_ver_text) - 1) with ds_col: annotated_text( disp_text_flank + sel_bill_ver_text[flanked_sel_tgt_start:sel_tgt_start], ( sel_bill_ver_text[sel_tgt_start:sel_tgt_end], '', disp_text_color ), sel_bill_ver_text[sel_tgt_end:flanked_sel_tgt_end] + disp_text_flank )