Spaces:

brown-cntr
/

chatbot-model-bills

Running

File size: 8,694 Bytes

fd4a87f

import numpy as np
import pandas as pd
import streamlit as st
from annotated_text import annotated_text

def render(
    tab,
    df,
    seg_df,
    text_df,
    disp_sim_df,
    min_nwords, 
    thres_ratio,
    ratio_bg_cmap = 'Reds',
    inspect_disp_max_cols = 4,
    thres_fuzzy_ratio_valid = 50,
    disp_char_jitter = 100,
    disp_text_flank = ' [...] ',
    disp_text_color = '#fc9272',
):
    
    if 'bill_model_pair' not in st.session_state:
        return
    
    # Get selection
    sel_bm_idx = st.session_state['bill_model_pair']['selection']['rows'][0]
    sel_bm_row = disp_sim_df.iloc[sel_bm_idx]
    sel_model, sel_bill, sel_doc = sel_bm_row['model'], sel_bm_row['bill_id'], sel_bm_row['doc_id']
    
    # Get selected bill with high sim. sentences
    sel_bm_df = (
        df.query(
            'source__model == @sel_model '
            'and target__bill_id == @sel_bill '
            'and source__est_nwords >= @min_nwords'
        )
        .reset_index(drop=True)
    )
    idx_highsentidx = (
        sel_bm_df.query('ratio > @thres_ratio')
        ['source__model_sent_idx'].unique()
    )
    sel_sim_seg_df = (
        sel_bm_df.query(
            'source__model_sent_idx in @idx_highsentidx'
        )
        .filter(
            regex='ratio|postproc|source__model(_sent_idx)?$|target__bill|target__doc_id|target__doc_rank'
        )
        .merge(
            seg_df.filter([
                'model','section_label','model_sent_idx','sentence'
            ]).add_prefix('source__'),
            how='left'
        )
    )
    assert sel_sim_seg_df['target__bill'].nunique() == 1
    sel_bill_name = sel_sim_seg_df['target__bill'].iloc[0]
    
    # Display table for high sim. model sentencecs
    disp_ver_seg_df = (
        sel_sim_seg_df
        .rename(columns={
            'source__model_sent_idx': ('model', 'sentence id'),
            'source__section_label': ('model', 'section'),
            'source__sentence': ('model', 'sentence')
        })
        .pivot(
            index=[
                ('model', 'sentence id'),
                ('model', 'section'),
                ('model', 'sentence'),
            ],
            columns='target__doc_rank',
            values=['ratio']
        )
        .add_prefix('bill ver. ')
        .rename(
            columns={'bill ver. ratio': 'similarity % with model sentence'}
        )
        .reset_index()
    )
    disp_ver_seg_df.columns.names = [None,None]
    
    # Create sub components
    tab.header(f'Inspect similarity between {sel_model} model and {sel_bill_name} bill')
    selseg_col1, selseg_col2 = tab.columns([1, 2])
    
    # Visualize whole-bill similarity across available bill progress
    selseg_col1.subheader(f'Bill-model similarity across versions')
    
    sel_bill_prog = (
        disp_sim_df.query('bill_id == @sel_bill and model == @sel_model')
    )
    assert len(sel_bill_prog) == 1
    sel_bill_prog = sel_bill_prog.iloc[0]['progress_pct_src_sim']
    if not hasattr(sel_bill_prog, '__len__'):
        sel_bill_prog = [sel_bill_prog]
    sel_bill_nvers = len(sel_bill_prog)
        
    selseg_col1.bar_chart(
        pd.DataFrame({
            'version': np.arange(1, sel_bill_nvers + 1),
            '% model sentences in bill': sel_bill_prog
        }),
        x='version',
        y='% model sentences in bill'
    )
    
    # Table model sentences with single-row selection to inspect further
    selseg_col2.subheader(f'Sentences from model {sel_model} found in bill')
    selseg_col2.text(
        'Note: Select one of the model sentences from table below.'
    )
        
    selseg_col2.dataframe(
        data=(
            disp_ver_seg_df.style
            .background_gradient(
                cmap=ratio_bg_cmap,
                vmin=thres_ratio, 
                vmax=100.0,
                subset=disp_ver_seg_df.filter(regex='bill').columns
            )
            .format(precision=1)
        ),
        selection_mode='single-row-required',
        key='selected_model_sentence',
        on_select="rerun",
    )
    
    # Display model sentence and the potential corresponding bill's sentences per version
    __inspect_single_sentence__(
        tab,
        text_df,
        sel_sim_seg_df,
        disp_ver_seg_df,
        sel_bill_nvers,
        inspect_disp_max_cols,
        thres_fuzzy_ratio_valid,
        disp_char_jitter,
        disp_text_flank,
        disp_text_color,
    )


def __inspect_single_sentence__(
    tab,
    text_df,
    sel_sim_seg_df,
    disp_ver_seg_df,
    sel_bill_nvers,
    inspect_disp_max_cols = 4,
    thres_fuzzy_ratio_valid = 50,
    disp_char_jitter = 100,
    disp_text_flank = ' [...] ',
    disp_text_color = '#fc9272',
):
    
    if 'selected_model_sentence' not in st.session_state:
        return
        
    sel_state_ms_idx = st.session_state['selected_model_sentence']['selection']['rows'][0]
    
    # Get selection
    if sel_state_ms_idx >= len(disp_ver_seg_df):
        sel_state_ms_idx = 0
    sel_ms_idx = disp_ver_seg_df.iloc[sel_state_ms_idx][('model','sentence id')]
    sel_ms = disp_ver_seg_df.iloc[sel_state_ms_idx][('model','sentence')]
    
    tab.subheader(f'Selected model sentence id {sel_ms_idx}, and found occurences across bill versions')
    tab.text(
        'Note: The annotation package currently messes with spacing so currently there may be disjoint sentences in show bill texts. '
        'Typically at least 70% is where the similarity should be considered. Disregard stuff under that. '
        'Additionally, the highlighted bill sentence may be a bit off sometimes to save processing time.'
    )
    
    # Create columns to display bill's segments similar to model sentence
    num_dispsent_cols = sel_bill_nvers + 1
    dispsent_cols = []
    for _ in range(int(np.ceil(num_dispsent_cols / inspect_disp_max_cols))):
        dispsent_cols.extend(tab.columns([1] * inspect_disp_max_cols))
        
    for i_dsc, ds_col in enumerate(dispsent_cols):
        # Display model sentence first
        if i_dsc == 0:
            ds_col.badge("Model sentence", color="blue")
            ds_col.text(sel_ms)
            continue

        # Do nothing when columns exceed number of bill versions
        if i_dsc > sel_bill_nvers:
            continue
    
        # `i_dsc` between 1 and `sel_bill_nvers` are the bill versions
        sel_tgt_row = (
            sel_sim_seg_df.query(
                'source__model_sent_idx == @sel_ms_idx '
                'and target__doc_rank == @i_dsc'
            )
        )
        
        # None found
        if len(sel_tgt_row) == 0:
            ds_col.badge(
                f"Bill ver. {i_dsc} sentence",  
                color="red"
            )
            ds_col.markdown('*(no fuzzy equiv. found)*')
            continue
        
        # Get version and text
        assert len(sel_tgt_row) == 1
        sel_tgt_row = sel_tgt_row.iloc[0]
        sel_disp_doc_id = sel_tgt_row['target__doc_id']
        sel_bill_ver_model_ratio = sel_tgt_row['ratio']

        ds_col.badge(
            f"Bill ver. {i_dsc} sentence" + (
                f" ({sel_bill_ver_model_ratio:.1f}%)" 
                if sel_bill_ver_model_ratio > thres_fuzzy_ratio_valid
                else ''
            ),  
            color="red"
        )
        
        # Too low fuzzy ratio to be display
        if sel_bill_ver_model_ratio <= thres_fuzzy_ratio_valid:
            ds_col.markdown('*(no fuzzy equiv. found)*')
            continue
            
        # Sufficient fuzzy ratio to display text, even if not reaching the global `thres_ratio`
        sel_bill_ver_text = text_df.query('doc_id == @sel_disp_doc_id')
        assert len(sel_bill_ver_text) == 1
        sel_bill_ver_text = sel_bill_ver_text.iloc[0]['text'].replace('$',r'\$') 

        # Target bill text segments with some flanking to see some context around
        # Note: the indices are found after some processing from `rapidfuzz`, but here just display pre-proc text 
        sel_tgt_start = int(sel_tgt_row['target_postproc_start'])
        sel_tgt_end = int(sel_tgt_row['target_postproc_end'])
        flanked_sel_tgt_start = max(sel_tgt_start - disp_char_jitter, 0) 
        flanked_sel_tgt_end = min(sel_tgt_end + disp_char_jitter,  len(sel_bill_ver_text) - 1)
        
        with ds_col:
            annotated_text(
                disp_text_flank + sel_bill_ver_text[flanked_sel_tgt_start:sel_tgt_start],
                (
                    sel_bill_ver_text[sel_tgt_start:sel_tgt_end],
                    '', disp_text_color
                ),
                sel_bill_ver_text[sel_tgt_end:flanked_sel_tgt_end] + disp_text_flank
            )