File size: 8,694 Bytes
fd4a87f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import numpy as np
import pandas as pd
import streamlit as st
from annotated_text import annotated_text

def render(
    tab,
    df,
    seg_df,
    text_df,
    disp_sim_df,
    min_nwords, 
    thres_ratio,
    ratio_bg_cmap = 'Reds',
    inspect_disp_max_cols = 4,
    thres_fuzzy_ratio_valid = 50,
    disp_char_jitter = 100,
    disp_text_flank = ' [...] ',
    disp_text_color = '#fc9272',
):
    
    if 'bill_model_pair' not in st.session_state:
        return
    
    # Get selection
    sel_bm_idx = st.session_state['bill_model_pair']['selection']['rows'][0]
    sel_bm_row = disp_sim_df.iloc[sel_bm_idx]
    sel_model, sel_bill, sel_doc = sel_bm_row['model'], sel_bm_row['bill_id'], sel_bm_row['doc_id']
    
    # Get selected bill with high sim. sentences
    sel_bm_df = (
        df.query(
            'source__model == @sel_model '
            'and target__bill_id == @sel_bill '
            'and source__est_nwords >= @min_nwords'
        )
        .reset_index(drop=True)
    )
    idx_highsentidx = (
        sel_bm_df.query('ratio > @thres_ratio')
        ['source__model_sent_idx'].unique()
    )
    sel_sim_seg_df = (
        sel_bm_df.query(
            'source__model_sent_idx in @idx_highsentidx'
        )
        .filter(
            regex='ratio|postproc|source__model(_sent_idx)?$|target__bill|target__doc_id|target__doc_rank'
        )
        .merge(
            seg_df.filter([
                'model','section_label','model_sent_idx','sentence'
            ]).add_prefix('source__'),
            how='left'
        )
    )
    assert sel_sim_seg_df['target__bill'].nunique() == 1
    sel_bill_name = sel_sim_seg_df['target__bill'].iloc[0]
    
    # Display table for high sim. model sentencecs
    disp_ver_seg_df = (
        sel_sim_seg_df
        .rename(columns={
            'source__model_sent_idx': ('model', 'sentence id'),
            'source__section_label': ('model', 'section'),
            'source__sentence': ('model', 'sentence')
        })
        .pivot(
            index=[
                ('model', 'sentence id'),
                ('model', 'section'),
                ('model', 'sentence'),
            ],
            columns='target__doc_rank',
            values=['ratio']
        )
        .add_prefix('bill ver. ')
        .rename(
            columns={'bill ver. ratio': 'similarity % with model sentence'}
        )
        .reset_index()
    )
    disp_ver_seg_df.columns.names = [None,None]
    
    # Create sub components
    tab.header(f'Inspect similarity between {sel_model} model and {sel_bill_name} bill')
    selseg_col1, selseg_col2 = tab.columns([1, 2])
    
    # Visualize whole-bill similarity across available bill progress
    selseg_col1.subheader(f'Bill-model similarity across versions')
    
    sel_bill_prog = (
        disp_sim_df.query('bill_id == @sel_bill and model == @sel_model')
    )
    assert len(sel_bill_prog) == 1
    sel_bill_prog = sel_bill_prog.iloc[0]['progress_pct_src_sim']
    if not hasattr(sel_bill_prog, '__len__'):
        sel_bill_prog = [sel_bill_prog]
    sel_bill_nvers = len(sel_bill_prog)
        
    selseg_col1.bar_chart(
        pd.DataFrame({
            'version': np.arange(1, sel_bill_nvers + 1),
            '% model sentences in bill': sel_bill_prog
        }),
        x='version',
        y='% model sentences in bill'
    )
    
    # Table model sentences with single-row selection to inspect further
    selseg_col2.subheader(f'Sentences from model {sel_model} found in bill')
    selseg_col2.text(
        'Note: Select one of the model sentences from table below.'
    )
        
    selseg_col2.dataframe(
        data=(
            disp_ver_seg_df.style
            .background_gradient(
                cmap=ratio_bg_cmap,
                vmin=thres_ratio, 
                vmax=100.0,
                subset=disp_ver_seg_df.filter(regex='bill').columns
            )
            .format(precision=1)
        ),
        selection_mode='single-row-required',
        key='selected_model_sentence',
        on_select="rerun",
    )
    
    # Display model sentence and the potential corresponding bill's sentences per version
    __inspect_single_sentence__(
        tab,
        text_df,
        sel_sim_seg_df,
        disp_ver_seg_df,
        sel_bill_nvers,
        inspect_disp_max_cols,
        thres_fuzzy_ratio_valid,
        disp_char_jitter,
        disp_text_flank,
        disp_text_color,
    )


def __inspect_single_sentence__(
    tab,
    text_df,
    sel_sim_seg_df,
    disp_ver_seg_df,
    sel_bill_nvers,
    inspect_disp_max_cols = 4,
    thres_fuzzy_ratio_valid = 50,
    disp_char_jitter = 100,
    disp_text_flank = ' [...] ',
    disp_text_color = '#fc9272',
):
    
    if 'selected_model_sentence' not in st.session_state:
        return
        
    sel_state_ms_idx = st.session_state['selected_model_sentence']['selection']['rows'][0]
    
    # Get selection
    if sel_state_ms_idx >= len(disp_ver_seg_df):
        sel_state_ms_idx = 0
    sel_ms_idx = disp_ver_seg_df.iloc[sel_state_ms_idx][('model','sentence id')]
    sel_ms = disp_ver_seg_df.iloc[sel_state_ms_idx][('model','sentence')]
    
    tab.subheader(f'Selected model sentence id {sel_ms_idx}, and found occurences across bill versions')
    tab.text(
        'Note: The annotation package currently messes with spacing so currently there may be disjoint sentences in show bill texts. '
        'Typically at least 70% is where the similarity should be considered. Disregard stuff under that. '
        'Additionally, the highlighted bill sentence may be a bit off sometimes to save processing time.'
    )
    
    # Create columns to display bill's segments similar to model sentence
    num_dispsent_cols = sel_bill_nvers + 1
    dispsent_cols = []
    for _ in range(int(np.ceil(num_dispsent_cols / inspect_disp_max_cols))):
        dispsent_cols.extend(tab.columns([1] * inspect_disp_max_cols))
        
    for i_dsc, ds_col in enumerate(dispsent_cols):
        # Display model sentence first
        if i_dsc == 0:
            ds_col.badge("Model sentence", color="blue")
            ds_col.text(sel_ms)
            continue

        # Do nothing when columns exceed number of bill versions
        if i_dsc > sel_bill_nvers:
            continue
    
        # `i_dsc` between 1 and `sel_bill_nvers` are the bill versions
        sel_tgt_row = (
            sel_sim_seg_df.query(
                'source__model_sent_idx == @sel_ms_idx '
                'and target__doc_rank == @i_dsc'
            )
        )
        
        # None found
        if len(sel_tgt_row) == 0:
            ds_col.badge(
                f"Bill ver. {i_dsc} sentence",  
                color="red"
            )
            ds_col.markdown('*(no fuzzy equiv. found)*')
            continue
        
        # Get version and text
        assert len(sel_tgt_row) == 1
        sel_tgt_row = sel_tgt_row.iloc[0]
        sel_disp_doc_id = sel_tgt_row['target__doc_id']
        sel_bill_ver_model_ratio = sel_tgt_row['ratio']

        ds_col.badge(
            f"Bill ver. {i_dsc} sentence" + (
                f" ({sel_bill_ver_model_ratio:.1f}%)" 
                if sel_bill_ver_model_ratio > thres_fuzzy_ratio_valid
                else ''
            ),  
            color="red"
        )
        
        # Too low fuzzy ratio to be display
        if sel_bill_ver_model_ratio <= thres_fuzzy_ratio_valid:
            ds_col.markdown('*(no fuzzy equiv. found)*')
            continue
            
        # Sufficient fuzzy ratio to display text, even if not reaching the global `thres_ratio`
        sel_bill_ver_text = text_df.query('doc_id == @sel_disp_doc_id')
        assert len(sel_bill_ver_text) == 1
        sel_bill_ver_text = sel_bill_ver_text.iloc[0]['text'].replace('$',r'\$') 

        # Target bill text segments with some flanking to see some context around
        # Note: the indices are found after some processing from `rapidfuzz`, but here just display pre-proc text 
        sel_tgt_start = int(sel_tgt_row['target_postproc_start'])
        sel_tgt_end = int(sel_tgt_row['target_postproc_end'])
        flanked_sel_tgt_start = max(sel_tgt_start - disp_char_jitter, 0) 
        flanked_sel_tgt_end = min(sel_tgt_end + disp_char_jitter,  len(sel_bill_ver_text) - 1)
        
        with ds_col:
            annotated_text(
                disp_text_flank + sel_bill_ver_text[flanked_sel_tgt_start:sel_tgt_start],
                (
                    sel_bill_ver_text[sel_tgt_start:sel_tgt_end],
                    '', disp_text_color
                ),
                sel_bill_ver_text[sel_tgt_end:flanked_sel_tgt_end] + disp_text_flank
            )