Spaces:
Running
Running
File size: 8,694 Bytes
fd4a87f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 | import numpy as np
import pandas as pd
import streamlit as st
from annotated_text import annotated_text
def render(
tab,
df,
seg_df,
text_df,
disp_sim_df,
min_nwords,
thres_ratio,
ratio_bg_cmap = 'Reds',
inspect_disp_max_cols = 4,
thres_fuzzy_ratio_valid = 50,
disp_char_jitter = 100,
disp_text_flank = ' [...] ',
disp_text_color = '#fc9272',
):
if 'bill_model_pair' not in st.session_state:
return
# Get selection
sel_bm_idx = st.session_state['bill_model_pair']['selection']['rows'][0]
sel_bm_row = disp_sim_df.iloc[sel_bm_idx]
sel_model, sel_bill, sel_doc = sel_bm_row['model'], sel_bm_row['bill_id'], sel_bm_row['doc_id']
# Get selected bill with high sim. sentences
sel_bm_df = (
df.query(
'source__model == @sel_model '
'and target__bill_id == @sel_bill '
'and source__est_nwords >= @min_nwords'
)
.reset_index(drop=True)
)
idx_highsentidx = (
sel_bm_df.query('ratio > @thres_ratio')
['source__model_sent_idx'].unique()
)
sel_sim_seg_df = (
sel_bm_df.query(
'source__model_sent_idx in @idx_highsentidx'
)
.filter(
regex='ratio|postproc|source__model(_sent_idx)?$|target__bill|target__doc_id|target__doc_rank'
)
.merge(
seg_df.filter([
'model','section_label','model_sent_idx','sentence'
]).add_prefix('source__'),
how='left'
)
)
assert sel_sim_seg_df['target__bill'].nunique() == 1
sel_bill_name = sel_sim_seg_df['target__bill'].iloc[0]
# Display table for high sim. model sentencecs
disp_ver_seg_df = (
sel_sim_seg_df
.rename(columns={
'source__model_sent_idx': ('model', 'sentence id'),
'source__section_label': ('model', 'section'),
'source__sentence': ('model', 'sentence')
})
.pivot(
index=[
('model', 'sentence id'),
('model', 'section'),
('model', 'sentence'),
],
columns='target__doc_rank',
values=['ratio']
)
.add_prefix('bill ver. ')
.rename(
columns={'bill ver. ratio': 'similarity % with model sentence'}
)
.reset_index()
)
disp_ver_seg_df.columns.names = [None,None]
# Create sub components
tab.header(f'Inspect similarity between {sel_model} model and {sel_bill_name} bill')
selseg_col1, selseg_col2 = tab.columns([1, 2])
# Visualize whole-bill similarity across available bill progress
selseg_col1.subheader(f'Bill-model similarity across versions')
sel_bill_prog = (
disp_sim_df.query('bill_id == @sel_bill and model == @sel_model')
)
assert len(sel_bill_prog) == 1
sel_bill_prog = sel_bill_prog.iloc[0]['progress_pct_src_sim']
if not hasattr(sel_bill_prog, '__len__'):
sel_bill_prog = [sel_bill_prog]
sel_bill_nvers = len(sel_bill_prog)
selseg_col1.bar_chart(
pd.DataFrame({
'version': np.arange(1, sel_bill_nvers + 1),
'% model sentences in bill': sel_bill_prog
}),
x='version',
y='% model sentences in bill'
)
# Table model sentences with single-row selection to inspect further
selseg_col2.subheader(f'Sentences from model {sel_model} found in bill')
selseg_col2.text(
'Note: Select one of the model sentences from table below.'
)
selseg_col2.dataframe(
data=(
disp_ver_seg_df.style
.background_gradient(
cmap=ratio_bg_cmap,
vmin=thres_ratio,
vmax=100.0,
subset=disp_ver_seg_df.filter(regex='bill').columns
)
.format(precision=1)
),
selection_mode='single-row-required',
key='selected_model_sentence',
on_select="rerun",
)
# Display model sentence and the potential corresponding bill's sentences per version
__inspect_single_sentence__(
tab,
text_df,
sel_sim_seg_df,
disp_ver_seg_df,
sel_bill_nvers,
inspect_disp_max_cols,
thres_fuzzy_ratio_valid,
disp_char_jitter,
disp_text_flank,
disp_text_color,
)
def __inspect_single_sentence__(
tab,
text_df,
sel_sim_seg_df,
disp_ver_seg_df,
sel_bill_nvers,
inspect_disp_max_cols = 4,
thres_fuzzy_ratio_valid = 50,
disp_char_jitter = 100,
disp_text_flank = ' [...] ',
disp_text_color = '#fc9272',
):
if 'selected_model_sentence' not in st.session_state:
return
sel_state_ms_idx = st.session_state['selected_model_sentence']['selection']['rows'][0]
# Get selection
if sel_state_ms_idx >= len(disp_ver_seg_df):
sel_state_ms_idx = 0
sel_ms_idx = disp_ver_seg_df.iloc[sel_state_ms_idx][('model','sentence id')]
sel_ms = disp_ver_seg_df.iloc[sel_state_ms_idx][('model','sentence')]
tab.subheader(f'Selected model sentence id {sel_ms_idx}, and found occurences across bill versions')
tab.text(
'Note: The annotation package currently messes with spacing so currently there may be disjoint sentences in show bill texts. '
'Typically at least 70% is where the similarity should be considered. Disregard stuff under that. '
'Additionally, the highlighted bill sentence may be a bit off sometimes to save processing time.'
)
# Create columns to display bill's segments similar to model sentence
num_dispsent_cols = sel_bill_nvers + 1
dispsent_cols = []
for _ in range(int(np.ceil(num_dispsent_cols / inspect_disp_max_cols))):
dispsent_cols.extend(tab.columns([1] * inspect_disp_max_cols))
for i_dsc, ds_col in enumerate(dispsent_cols):
# Display model sentence first
if i_dsc == 0:
ds_col.badge("Model sentence", color="blue")
ds_col.text(sel_ms)
continue
# Do nothing when columns exceed number of bill versions
if i_dsc > sel_bill_nvers:
continue
# `i_dsc` between 1 and `sel_bill_nvers` are the bill versions
sel_tgt_row = (
sel_sim_seg_df.query(
'source__model_sent_idx == @sel_ms_idx '
'and target__doc_rank == @i_dsc'
)
)
# None found
if len(sel_tgt_row) == 0:
ds_col.badge(
f"Bill ver. {i_dsc} sentence",
color="red"
)
ds_col.markdown('*(no fuzzy equiv. found)*')
continue
# Get version and text
assert len(sel_tgt_row) == 1
sel_tgt_row = sel_tgt_row.iloc[0]
sel_disp_doc_id = sel_tgt_row['target__doc_id']
sel_bill_ver_model_ratio = sel_tgt_row['ratio']
ds_col.badge(
f"Bill ver. {i_dsc} sentence" + (
f" ({sel_bill_ver_model_ratio:.1f}%)"
if sel_bill_ver_model_ratio > thres_fuzzy_ratio_valid
else ''
),
color="red"
)
# Too low fuzzy ratio to be display
if sel_bill_ver_model_ratio <= thres_fuzzy_ratio_valid:
ds_col.markdown('*(no fuzzy equiv. found)*')
continue
# Sufficient fuzzy ratio to display text, even if not reaching the global `thres_ratio`
sel_bill_ver_text = text_df.query('doc_id == @sel_disp_doc_id')
assert len(sel_bill_ver_text) == 1
sel_bill_ver_text = sel_bill_ver_text.iloc[0]['text'].replace('$',r'\$')
# Target bill text segments with some flanking to see some context around
# Note: the indices are found after some processing from `rapidfuzz`, but here just display pre-proc text
sel_tgt_start = int(sel_tgt_row['target_postproc_start'])
sel_tgt_end = int(sel_tgt_row['target_postproc_end'])
flanked_sel_tgt_start = max(sel_tgt_start - disp_char_jitter, 0)
flanked_sel_tgt_end = min(sel_tgt_end + disp_char_jitter, len(sel_bill_ver_text) - 1)
with ds_col:
annotated_text(
disp_text_flank + sel_bill_ver_text[flanked_sel_tgt_start:sel_tgt_start],
(
sel_bill_ver_text[sel_tgt_start:sel_tgt_end],
'', disp_text_color
),
sel_bill_ver_text[sel_tgt_end:flanked_sel_tgt_end] + disp_text_flank
)
|