| | import gradio as gr |
| | import json |
| |
|
| |
|
| | import os |
| | os.environ["GRADIO_TEMP_DIR"] = "./datasets/temp" |
| | os.makedirs(os.environ["GRADIO_TEMP_DIR"], exist_ok=True) |
| |
|
| | import yaml |
| | import argparse |
| | import os |
| | import urllib.request |
| | from tqdm import tqdm |
| |
|
| | from dotenv import load_dotenv |
| | from openai import OpenAI |
| | from utils.file_download import download_file_override |
| |
|
| |
|
| | def load_config(path="config/config.yaml"): |
| | with open(path, "r") as f: |
| | return yaml.safe_load(f) |
| | |
| | cfg = load_config() |
| |
|
| |
|
| | download_file_override(cfg.get('interp_space_url'), cfg.get('interp_space_path')) |
| | download_file_override(cfg.get('instances_to_explain_url'), cfg.get('instances_to_explain_path')) |
| | download_file_override(cfg.get('gram2vec_feats_url'), cfg.get('gram2vec_feats_path')) |
| |
|
| | from utils.visualizations import * |
| | from utils.llm_feat_utils import * |
| | from utils.gram2vec_feat_utils import * |
| | from utils.interp_space_utils import * |
| | from utils.ui import * |
| |
|
| | load_dotenv() |
| | client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) |
| |
|
| |
|
| | |
| | GRAM2VEC_SHORTHAND = load_code_map() |
| |
|
| | def validate_ground_truth(gt1, gt2, gt3): |
| | selected = [gt1, gt2, gt3] |
| | selected_count = sum(selected) |
| |
|
| | if selected_count > 1: |
| | return None, "Please select only one ground truth author." |
| | elif selected_count == 0: |
| | return None, "No ground truth author selected." |
| |
|
| | index = selected.index(True) |
| | return index, f"Candidate {index+1} is marked as the ground truth author." |
| |
|
| |
|
| | def app(share=False, use_cluster_feats=False): |
| | instances, instance_ids = get_instances(cfg['instances_to_explain_path']) |
| |
|
| | interp = load_interp_space(cfg) |
| | clustered_authors_df = interp['clustered_authors_df'][:1000] |
| | clustered_authors_df['fullText'] = clustered_authors_df['fullText'].map(lambda l: l[:3]) |
| |
|
| | with gr.Blocks(title="Author Attribution Explainability Tool") as demo: |
| | |
| | gr.HTML(styled_block(""" |
| | <h1 style=" |
| | text-align:center; |
| | font-size:3em; /* About 48px */ |
| | margin-bottom:0.3em; |
| | font-weight:700; |
| | "> |
| | Author Attribution (AA) Explainability Tool |
| | </h1> |
| | """)) |
| |
|
| | gr.HTML(styled_block(""" |
| | <div style=" |
| | text-align:center; |
| | margin: 1em auto 2em auto; |
| | max-width:900px; |
| | "> |
| | <p style="font-size:1.3em; line-height:1.4;"> |
| | This demo helps you <strong>see inside</strong> a deep AA modelβs latent style space. |
| | </p> |
| | <p style="font-size:0.9em; line-height:1.4;"> |
| | Currently you are inspecting <a href="https://huggingface.co/rrivera1849/LUAR-MUD">LUAR</a> with pre-defined AA tasks from the <a href="https://www.iarpa.gov/images/research-programs/HIATUS/IARPA_HIATUS_Phase_1_HRS_Data.to_DAC_20240610.pdf">HRS dataset </a> |
| | </p> |
| | <div style=" |
| | display:flex; |
| | justify-content:center; |
| | gap:3em; |
| | margin-top:1em; |
| | "> |
| | <!-- Visualize --> |
| | <div style="max-width:200px;"> |
| | <div style="font-size:2em;">π</div> |
| | <h4 style="margin:0.2em 0;">Visualize</h4> |
| | <p style="margin:0; font-size:1em; line-height:1.3;"> |
| | Place your AA task with respect to other background authors. |
| | </p> |
| | </div> |
| | <!-- GENERATE --> |
| | <div style="max-width:200px;"> |
| | <div style="font-size:2em;">βοΈ</div> |
| | <h4 style="margin:0.2em 0;">Generate</h4> |
| | <p style="margin:0; font-size:1em; line-height:1.3;"> |
| | Describe your investigated authors' writing style via human-readable LLM-generated style features. |
| | </p> |
| | </div> |
| | <!-- COMPARE --> |
| | <div style="max-width:200px;"> |
| | <div style="font-size:2em;">βοΈ</div> |
| | <h4 style="margin:0.2em 0;">Compare</h4> |
| | <p style="margin:0; font-size:1em; line-height:1.3;"> |
| | Contrast with <a href=""https://github.com/eric-sclafani/gram2vec>Gram2Vec</a> stylometric features. |
| | </p> |
| | </div> |
| | </div> |
| | </div> |
| | """)) |
| |
|
| |
|
| | |
| | with gr.Accordion("π How to Use", open=True): |
| | gr.Markdown(""" |
| | 1. **Select** a model and a task source (pre-defined or custom) |
| | 2. Click **Load Task & Generate Embeddings** to load the task and generate embeddings |
| | 3. **Run Visualization** to see the mystery author and candidates in the AA model's latent space |
| | 4. **Zoom** into the visualization to select a cluster of background authors |
| | 5. Pick an **LLM feature** to highlight in yellow |
| | 6. Pick a **Gram2Vec feature** to highlight in blue |
| | 7. Click **Show Combined Spans** to compare side-by-side |
| | """ |
| | ) |
| |
|
| | |
| | model_radio = gr.Radio( |
| | choices=[ |
| | 'gabrielloiseau/LUAR-MUD-sentence-transformers', |
| | 'gabrielloiseau/LUAR-CRUD-sentence-transformers', |
| | 'miladalsh/light-luar', |
| | 'AnnaWegmann/Style-Embedding', |
| | 'Other' |
| | ], |
| | value='gabrielloiseau/LUAR-MUD-sentence-transformers', |
| | label='Choose a Model to inspect' |
| | ) |
| | print(f"Model choices: {model_radio.choices}") |
| | print(f"Model default: {model_radio.value}") |
| | custom_model = gr.Textbox( |
| | label='Custom Model ID', |
| | placeholder='Enter your Hugging Face Model ID here', |
| | visible=False, |
| | interactive=True |
| | ) |
| | |
| | model_radio.change( |
| | fn=toggle_custom_model, |
| | inputs=[model_radio], |
| | outputs=[custom_model] |
| | ) |
| |
|
| | |
| | task_mode = gr.Radio( |
| | choices=["Predefined HRS Task", "Upload Your Own Task"], |
| | value="Predefined HRS Task", |
| | label="Select Task Source" |
| | ) |
| |
|
| | ground_truth_author = gr.State() |
| |
|
| | with gr.Column(): |
| | with gr.Column(visible=True) as predefined_container: |
| | gr.HTML(""" |
| | <div style=" |
| | font-size: 1.3em; |
| | font-weight: 600; |
| | margin-bottom: 0.5em; |
| | "> |
| | Pick a pre-defined task to investigate (a mystery text and its three candidate authors) |
| | </div> |
| | """) |
| | task_dropdown = gr.Dropdown( |
| | choices=[f"Task {i}" for i in instance_ids], |
| | value=f"Task {instance_ids[0]}", |
| | label="Choose which mystery document to explain", |
| | ) |
| | with gr.Column(visible=False) as custom_container: |
| | gr.HTML(""" |
| | <div style=" |
| | font-size: 1.3em; |
| | font-weight: 600; |
| | margin-bottom: 0.5em; |
| | "> |
| | Upload your own task |
| | </div> |
| | """) |
| | mystery_input = gr.File(label="Mystery (.txt)", file_types=['.txt']) |
| | with gr.Row(): |
| | candidate1 = gr.File(label="Candidate 1 (.txt)", file_types=['.txt']) |
| | gt1_checkbox = gr.Checkbox(label="Ground Truth?", value=False) |
| |
|
| | with gr.Row(): |
| | candidate2 = gr.File(label="Candidate 2 (.txt)", file_types=['.txt']) |
| | gt2_checkbox = gr.Checkbox(label="Ground Truth?", value=False) |
| |
|
| | with gr.Row(): |
| | candidate3 = gr.File(label="Candidate 3 (.txt)", file_types=['.txt']) |
| | gt3_checkbox = gr.Checkbox(label="Ground Truth?", value=False) |
| | |
| | validation_msg = gr.Textbox(label="Validation Result", interactive=False) |
| | |
| | for checkbox in [gt1_checkbox, gt2_checkbox, gt3_checkbox]: |
| | checkbox.change( |
| | fn=validate_ground_truth, |
| | inputs=[gt1_checkbox, gt2_checkbox, gt3_checkbox], |
| | outputs=[ground_truth_author, validation_msg] |
| | ) |
| |
|
| | |
| | |
| | gr.HTML(instruction_callout("Click the button below to load the tasks and generate embeddings using selected model.")) |
| | load_button = gr.Button("Load Task & Generate Embeddings") |
| |
|
| | |
| | default_outputs = load_instance(0, instances) |
| | |
| | header = gr.HTML() |
| | mystery = gr.HTML() |
| | mystery_state = gr.State() |
| | with gr.Row(): |
| | c0 = gr.HTML() |
| | c1 = gr.HTML() |
| | c2 = gr.HTML() |
| | c0_state = gr.State() |
| | c1_state = gr.State() |
| | c2_state = gr.State() |
| | |
| | task_authors_embeddings_df = gr.State() |
| | background_authors_embeddings_df = gr.State() |
| | task_mode.change( |
| | fn=toggle_task, |
| | inputs=[task_mode], |
| | outputs=[predefined_container, custom_container] |
| | ) |
| | |
| | predicted_author = gr.State() |
| | load_button.click( |
| | fn=lambda: gr.update(value="β³ Loading... Please wait", interactive=False), |
| | inputs=[], |
| | outputs=[load_button] |
| | ).then( |
| | fn=lambda mode, dropdown, mystery, c1, c2, c3, ground_truth_author, model_radio, custom_model_input: |
| | update_task_display( |
| | mode, |
| | dropdown, |
| | instances, |
| | clustered_authors_df, |
| | mystery, |
| | c1, |
| | c2, |
| | c3, |
| | ground_truth_author, |
| | model_radio, |
| | custom_model_input |
| | ), |
| | inputs=[task_mode, task_dropdown, mystery_input, candidate1, candidate2, candidate3, ground_truth_author, model_radio, custom_model], |
| | outputs=[header, mystery, c0, c1, c2, mystery_state, c0_state, c1_state, c2_state, task_authors_embeddings_df, background_authors_embeddings_df, predicted_author, ground_truth_author] |
| | ).then( |
| | fn=lambda: gr.update(value="Load Task & Generate Embeddings", interactive=True), |
| | inputs=[], |
| | outputs=[load_button] |
| | ) |
| |
|
| | |
| | gr.HTML(instruction_callout("Run visualization to see which author is similar to the mystery document.")) |
| | run_btn = gr.Button("Run visualization") |
| | bg_proj_state = gr.State() |
| | bg_lbls_state = gr.State() |
| | bg_authors_df = gr.State() |
| | with gr.Row(): |
| | with gr.Column(scale=3): |
| | axis_ranges = gr.Textbox(visible=False, elem_id="axis-ranges") |
| | plot = gr.Plot( |
| | label="Visualization", |
| | elem_id="feature-plot", |
| | ) |
| | plot.change( |
| | fn=None, |
| | inputs=[plot], |
| | outputs=[axis_ranges], |
| | js=""" |
| | function(){ |
| | console.log("------------>[JS] plot.change() triggered<------------"); |
| | |
| | let attempts = 0; |
| | const maxAttempts = 50; |
| | |
| | const tryAttach = () => { |
| | const gd = document.querySelector('#feature-plot .js-plotly-plot'); |
| | if (!gd) { |
| | if (++attempts < maxAttempts) { |
| | requestAnimationFrame(tryAttach); |
| | } else { |
| | console.error(" ------------>Could not find .js-plotly-plot after multiple attempts.<------------"); |
| | } |
| | return; |
| | } |
| | |
| | if (gd.__zoomListenerAttached) { |
| | console.log("------------>Zoom listener already attached.<------------"); |
| | return; |
| | } |
| | |
| | gd.__zoomListenerAttached = true; |
| | console.log("------------>Zoom listener attached!<------------"); |
| | |
| | gd.on('plotly_relayout', (ev) => { |
| | if ( |
| | ev['xaxis.range[0]'] === undefined || |
| | ev['xaxis.range[1]'] === undefined || |
| | ev['yaxis.range[0]'] === undefined || |
| | ev['yaxis.range[1]'] === undefined |
| | ) return; |
| | |
| | const payload = { |
| | xaxis: [ev['xaxis.range[0]'], ev['xaxis.range[1]']], |
| | yaxis: [ev['yaxis.range[0]'], ev['yaxis.range[1]']] |
| | }; |
| | |
| | const txtbox = document.querySelector('#axis-ranges textarea'); |
| | if (txtbox) { |
| | txtbox.value = JSON.stringify(payload); |
| | txtbox.dispatchEvent(new Event('input', { bubbles: true })); |
| | console.log("------------> Zoom payload dispatched:<------------", payload); |
| | } else { |
| | console.warn("------------> No hidden textbox found to write zoom payload.<------------"); |
| | } |
| | }); |
| | }; |
| | |
| | requestAnimationFrame(tryAttach); |
| | return ''; |
| | } |
| | """ |
| | ) |
| |
|
| |
|
| | with gr.Column(scale=1): |
| | expl_html = """ |
| | <h4>What am I looking at?</h4> |
| | <p> |
| | This plot shows the mystery author (β
) and three candidate authors (β) |
| | in the AA modelβs latent space.<br> |
| | The grey β symbols represent the background corpusβreal authors with diverse writing styles. |
| | You can zoom in on any region of the plot. The system will analyze the visible authors |
| | in that area and list the most representative writing style features for the zoomed-in region.<br> |
| | Use this to compare your mystery textβs position against nearby writing styles and |
| | investigate which features distinguish it from others. |
| | </p> |
| | """ |
| | gr.HTML(styled_html(expl_html)) |
| | |
| | cluster_dropdown = gr.Dropdown(choices=[], label="Select Cluster to Inspect", visible=False) |
| | style_map_state = gr.State() |
| | llm_style_feats_analysis = gr.State() |
| | visible_zoomed_authors = gr.State() |
| |
|
| | if use_cluster_feats: |
| | |
| | gr.HTML(instruction_callout("Choose a cluster from the dropdown below to inspect whether its features appear in the mystery authorβs text.")) |
| | cluster_dropdown.visible = True |
| | else: |
| | gr.HTML(instruction_callout("Zoom in on the plot to select a set of background authors and see the presence of the top features from this set in candidate and mystery authors.")) |
| | |
| | with gr.Row(): |
| | |
| | with gr.Column(scale=1, min_width=0): |
| | |
| | gr.HTML(""" |
| | <div style=" |
| | font-size: 1.3em; |
| | font-weight: 600; |
| | margin-bottom: 0.5em; |
| | "> |
| | LLM-derived style features prominent in the zoomed-in region |
| | </div> |
| | """) |
| | features_rb = gr.Radio(choices=[], label="LLM-derived style features for this zoomed-in region") |
| | feature_list_state = gr.State() |
| |
|
| | |
| | with gr.Column(scale=1, min_width=0): |
| | |
| | gr.HTML(""" |
| | <div style=" |
| | font-size: 1.3em; |
| | font-weight: 600; |
| | margin-bottom: 0.5em; |
| | "> |
| | Gram2Vec Features prominent in the zoomed-in region |
| | </div> |
| | """) |
| | gram2vec_rb = gr.Radio(choices=[], label="Gram2Vec features for this zoomed-in region") |
| | gram2vec_state = gr.State() |
| |
|
| | |
| | run_btn.click( |
| | fn=lambda iid, model_radio, custom_model_input, task_authors_embeddings_df, background_authors_embeddings_df, predicted_author, ground_truth_author: visualize_clusters_plotly( |
| | int(iid.replace('Task ','')), cfg, instances, model_radio, |
| | custom_model_input, task_authors_embeddings_df, background_authors_embeddings_df, predicted_author, ground_truth_author |
| | ), |
| | inputs=[task_dropdown, model_radio, custom_model, task_authors_embeddings_df, background_authors_embeddings_df, predicted_author, ground_truth_author], |
| | outputs=[plot, style_map_state, bg_proj_state, bg_lbls_state, bg_authors_df] |
| | ) |
| | |
| | |
| | if use_cluster_feats: |
| | |
| | cluster_dropdown.change( |
| | fn=on_cluster_change, |
| | inputs=[cluster_dropdown, style_map_state], |
| | outputs=[features_rb, gram2vec_rb , feature_list_state] |
| | |
| | ) |
| | else: |
| |
|
| | axis_ranges.change( |
| | fn=handle_zoom_with_retries, |
| | inputs=[axis_ranges, bg_proj_state, bg_lbls_state, bg_authors_df, task_authors_embeddings_df], |
| | outputs=[features_rb, gram2vec_rb , llm_style_feats_analysis, feature_list_state, visible_zoomed_authors] |
| | ) |
| |
|
| |
|
| | |
| | |
| | gr.HTML( |
| | instruction_callout( |
| | "Click \"Show Combined Spans\" to highlight the LLM (yellow) & Gram2Vec (blue) feature spans in the texts" |
| | ) |
| | + """ |
| | <div style=" |
| | display: flex; |
| | align-items: center; |
| | justify-content: center; |
| | gap: 2em; |
| | margin-top: 0.5em; |
| | font-size: 0.9em; |
| | "> |
| | <div style="display: flex; align-items: center; gap: 0.5em; font-weight: 600; font-size: 1.5em;"> |
| | <span style=" |
| | display: inline-block; |
| | width: 1.5em; height: 1.5em; |
| | background: #FFEB3B; /* bright yellow */ |
| | border: 1px solid #666; |
| | vertical-align: middle; |
| | "></span> |
| | LLM feature |
| | </div> |
| | <div style="display: flex; align-items: center; gap: 0.5em; font-weight: 600; font-size: 1.5em;"> |
| | <span style=" |
| | display: inline-block; |
| | width: 1.5em; height: 1.5em; |
| | background: #5CB3FF; /* clearer blue */ |
| | border: 1px solid #666; |
| | vertical-align: middle; |
| | "></span> |
| | Gram2Vec feature |
| | </div> |
| | </div> |
| | """ |
| | ) |
| |
|
| |
|
| | combined_btn = gr.Button("Show Combined Spans") |
| | combined_html = gr.HTML() |
| | show_background_checkbox = gr.Checkbox(label="Show spans in background authors", value=False) |
| | background_html = gr.HTML(visible=False) |
| | |
| | |
| |
|
| |
|
| | combined_btn.click( |
| | fn=show_combined_spans_all, |
| | inputs=[features_rb, |
| | gram2vec_rb, |
| | llm_style_feats_analysis, |
| | background_authors_embeddings_df, |
| | task_authors_embeddings_df, |
| | visible_zoomed_authors, |
| | predicted_author, |
| | ground_truth_author], |
| | outputs=[combined_html, background_html] |
| | ) |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | show_background_checkbox.change( |
| | fn=lambda show: gr.update(visible=show), |
| | inputs=[show_background_checkbox], |
| | outputs=[background_html] |
| | ) |
| |
|
| | demo.launch(share=share) |
| |
|
| | if __name__ == "__main__": |
| | parser = argparse.ArgumentParser() |
| | parser.add_argument("--use_cluster_feats", action="store_true", help="Use cluster-based selection for features") |
| | args = parser.parse_args() |
| | app(share=True, use_cluster_feats=args.use_cluster_feats) |
| |
|