Spaces:
Running
Running
File size: 6,339 Bytes
fc56c31 afc8e3b fc56c31 7d9ee3f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import gradio as gr
import sqlite3
import pandas as pd
import plotly.express as px
from utils import *
from huggingface_hub import hf_hub_download
db_path = hf_hub_download(
repo_id="hemantn/antibody-paired-sequences",
filename="antibody_data_year.db",
repo_type="dataset"
)
conn = sqlite3.connect(db_path, check_same_thread=False)
#conn = sqlite3.connect('/data/hn533621/OAS/Paired_Data_Analysis/antibody_data_year.db', check_same_thread=False)
all_df = pd.read_sql("SELECT * FROM antibody_data", conn)
# Compute min / max for each column
vh_min, vh_max = int(all_df["vh_length"].min()), int(all_df["vh_length"].max())
vl_min, vl_max = int(all_df["vl_length"].min()), int(all_df["vl_length"].max())
def apply_filters(dropdown_1, dropdown_2, dropdown_3, dropdown_4, slider_vh, slider_vl):
"""
Build a query to filter the data based on the user's input
"""
clauses = []
params = {}
if dropdown_1: clauses.append('v_call_heavy_first = :vh_germline'); params['vh_germline'] = dropdown_1
if dropdown_2: clauses.append('v_call_light_first = :vl_germline'); params['vl_germline'] = dropdown_2
if dropdown_3: clauses.append('BType = :btype'); params['btype'] = dropdown_3
if dropdown_4: clauses.append('Disease = :disease'); params['disease'] = dropdown_4
if slider_vh:
clauses.append("vh_length BETWEEN :vh_min AND :vh_max")
params["vh_min"] = 80
params["vh_max"] = slider_vh
if slider_vl:
clauses.append("vl_length BETWEEN :vl_min AND :vl_max")
params["vl_min"] = 80
params["vl_max"] = slider_vl
sql = "SELECT * FROM antibody_data"
if clauses:
sql += " WHERE " + " AND ".join(clauses)
df = pd.read_sql_query(sql, conn, params=params)
#print(df.columns)
# ---- Reorder and rename columns ----
# 1. Rename to your desired display names
rename_map = {
"BType": "BType",
"Disease": "Disease",
"v_call_heavy_first": "vcall_VH",
"v_call_light_first": "vcall_VL",
"vh_length": "VH_length",
"vl_length": "VL_length",
"sequence_alignment_aa_heavy": "VH",
"sequence_alignment_aa_light": "VL",
"Year": "Year",
}
df = df.rename(columns=rename_map)
# 2. Reorder the columns
desired_order = [
"BType",
"Disease",
"vcall_VH",
"vcall_VL",
"VH_length",
"VL_length",
"Year",
"VH",
"VL",
]
# Only keep columns that actually exist (avoids errors if some missing)
df = df[[c for c in desired_order if c in df.columns]]
total_rows = conn.execute("SELECT COUNT(*) FROM antibody_data").fetchone()[0]
#vcall_vh_bar = bar_vcall_vh(df, total_rows, dropdown_1)
year_bar = bar_year_count(df)
#vcall_vl_bar = bar_vcall_vl(df, total_rows, dropdown_2)
combined_vh_vl_bar = bar_vh_vl_combined(df, total_rows, dropdown_1, dropdown_2)
#disease_bar = bar_disease_count(df, total_rows, dropdown_4)
#btype_bar = bar_btype_count(df, total_rows, dropdown_3)
vh_fig, vl_fig = hist_vh_vl_separate(df)
fasta_file = make_fasta_file(df)
total_sequences = 2*len(df)
return df.head(5), combined_vh_vl_bar, year_bar, vh_fig, vl_fig, fasta_file, total_sequences
with gr.Blocks(theme=gr.themes.Ocean()) as demo:
gr.Markdown(
"""
<h1 style='text-align:center; color:#3A7; margin-bottom:0.5em;'>🔎 Antibody Database Dashboard</h1>
<p style='text-align:center; color:gray;'>
Filter sequences, explore counts, and download custom FASTA files.
</p>
"""
)
with gr.Row():
with gr.Column(scale=1):
dropdown_1 = gr.Dropdown(choices = [""] + sorted(all_df['v_call_heavy_first'].unique()), \
value = None, label = 'VH germline')
dropdown_2 = gr.Dropdown(choices = [""] + sorted(all_df['v_call_light_first'].unique()), \
value = None, label = 'VL germline')
dropdown_3 = gr.Dropdown(choices = [""] + sorted(all_df['BType'].unique()), \
value = None, label = 'B-Type')
dropdown_4 = gr.Dropdown(choices = [""] + sorted(all_df['Disease'].unique()), \
value = None, label = 'Disase')
slider_vh = gr.Slider(value=vh_max, minimum=vh_min, maximum=vh_max, step=1, label = 'VH length')
slider_vl = gr.Slider(value=vl_max, minimum=vl_min, maximum=vl_max, step=1, label = 'VL length')
button = gr.Button('Apply Filters', variant='primary')
with gr.Column(scale=2):
with gr.Row():
with gr.Column(scale=1):
combined_vh_vl_bar = gr.Plot(label = 'VH and VL Germline')
with gr.Column(scale=1):
vh_fig = gr.Plot(label = 'VH Length Distribution')
with gr.Row():
with gr.Column(scale=1):
year_bar = gr.Plot(label = 'Year Wise Distribution')
with gr.Column(scale=1):
vl_fig = gr.Plot(label = 'VL Length Distribution')
with gr.Row():
#with gr.Column(scale=0.3):
#gr.Plot(label= 'Yearwise distribution of antibodies')
with gr.Column(scale=1):
with gr.Row():
with gr.Column(scale=2):
df_out = gr.Dataframe()
with gr.Column(scale=1):
fasta_file = gr.File(label= 'Download Antibody Data Fasta file', interactive=False)
with gr.Row():
total_sequences = gr.Textbox(label= 'No of Sequences in Fasta file', value=0, interactive=False)
slider_vh.input(update_vh, inputs=slider_vh, outputs=slider_vh, queue=False)
slider_vl.input(update_vl, inputs=slider_vl, outputs=slider_vl, queue=False)
inputs = [dropdown_1, dropdown_2, dropdown_3, dropdown_4, slider_vh, slider_vl]
#outputs = df,
#outputs = [plot_vh_germline, plot_vl_germline, plot_disease_count, plot_btype_count, \
# plot_year_data, dataframe, download_fasta]
button.click(
fn=apply_filters,
inputs = inputs,
outputs = [df_out, combined_vh_vl_bar, year_bar, vh_fig, vl_fig, fasta_file, total_sequences]
)
demo.launch()
|