hemantn commited on
Commit
fc56c31
·
1 Parent(s): 516b284

build files added

Browse files
Files changed (4) hide show
  1. README.md +57 -8
  2. app.py +160 -0
  3. requirements.txt +5 -0
  4. utils.py +414 -0
README.md CHANGED
@@ -1,14 +1,63 @@
1
  ---
2
- title: Antibody Database
3
- emoji: 🦀
4
- colorFrom: yellow
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 5.49.0
8
- app_file: app.py
9
  pinned: false
10
  license: mit
11
- short_description: 'Interactive antibody database - filter, analyze, and export '
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Antibody Database Dashboard
3
+ emoji: 🔎
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 4.0.0
8
+ app_file: dashboard.py
9
  pinned: false
10
  license: mit
11
+ short_description: Interactive antibody database - filter, analyze, and export
12
  ---
13
 
14
+ # 🧬 Antibody Database Dashboard
15
+
16
+ An interactive web dashboard for exploring antibody sequence data using Gradio and Plotly. This dashboard allows users to filter antibody sequences by various criteria and visualize the data through interactive charts.
17
+
18
+ ## Features
19
+
20
+ - **Interactive Filtering**: Filter sequences by VH/VL germline, B-cell type, disease, and sequence length
21
+ - **Data Visualization**:
22
+ - VH and VL germline distribution charts
23
+ - Length distribution histograms
24
+ - Year-wise sequence distribution
25
+ - **Data Export**: Download filtered sequences as FASTA files
26
+ - **Real-time Statistics**: View sequence counts and statistics
27
+
28
+ ## 🚀 Usage
29
+
30
+ 1. **Select Filters**:
31
+ - Choose VH and VL germlines from dropdowns
32
+ - Select B-cell type and disease
33
+ - Adjust sequence length sliders
34
+
35
+ 2. **Apply Filters**:
36
+ - Click "Apply Filters" to update the dashboard
37
+ - View filtered data in the table
38
+ - Explore visualizations
39
+
40
+ 3. **Export Data**:
41
+ - Download filtered sequences as FASTA files
42
+ - View sequence counts and statistics
43
+
44
+ ## 📋 Requirements
45
+
46
+ - Python 3.11+
47
+ - Required packages (see `requirements.txt`):
48
+ - `gradio>=4.0.0`
49
+ - `pandas>=1.5.0`
50
+ - `plotly>=5.0.0`
51
+ - `huggingface_hub>=0.16.0`
52
+ - `numpy>=1.21.0`
53
+
54
+ ## 🗂️ Project Structure
55
+
56
+ ```
57
+ dashboard_learning/
58
+ ├── dashboard.py # Main dashboard application
59
+ ├── utils.py # Utility functions for data processing
60
+ ├── requirements.txt # Python dependencies
61
+ └── README.md # This file
62
+ ```
63
+
app.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import sqlite3
3
+ import pandas as pd
4
+ import plotly.express as px
5
+ from utils import *
6
+ from huggingface_hub import hf_hub_download
7
+
8
+ db_path = hf_hub_download(
9
+ repo_id="hemantn/antibody-paired-sequences",
10
+ filename="antibody_data_year.db",
11
+ repo_type="dataset"
12
+ )
13
+
14
+
15
+ conn = sqlite3.connect(db_path, check_same_thread=False)
16
+
17
+ #conn = sqlite3.connect('/data/hn533621/OAS/Paired_Data_Analysis/antibody_data_year.db', check_same_thread=False)
18
+ all_df = pd.read_sql("SELECT * FROM antibody_data", conn)
19
+
20
+ # Compute min / max for each column
21
+ vh_min, vh_max = int(all_df["vh_length"].min()), int(all_df["vh_length"].max())
22
+ vl_min, vl_max = int(all_df["vl_length"].min()), int(all_df["vl_length"].max())
23
+
24
+ def apply_filters(dropdown_1, dropdown_2, dropdown_3, dropdown_4, slider_vh, slider_vl):
25
+ """
26
+ Build a query to filter the data based on the user's input
27
+ """
28
+ clauses = []
29
+ params = {}
30
+ if dropdown_1: clauses.append('v_call_heavy_first = :vh_germline'); params['vh_germline'] = dropdown_1
31
+ if dropdown_2: clauses.append('v_call_light_first = :vl_germline'); params['vl_germline'] = dropdown_2
32
+ if dropdown_3: clauses.append('BType = :btype'); params['btype'] = dropdown_3
33
+ if dropdown_4: clauses.append('Disease = :disease'); params['disease'] = dropdown_4
34
+ if slider_vh:
35
+ clauses.append("vh_length BETWEEN :vh_min AND :vh_max")
36
+ params["vh_min"] = 80
37
+ params["vh_max"] = slider_vh
38
+ if slider_vl:
39
+ clauses.append("vl_length BETWEEN :vl_min AND :vl_max")
40
+ params["vl_min"] = 80
41
+ params["vl_max"] = slider_vl
42
+
43
+ sql = "SELECT * FROM antibody_data"
44
+ if clauses:
45
+ sql += " WHERE " + " AND ".join(clauses)
46
+
47
+ df = pd.read_sql_query(sql, conn, params=params)
48
+ #print(df.columns)
49
+ # ---- Reorder and rename columns ----
50
+ # 1. Rename to your desired display names
51
+ rename_map = {
52
+ "BType": "BType",
53
+ "Disease": "Disease",
54
+ "v_call_heavy_first": "vcall_VH",
55
+ "v_call_light_first": "vcall_VL",
56
+ "vh_length": "VH_length",
57
+ "vl_length": "VL_length",
58
+ "sequence_alignment_aa_heavy": "VH",
59
+ "sequence_alignment_aa_light": "VL",
60
+ "Year": "Year",
61
+ }
62
+ df = df.rename(columns=rename_map)
63
+
64
+ # 2. Reorder the columns
65
+ desired_order = [
66
+ "BType",
67
+ "Disease",
68
+ "vcall_VH",
69
+ "vcall_VL",
70
+ "VH_length",
71
+ "VL_length",
72
+ "Year",
73
+ "VH",
74
+ "VL",
75
+ ]
76
+ # Only keep columns that actually exist (avoids errors if some missing)
77
+ df = df[[c for c in desired_order if c in df.columns]]
78
+
79
+ total_rows = conn.execute("SELECT COUNT(*) FROM antibody_data").fetchone()[0]
80
+ #vcall_vh_bar = bar_vcall_vh(df, total_rows, dropdown_1)
81
+ year_bar = bar_year_count(df)
82
+ #vcall_vl_bar = bar_vcall_vl(df, total_rows, dropdown_2)
83
+ combined_vh_vl_bar = bar_vh_vl_combined(df, total_rows, dropdown_1, dropdown_2)
84
+ #disease_bar = bar_disease_count(df, total_rows, dropdown_4)
85
+ #btype_bar = bar_btype_count(df, total_rows, dropdown_3)
86
+ vh_fig, vl_fig = hist_vh_vl_separate(df)
87
+ fasta_file = make_fasta_file(df)
88
+ total_sequences = 2*len(df)
89
+ return df.head(5), combined_vh_vl_bar, year_bar, vh_fig, vl_fig, fasta_file, total_sequences
90
+
91
+ with gr.Blocks(theme=gr.themes.Ocean()) as demo:
92
+ gr.Markdown(
93
+ """
94
+ <h1 style='text-align:center; color:#3A7; margin-bottom:0.5em;'>🔎 Antibody Database Dashboard</h1>
95
+ <p style='text-align:center; color:gray;'>
96
+ Filter sequences, explore counts, and download custom FASTA files.
97
+ </p>
98
+ """
99
+ )
100
+
101
+ with gr.Row():
102
+ with gr.Column(scale=1):
103
+ dropdown_1 = gr.Dropdown(choices = [""] + sorted(all_df['v_call_heavy_first'].unique()), \
104
+ value = None, label = 'VH germline')
105
+ dropdown_2 = gr.Dropdown(choices = [""] + sorted(all_df['v_call_light_first'].unique()), \
106
+ value = None, label = 'VL germline')
107
+ dropdown_3 = gr.Dropdown(choices = [""] + sorted(all_df['BType'].unique()), \
108
+ value = None, label = 'B-Type')
109
+ dropdown_4 = gr.Dropdown(choices = [""] + sorted(all_df['Disease'].unique()), \
110
+ value = None, label = 'Disase')
111
+
112
+ slider_vh = gr.Slider(value=vh_max, minimum=vh_min, maximum=vh_max, step=1, label = 'VH length')
113
+ slider_vl = gr.Slider(value=vl_max, minimum=vl_min, maximum=vl_max, step=1, label = 'VL length')
114
+
115
+ button = gr.Button('Apply Filters', variant='primary')
116
+
117
+ with gr.Column(scale=2):
118
+ with gr.Row():
119
+ with gr.Column(scale=1):
120
+ combined_vh_vl_bar = gr.Plot(label = 'VH and VL Germline')
121
+ with gr.Column(scale=1):
122
+ vh_fig = gr.Plot(label = 'VH Length Distribution')
123
+ with gr.Row():
124
+ with gr.Column(scale=1):
125
+ year_bar = gr.Plot(label = 'Year Wise Distribution')
126
+ with gr.Column(scale=1):
127
+ vl_fig = gr.Plot(label = 'VL Length Distribution')
128
+
129
+ with gr.Row():
130
+ #with gr.Column(scale=0.3):
131
+ #gr.Plot(label= 'Yearwise distribution of antibodies')
132
+ with gr.Column(scale=1):
133
+ with gr.Row():
134
+ with gr.Column(scale=2):
135
+ df_out = gr.Dataframe()
136
+ with gr.Column(scale=1):
137
+ fasta_file = gr.File(label= 'Download Antibody Data Fasta file', interactive=False)
138
+
139
+ with gr.Row():
140
+ total_sequences = gr.Textbox(label= 'No of Sequences in Fasta file', value=0, interactive=False)
141
+
142
+
143
+ slider_vh.input(update_vh, inputs=slider_vh, outputs=slider_vh, queue=False)
144
+ slider_vl.input(update_vl, inputs=slider_vl, outputs=slider_vl, queue=False)
145
+
146
+ inputs = [dropdown_1, dropdown_2, dropdown_3, dropdown_4, slider_vh, slider_vl]
147
+ #outputs = df,
148
+ #outputs = [plot_vh_germline, plot_vl_germline, plot_disease_count, plot_btype_count, \
149
+ # plot_year_data, dataframe, download_fasta]
150
+
151
+ button.click(
152
+ fn=apply_filters,
153
+ inputs = inputs,
154
+ outputs = [df_out, combined_vh_vl_bar, year_bar, vh_fig, vl_fig, fasta_file, total_sequences]
155
+ )
156
+
157
+ if __name__ == '__main__':
158
+ demo.launch()
159
+
160
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio=5.48.0
2
+ pandas>=1.5.0
3
+ plotly=5.25.0
4
+ huggingface_hub=0.29.3
5
+ numpy=1.26.4
utils.py ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import plotly.express as px
3
+ import tempfile
4
+
5
+ def update_vh(vh_len):
6
+ return vh_len
7
+ def update_vl(vl_len):
8
+ return vl_len
9
+
10
+ #def make_fasta_file(df: pd.DataFrame):
11
+ # if df.empty:
12
+ # return None
13
+ # lines = []
14
+ # i = 1
15
+ # for _, row in df.iterrows():
16
+ # header = f">{i}_{row['vcall_VH']}|{row['Disease']}"
17
+ # lines.append(header)
18
+ # lines.append(row['VH'])
19
+ # header = f">{i}_{row['vcall_VL']}|{row['Disease']}"
20
+ # lines.append(header)
21
+ # lines.append(row['VL'])
22
+ # tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".fasta")
23
+ # tmp.write("\n".join(lines).encode())
24
+ # tmp.close()
25
+ # return tmp.name
26
+
27
+ def make_fasta_file(df: pd.DataFrame):
28
+ """
29
+ Vectorized FASTA file creation - ~100x faster than loop-based approach.
30
+ Optimized for large datasets (1M+ sequences).
31
+ """
32
+ if df.empty:
33
+ return None
34
+
35
+ import numpy as np
36
+
37
+ # Create sequence IDs as a vector
38
+ n_seqs = len(df)
39
+ seq_ids = np.arange(1, n_seqs + 1)
40
+
41
+ # Vectorized header creation using string concatenation
42
+ vh_headers = ">" + seq_ids.astype(str) + "_" + df['vcall_VH'].astype(str) + "|" + df['Disease'].astype(str) + "|VH"
43
+ vl_headers = ">" + seq_ids.astype(str) + "_" + df['vcall_VL'].astype(str) + "|" + df['Disease'].astype(str) + "|VL"
44
+
45
+ # Interleave headers and sequences using numpy array indexing
46
+ fasta_content = np.empty((n_seqs * 4,), dtype=object)
47
+ fasta_content[0::4] = vh_headers # VH headers at positions 0, 4, 8, ...
48
+ fasta_content[1::4] = df['VH'].astype(str) # VH sequences at positions 1, 5, 9, ...
49
+ fasta_content[2::4] = vl_headers # VL headers at positions 2, 6, 10, ...
50
+ fasta_content[3::4] = df['VL'].astype(str) # VL sequences at positions 3, 7, 11, ...
51
+
52
+ # Write to file in one operation (much faster than multiple writes)
53
+ tmp = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".fasta", newline='')
54
+ tmp.write('\n'.join(fasta_content))
55
+ tmp.close()
56
+ return tmp.name
57
+
58
+
59
+ def pie_vcall_vh(df: pd.DataFrame, total_raws: int, width: int = 500, height: int = 400) -> px.pie:
60
+
61
+ current_count = len(df)
62
+ remaining = total_raws - current_count
63
+ values = [current_count, remaining]
64
+ #labels = ['Selected', 'Remaining']
65
+ fig = px.pie(values=values)
66
+ fig.update_layout(width=width, height=height)
67
+ return fig
68
+
69
+ def bar_vcall_vh(df: pd.DataFrame, total_rows: int, vh_germline: str,
70
+ width: int = 500, height: int = 250) -> px.bar:
71
+ """
72
+ Horizontal bar chart showing Selected vs Remaining counts.
73
+
74
+ Parameters
75
+ ----------
76
+ df : pd.DataFrame
77
+ Filtered dataframe from your query.
78
+ total_rows : int
79
+ Total number of rows in the full database.
80
+ width, height : int
81
+ Size of the resulting figure in pixels.
82
+ """
83
+ current_count = len(df)
84
+ remaining = total_rows - current_count
85
+
86
+ label_selected = vh_germline if vh_germline else "All Germlines"
87
+
88
+ plot_df = pd.DataFrame({
89
+ "Category": [label_selected, "Remaining"],
90
+ "Count": [current_count, remaining]
91
+ })
92
+
93
+ fig = px.bar(
94
+ plot_df,
95
+ x="Count",
96
+ y="Category",
97
+ orientation="h", # horizontal bars
98
+ text="Count", # show numbers on bars
99
+ color="Category",
100
+ color_discrete_map={
101
+ "Selected Germline": "#3A7", # greenish
102
+ "Remaining": "#0000FF" # gray #999
103
+ }
104
+ )
105
+
106
+ fig.update_layout(
107
+ width=width,
108
+ height=height,
109
+ showlegend=False,
110
+ plot_bgcolor="white",
111
+ xaxis_title="Number of Sequences",
112
+ )
113
+
114
+ return fig
115
+
116
+ def bar_vcall_vl(df: pd.DataFrame, total_rows: int, vl_germline: str,
117
+ width: int = 500, height: int = 250) -> px.bar:
118
+ """
119
+ Horizontal bar chart showing Selected vs Remaining counts.
120
+
121
+ Parameters
122
+ ----------
123
+ df : pd.DataFrame
124
+ Filtered dataframe from your query.
125
+ total_rows : int
126
+ Total number of rows in the full database.
127
+ width, height : int
128
+ Size of the resulting figure in pixels.
129
+ """
130
+ current_count = len(df)
131
+ remaining = total_rows - current_count
132
+
133
+ label_selected = vl_germline if vl_germline else "All Germlines"
134
+
135
+ plot_df = pd.DataFrame({
136
+ "Category": [label_selected, "Remaining"],
137
+ "Count": [current_count, remaining]
138
+ })
139
+
140
+ fig = px.bar(
141
+ plot_df,
142
+ x="Count",
143
+ y="Category",
144
+ orientation="h", # horizontal bars
145
+ text="Count", # show numbers on bars
146
+ color="Category",
147
+ color_discrete_map={
148
+ "Selected Germline": "#3A7", # greenish
149
+ "Remaining": "#0000FF" # gray #999
150
+ }
151
+ )
152
+
153
+
154
+ fig.update_layout(
155
+ width=width,
156
+ height=height,
157
+ showlegend=False,
158
+ plot_bgcolor="white",
159
+ xaxis_title="Number of Sequences",
160
+ )
161
+
162
+ return fig
163
+
164
+ def bar_disease_count(df: pd.DataFrame,
165
+ total_rows: int,
166
+ disease: str,
167
+ width: int = 500,
168
+ height: int = 250) -> px.bar:
169
+ """
170
+ Horizontal bar chart showing the count for the selected Disease
171
+ versus all remaining rows in the database.
172
+
173
+ Parameters
174
+ ----------
175
+ df : pd.DataFrame
176
+ Filtered dataframe from your query (the rows matching filters).
177
+ total_rows : int
178
+ Total number of rows in the full database.
179
+ disease : str
180
+ Disease name chosen in the UI (e.g., "SARS-COV-2").
181
+ width, height : int
182
+ Size of the resulting figure.
183
+ """
184
+ current_count = len(df)
185
+ remaining = total_rows - current_count
186
+
187
+ label_selected = disease if disease else "All Diseases"
188
+
189
+ plot_df = pd.DataFrame({
190
+ "Category": [label_selected, "Remaining"],
191
+ "Count": [current_count, remaining]
192
+ })
193
+
194
+ fig = px.bar(
195
+ plot_df,
196
+ x="Count",
197
+ y="Category",
198
+ orientation="h",
199
+ color="Category",
200
+ color_discrete_map={label_selected: "#d62728", "Remaining": "#999"} # red & gray
201
+ )
202
+
203
+ # Remove all labels/legend for a clean look
204
+ fig.update_layout(
205
+ width=width,
206
+ height=height,
207
+ showlegend=False,
208
+ plot_bgcolor="white",
209
+ )
210
+
211
+ return fig
212
+
213
+ def bar_btype_count(df: pd.DataFrame,
214
+ total_rows: int,
215
+ btype: str,
216
+ width: int = 500,
217
+ height: int = 250) -> px.bar:
218
+ """
219
+ Horizontal bar chart showing the count for the selected B-cell type
220
+ versus the remaining rows in the database.
221
+
222
+ Parameters
223
+ ----------
224
+ df : pd.DataFrame
225
+ Filtered dataframe from your query (rows matching filters).
226
+ total_rows : int
227
+ Total number of rows in the full database.
228
+ btype : str
229
+ B-cell type selected in the UI (e.g., "Memory-B-Cells").
230
+ width, height : int
231
+ Size of the figure in pixels.
232
+ """
233
+ current_count = len(df)
234
+ remaining = total_rows - current_count
235
+
236
+ label_selected = btype if btype else "All B-Types"
237
+
238
+ plot_df = pd.DataFrame({
239
+ "Category": [label_selected, "Remaining"],
240
+ "Count": [current_count, remaining]
241
+ })
242
+
243
+ fig = px.bar(
244
+ plot_df,
245
+ x="Count",
246
+ y="Category",
247
+ orientation="h",
248
+ color="Category",
249
+ color_discrete_map={label_selected: "#1f77b4", # blue
250
+ "Remaining": "#999"} # gray
251
+ )
252
+
253
+ fig.update_layout(
254
+ width=width,
255
+ height=height,
256
+ showlegend=False,
257
+ plot_bgcolor="white",
258
+ )
259
+
260
+ return fig
261
+
262
+ def hist_vh_vl_separate(df: pd.DataFrame,
263
+ width: int = 500,
264
+ height: int = 250) -> tuple[px.histogram, px.histogram]:
265
+ """
266
+ Returns two separate histograms: one for VH_length, one for VL_length.
267
+ """
268
+
269
+ vh_fig = px.histogram(
270
+ df,
271
+ x="VH_length",
272
+ nbins=40,
273
+ color_discrete_sequence=["#ff5c77"], #blue
274
+ labels={"count": "Count"}
275
+ )
276
+ vh_fig.update_layout(width=width, height=height,
277
+ plot_bgcolor="white",
278
+ yaxis_title="Count"
279
+ )
280
+
281
+ vl_fig = px.histogram(
282
+ df,
283
+ x="VL_length",
284
+ nbins=40,
285
+ color_discrete_sequence=["#00ffff"], # VL color (red)
286
+ labels={"count": "Count"}
287
+ )
288
+ vl_fig.update_layout(width=width, height=height,
289
+ plot_bgcolor="white",
290
+ yaxis_title="Count"
291
+ )
292
+
293
+ return vh_fig, vl_fig
294
+
295
+ def bar_vh_vl_combined(
296
+ df: pd.DataFrame,
297
+ total_rows: int,
298
+ vh_germline: str | None,
299
+ vl_germline: str | None,
300
+ width: int = 500,
301
+ height: int = 250
302
+ ) -> px.bar:
303
+ """
304
+ Horizontal bar chart with three bars:
305
+ 1. Selected VH germline count
306
+ 2. Selected VL germline count
307
+ 3. Remaining = (2 * total_rows) - VH_count - VL_count
308
+ """
309
+
310
+ # Count VH matches
311
+ if vh_germline:
312
+ vh_count = (df["vcall_VH"] == vh_germline).sum()
313
+ else:
314
+ vh_count = len(df)
315
+
316
+ # Count VL matches
317
+ if vl_germline:
318
+ vl_count = (df["vcall_VL"] == vl_germline).sum()
319
+ else:
320
+ vl_count = len(df)
321
+
322
+ # Remaining sequences = 2 * total_rows - VH_count - VL_count
323
+ remaining = (2 * total_rows) - (vh_count + vl_count)
324
+
325
+ plot_df = pd.DataFrame({
326
+ "Category": [
327
+ vh_germline if vh_germline else "All Germlines",
328
+ vl_germline if vl_germline else "All Germlines",
329
+ "Remaining"
330
+ ],
331
+ "Count": [vh_count, vl_count, remaining]
332
+ })
333
+
334
+ fig = px.bar(
335
+ plot_df,
336
+ x="Count",
337
+ y="Category",
338
+ orientation="h",
339
+ text="Count",
340
+ color="Category",
341
+ color_discrete_map={
342
+ (vh_germline if vh_germline else "All Germlines"): "#3A7",
343
+ (vl_germline if vl_germline else "All Germlines"): "#FF7F0E",
344
+ "Remaining": "#0000FF"
345
+ }
346
+ )
347
+
348
+ fig.update_layout(
349
+ width=width,
350
+ height=height,
351
+ showlegend=False,
352
+ plot_bgcolor="white",
353
+ xaxis_title="Number of Sequences",
354
+ )
355
+
356
+ return fig
357
+
358
+ def bar_year_count(
359
+ df: pd.DataFrame,
360
+ width: int = 500,
361
+ height: int = 250
362
+ ) -> px.bar:
363
+ """
364
+ Horizontal bar chart of sequence counts per Year.
365
+
366
+ Parameters
367
+ ----------
368
+ df : pd.DataFrame
369
+ DataFrame that includes a 'Year' column.
370
+ width, height : int
371
+ Size of the figure.
372
+
373
+ Returns
374
+ -------
375
+ plotly.graph_objects.Figure
376
+ """
377
+ if "Year" not in df.columns:
378
+ raise ValueError("DataFrame must contain a 'Year' column.")
379
+
380
+ # Count sequences per year and sort descending
381
+ year_counts = 2 *df["Year"].value_counts().sort_index()
382
+
383
+ # Create a DataFrame for plotting
384
+ plot_df = pd.DataFrame({
385
+ 'Year': year_counts.index.astype(str),
386
+ 'Count': year_counts.values
387
+ })
388
+
389
+ fig = px.bar(
390
+ plot_df,
391
+ x='Count',
392
+ y='Year',
393
+ orientation="h",
394
+ text='Count',
395
+ color="Year", # <─ use Year as the color key
396
+ color_discrete_sequence=px.colors.qualitative.Light24 # or any palette you like
397
+ )
398
+
399
+ fig.update_layout(
400
+ width=width,
401
+ height=height,
402
+ plot_bgcolor="white",
403
+ paper_bgcolor="white",
404
+ xaxis_title="Number of Sequences",
405
+ yaxis_title="Year",
406
+ showlegend=False
407
+ )
408
+ # Remove grid lines for a cleaner look
409
+ fig.update_xaxes(showgrid=False)
410
+ fig.update_yaxes(showgrid=False)
411
+
412
+ return fig
413
+
414
+