antibody-database / utils.py
hemantn's picture
build files added
fc56c31
import pandas as pd
import plotly.express as px
import tempfile
def update_vh(vh_len):
return vh_len
def update_vl(vl_len):
return vl_len
#def make_fasta_file(df: pd.DataFrame):
# if df.empty:
# return None
# lines = []
# i = 1
# for _, row in df.iterrows():
# header = f">{i}_{row['vcall_VH']}|{row['Disease']}"
# lines.append(header)
# lines.append(row['VH'])
# header = f">{i}_{row['vcall_VL']}|{row['Disease']}"
# lines.append(header)
# lines.append(row['VL'])
# tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".fasta")
# tmp.write("\n".join(lines).encode())
# tmp.close()
# return tmp.name
def make_fasta_file(df: pd.DataFrame):
"""
Vectorized FASTA file creation - ~100x faster than loop-based approach.
Optimized for large datasets (1M+ sequences).
"""
if df.empty:
return None
import numpy as np
# Create sequence IDs as a vector
n_seqs = len(df)
seq_ids = np.arange(1, n_seqs + 1)
# Vectorized header creation using string concatenation
vh_headers = ">" + seq_ids.astype(str) + "_" + df['vcall_VH'].astype(str) + "|" + df['Disease'].astype(str) + "|VH"
vl_headers = ">" + seq_ids.astype(str) + "_" + df['vcall_VL'].astype(str) + "|" + df['Disease'].astype(str) + "|VL"
# Interleave headers and sequences using numpy array indexing
fasta_content = np.empty((n_seqs * 4,), dtype=object)
fasta_content[0::4] = vh_headers # VH headers at positions 0, 4, 8, ...
fasta_content[1::4] = df['VH'].astype(str) # VH sequences at positions 1, 5, 9, ...
fasta_content[2::4] = vl_headers # VL headers at positions 2, 6, 10, ...
fasta_content[3::4] = df['VL'].astype(str) # VL sequences at positions 3, 7, 11, ...
# Write to file in one operation (much faster than multiple writes)
tmp = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".fasta", newline='')
tmp.write('\n'.join(fasta_content))
tmp.close()
return tmp.name
def pie_vcall_vh(df: pd.DataFrame, total_raws: int, width: int = 500, height: int = 400) -> px.pie:
current_count = len(df)
remaining = total_raws - current_count
values = [current_count, remaining]
#labels = ['Selected', 'Remaining']
fig = px.pie(values=values)
fig.update_layout(width=width, height=height)
return fig
def bar_vcall_vh(df: pd.DataFrame, total_rows: int, vh_germline: str,
width: int = 500, height: int = 250) -> px.bar:
"""
Horizontal bar chart showing Selected vs Remaining counts.
Parameters
----------
df : pd.DataFrame
Filtered dataframe from your query.
total_rows : int
Total number of rows in the full database.
width, height : int
Size of the resulting figure in pixels.
"""
current_count = len(df)
remaining = total_rows - current_count
label_selected = vh_germline if vh_germline else "All Germlines"
plot_df = pd.DataFrame({
"Category": [label_selected, "Remaining"],
"Count": [current_count, remaining]
})
fig = px.bar(
plot_df,
x="Count",
y="Category",
orientation="h", # horizontal bars
text="Count", # show numbers on bars
color="Category",
color_discrete_map={
"Selected Germline": "#3A7", # greenish
"Remaining": "#0000FF" # gray #999
}
)
fig.update_layout(
width=width,
height=height,
showlegend=False,
plot_bgcolor="white",
xaxis_title="Number of Sequences",
)
return fig
def bar_vcall_vl(df: pd.DataFrame, total_rows: int, vl_germline: str,
width: int = 500, height: int = 250) -> px.bar:
"""
Horizontal bar chart showing Selected vs Remaining counts.
Parameters
----------
df : pd.DataFrame
Filtered dataframe from your query.
total_rows : int
Total number of rows in the full database.
width, height : int
Size of the resulting figure in pixels.
"""
current_count = len(df)
remaining = total_rows - current_count
label_selected = vl_germline if vl_germline else "All Germlines"
plot_df = pd.DataFrame({
"Category": [label_selected, "Remaining"],
"Count": [current_count, remaining]
})
fig = px.bar(
plot_df,
x="Count",
y="Category",
orientation="h", # horizontal bars
text="Count", # show numbers on bars
color="Category",
color_discrete_map={
"Selected Germline": "#3A7", # greenish
"Remaining": "#0000FF" # gray #999
}
)
fig.update_layout(
width=width,
height=height,
showlegend=False,
plot_bgcolor="white",
xaxis_title="Number of Sequences",
)
return fig
def bar_disease_count(df: pd.DataFrame,
total_rows: int,
disease: str,
width: int = 500,
height: int = 250) -> px.bar:
"""
Horizontal bar chart showing the count for the selected Disease
versus all remaining rows in the database.
Parameters
----------
df : pd.DataFrame
Filtered dataframe from your query (the rows matching filters).
total_rows : int
Total number of rows in the full database.
disease : str
Disease name chosen in the UI (e.g., "SARS-COV-2").
width, height : int
Size of the resulting figure.
"""
current_count = len(df)
remaining = total_rows - current_count
label_selected = disease if disease else "All Diseases"
plot_df = pd.DataFrame({
"Category": [label_selected, "Remaining"],
"Count": [current_count, remaining]
})
fig = px.bar(
plot_df,
x="Count",
y="Category",
orientation="h",
color="Category",
color_discrete_map={label_selected: "#d62728", "Remaining": "#999"} # red & gray
)
# Remove all labels/legend for a clean look
fig.update_layout(
width=width,
height=height,
showlegend=False,
plot_bgcolor="white",
)
return fig
def bar_btype_count(df: pd.DataFrame,
total_rows: int,
btype: str,
width: int = 500,
height: int = 250) -> px.bar:
"""
Horizontal bar chart showing the count for the selected B-cell type
versus the remaining rows in the database.
Parameters
----------
df : pd.DataFrame
Filtered dataframe from your query (rows matching filters).
total_rows : int
Total number of rows in the full database.
btype : str
B-cell type selected in the UI (e.g., "Memory-B-Cells").
width, height : int
Size of the figure in pixels.
"""
current_count = len(df)
remaining = total_rows - current_count
label_selected = btype if btype else "All B-Types"
plot_df = pd.DataFrame({
"Category": [label_selected, "Remaining"],
"Count": [current_count, remaining]
})
fig = px.bar(
plot_df,
x="Count",
y="Category",
orientation="h",
color="Category",
color_discrete_map={label_selected: "#1f77b4", # blue
"Remaining": "#999"} # gray
)
fig.update_layout(
width=width,
height=height,
showlegend=False,
plot_bgcolor="white",
)
return fig
def hist_vh_vl_separate(df: pd.DataFrame,
width: int = 500,
height: int = 250) -> tuple[px.histogram, px.histogram]:
"""
Returns two separate histograms: one for VH_length, one for VL_length.
"""
vh_fig = px.histogram(
df,
x="VH_length",
nbins=40,
color_discrete_sequence=["#ff5c77"], #blue
labels={"count": "Count"}
)
vh_fig.update_layout(width=width, height=height,
plot_bgcolor="white",
yaxis_title="Count"
)
vl_fig = px.histogram(
df,
x="VL_length",
nbins=40,
color_discrete_sequence=["#00ffff"], # VL color (red)
labels={"count": "Count"}
)
vl_fig.update_layout(width=width, height=height,
plot_bgcolor="white",
yaxis_title="Count"
)
return vh_fig, vl_fig
def bar_vh_vl_combined(
df: pd.DataFrame,
total_rows: int,
vh_germline: str | None,
vl_germline: str | None,
width: int = 500,
height: int = 250
) -> px.bar:
"""
Horizontal bar chart with three bars:
1. Selected VH germline count
2. Selected VL germline count
3. Remaining = (2 * total_rows) - VH_count - VL_count
"""
# Count VH matches
if vh_germline:
vh_count = (df["vcall_VH"] == vh_germline).sum()
else:
vh_count = len(df)
# Count VL matches
if vl_germline:
vl_count = (df["vcall_VL"] == vl_germline).sum()
else:
vl_count = len(df)
# Remaining sequences = 2 * total_rows - VH_count - VL_count
remaining = (2 * total_rows) - (vh_count + vl_count)
plot_df = pd.DataFrame({
"Category": [
vh_germline if vh_germline else "All Germlines",
vl_germline if vl_germline else "All Germlines",
"Remaining"
],
"Count": [vh_count, vl_count, remaining]
})
fig = px.bar(
plot_df,
x="Count",
y="Category",
orientation="h",
text="Count",
color="Category",
color_discrete_map={
(vh_germline if vh_germline else "All Germlines"): "#3A7",
(vl_germline if vl_germline else "All Germlines"): "#FF7F0E",
"Remaining": "#0000FF"
}
)
fig.update_layout(
width=width,
height=height,
showlegend=False,
plot_bgcolor="white",
xaxis_title="Number of Sequences",
)
return fig
def bar_year_count(
df: pd.DataFrame,
width: int = 500,
height: int = 250
) -> px.bar:
"""
Horizontal bar chart of sequence counts per Year.
Parameters
----------
df : pd.DataFrame
DataFrame that includes a 'Year' column.
width, height : int
Size of the figure.
Returns
-------
plotly.graph_objects.Figure
"""
if "Year" not in df.columns:
raise ValueError("DataFrame must contain a 'Year' column.")
# Count sequences per year and sort descending
year_counts = 2 *df["Year"].value_counts().sort_index()
# Create a DataFrame for plotting
plot_df = pd.DataFrame({
'Year': year_counts.index.astype(str),
'Count': year_counts.values
})
fig = px.bar(
plot_df,
x='Count',
y='Year',
orientation="h",
text='Count',
color="Year", # <─ use Year as the color key
color_discrete_sequence=px.colors.qualitative.Light24 # or any palette you like
)
fig.update_layout(
width=width,
height=height,
plot_bgcolor="white",
paper_bgcolor="white",
xaxis_title="Number of Sequences",
yaxis_title="Year",
showlegend=False
)
# Remove grid lines for a cleaner look
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
return fig