import pandas as pd import plotly.express as px import tempfile def update_vh(vh_len): return vh_len def update_vl(vl_len): return vl_len #def make_fasta_file(df: pd.DataFrame): # if df.empty: # return None # lines = [] # i = 1 # for _, row in df.iterrows(): # header = f">{i}_{row['vcall_VH']}|{row['Disease']}" # lines.append(header) # lines.append(row['VH']) # header = f">{i}_{row['vcall_VL']}|{row['Disease']}" # lines.append(header) # lines.append(row['VL']) # tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".fasta") # tmp.write("\n".join(lines).encode()) # tmp.close() # return tmp.name def make_fasta_file(df: pd.DataFrame): """ Vectorized FASTA file creation - ~100x faster than loop-based approach. Optimized for large datasets (1M+ sequences). """ if df.empty: return None import numpy as np # Create sequence IDs as a vector n_seqs = len(df) seq_ids = np.arange(1, n_seqs + 1) # Vectorized header creation using string concatenation vh_headers = ">" + seq_ids.astype(str) + "_" + df['vcall_VH'].astype(str) + "|" + df['Disease'].astype(str) + "|VH" vl_headers = ">" + seq_ids.astype(str) + "_" + df['vcall_VL'].astype(str) + "|" + df['Disease'].astype(str) + "|VL" # Interleave headers and sequences using numpy array indexing fasta_content = np.empty((n_seqs * 4,), dtype=object) fasta_content[0::4] = vh_headers # VH headers at positions 0, 4, 8, ... fasta_content[1::4] = df['VH'].astype(str) # VH sequences at positions 1, 5, 9, ... fasta_content[2::4] = vl_headers # VL headers at positions 2, 6, 10, ... fasta_content[3::4] = df['VL'].astype(str) # VL sequences at positions 3, 7, 11, ... # Write to file in one operation (much faster than multiple writes) tmp = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".fasta", newline='') tmp.write('\n'.join(fasta_content)) tmp.close() return tmp.name def pie_vcall_vh(df: pd.DataFrame, total_raws: int, width: int = 500, height: int = 400) -> px.pie: current_count = len(df) remaining = total_raws - current_count values = [current_count, remaining] #labels = ['Selected', 'Remaining'] fig = px.pie(values=values) fig.update_layout(width=width, height=height) return fig def bar_vcall_vh(df: pd.DataFrame, total_rows: int, vh_germline: str, width: int = 500, height: int = 250) -> px.bar: """ Horizontal bar chart showing Selected vs Remaining counts. Parameters ---------- df : pd.DataFrame Filtered dataframe from your query. total_rows : int Total number of rows in the full database. width, height : int Size of the resulting figure in pixels. """ current_count = len(df) remaining = total_rows - current_count label_selected = vh_germline if vh_germline else "All Germlines" plot_df = pd.DataFrame({ "Category": [label_selected, "Remaining"], "Count": [current_count, remaining] }) fig = px.bar( plot_df, x="Count", y="Category", orientation="h", # horizontal bars text="Count", # show numbers on bars color="Category", color_discrete_map={ "Selected Germline": "#3A7", # greenish "Remaining": "#0000FF" # gray #999 } ) fig.update_layout( width=width, height=height, showlegend=False, plot_bgcolor="white", xaxis_title="Number of Sequences", ) return fig def bar_vcall_vl(df: pd.DataFrame, total_rows: int, vl_germline: str, width: int = 500, height: int = 250) -> px.bar: """ Horizontal bar chart showing Selected vs Remaining counts. Parameters ---------- df : pd.DataFrame Filtered dataframe from your query. total_rows : int Total number of rows in the full database. width, height : int Size of the resulting figure in pixels. """ current_count = len(df) remaining = total_rows - current_count label_selected = vl_germline if vl_germline else "All Germlines" plot_df = pd.DataFrame({ "Category": [label_selected, "Remaining"], "Count": [current_count, remaining] }) fig = px.bar( plot_df, x="Count", y="Category", orientation="h", # horizontal bars text="Count", # show numbers on bars color="Category", color_discrete_map={ "Selected Germline": "#3A7", # greenish "Remaining": "#0000FF" # gray #999 } ) fig.update_layout( width=width, height=height, showlegend=False, plot_bgcolor="white", xaxis_title="Number of Sequences", ) return fig def bar_disease_count(df: pd.DataFrame, total_rows: int, disease: str, width: int = 500, height: int = 250) -> px.bar: """ Horizontal bar chart showing the count for the selected Disease versus all remaining rows in the database. Parameters ---------- df : pd.DataFrame Filtered dataframe from your query (the rows matching filters). total_rows : int Total number of rows in the full database. disease : str Disease name chosen in the UI (e.g., "SARS-COV-2"). width, height : int Size of the resulting figure. """ current_count = len(df) remaining = total_rows - current_count label_selected = disease if disease else "All Diseases" plot_df = pd.DataFrame({ "Category": [label_selected, "Remaining"], "Count": [current_count, remaining] }) fig = px.bar( plot_df, x="Count", y="Category", orientation="h", color="Category", color_discrete_map={label_selected: "#d62728", "Remaining": "#999"} # red & gray ) # Remove all labels/legend for a clean look fig.update_layout( width=width, height=height, showlegend=False, plot_bgcolor="white", ) return fig def bar_btype_count(df: pd.DataFrame, total_rows: int, btype: str, width: int = 500, height: int = 250) -> px.bar: """ Horizontal bar chart showing the count for the selected B-cell type versus the remaining rows in the database. Parameters ---------- df : pd.DataFrame Filtered dataframe from your query (rows matching filters). total_rows : int Total number of rows in the full database. btype : str B-cell type selected in the UI (e.g., "Memory-B-Cells"). width, height : int Size of the figure in pixels. """ current_count = len(df) remaining = total_rows - current_count label_selected = btype if btype else "All B-Types" plot_df = pd.DataFrame({ "Category": [label_selected, "Remaining"], "Count": [current_count, remaining] }) fig = px.bar( plot_df, x="Count", y="Category", orientation="h", color="Category", color_discrete_map={label_selected: "#1f77b4", # blue "Remaining": "#999"} # gray ) fig.update_layout( width=width, height=height, showlegend=False, plot_bgcolor="white", ) return fig def hist_vh_vl_separate(df: pd.DataFrame, width: int = 500, height: int = 250) -> tuple[px.histogram, px.histogram]: """ Returns two separate histograms: one for VH_length, one for VL_length. """ vh_fig = px.histogram( df, x="VH_length", nbins=40, color_discrete_sequence=["#ff5c77"], #blue labels={"count": "Count"} ) vh_fig.update_layout(width=width, height=height, plot_bgcolor="white", yaxis_title="Count" ) vl_fig = px.histogram( df, x="VL_length", nbins=40, color_discrete_sequence=["#00ffff"], # VL color (red) labels={"count": "Count"} ) vl_fig.update_layout(width=width, height=height, plot_bgcolor="white", yaxis_title="Count" ) return vh_fig, vl_fig def bar_vh_vl_combined( df: pd.DataFrame, total_rows: int, vh_germline: str | None, vl_germline: str | None, width: int = 500, height: int = 250 ) -> px.bar: """ Horizontal bar chart with three bars: 1. Selected VH germline count 2. Selected VL germline count 3. Remaining = (2 * total_rows) - VH_count - VL_count """ # Count VH matches if vh_germline: vh_count = (df["vcall_VH"] == vh_germline).sum() else: vh_count = len(df) # Count VL matches if vl_germline: vl_count = (df["vcall_VL"] == vl_germline).sum() else: vl_count = len(df) # Remaining sequences = 2 * total_rows - VH_count - VL_count remaining = (2 * total_rows) - (vh_count + vl_count) plot_df = pd.DataFrame({ "Category": [ vh_germline if vh_germline else "All Germlines", vl_germline if vl_germline else "All Germlines", "Remaining" ], "Count": [vh_count, vl_count, remaining] }) fig = px.bar( plot_df, x="Count", y="Category", orientation="h", text="Count", color="Category", color_discrete_map={ (vh_germline if vh_germline else "All Germlines"): "#3A7", (vl_germline if vl_germline else "All Germlines"): "#FF7F0E", "Remaining": "#0000FF" } ) fig.update_layout( width=width, height=height, showlegend=False, plot_bgcolor="white", xaxis_title="Number of Sequences", ) return fig def bar_year_count( df: pd.DataFrame, width: int = 500, height: int = 250 ) -> px.bar: """ Horizontal bar chart of sequence counts per Year. Parameters ---------- df : pd.DataFrame DataFrame that includes a 'Year' column. width, height : int Size of the figure. Returns ------- plotly.graph_objects.Figure """ if "Year" not in df.columns: raise ValueError("DataFrame must contain a 'Year' column.") # Count sequences per year and sort descending year_counts = 2 *df["Year"].value_counts().sort_index() # Create a DataFrame for plotting plot_df = pd.DataFrame({ 'Year': year_counts.index.astype(str), 'Count': year_counts.values }) fig = px.bar( plot_df, x='Count', y='Year', orientation="h", text='Count', color="Year", # <─ use Year as the color key color_discrete_sequence=px.colors.qualitative.Light24 # or any palette you like ) fig.update_layout( width=width, height=height, plot_bgcolor="white", paper_bgcolor="white", xaxis_title="Number of Sequences", yaxis_title="Year", showlegend=False ) # Remove grid lines for a cleaner look fig.update_xaxes(showgrid=False) fig.update_yaxes(showgrid=False) return fig