Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import plotly.express as px | |
| import tempfile | |
| def update_vh(vh_len): | |
| return vh_len | |
| def update_vl(vl_len): | |
| return vl_len | |
| #def make_fasta_file(df: pd.DataFrame): | |
| # if df.empty: | |
| # return None | |
| # lines = [] | |
| # i = 1 | |
| # for _, row in df.iterrows(): | |
| # header = f">{i}_{row['vcall_VH']}|{row['Disease']}" | |
| # lines.append(header) | |
| # lines.append(row['VH']) | |
| # header = f">{i}_{row['vcall_VL']}|{row['Disease']}" | |
| # lines.append(header) | |
| # lines.append(row['VL']) | |
| # tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".fasta") | |
| # tmp.write("\n".join(lines).encode()) | |
| # tmp.close() | |
| # return tmp.name | |
| def make_fasta_file(df: pd.DataFrame): | |
| """ | |
| Vectorized FASTA file creation - ~100x faster than loop-based approach. | |
| Optimized for large datasets (1M+ sequences). | |
| """ | |
| if df.empty: | |
| return None | |
| import numpy as np | |
| # Create sequence IDs as a vector | |
| n_seqs = len(df) | |
| seq_ids = np.arange(1, n_seqs + 1) | |
| # Vectorized header creation using string concatenation | |
| vh_headers = ">" + seq_ids.astype(str) + "_" + df['vcall_VH'].astype(str) + "|" + df['Disease'].astype(str) + "|VH" | |
| vl_headers = ">" + seq_ids.astype(str) + "_" + df['vcall_VL'].astype(str) + "|" + df['Disease'].astype(str) + "|VL" | |
| # Interleave headers and sequences using numpy array indexing | |
| fasta_content = np.empty((n_seqs * 4,), dtype=object) | |
| fasta_content[0::4] = vh_headers # VH headers at positions 0, 4, 8, ... | |
| fasta_content[1::4] = df['VH'].astype(str) # VH sequences at positions 1, 5, 9, ... | |
| fasta_content[2::4] = vl_headers # VL headers at positions 2, 6, 10, ... | |
| fasta_content[3::4] = df['VL'].astype(str) # VL sequences at positions 3, 7, 11, ... | |
| # Write to file in one operation (much faster than multiple writes) | |
| tmp = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".fasta", newline='') | |
| tmp.write('\n'.join(fasta_content)) | |
| tmp.close() | |
| return tmp.name | |
| def pie_vcall_vh(df: pd.DataFrame, total_raws: int, width: int = 500, height: int = 400) -> px.pie: | |
| current_count = len(df) | |
| remaining = total_raws - current_count | |
| values = [current_count, remaining] | |
| #labels = ['Selected', 'Remaining'] | |
| fig = px.pie(values=values) | |
| fig.update_layout(width=width, height=height) | |
| return fig | |
| def bar_vcall_vh(df: pd.DataFrame, total_rows: int, vh_germline: str, | |
| width: int = 500, height: int = 250) -> px.bar: | |
| """ | |
| Horizontal bar chart showing Selected vs Remaining counts. | |
| Parameters | |
| ---------- | |
| df : pd.DataFrame | |
| Filtered dataframe from your query. | |
| total_rows : int | |
| Total number of rows in the full database. | |
| width, height : int | |
| Size of the resulting figure in pixels. | |
| """ | |
| current_count = len(df) | |
| remaining = total_rows - current_count | |
| label_selected = vh_germline if vh_germline else "All Germlines" | |
| plot_df = pd.DataFrame({ | |
| "Category": [label_selected, "Remaining"], | |
| "Count": [current_count, remaining] | |
| }) | |
| fig = px.bar( | |
| plot_df, | |
| x="Count", | |
| y="Category", | |
| orientation="h", # horizontal bars | |
| text="Count", # show numbers on bars | |
| color="Category", | |
| color_discrete_map={ | |
| "Selected Germline": "#3A7", # greenish | |
| "Remaining": "#0000FF" # gray #999 | |
| } | |
| ) | |
| fig.update_layout( | |
| width=width, | |
| height=height, | |
| showlegend=False, | |
| plot_bgcolor="white", | |
| xaxis_title="Number of Sequences", | |
| ) | |
| return fig | |
| def bar_vcall_vl(df: pd.DataFrame, total_rows: int, vl_germline: str, | |
| width: int = 500, height: int = 250) -> px.bar: | |
| """ | |
| Horizontal bar chart showing Selected vs Remaining counts. | |
| Parameters | |
| ---------- | |
| df : pd.DataFrame | |
| Filtered dataframe from your query. | |
| total_rows : int | |
| Total number of rows in the full database. | |
| width, height : int | |
| Size of the resulting figure in pixels. | |
| """ | |
| current_count = len(df) | |
| remaining = total_rows - current_count | |
| label_selected = vl_germline if vl_germline else "All Germlines" | |
| plot_df = pd.DataFrame({ | |
| "Category": [label_selected, "Remaining"], | |
| "Count": [current_count, remaining] | |
| }) | |
| fig = px.bar( | |
| plot_df, | |
| x="Count", | |
| y="Category", | |
| orientation="h", # horizontal bars | |
| text="Count", # show numbers on bars | |
| color="Category", | |
| color_discrete_map={ | |
| "Selected Germline": "#3A7", # greenish | |
| "Remaining": "#0000FF" # gray #999 | |
| } | |
| ) | |
| fig.update_layout( | |
| width=width, | |
| height=height, | |
| showlegend=False, | |
| plot_bgcolor="white", | |
| xaxis_title="Number of Sequences", | |
| ) | |
| return fig | |
| def bar_disease_count(df: pd.DataFrame, | |
| total_rows: int, | |
| disease: str, | |
| width: int = 500, | |
| height: int = 250) -> px.bar: | |
| """ | |
| Horizontal bar chart showing the count for the selected Disease | |
| versus all remaining rows in the database. | |
| Parameters | |
| ---------- | |
| df : pd.DataFrame | |
| Filtered dataframe from your query (the rows matching filters). | |
| total_rows : int | |
| Total number of rows in the full database. | |
| disease : str | |
| Disease name chosen in the UI (e.g., "SARS-COV-2"). | |
| width, height : int | |
| Size of the resulting figure. | |
| """ | |
| current_count = len(df) | |
| remaining = total_rows - current_count | |
| label_selected = disease if disease else "All Diseases" | |
| plot_df = pd.DataFrame({ | |
| "Category": [label_selected, "Remaining"], | |
| "Count": [current_count, remaining] | |
| }) | |
| fig = px.bar( | |
| plot_df, | |
| x="Count", | |
| y="Category", | |
| orientation="h", | |
| color="Category", | |
| color_discrete_map={label_selected: "#d62728", "Remaining": "#999"} # red & gray | |
| ) | |
| # Remove all labels/legend for a clean look | |
| fig.update_layout( | |
| width=width, | |
| height=height, | |
| showlegend=False, | |
| plot_bgcolor="white", | |
| ) | |
| return fig | |
| def bar_btype_count(df: pd.DataFrame, | |
| total_rows: int, | |
| btype: str, | |
| width: int = 500, | |
| height: int = 250) -> px.bar: | |
| """ | |
| Horizontal bar chart showing the count for the selected B-cell type | |
| versus the remaining rows in the database. | |
| Parameters | |
| ---------- | |
| df : pd.DataFrame | |
| Filtered dataframe from your query (rows matching filters). | |
| total_rows : int | |
| Total number of rows in the full database. | |
| btype : str | |
| B-cell type selected in the UI (e.g., "Memory-B-Cells"). | |
| width, height : int | |
| Size of the figure in pixels. | |
| """ | |
| current_count = len(df) | |
| remaining = total_rows - current_count | |
| label_selected = btype if btype else "All B-Types" | |
| plot_df = pd.DataFrame({ | |
| "Category": [label_selected, "Remaining"], | |
| "Count": [current_count, remaining] | |
| }) | |
| fig = px.bar( | |
| plot_df, | |
| x="Count", | |
| y="Category", | |
| orientation="h", | |
| color="Category", | |
| color_discrete_map={label_selected: "#1f77b4", # blue | |
| "Remaining": "#999"} # gray | |
| ) | |
| fig.update_layout( | |
| width=width, | |
| height=height, | |
| showlegend=False, | |
| plot_bgcolor="white", | |
| ) | |
| return fig | |
| def hist_vh_vl_separate(df: pd.DataFrame, | |
| width: int = 500, | |
| height: int = 250) -> tuple[px.histogram, px.histogram]: | |
| """ | |
| Returns two separate histograms: one for VH_length, one for VL_length. | |
| """ | |
| vh_fig = px.histogram( | |
| df, | |
| x="VH_length", | |
| nbins=40, | |
| color_discrete_sequence=["#ff5c77"], #blue | |
| labels={"count": "Count"} | |
| ) | |
| vh_fig.update_layout(width=width, height=height, | |
| plot_bgcolor="white", | |
| yaxis_title="Count" | |
| ) | |
| vl_fig = px.histogram( | |
| df, | |
| x="VL_length", | |
| nbins=40, | |
| color_discrete_sequence=["#00ffff"], # VL color (red) | |
| labels={"count": "Count"} | |
| ) | |
| vl_fig.update_layout(width=width, height=height, | |
| plot_bgcolor="white", | |
| yaxis_title="Count" | |
| ) | |
| return vh_fig, vl_fig | |
| def bar_vh_vl_combined( | |
| df: pd.DataFrame, | |
| total_rows: int, | |
| vh_germline: str | None, | |
| vl_germline: str | None, | |
| width: int = 500, | |
| height: int = 250 | |
| ) -> px.bar: | |
| """ | |
| Horizontal bar chart with three bars: | |
| 1. Selected VH germline count | |
| 2. Selected VL germline count | |
| 3. Remaining = (2 * total_rows) - VH_count - VL_count | |
| """ | |
| # Count VH matches | |
| if vh_germline: | |
| vh_count = (df["vcall_VH"] == vh_germline).sum() | |
| else: | |
| vh_count = len(df) | |
| # Count VL matches | |
| if vl_germline: | |
| vl_count = (df["vcall_VL"] == vl_germline).sum() | |
| else: | |
| vl_count = len(df) | |
| # Remaining sequences = 2 * total_rows - VH_count - VL_count | |
| remaining = (2 * total_rows) - (vh_count + vl_count) | |
| plot_df = pd.DataFrame({ | |
| "Category": [ | |
| vh_germline if vh_germline else "All Germlines", | |
| vl_germline if vl_germline else "All Germlines", | |
| "Remaining" | |
| ], | |
| "Count": [vh_count, vl_count, remaining] | |
| }) | |
| fig = px.bar( | |
| plot_df, | |
| x="Count", | |
| y="Category", | |
| orientation="h", | |
| text="Count", | |
| color="Category", | |
| color_discrete_map={ | |
| (vh_germline if vh_germline else "All Germlines"): "#3A7", | |
| (vl_germline if vl_germline else "All Germlines"): "#FF7F0E", | |
| "Remaining": "#0000FF" | |
| } | |
| ) | |
| fig.update_layout( | |
| width=width, | |
| height=height, | |
| showlegend=False, | |
| plot_bgcolor="white", | |
| xaxis_title="Number of Sequences", | |
| ) | |
| return fig | |
| def bar_year_count( | |
| df: pd.DataFrame, | |
| width: int = 500, | |
| height: int = 250 | |
| ) -> px.bar: | |
| """ | |
| Horizontal bar chart of sequence counts per Year. | |
| Parameters | |
| ---------- | |
| df : pd.DataFrame | |
| DataFrame that includes a 'Year' column. | |
| width, height : int | |
| Size of the figure. | |
| Returns | |
| ------- | |
| plotly.graph_objects.Figure | |
| """ | |
| if "Year" not in df.columns: | |
| raise ValueError("DataFrame must contain a 'Year' column.") | |
| # Count sequences per year and sort descending | |
| year_counts = 2 *df["Year"].value_counts().sort_index() | |
| # Create a DataFrame for plotting | |
| plot_df = pd.DataFrame({ | |
| 'Year': year_counts.index.astype(str), | |
| 'Count': year_counts.values | |
| }) | |
| fig = px.bar( | |
| plot_df, | |
| x='Count', | |
| y='Year', | |
| orientation="h", | |
| text='Count', | |
| color="Year", # <─ use Year as the color key | |
| color_discrete_sequence=px.colors.qualitative.Light24 # or any palette you like | |
| ) | |
| fig.update_layout( | |
| width=width, | |
| height=height, | |
| plot_bgcolor="white", | |
| paper_bgcolor="white", | |
| xaxis_title="Number of Sequences", | |
| yaxis_title="Year", | |
| showlegend=False | |
| ) | |
| # Remove grid lines for a cleaner look | |
| fig.update_xaxes(showgrid=False) | |
| fig.update_yaxes(showgrid=False) | |
| return fig | |