Spaces:

hemantn
/

antibody-database

Sleeping

App Files Files Community

antibody-database / utils.py

hemantn

build files added

fc56c31 3 months ago

raw

history blame contribute delete

11.7 kB

	import pandas as pd
	import plotly.express as px
	import tempfile

	def update_vh(vh_len):
	return vh_len
	def update_vl(vl_len):
	return vl_len

	#def make_fasta_file(df: pd.DataFrame):
	# if df.empty:
	# return None
	# lines = []
	# i = 1
	# for _, row in df.iterrows():
	# header = f">{i}_{row['vcall_VH']}\|{row['Disease']}"
	# lines.append(header)
	# lines.append(row['VH'])
	# header = f">{i}_{row['vcall_VL']}\|{row['Disease']}"
	# lines.append(header)
	# lines.append(row['VL'])
	# tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".fasta")
	# tmp.write("\n".join(lines).encode())
	# tmp.close()
	# return tmp.name

	def make_fasta_file(df: pd.DataFrame):
	"""
	Vectorized FASTA file creation - ~100x faster than loop-based approach.
	Optimized for large datasets (1M+ sequences).
	"""
	if df.empty:
	return None

	import numpy as np

	# Create sequence IDs as a vector
	n_seqs = len(df)
	seq_ids = np.arange(1, n_seqs + 1)

	# Vectorized header creation using string concatenation
	vh_headers = ">" + seq_ids.astype(str) + "_" + df['vcall_VH'].astype(str) + "\|" + df['Disease'].astype(str) + "\|VH"
	vl_headers = ">" + seq_ids.astype(str) + "_" + df['vcall_VL'].astype(str) + "\|" + df['Disease'].astype(str) + "\|VL"

	# Interleave headers and sequences using numpy array indexing
	fasta_content = np.empty((n_seqs * 4,), dtype=object)
	fasta_content[0::4] = vh_headers # VH headers at positions 0, 4, 8, ...
	fasta_content[1::4] = df['VH'].astype(str) # VH sequences at positions 1, 5, 9, ...
	fasta_content[2::4] = vl_headers # VL headers at positions 2, 6, 10, ...
	fasta_content[3::4] = df['VL'].astype(str) # VL sequences at positions 3, 7, 11, ...

	# Write to file in one operation (much faster than multiple writes)
	tmp = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".fasta", newline='')
	tmp.write('\n'.join(fasta_content))
	tmp.close()
	return tmp.name


	def pie_vcall_vh(df: pd.DataFrame, total_raws: int, width: int = 500, height: int = 400) -> px.pie:

	current_count = len(df)
	remaining = total_raws - current_count
	values = [current_count, remaining]
	#labels = ['Selected', 'Remaining']
	fig = px.pie(values=values)
	fig.update_layout(width=width, height=height)
	return fig

	def bar_vcall_vh(df: pd.DataFrame, total_rows: int, vh_germline: str,
	width: int = 500, height: int = 250) -> px.bar:
	"""
	Horizontal bar chart showing Selected vs Remaining counts.

	Parameters
	----------
	df : pd.DataFrame
	Filtered dataframe from your query.
	total_rows : int
	Total number of rows in the full database.
	width, height : int
	Size of the resulting figure in pixels.
	"""
	current_count = len(df)
	remaining = total_rows - current_count

	label_selected = vh_germline if vh_germline else "All Germlines"

	plot_df = pd.DataFrame({
	"Category": [label_selected, "Remaining"],
	"Count": [current_count, remaining]
	})

	fig = px.bar(
	plot_df,
	x="Count",
	y="Category",
	orientation="h", # horizontal bars
	text="Count", # show numbers on bars
	color="Category",
	color_discrete_map={
	"Selected Germline": "#3A7", # greenish
	"Remaining": "#0000FF" # gray #999
	}
	)

	fig.update_layout(
	width=width,
	height=height,
	showlegend=False,
	plot_bgcolor="white",
	xaxis_title="Number of Sequences",
	)

	return fig

	def bar_vcall_vl(df: pd.DataFrame, total_rows: int, vl_germline: str,
	width: int = 500, height: int = 250) -> px.bar:
	"""
	Horizontal bar chart showing Selected vs Remaining counts.

	Parameters
	----------
	df : pd.DataFrame
	Filtered dataframe from your query.
	total_rows : int
	Total number of rows in the full database.
	width, height : int
	Size of the resulting figure in pixels.
	"""
	current_count = len(df)
	remaining = total_rows - current_count

	label_selected = vl_germline if vl_germline else "All Germlines"

	plot_df = pd.DataFrame({
	"Category": [label_selected, "Remaining"],
	"Count": [current_count, remaining]
	})

	fig = px.bar(
	plot_df,
	x="Count",
	y="Category",
	orientation="h", # horizontal bars
	text="Count", # show numbers on bars
	color="Category",
	color_discrete_map={
	"Selected Germline": "#3A7", # greenish
	"Remaining": "#0000FF" # gray #999
	}
	)


	fig.update_layout(
	width=width,
	height=height,
	showlegend=False,
	plot_bgcolor="white",
	xaxis_title="Number of Sequences",
	)

	return fig

	def bar_disease_count(df: pd.DataFrame,
	total_rows: int,
	disease: str,
	width: int = 500,
	height: int = 250) -> px.bar:
	"""
	Horizontal bar chart showing the count for the selected Disease
	versus all remaining rows in the database.

	Parameters
	----------
	df : pd.DataFrame
	Filtered dataframe from your query (the rows matching filters).
	total_rows : int
	Total number of rows in the full database.
	disease : str
	Disease name chosen in the UI (e.g., "SARS-COV-2").
	width, height : int
	Size of the resulting figure.
	"""
	current_count = len(df)
	remaining = total_rows - current_count

	label_selected = disease if disease else "All Diseases"

	plot_df = pd.DataFrame({
	"Category": [label_selected, "Remaining"],
	"Count": [current_count, remaining]
	})

	fig = px.bar(
	plot_df,
	x="Count",
	y="Category",
	orientation="h",
	color="Category",
	color_discrete_map={label_selected: "#d62728", "Remaining": "#999"} # red & gray
	)

	# Remove all labels/legend for a clean look
	fig.update_layout(
	width=width,
	height=height,
	showlegend=False,
	plot_bgcolor="white",
	)

	return fig

	def bar_btype_count(df: pd.DataFrame,
	total_rows: int,
	btype: str,
	width: int = 500,
	height: int = 250) -> px.bar:
	"""
	Horizontal bar chart showing the count for the selected B-cell type
	versus the remaining rows in the database.

	Parameters
	----------
	df : pd.DataFrame
	Filtered dataframe from your query (rows matching filters).
	total_rows : int
	Total number of rows in the full database.
	btype : str
	B-cell type selected in the UI (e.g., "Memory-B-Cells").
	width, height : int
	Size of the figure in pixels.
	"""
	current_count = len(df)
	remaining = total_rows - current_count

	label_selected = btype if btype else "All B-Types"

	plot_df = pd.DataFrame({
	"Category": [label_selected, "Remaining"],
	"Count": [current_count, remaining]
	})

	fig = px.bar(
	plot_df,
	x="Count",
	y="Category",
	orientation="h",
	color="Category",
	color_discrete_map={label_selected: "#1f77b4", # blue
	"Remaining": "#999"} # gray
	)

	fig.update_layout(
	width=width,
	height=height,
	showlegend=False,
	plot_bgcolor="white",
	)

	return fig

	def hist_vh_vl_separate(df: pd.DataFrame,
	width: int = 500,
	height: int = 250) -> tuple[px.histogram, px.histogram]:
	"""
	Returns two separate histograms: one for VH_length, one for VL_length.
	"""

	vh_fig = px.histogram(
	df,
	x="VH_length",
	nbins=40,
	color_discrete_sequence=["#ff5c77"], #blue
	labels={"count": "Count"}
	)
	vh_fig.update_layout(width=width, height=height,
	plot_bgcolor="white",
	yaxis_title="Count"
	)

	vl_fig = px.histogram(
	df,
	x="VL_length",
	nbins=40,
	color_discrete_sequence=["#00ffff"], # VL color (red)
	labels={"count": "Count"}
	)
	vl_fig.update_layout(width=width, height=height,
	plot_bgcolor="white",
	yaxis_title="Count"
	)

	return vh_fig, vl_fig

	def bar_vh_vl_combined(
	df: pd.DataFrame,
	total_rows: int,
	vh_germline: str \| None,
	vl_germline: str \| None,
	width: int = 500,
	height: int = 250
	) -> px.bar:
	"""
	Horizontal bar chart with three bars:
	1. Selected VH germline count
	2. Selected VL germline count
	3. Remaining = (2 * total_rows) - VH_count - VL_count
	"""

	# Count VH matches
	if vh_germline:
	vh_count = (df["vcall_VH"] == vh_germline).sum()
	else:
	vh_count = len(df)

	# Count VL matches
	if vl_germline:
	vl_count = (df["vcall_VL"] == vl_germline).sum()
	else:
	vl_count = len(df)

	# Remaining sequences = 2 * total_rows - VH_count - VL_count
	remaining = (2 * total_rows) - (vh_count + vl_count)

	plot_df = pd.DataFrame({
	"Category": [
	vh_germline if vh_germline else "All Germlines",
	vl_germline if vl_germline else "All Germlines",
	"Remaining"
	],
	"Count": [vh_count, vl_count, remaining]
	})

	fig = px.bar(
	plot_df,
	x="Count",
	y="Category",
	orientation="h",
	text="Count",
	color="Category",
	color_discrete_map={
	(vh_germline if vh_germline else "All Germlines"): "#3A7",
	(vl_germline if vl_germline else "All Germlines"): "#FF7F0E",
	"Remaining": "#0000FF"
	}
	)

	fig.update_layout(
	width=width,
	height=height,
	showlegend=False,
	plot_bgcolor="white",
	xaxis_title="Number of Sequences",
	)

	return fig

	def bar_year_count(
	df: pd.DataFrame,
	width: int = 500,
	height: int = 250
	) -> px.bar:
	"""
	Horizontal bar chart of sequence counts per Year.

	Parameters
	----------
	df : pd.DataFrame
	DataFrame that includes a 'Year' column.
	width, height : int
	Size of the figure.

	Returns
	-------
	plotly.graph_objects.Figure
	"""
	if "Year" not in df.columns:
	raise ValueError("DataFrame must contain a 'Year' column.")

	# Count sequences per year and sort descending
	year_counts = 2 *df["Year"].value_counts().sort_index()

	# Create a DataFrame for plotting
	plot_df = pd.DataFrame({
	'Year': year_counts.index.astype(str),
	'Count': year_counts.values
	})

	fig = px.bar(
	plot_df,
	x='Count',
	y='Year',
	orientation="h",
	text='Count',
	color="Year", # <─ use Year as the color key
	color_discrete_sequence=px.colors.qualitative.Light24 # or any palette you like
	)

	fig.update_layout(
	width=width,
	height=height,
	plot_bgcolor="white",
	paper_bgcolor="white",
	xaxis_title="Number of Sequences",
	yaxis_title="Year",
	showlegend=False
	)
	# Remove grid lines for a cleaner look
	fig.update_xaxes(showgrid=False)
	fig.update_yaxes(showgrid=False)

	return fig