Spaces:

IBIBoW
/

Sp_cpsB_serotyping

Running

App Files Files Community

Sp_cpsB_serotyping / app.py

Muhamed-Kheir

Upload 5 files

21ce98d verified 5 months ago

raw

history blame contribute delete

3.84 kB

	import streamlit as st
	import pandas as pd
	import math
	from user_script import (
	read_uploaded_fasta_or_zip,
	derive_serotype_names_from_sources,
	compute_unique_kmers_per_serotype,
	classify_unknown_sequences,
	parse_k_input,
	plot_counts_by_serotype,
	)

	st.set_page_config(page_title="Serotype k-mer classifier", layout="wide")
	st.title("🧬 Serotype k-mer classifier — STRICT uniques (Streamlit)")

	st.markdown("""Upload known serotype sequences (FASTA or a ZIP with one FASTA per serotype) and unknown sequences,
	choose parameters, then click Run. The app computes strictly unique k-mers per serotype across k and classifies unknowns.
	""")

	st.sidebar.header("Inputs")
	known_file = st.sidebar.file_uploader("Known serotypes (FASTA or ZIP of FASTA files)", type=["fasta","fa","fas","fna","zip"])
	unknown_file = st.sidebar.file_uploader("Unknown sequences (FASTA or ZIP of FASTA files)", type=["fasta","fa","fas","fna","zip"])

	seqtype = st.sidebar.selectbox("Sequence type", ["DNA", "Protein"])
	is_protein = (seqtype == "Protein")
	default_k = 9 if is_protein else 21
	k_input = st.sidebar.text_input("k values (e.g. 21 or 15-21 or 7,9,11)", value=str(default_k))
	fdr_alpha = st.sidebar.number_input("FDR α", min_value=0.0, value=0.05, step=0.01)

	run = st.sidebar.button("▶ Run analysis")

	if run:
	if not known_file or not unknown_file:
	st.error("Please upload both known and unknown sequences.")
	st.stop()

	with st.spinner("Reading uploads..."):
	known_records = read_uploaded_fasta_or_zip(known_file) # list of (src, header, seq)
	unknown_records = read_uploaded_fasta_or_zip(unknown_file)

	if not known_records:
	st.error("No records found in the known upload."); st.stop()
	if not unknown_records:
	st.error("No records found in the unknown upload."); st.stop()

	# Map headers to serotype names
	name_map = derive_serotype_names_from_sources(known_records)
	serotype_to_seq = {}
	for src, header, seq in known_records:
	sero = name_map.get(header, header.split()[0])
	if sero not in serotype_to_seq:
	serotype_to_seq[sero] = seq

	k_values = parse_k_input(k_input, default_single=default_k)
	k_values = sorted({k for k in k_values if k >= 3})
	if not k_values:
	st.error("No valid k values (>=3)."); st.stop()

	st.info(f"Detected serotypes: {list(serotype_to_seq.keys())}")
	st.info(f"k values: {k_values}")

	with st.spinner("Computing strict-unique k-mers per serotype..."):
	uniques = compute_unique_kmers_per_serotype(serotype_to_seq, is_protein=is_protein, k_values=k_values)

	with st.spinner("Classifying unknown sequences..."):
	df_full = classify_unknown_sequences(unknown_records, uniques, is_protein=is_protein, fdr_alpha=fdr_alpha)

	def best_score(row):
	g = row["Predicted_serotype"]
	if g == "NoMatch":
	return 0.0
	q = row.get(f"FDR_{g}", 1.0)
	return 0.0 if (q is None or q <= 0) else -math.log10(max(q, 1e-300))

	df_full["Score_-log10FDR"] = df_full.apply(best_score, axis=1)

	show_cols = ["Source","Sequence","Predicted_serotype","Matches_total","Confidence_by_present","Confidence_by_serotype_vocab","Score_-log10FDR"]
	st.subheader("Predictions")
	st.dataframe(df_full[show_cols].sort_values("Score_-log10FDR", ascending=False), use_container_width=True)

	fig = plot_counts_by_serotype(df_full)
	st.subheader("Predicted serotype counts")
	st.pyplot(fig)

	st.subheader("Downloads")
	csv = df_full.to_csv(index=False).encode("utf-8")
	st.download_button("Download predictions_by_serotype.csv", data=csv, file_name="predictions_by_serotype.csv", mime="text/csv")
	else:
	st.info("Upload the two files on the left, set parameters, then click Run analysis.")