Spaces:

prihodad
/

promb-humanness

Runtime error

David Prihoda

Rename PMB to average mutations

9712727 10 months ago

7.55 kB

	import altair as alt
	import numpy as np
	import pandas as pd
	import streamlit as st
	from promb import init_db, print_nearest
	from io import StringIO
	import logomaker

	st.set_page_config(layout="wide")

	@st.cache_resource(show_spinner=False)
	def init_db_cached(db_name, args, *kwargs):
	st.info("""
	Note: More human doesn't always mean "good". Peptides like "GGGGGGGGG" or "EKEKEKEKE" are human but you don\'t necessarily want your protein to contain those.
	Make sure to check for sequence entropy, AlphaFold2 confidence, or other quality metrics.
	""")
	return init_db(db_name, args, *kwargs)

	st.write('# promb - protein humanness evaluation')

	st.code('''pip install promb
	* █
	▄▄▄▄ ▄▄▄ ▄█▄ ▄▄▄▄ █▄▄▄
	█ █ █ █▓███ █ █ █ █ █
	█▄▄▄▀ █ ▀███▀ █ █ █▄▄▄▀
	█
	▀ protein mutation burden
	''')

	if st.button("Load example"):
	st.session_state["seq"] = "SPLQKASDSLINIAIKMLRNGINPELAKKLWDIAYKISMSHIDPSSFYEALKELKKLIEEQEEELIEA"

	with st.form(border=False, key="input"):
	seq = st.text_area(
	"Amino acid sequence",
	key="seq"
	)
	seq = "".join(seq.split())


	left, mid, right = st.columns(3)
	with left:
	database_name = st.selectbox("Database", options=["human-reference", "human-swissprot", "human-oas"])
	with mid:
	peptide_length = st.number_input("Peptide length", value=9, min_value=2, max_value=20, disabled=database_name == "human-oas")
	with right:
	num_nearest = st.number_input("Nearest peptides", value=1, min_value=1, max_value=5, help="Number of nearest human peptides used for visualization and to compute PSSM and suggest humanizing mutations")

	st.form_submit_button("Run", type="primary")

	if not seq:
	st.stop()

	with st.spinner(f"Finding nearest {peptide_length}mer peptides in {database_name}..."):
	db = init_db_cached(database_name, peptide_length if database_name != "human-oas" else None)
	peptides = db.chop_seq_peptides(seq)
	nearest = db.find_nearest_peptides(peptides, n=num_nearest)

	st.write("## Result")

	st.metric("Human Peptide Content", "{:.1%} human".format(db.compute_peptide_content(seq)))
	num_mutations = 0
	for peptide, hits in zip(peptides, nearest):
	num_mutations += sum(aa != bb for aa, bb in zip(peptide, hits[0]))

	st.metric("Mutation Burden", "{:.1f} mutations per {}mer".format(num_mutations / len(peptides), peptide_length))

	likelihood = pd.DataFrame({
	"likelihood": db.compute_positional_likelihood(seq, nearest_peptides=nearest),
	"metric": "likelihood",
	"aa": list(seq),
	"position": range(1, len(seq)+1)
	})

	wrap = 50
	st.write("### Positional likelihood")
	st.write(f"Fraction of nearest overlapping {peptide_length}mers that contain the input amino acid at that position. Positions with values close to 0 can be considered to be non-human, and values close to 1 to be human.")
	for start in range(0, len(seq), wrap):
	chunk = likelihood.iloc[start:start+wrap]
	heatmap = alt.Chart(chunk).mark_rect().encode(
	x=alt.X("position:O", title="Sequence Position"),
	y=alt.Y("metric:N", title="Metric"),
	color=alt.Color("likelihood:Q", scale=alt.Scale(scheme="reds", reverse=True, domain=(0, 1))),
	tooltip=["position", "likelihood", "aa"]
	)
	text = alt.Chart(chunk).mark_text(baseline="middle", fontSize=12).encode(
	x=alt.X("position:O", title="Sequence Position"),
	y=alt.Y("metric:N", title="Metric"),
	text=alt.Text("aa:N"),
	tooltip=["position", "likelihood", "aa"]
	)
	chart = (heatmap + text).properties(
	width=250 + (15 * len(chunk)),
	height=180,
	title=f"Positions {start+1}-{start+wrap}"
	)
	st.altair_chart(chart, use_container_width=False)

	st.write("### Position-specific scoring matrix")
	st.write("A PSSM (PWM) computed by counting occurences of amino acids in nearest overlapping human peptides at each position.")
	pssm = db.compute_pssm(seq, nearest_peptides=nearest)
	freqs_long = pssm.reset_index().melt(id_vars="position", var_name="aa", value_name="count")
	heatmap = alt.Chart(freqs_long).mark_rect().encode(
	x=alt.X("position:O", title="Sequence Position"),
	y=alt.Y("aa:N", title="Amino Acid", sort=None),
	color=alt.Color("count:Q", scale=alt.Scale(scheme="viridis")),
	tooltip=["position", "aa", "count"]
	).properties(
	height=600,
	title="Amino Acid Frequencies"
	)
	st.altair_chart(heatmap, use_container_width=True)

	st.write("#### Sequence logo")
	st.write("PSSM computed from nearest human peptides visualized using logomaker library")

	logo = logomaker.Logo(pssm, figsize=(min(50, 2 + 0.2 * len(seq)), 2))
	st.pyplot(logo.fig)

	st.write("#### Suggested mutations")
	st.write("PSSM but without counting amino acids found in input sequence at each position")

	pssm_mutations = db.compute_pssm(seq, nearest_peptides=nearest, ignore_wildtype=True)
	logo = logomaker.Logo(pssm_mutations, figsize=(min(50, 2 + 0.2 * len(seq)), 2))
	st.pyplot(logo.fig)

	st.write("### Nearest human peptides")

	stream = StringIO()
	print_nearest(peptides, nearest, file=stream)
	with st.container(height=400):
	st.code(stream.getvalue())

	st.write("## Humanization (naive approach)")

	st.write("Generate slightly humanized variants by applying 1-3 mutations based on nearest overlapping peptides")

	if st.button("Generate humanized mutants", type="primary"):

	st.write("### Point mutant candidates")

	with st.spinner("Generating point mutants..."):
	for candidate in db.suggest_point_mutant_candidates(seq, nearest_peptides=nearest)[:5]:
	mutations = " ".join(f"{aa}{pos}{bb}" for pos, (aa, bb) in enumerate(zip(seq, candidate), start=1) if aa != bb)
	avg = db.compute_average_mutations(candidate)
	st.code(f">{mutations} AvgMut={avg:.2f}\n{candidate}")

	st.write("### Double mutant candidates")

	with st.spinner("Generating double mutants..."):
	for candidate in db.suggest_double_mutant_candidates(seq, nearest_peptides=nearest)[:5]:
	mutations = " ".join(f"{aa}{pos}{bb}" for pos, (aa, bb) in enumerate(zip(seq, candidate), start=1) if aa != bb)
	avg = db.compute_average_mutations(candidate)
	st.code(f">{mutations} AvgMut={avg:.2f}\n{candidate}")

	st.write("### Triple mutant candidates")

	with st.spinner("Generating triple mutants..."):
	for candidate in db.suggest_triple_mutant_candidates(seq, nearest_peptides=nearest)[:5]:
	mutations = " ".join(f"{aa}{pos}{bb}" for pos, (aa, bb) in enumerate(zip(seq, candidate), start=1) if aa != bb)
	avg = db.compute_average_mutations(candidate)
	st.code(f">{mutations} AvgMut={avg:.2f}\n{candidate}")

	st.divider()

	st.write("## Run locally")

	st.write("Install and run `promb` locally:")

	st.code("""
	# Install promb
	pip install promb
	# See cli commands
	promb --help
	""", language="text")

	st.write("More instructions in the GitHub repo: https://github.com/MSDLLCpapers/promb")

	st.write("You can also clone this space as a git repository and run it locally:")

	st.code("""
	# Clone huggingface spaces repository
	git clone https://huggingface.co/spaces/prihodad/promb-humanness
	# Open the directory
	cd promb-humanness
	# Install dependencies (you should do this in a separate conda/venv environment)
	pip install -r requirements.txt
	# Run Streamlit app
	streamlit run src/streamlit_app.py
	""", language="text")