import altair as alt import numpy as np import pandas as pd import streamlit as st from promb import init_db, print_nearest from io import StringIO import logomaker st.set_page_config(layout="wide") @st.cache_resource(show_spinner=False) def init_db_cached(db_name, *args, **kwargs): st.info(""" Note: More human doesn't always mean "good". Peptides like "GGGGGGGGG" or "EKEKEKEKE" are human but you don\'t necessarily want your protein to contain those. Make sure to check for sequence entropy, AlphaFold2 confidence, or other quality metrics. """) return init_db(db_name, *args, **kwargs) st.write('# promb - protein humanness evaluation') st.code('''pip install promb * █ ▄▄▄▄ ▄▄▄ ▄█▄ ▄▄▄▄ █▄▄▄ █ █ █ █▓███ █ █ █ █ █ █▄▄▄▀ █ ▀███▀ █ █ █▄▄▄▀ █ ▀ protein mutation burden ''') if st.button("Load example"): st.session_state["seq"] = "SPLQKASDSLINIAIKMLRNGINPELAKKLWDIAYKISMSHIDPSSFYEALKELKKLIEEQEEELIEA" with st.form(border=False, key="input"): seq = st.text_area( "Amino acid sequence", key="seq" ) seq = "".join(seq.split()) left, mid, right = st.columns(3) with left: database_name = st.selectbox("Database", options=["human-reference", "human-swissprot", "human-oas"]) with mid: peptide_length = st.number_input("Peptide length", value=9, min_value=2, max_value=20, disabled=database_name == "human-oas") with right: num_nearest = st.number_input("Nearest peptides", value=1, min_value=1, max_value=5, help="Number of nearest human peptides used for visualization and to compute PSSM and suggest humanizing mutations") st.form_submit_button("Run", type="primary") if not seq: st.stop() with st.spinner(f"Finding nearest {peptide_length}mer peptides in {database_name}..."): db = init_db_cached(database_name, peptide_length if database_name != "human-oas" else None) peptides = db.chop_seq_peptides(seq) nearest = db.find_nearest_peptides(peptides, n=num_nearest) st.write("## Result") st.metric("Human Peptide Content", "{:.1%} human".format(db.compute_peptide_content(seq))) num_mutations = 0 for peptide, hits in zip(peptides, nearest): num_mutations += sum(aa != bb for aa, bb in zip(peptide, hits[0])) st.metric("Mutation Burden", "{:.1f} mutations per {}mer".format(num_mutations / len(peptides), peptide_length)) likelihood = pd.DataFrame({ "likelihood": db.compute_positional_likelihood(seq, nearest_peptides=nearest), "metric": "likelihood", "aa": list(seq), "position": range(1, len(seq)+1) }) wrap = 50 st.write("### Positional likelihood") st.write(f"Fraction of nearest overlapping {peptide_length}mers that contain the input amino acid at that position. Positions with values close to 0 can be considered to be non-human, and values close to 1 to be human.") for start in range(0, len(seq), wrap): chunk = likelihood.iloc[start:start+wrap] heatmap = alt.Chart(chunk).mark_rect().encode( x=alt.X("position:O", title="Sequence Position"), y=alt.Y("metric:N", title="Metric"), color=alt.Color("likelihood:Q", scale=alt.Scale(scheme="reds", reverse=True, domain=(0, 1))), tooltip=["position", "likelihood", "aa"] ) text = alt.Chart(chunk).mark_text(baseline="middle", fontSize=12).encode( x=alt.X("position:O", title="Sequence Position"), y=alt.Y("metric:N", title="Metric"), text=alt.Text("aa:N"), tooltip=["position", "likelihood", "aa"] ) chart = (heatmap + text).properties( width=250 + (15 * len(chunk)), height=180, title=f"Positions {start+1}-{start+wrap}" ) st.altair_chart(chart, use_container_width=False) st.write("### Position-specific scoring matrix") st.write("A PSSM (PWM) computed by counting occurences of amino acids in nearest overlapping human peptides at each position.") pssm = db.compute_pssm(seq, nearest_peptides=nearest) freqs_long = pssm.reset_index().melt(id_vars="position", var_name="aa", value_name="count") heatmap = alt.Chart(freqs_long).mark_rect().encode( x=alt.X("position:O", title="Sequence Position"), y=alt.Y("aa:N", title="Amino Acid", sort=None), color=alt.Color("count:Q", scale=alt.Scale(scheme="viridis")), tooltip=["position", "aa", "count"] ).properties( height=600, title="Amino Acid Frequencies" ) st.altair_chart(heatmap, use_container_width=True) st.write("#### Sequence logo") st.write("PSSM computed from nearest human peptides visualized using logomaker library") logo = logomaker.Logo(pssm, figsize=(min(50, 2 + 0.2 * len(seq)), 2)) st.pyplot(logo.fig) st.write("#### Suggested mutations") st.write("PSSM but without counting amino acids found in input sequence at each position") pssm_mutations = db.compute_pssm(seq, nearest_peptides=nearest, ignore_wildtype=True) logo = logomaker.Logo(pssm_mutations, figsize=(min(50, 2 + 0.2 * len(seq)), 2)) st.pyplot(logo.fig) st.write("### Nearest human peptides") stream = StringIO() print_nearest(peptides, nearest, file=stream) with st.container(height=400): st.code(stream.getvalue()) st.write("## Humanization (naive approach)") st.write("Generate **slightly** humanized variants by applying 1-3 mutations based on nearest overlapping peptides") if st.button("Generate humanized mutants", type="primary"): st.write("### Point mutant candidates") with st.spinner("Generating point mutants..."): for candidate in db.suggest_point_mutant_candidates(seq, nearest_peptides=nearest)[:5]: mutations = " ".join(f"{aa}{pos}{bb}" for pos, (aa, bb) in enumerate(zip(seq, candidate), start=1) if aa != bb) avg = db.compute_average_mutations(candidate) st.code(f">{mutations} AvgMut={avg:.2f}\n{candidate}") st.write("### Double mutant candidates") with st.spinner("Generating double mutants..."): for candidate in db.suggest_double_mutant_candidates(seq, nearest_peptides=nearest)[:5]: mutations = " ".join(f"{aa}{pos}{bb}" for pos, (aa, bb) in enumerate(zip(seq, candidate), start=1) if aa != bb) avg = db.compute_average_mutations(candidate) st.code(f">{mutations} AvgMut={avg:.2f}\n{candidate}") st.write("### Triple mutant candidates") with st.spinner("Generating triple mutants..."): for candidate in db.suggest_triple_mutant_candidates(seq, nearest_peptides=nearest)[:5]: mutations = " ".join(f"{aa}{pos}{bb}" for pos, (aa, bb) in enumerate(zip(seq, candidate), start=1) if aa != bb) avg = db.compute_average_mutations(candidate) st.code(f">{mutations} AvgMut={avg:.2f}\n{candidate}") st.divider() st.write("## Run locally") st.write("Install and run `promb` locally:") st.code(""" # Install promb pip install promb # See cli commands promb --help """, language="text") st.write("More instructions in the GitHub repo: https://github.com/MSDLLCpapers/promb") st.write("You can also clone this space as a git repository and run it locally:") st.code(""" # Clone huggingface spaces repository git clone https://huggingface.co/spaces/prihodad/promb-humanness # Open the directory cd promb-humanness # Install dependencies (you should do this in a separate conda/venv environment) pip install -r requirements.txt # Run Streamlit app streamlit run src/streamlit_app.py """, language="text")