Spaces:
Runtime error
Runtime error
| import altair as alt | |
| import numpy as np | |
| import pandas as pd | |
| import streamlit as st | |
| from promb import init_db, print_nearest | |
| from io import StringIO | |
| import logomaker | |
| st.set_page_config(layout="wide") | |
| def init_db_cached(db_name, *args, **kwargs): | |
| st.info(""" | |
| Note: More human doesn't always mean "good". Peptides like "GGGGGGGGG" or "EKEKEKEKE" are human but you don\'t necessarily want your protein to contain those. | |
| Make sure to check for sequence entropy, AlphaFold2 confidence, or other quality metrics. | |
| """) | |
| return init_db(db_name, *args, **kwargs) | |
| st.write('# promb - protein humanness evaluation') | |
| st.code('''pip install promb | |
| * █ | |
| ▄▄▄▄ ▄▄▄ ▄█▄ ▄▄▄▄ █▄▄▄ | |
| █ █ █ █▓███ █ █ █ █ █ | |
| █▄▄▄▀ █ ▀███▀ █ █ █▄▄▄▀ | |
| █ | |
| ▀ protein mutation burden | |
| ''') | |
| if st.button("Load example"): | |
| st.session_state["seq"] = "SPLQKASDSLINIAIKMLRNGINPELAKKLWDIAYKISMSHIDPSSFYEALKELKKLIEEQEEELIEA" | |
| with st.form(border=False, key="input"): | |
| seq = st.text_area( | |
| "Amino acid sequence", | |
| key="seq" | |
| ) | |
| seq = "".join(seq.split()) | |
| left, mid, right = st.columns(3) | |
| with left: | |
| database_name = st.selectbox("Database", options=["human-reference", "human-swissprot", "human-oas"]) | |
| with mid: | |
| peptide_length = st.number_input("Peptide length", value=9, min_value=2, max_value=20, disabled=database_name == "human-oas") | |
| with right: | |
| num_nearest = st.number_input("Nearest peptides", value=1, min_value=1, max_value=5, help="Number of nearest human peptides used for visualization and to compute PSSM and suggest humanizing mutations") | |
| st.form_submit_button("Run", type="primary") | |
| if not seq: | |
| st.stop() | |
| with st.spinner(f"Finding nearest {peptide_length}mer peptides in {database_name}..."): | |
| db = init_db_cached(database_name, peptide_length if database_name != "human-oas" else None) | |
| peptides = db.chop_seq_peptides(seq) | |
| nearest = db.find_nearest_peptides(peptides, n=num_nearest) | |
| st.write("## Result") | |
| st.metric("Human Peptide Content", "{:.1%} human".format(db.compute_peptide_content(seq))) | |
| num_mutations = 0 | |
| for peptide, hits in zip(peptides, nearest): | |
| num_mutations += sum(aa != bb for aa, bb in zip(peptide, hits[0])) | |
| st.metric("Mutation Burden", "{:.1f} mutations per {}mer".format(num_mutations / len(peptides), peptide_length)) | |
| likelihood = pd.DataFrame({ | |
| "likelihood": db.compute_positional_likelihood(seq, nearest_peptides=nearest), | |
| "metric": "likelihood", | |
| "aa": list(seq), | |
| "position": range(1, len(seq)+1) | |
| }) | |
| wrap = 50 | |
| st.write("### Positional likelihood") | |
| st.write(f"Fraction of nearest overlapping {peptide_length}mers that contain the input amino acid at that position. Positions with values close to 0 can be considered to be non-human, and values close to 1 to be human.") | |
| for start in range(0, len(seq), wrap): | |
| chunk = likelihood.iloc[start:start+wrap] | |
| heatmap = alt.Chart(chunk).mark_rect().encode( | |
| x=alt.X("position:O", title="Sequence Position"), | |
| y=alt.Y("metric:N", title="Metric"), | |
| color=alt.Color("likelihood:Q", scale=alt.Scale(scheme="reds", reverse=True, domain=(0, 1))), | |
| tooltip=["position", "likelihood", "aa"] | |
| ) | |
| text = alt.Chart(chunk).mark_text(baseline="middle", fontSize=12).encode( | |
| x=alt.X("position:O", title="Sequence Position"), | |
| y=alt.Y("metric:N", title="Metric"), | |
| text=alt.Text("aa:N"), | |
| tooltip=["position", "likelihood", "aa"] | |
| ) | |
| chart = (heatmap + text).properties( | |
| width=250 + (15 * len(chunk)), | |
| height=180, | |
| title=f"Positions {start+1}-{start+wrap}" | |
| ) | |
| st.altair_chart(chart, use_container_width=False) | |
| st.write("### Position-specific scoring matrix") | |
| st.write("A PSSM (PWM) computed by counting occurences of amino acids in nearest overlapping human peptides at each position.") | |
| pssm = db.compute_pssm(seq, nearest_peptides=nearest) | |
| freqs_long = pssm.reset_index().melt(id_vars="position", var_name="aa", value_name="count") | |
| heatmap = alt.Chart(freqs_long).mark_rect().encode( | |
| x=alt.X("position:O", title="Sequence Position"), | |
| y=alt.Y("aa:N", title="Amino Acid", sort=None), | |
| color=alt.Color("count:Q", scale=alt.Scale(scheme="viridis")), | |
| tooltip=["position", "aa", "count"] | |
| ).properties( | |
| height=600, | |
| title="Amino Acid Frequencies" | |
| ) | |
| st.altair_chart(heatmap, use_container_width=True) | |
| st.write("#### Sequence logo") | |
| st.write("PSSM computed from nearest human peptides visualized using logomaker library") | |
| logo = logomaker.Logo(pssm, figsize=(min(50, 2 + 0.2 * len(seq)), 2)) | |
| st.pyplot(logo.fig) | |
| st.write("#### Suggested mutations") | |
| st.write("PSSM but without counting amino acids found in input sequence at each position") | |
| pssm_mutations = db.compute_pssm(seq, nearest_peptides=nearest, ignore_wildtype=True) | |
| logo = logomaker.Logo(pssm_mutations, figsize=(min(50, 2 + 0.2 * len(seq)), 2)) | |
| st.pyplot(logo.fig) | |
| st.write("### Nearest human peptides") | |
| stream = StringIO() | |
| print_nearest(peptides, nearest, file=stream) | |
| with st.container(height=400): | |
| st.code(stream.getvalue()) | |
| st.write("## Humanization (naive approach)") | |
| st.write("Generate **slightly** humanized variants by applying 1-3 mutations based on nearest overlapping peptides") | |
| if st.button("Generate humanized mutants", type="primary"): | |
| st.write("### Point mutant candidates") | |
| with st.spinner("Generating point mutants..."): | |
| for candidate in db.suggest_point_mutant_candidates(seq, nearest_peptides=nearest)[:5]: | |
| mutations = " ".join(f"{aa}{pos}{bb}" for pos, (aa, bb) in enumerate(zip(seq, candidate), start=1) if aa != bb) | |
| avg = db.compute_average_mutations(candidate) | |
| st.code(f">{mutations} AvgMut={avg:.2f}\n{candidate}") | |
| st.write("### Double mutant candidates") | |
| with st.spinner("Generating double mutants..."): | |
| for candidate in db.suggest_double_mutant_candidates(seq, nearest_peptides=nearest)[:5]: | |
| mutations = " ".join(f"{aa}{pos}{bb}" for pos, (aa, bb) in enumerate(zip(seq, candidate), start=1) if aa != bb) | |
| avg = db.compute_average_mutations(candidate) | |
| st.code(f">{mutations} AvgMut={avg:.2f}\n{candidate}") | |
| st.write("### Triple mutant candidates") | |
| with st.spinner("Generating triple mutants..."): | |
| for candidate in db.suggest_triple_mutant_candidates(seq, nearest_peptides=nearest)[:5]: | |
| mutations = " ".join(f"{aa}{pos}{bb}" for pos, (aa, bb) in enumerate(zip(seq, candidate), start=1) if aa != bb) | |
| avg = db.compute_average_mutations(candidate) | |
| st.code(f">{mutations} AvgMut={avg:.2f}\n{candidate}") | |
| st.divider() | |
| st.write("## Run locally") | |
| st.write("Install and run `promb` locally:") | |
| st.code(""" | |
| # Install promb | |
| pip install promb | |
| # See cli commands | |
| promb --help | |
| """, language="text") | |
| st.write("More instructions in the GitHub repo: https://github.com/MSDLLCpapers/promb") | |
| st.write("You can also clone this space as a git repository and run it locally:") | |
| st.code(""" | |
| # Clone huggingface spaces repository | |
| git clone https://huggingface.co/spaces/prihodad/promb-humanness | |
| # Open the directory | |
| cd promb-humanness | |
| # Install dependencies (you should do this in a separate conda/venv environment) | |
| pip install -r requirements.txt | |
| # Run Streamlit app | |
| streamlit run src/streamlit_app.py | |
| """, language="text") | |