File size: 5,444 Bytes
e2ef0ff 74ee563 e2ef0ff 74ee563 e2ef0ff 74ee563 e2ef0ff 3f6b5d2 e2ef0ff 118eef0 e2ef0ff 4ccc715 3f6b5d2 e2ef0ff 4ccc715 3f6b5d2 e2ef0ff c01cca6 e2ef0ff c08f7b6 e2ef0ff c01cca6 3f6b5d2 e2ef0ff c08f7b6 e2ef0ff 04f7d90 e2ef0ff 74ee563 e2ef0ff | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | import datasets
import streamlit as st
import numpy as np
import pandas as pd
import altair as alt
st.set_page_config(layout='wide')
# parse out feature_names from URL query args to it's possible to link to this page
query_params = st.query_params
if "feature_name_1" in query_params.keys():
feature_name_1 = query_params["feature_name_1"]
else:
feature_name_1 = "B9J08_000003"
if "feature_name_2" in query_params.keys():
feature_name_2 = query_params["feature_name_2"]
else:
feature_name_2 = "B9J08_000004"
st.markdown("""
# CaurisCEN Expression Scatter
**CaurisCEN** is a co-expression network for *Candida auris* built on 577 RNA-seq runs across 2 96-well plates formats in 3 biological replicas.
A pair of genes are said to be co-expressed when their expression is correlated across different conditions and
is often a marker for genes to be involved in similar processes.
To Cite:
Rapala JR, MJ O'Meara, TR O'Meara
CaurisCEN: A Co-Expression Network for Candida auris
* Code available at https://github.com/maomlab/CaurisCEN/tree/master/vignettes/CaurisCEN
* Full network and dataset: https://huggingface.co/datasets/maomlab/CaurisCEN
## Plot scatter plot expression for a pair of genes across studies.
Put in the B9J08 locus tag (e.g. `B9J08_004112`) for two genes.
""")
gene_metadata = datasets.load_dataset(
path = "maomlab/CaurisCEN",
name = "gene_metadata",
data_dir = "gene_metadata/data")['train'].to_pandas()
expression_runs = datasets.load_dataset(
path = "maomlab/CaurisCEN",
name = "run_metadata",
data_dir = "run_metadata/data")['train'].to_pandas()
expression_matrix = datasets.load_dataset(
path = "maomlab/CaurisCEN",
name = "expression_matrix",
data_dir = "expression_matrix")['train'].to_pandas()
#DEBUG
print(f"expression_matrix shape: {expression_matrix.shape}")
col1, col2, col3, padding = st.columns(spec = [0.2, 0.2, 0.2, 0.4])
with col1:
feature_name_1 = st.text_input(
label = "Feature Name 1",
value = f"{feature_name_1}",
max_chars = 12,
help = "B9J08 Locus Tag e.g. B9J08_000003")
with col2:
feature_name_2 = st.text_input(
label = "Feature Name 2",
value = f"{feature_name_2}",
max_chars = 12,
help = "B9J08 New Locus Tag e.g. B9J08_000004")
# check the user input
try:
locus_tag_new_1 = gene_metadata.loc[gene_metadata["locus_tag_old"] == feature_name_1]["locus_tag_new"].values[0]
gene_name_1 = gene_metadata.loc[gene_metadata["locus_tag_old"] == feature_name_1]["gene_name"].values[0]
description_1 = gene_metadata.loc[gene_metadata["locus_tag_old"] == feature_name_1]["description"].values[0]
except:
st.error(f"Unable to locate locus tag for gene 1: {feature_name_1}, it should be of the form 'B9J08_#######'")
try:
locus_tag_new_2 = gene_metadata.loc[gene_metadata["locus_tag_old"] == feature_name_2]["locus_tag_new"].values[0]
gene_name_2 = gene_metadata.loc[gene_metadata["locus_tag_old"] == feature_name_2]["gene_name"].values[0]
description_2 = gene_metadata.loc[gene_metadata["locus_tag_old"] == feature_name_2]["description"].values[0]
except:
st.error(f"Unable to locate locus tag for gene 2: {feature_name_2}, it should be of the form 'B9J08_#######'")
expression_1 = expression_matrix.loc[expression_matrix["gene"] == locus_tag_new_1].transpose().iloc[1:,0].to_numpy().astype('float64')
expression_2 = expression_matrix.loc[expression_matrix["gene"] == locus_tag_new_2].transpose().iloc[1:,0].to_numpy().astype('float64')
chart_data = pd.DataFrame({
"feature_name_1": feature_name_1,
"feature_name_2": feature_name_2,
"expression_1": expression_1,
"expression_2": expression_2,
"log_expression_1": np.log10(expression_1 + 1),
"log_expression_2": np.log10(expression_2 + 1),
"run_accession": expression_matrix.columns[1:]})
chart_data = chart_data.merge(
right = expression_runs,
on = "run_accession")
with col3:
st.text('') # help alignment with input box
st.download_button(
label = "Download data as TSV",
data = chart_data.to_csv(sep ='\t').encode('utf-8'),
file_name = f"CaurisCEN_expression_{feature_name_1}_vs_{feature_name_2}.tsv",
mime = "text/csv")
st.markdown(f"""
#### Gene 1:
* *Gene ID*: [{feature_name_1}](http://www.candidagenome.org/cgi-bin/locus.pl?locus={feature_name_1}&organism=C_auris_B8441)
{'* *Gene Name*:' + gene_name_1 if gene_name_1 is not None else ''}
* *Description*: {description_1}
* *Top [Co-Expressed Partners](https://huggingface.co/spaces/maomlab/CaurisCEN-TopHits?feature_name={feature_name_1})*
#### Gene 2:
* *Gene ID*: [{feature_name_2}](http://www.candidagenome.org/cgi-bin/locus.pl?locus={feature_name_2}&organism=C_auris_B8441)
{'* *Gene Name*:' + gene_name_2 if gene_name_2 is not None else ''}
* *Description*: {description_2}
* *Top [Co-Expressed Partners](https://huggingface.co/spaces/maomlab/CaurisCEN-TopHits?feature_name={feature_name_2})*
""")
chart = (
alt.Chart(
chart_data,
width = 750,
height = 750)
.mark_circle()
.encode(
x=alt.X("log_expression_1", title=f"Log10[{feature_name_1}+1] Expression"),
y=alt.Y("log_expression_2", title=f"Log10[{feature_name_2}+1] Expression"),
color=alt.Color("study_accession", title="Study Accession"),
tooltip=["run_accession", "study_accession"]))
st.altair_chart(
chart) |