Spaces:

maomlab
/

CaurisCEN-ExpressionScatter

Sleeping

File size: 5,444 Bytes

e2ef0ff
 
 
 
 
 
 
 
 
 
 
 
 
74ee563
e2ef0ff
 
 
 
74ee563
e2ef0ff
 
 
 
 
 
 
 
 
 
 
 
74ee563
e2ef0ff
 
 
3f6b5d2
e2ef0ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118eef0
e2ef0ff
 
 
 
 
 
4ccc715
3f6b5d2
e2ef0ff
 
 
 
 
4ccc715
3f6b5d2
e2ef0ff
 
 
c01cca6
e2ef0ff
 
 
c08f7b6
e2ef0ff
 
c01cca6
3f6b5d2
 
e2ef0ff
c08f7b6
e2ef0ff
04f7d90
 
e2ef0ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74ee563
e2ef0ff

import datasets
import streamlit as st
import numpy as np
import pandas as pd
import altair as alt

st.set_page_config(layout='wide')

# parse out feature_names from URL query args to it's possible to link to this page
query_params = st.query_params
if "feature_name_1" in query_params.keys():
    feature_name_1 = query_params["feature_name_1"]
else:
    feature_name_1 = "B9J08_000003"

if "feature_name_2" in query_params.keys():
    feature_name_2 = query_params["feature_name_2"]
else:
    feature_name_2 = "B9J08_000004"

    


st.markdown("""
# CaurisCEN Expression Scatter
**CaurisCEN** is a co-expression network for *Candida auris* built on 577 RNA-seq runs across 2 96-well plates formats in 3 biological replicas.
A pair of genes are said to be co-expressed when their expression is correlated across different conditions and
is often a marker for genes to be involved in similar processes. 
To Cite:
    Rapala JR, MJ O'Meara, TR O'Meara
    CaurisCEN: A Co-Expression Network for Candida auris
* Code available at https://github.com/maomlab/CaurisCEN/tree/master/vignettes/CaurisCEN
* Full network and dataset: https://huggingface.co/datasets/maomlab/CaurisCEN

## Plot scatter plot expression for a pair of genes across studies.
Put in the B9J08 locus tag  (e.g. `B9J08_004112`) for two genes.
""")

gene_metadata = datasets.load_dataset(
    path = "maomlab/CaurisCEN",
    name = "gene_metadata",
    data_dir = "gene_metadata/data")['train'].to_pandas()

expression_runs = datasets.load_dataset(
    path = "maomlab/CaurisCEN",
    name = "run_metadata",
    data_dir = "run_metadata/data")['train'].to_pandas()

expression_matrix = datasets.load_dataset(
    path = "maomlab/CaurisCEN",
    name = "expression_matrix",
    data_dir = "expression_matrix")['train'].to_pandas()


#DEBUG
print(f"expression_matrix shape: {expression_matrix.shape}")

col1, col2, col3, padding = st.columns(spec = [0.2, 0.2, 0.2, 0.4])
with col1:
    feature_name_1 = st.text_input(
        label = "Feature Name 1",
        value = f"{feature_name_1}",
        max_chars = 12,
        help = "B9J08 Locus Tag e.g. B9J08_000003")

with col2:
    feature_name_2 = st.text_input(
        label = "Feature Name 2",
        value = f"{feature_name_2}",
        max_chars = 12,
        help = "B9J08 New Locus Tag e.g. B9J08_000004")

# check the user input
try:
    locus_tag_new_1 = gene_metadata.loc[gene_metadata["locus_tag_old"] == feature_name_1]["locus_tag_new"].values[0]
    gene_name_1 = gene_metadata.loc[gene_metadata["locus_tag_old"] == feature_name_1]["gene_name"].values[0]
    description_1 = gene_metadata.loc[gene_metadata["locus_tag_old"] == feature_name_1]["description"].values[0]
except:
    st.error(f"Unable to locate locus tag for gene 1: {feature_name_1}, it should be of the form 'B9J08_#######'")

try:
    locus_tag_new_2 = gene_metadata.loc[gene_metadata["locus_tag_old"] == feature_name_2]["locus_tag_new"].values[0]
    gene_name_2 = gene_metadata.loc[gene_metadata["locus_tag_old"] == feature_name_2]["gene_name"].values[0]
    description_2 = gene_metadata.loc[gene_metadata["locus_tag_old"] == feature_name_2]["description"].values[0]
except:
    st.error(f"Unable to locate locus tag for gene 2: {feature_name_2}, it should be of the form 'B9J08_#######'")

expression_1 = expression_matrix.loc[expression_matrix["gene"] == locus_tag_new_1].transpose().iloc[1:,0].to_numpy().astype('float64')
expression_2 = expression_matrix.loc[expression_matrix["gene"] == locus_tag_new_2].transpose().iloc[1:,0].to_numpy().astype('float64')

chart_data = pd.DataFrame({
    "feature_name_1": feature_name_1,
    "feature_name_2": feature_name_2,
    "expression_1": expression_1,
    "expression_2": expression_2,
    "log_expression_1": np.log10(expression_1 + 1),
    "log_expression_2": np.log10(expression_2 + 1),
    "run_accession": expression_matrix.columns[1:]})
chart_data = chart_data.merge(
    right = expression_runs,
    on = "run_accession")

with col3:
    st.text('') # help alignment with input box
    st.download_button(
        label = "Download data as TSV",
        data = chart_data.to_csv(sep ='\t').encode('utf-8'),
        file_name = f"CaurisCEN_expression_{feature_name_1}_vs_{feature_name_2}.tsv",
        mime = "text/csv")


st.markdown(f"""
#### Gene 1:
* *Gene ID*: [{feature_name_1}](http://www.candidagenome.org/cgi-bin/locus.pl?locus={feature_name_1}&organism=C_auris_B8441)
{'* *Gene Name*:' + gene_name_1 if gene_name_1 is not None else ''}
* *Description*: {description_1}
* *Top [Co-Expressed Partners](https://huggingface.co/spaces/maomlab/CaurisCEN-TopHits?feature_name={feature_name_1})*

#### Gene 2:
* *Gene ID*: [{feature_name_2}](http://www.candidagenome.org/cgi-bin/locus.pl?locus={feature_name_2}&organism=C_auris_B8441)
{'* *Gene Name*:' + gene_name_2 if gene_name_2 is not None else ''}
* *Description*: {description_2}
* *Top [Co-Expressed Partners](https://huggingface.co/spaces/maomlab/CaurisCEN-TopHits?feature_name={feature_name_2})*
""")

chart = (
   alt.Chart(
       chart_data,
       width = 750,
       height = 750)
   .mark_circle()
   .encode(
       x=alt.X("log_expression_1", title=f"Log10[{feature_name_1}+1] Expression"),
       y=alt.Y("log_expression_2", title=f"Log10[{feature_name_2}+1] Expression"),
       color=alt.Color("study_accession", title="Study Accession"),
       tooltip=["run_accession", "study_accession"]))

st.altair_chart(
    chart)