Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import pandas as pd | |
| import datasets | |
| import streamlit as st | |
| from streamlit_cytoscapejs import st_cytoscapejs | |
| import networkx as nx | |
| st.set_page_config(layout='wide') | |
| # parse out gene_ids from URL query args to it's possible to link to this page | |
| query_params = st.query_params | |
| if "gene_ids" in query_params.keys(): | |
| input_gene_ids = query_params["gene_ids"] | |
| else: | |
| input_gene_ids = "B9J08_000884,B9J08_004112" | |
| # use "\n" as the separator so it shows correctly in the text area | |
| input_gene_ids = input_gene_ids.replace(",", "\n") | |
| if "coexp_score_threshold" in query_params.keys(): | |
| coexp_score_threshold = query_params["coexp_score_threshold"] | |
| else: | |
| coexp_score_threshold = "0.85" | |
| if "max_per_gene" in query_params.keys(): | |
| max_per_gene = query_params["max_per_gene"] | |
| else: | |
| max_per_gene = "25" | |
| st.markdown(""" | |
| # CaurisCEN Network | |
| **CaurisCEN** is a co-expression network for *Candida auris* built on 577 RNA-seq runs across 2 96-well plates formats in 3 biological replicas. | |
| A pair of genes are said to be co-expressed when their expression is correlated across different conditions and | |
| is often a marker for genes to be involved in similar processes. | |
| To Cite: | |
| Rapala JR, MJ O'Meara, TR O'Meara | |
| CaurisCEN: A Co-Expression Network for Candida auris | |
| * Code available at https://github.com/maomlab/CalCEN/tree/master/vignettes/CaurisCEN | |
| * Full network and dataset: https://huggingface.co/datasets/maomlab/CaurisCEN | |
| ## Plot a network for a set of genes | |
| Put a ``B9J08_######`` gene_id, one each row to seed the network | |
| """) | |
| gene_metadata = datasets.load_dataset( | |
| path = "maomlab/CaurisCEN", | |
| name = "gene_metadata", | |
| data_dir = "gene_metadata/data")['train'].to_pandas() | |
| top_coexp_hits = datasets.load_dataset( | |
| path = "maomlab/CaurisCEN", | |
| name = "top_coexp_hits_general", | |
| data_dir = "top_coexp_hits_general/data")['train'].to_pandas() | |
| col1, col2, col3, padding = st.columns(spec = [0.2, 0.2, 0.2, 0.4]) | |
| with col1: | |
| input_gene_ids = st.text_area( | |
| label = "Gene IDs", | |
| value = f"{input_gene_ids}", | |
| height = 130, | |
| help = "B9J08 Gene IDs e.g. B9J08_000884") | |
| with col2: | |
| coexp_score_threshold = st.text_input( | |
| label = "Co-expression threshold [0-1]", | |
| value = f"{coexp_score_threshold}", | |
| help = "Default: 0.85") | |
| try: | |
| coexp_score_threshold = float(coexp_score_threshold) | |
| except: | |
| st.error(f"Co-expression threshold should be a number between 0 and 1, instead it is '{coexp_score_threshold}'") | |
| if coexp_score_threshold < 0 or 1 < coexp_score_threshold: | |
| st.error(f"Co-expression threshold should be a number between 0 and 1, instead it is '{coexp_score_threshold}'") | |
| max_per_gene = st.text_input( | |
| label = "Max per gene", | |
| value = f"{max_per_gene}", | |
| help = "Default: 25") | |
| try: | |
| max_per_gene = int(max_per_gene) | |
| except: | |
| st.error(f"Max per gene should be a number greater than 0, instead it is '{max_per_gene}'") | |
| if max_per_gene <= 0: | |
| st.error(f"Max per gene should be a number greater than 0, instead it is '{max_per_gene}'") | |
| ################################## | |
| # Parse and check the user input # | |
| ################################## | |
| seed_gene_ids = [] | |
| for input_gene_id in input_gene_ids.split("\n"): | |
| gene_id = input_gene_id.strip() | |
| if gene_id == "": | |
| continue | |
| else: | |
| seed_gene_ids.append(gene_id) | |
| neighbors = [] | |
| for seed_gene_id in seed_gene_ids: | |
| hits = top_coexp_hits[ | |
| (top_coexp_hits.feature_name_1 == seed_gene_id) & (top_coexp_hits.score > coexp_score_threshold)] | |
| if len(hits.index) > max_per_gene: | |
| hits = hits[0:max_per_gene] | |
| neighbors.append(hits) | |
| neighbors = pd.concat(neighbors) | |
| neighbor_gene_ids = list(set(neighbors.feature_name_2)) | |
| gene_ids = seed_gene_ids + neighbor_gene_ids | |
| gene_types = ['seed'] * len(seed_gene_ids) + ['neighbor'] * len(neighbor_gene_ids) | |
| old_locus_tags = [] | |
| gene_names = [] | |
| sacch_orthologs = [] | |
| descriptions = [] | |
| for gene_id in gene_ids: | |
| try: | |
| locus_tag_old = gene_metadata.loc[gene_metadata["locus_tag_old"] == gene_id]["locus_tag_old"].values[0] | |
| gene_name = gene_metadata.loc[gene_metadata["locus_tag_old"] == gene_id]["gene_name"].values[0] | |
| sacch_ortholog = gene_metadata.loc[gene_metadata["locus_tag_old"] == gene_id]["sacch_ortholog"].values[0] | |
| description = gene_metadata.loc[gene_metadata["locus_tag_old"] == gene_id]["description"].values[0] | |
| except: | |
| st.error(f"Unable to locate locus_tag_new for Gene ID: {gene_id}, it should be of the form 'B9J08_#######'") | |
| gene_id = None | |
| gene_names = None | |
| sacch_ortholog = None | |
| description = None | |
| old_locus_tags.append(locus_tag_old) | |
| gene_names.append(gene_name) | |
| sacch_orthologs.append(sacch_ortholog) | |
| descriptions.append(description) | |
| print(f""" | |
| Constructing node_info | |
| seed_gene_ids: {len(seed_gene_ids)}, | |
| neighbor_gene_ids: {len(neighbor_gene_ids)}, | |
| gene_index: {len(gene_ids)}, | |
| locus_tag_old: {len(old_locus_tags)}, | |
| gene_types: {len(gene_types)}, | |
| gene_name: {len(gene_names)}, | |
| sacc_ortholog: {len(sacch_orthologs)}, | |
| descriptions: {len(descriptions)} | |
| """) | |
| node_info = pd.DataFrame({ | |
| "gene_index": range(len(gene_ids)), | |
| "locus_tag_old" : old_locus_tags, | |
| "gene_type" : gene_types, | |
| "gene_name" : gene_names, | |
| "sacch_ortholog": sacch_orthologs, | |
| "description": descriptions | |
| }) | |
| neighbors = neighbors.merge( | |
| right = node_info, | |
| left_on = "feature_name_1", | |
| right_on = "locus_tag_old") | |
| neighbors = neighbors.merge( | |
| right = node_info, | |
| left_on = "feature_name_2", | |
| right_on = "locus_tag_old", | |
| suffixes = ("_a", "_b")) | |
| ################################ | |
| # Use NetworkX to layout graph # | |
| ################################ | |
| # note I think CytoscapeJS can layout graphs | |
| # but I'm unsure how to do it through the streamlit-cytoscapejs interface :( | |
| st.write(neighbors) | |
| G = nx.Graph() | |
| for i in range(len(neighbors.index)): | |
| edge = neighbors.iloc[i] | |
| G.add_edge( | |
| edge["gene_index_a"], | |
| edge["gene_index_b"], | |
| weight = edge["score"]) | |
| layout = nx.spring_layout(G) | |
| node_color_lut = { | |
| "seed" : "#4866F0", # blue | |
| "neighbor" : "#F0C547" # gold | |
| } | |
| elements = [] | |
| singleton_index = 0 | |
| for i in range(len(node_info.index)): | |
| node = node_info.iloc[i] | |
| if node["gene_index"] in layout.keys(): | |
| layout_x = layout[node["gene_index"]][0] * 600 + 1500/2 | |
| layout_y = layout[node["gene_index"]][1] * 600 + 1500/2 | |
| else: | |
| layout_x = (singleton_index % 8) * 150 + 100 | |
| layout_y = np.floor(singleton_index / 8) * 50 + 30 | |
| singleton_index += 1 | |
| elements.append({ | |
| "data": { | |
| "id": node["locus_tag_old"], | |
| "label": node["gene_name"] if node["gene_name"] is not None else node["locus_tag_old"], | |
| "color": node_color_lut[node["gene_type"]]}, | |
| "position": { | |
| "x" : layout_x, | |
| "y" : layout_y}}) | |
| for i in range(len(neighbors.index)): | |
| edge = neighbors.iloc[i] | |
| elements.append({ | |
| "data" : { | |
| "source" : edge["feature_name_1"], | |
| "target" : edge["feature_name_2"], | |
| "width" : | |
| 20 if edge["score"] > 0.98 else | |
| 15 if edge["score"] > 0.93 else | |
| 10 if edge["score"] > 0.90 else | |
| 8 if edge["score"] > 0.88 else | |
| 5}}) | |
| with col3: | |
| st.text('') # help alignment with input box | |
| st.download_button( | |
| label = "Download as as TSV", | |
| data = neighbors.to_csv(sep ='\t').encode('utf-8'), | |
| file_name = f"CaurisCEN_network.tsv", | |
| mime = "text/csv") | |
| ########################################################## | |
| stylesheet = [ | |
| {"selector": "node", "style": { | |
| "width": 140, | |
| "height": 30, | |
| "shape": "rectangle", | |
| "label" : "data(label)", | |
| "labelFontSize": 100, | |
| 'background-color': 'data(color)', | |
| "text-halign": "center", | |
| "text-valign": "center", | |
| }}, | |
| {"selector": "edge", "style": { | |
| "width": "data(width)" | |
| }} | |
| ] | |
| st.title("CaurisCEN Network") | |
| clicked_elements = st_cytoscapejs( | |
| elements = elements, | |
| stylesheet = stylesheet, | |
| width = 1000, | |
| height= 1000, | |
| key = "1") | |