Spaces:
Sleeping
Sleeping
| import os | |
| import pandas as pd | |
| import numpy as np | |
| description = ''' | |
| This is a simple module for preprocessing data for the shiny app. | |
| ''' | |
| def format_degs(): | |
| ''' | |
| This function simply reads and formats the DEG files returning | |
| a single large pd.DataFrame. Filters to only significant genes. | |
| ''' | |
| deg_dir = 'data/differentially_expressed_genes/' | |
| degs = {file.split('_D')[0] : pd.read_csv(deg_dir + file, sep = '\t') | |
| for file in os.listdir(deg_dir)} | |
| # Labeling data with dataset name | |
| for ds, df in degs.items(): | |
| degs[ds]['Dataset'] = [ds] * len(df) | |
| # Combining all dataframes | |
| degs = pd.concat([df for df in degs.values()]) | |
| # Filtering out insignificant genes | |
| degs = degs[degs['pvals_adj'] < 0.05] | |
| return degs | |
| def get_gse(): | |
| ''' | |
| This function simply reads and formats geneset enrichment data. | |
| Returns a pd.DataFrame. | |
| ''' | |
| df = {'Dataset' : [], | |
| 'Group' : [], | |
| 'Value' : [], | |
| 'Variable' : []} | |
| # Reading in gse data | |
| gse = pd.read_csv('data/geneset_enrichment.tsv', sep = '\t') | |
| # Subsetting to only hallmark | |
| gse = gse[gse['Gene Set'] == 'Hallmark RNA'] | |
| var_name = 'Gene Set Enrichment (-log10(adjusted p-value))' | |
| for i, row in gse.iterrows(): | |
| df['Dataset'].append(row['Dataset']) | |
| df['Group'].append(row['Group Name']) | |
| df['Value'].append(row['GSE (-log10(adj. p-val))']) | |
| df['Variable'].append(var_name) | |
| return pd.DataFrame(df) | |
| def get_selection(): | |
| ''' | |
| This function simply reads in and formats selection data from scMKL | |
| results. Returns a pd.DataFrame. | |
| ''' | |
| df = {'Dataset' : [], | |
| 'Group' : [], | |
| 'Value' : [], | |
| 'Variable' : []} | |
| sel_dir = 'data/group_selections/' | |
| # Reading in scMKL selection | |
| selection = [pd.read_csv(sel_dir + file, sep = '\t') | |
| for file in os.listdir(sel_dir)] | |
| selection = pd.concat(selection) | |
| # Filtering to only hallmark rna runs | |
| selection = selection[selection['Modality'] == 'RNA - hallmark'] | |
| # Formatting group names | |
| rp = ('_', ' ') | |
| names = selection['Group'].apply(lambda x: x.replace(*rp)[9:]) | |
| selection['Group'] = names | |
| # Grouping by group and summing selection | |
| groupby = ['Dataset', 'Group', 'Modality'] | |
| selection = selection.groupby(groupby)['Selected'].sum().reset_index() | |
| var_name = 'scMKL Selection Frequency' | |
| for i, row in selection.iterrows(): | |
| df['Dataset'].append(row['Dataset']) | |
| df['Group'].append(row['Group']) | |
| df['Value'].append(row['Selected']) | |
| df['Variable'].append(var_name) | |
| return pd.DataFrame(df) | |
| def get_overlap(degs, group_dict): | |
| ''' | |
| This function takes DEGs in the form of a dataframe and returns a | |
| dataframe with columns ['Group', 'Value', 'Variable', 'Dataset']. | |
| ''' | |
| df = {'Dataset' : [], | |
| 'Group' : [], | |
| 'Value' : [], | |
| 'Variable' : []} | |
| var_name = 'Proportion of DE Features' | |
| for ds in set(degs['Dataset']): | |
| # Getting array of deg genes for current dataset | |
| ds_degs = np.array(degs[degs['Dataset'] == ds]['names'], | |
| dtype = np.str_) | |
| for group, genes in group_dict.items(): | |
| # Finding num of overlap between DE genes and groupings | |
| genes_list = list(genes) | |
| overlap = np.isin(ds_degs, genes_list) | |
| num_overlap = np.sum(overlap) | |
| # Taking proportion | |
| prop_overlap = num_overlap / len(genes) | |
| # Formatting group name | |
| group_name = group.replace('_', ' ')[9:] | |
| df['Dataset'].append(ds) | |
| df['Group'].append(group_name) | |
| df['Value'].append(prop_overlap) | |
| df['Variable'].append(var_name) | |
| return pd.DataFrame(df) | |
| def hallmarkify_tsv(): | |
| ''' | |
| This script ties together the other functions and saves the tsv | |
| for the GSEApy results tab. | |
| ''' | |
| degs = format_degs() | |
| group_dict = np.load('data/hallmark_groupings.pkl', allow_pickle = True) | |
| deg_overlap = get_overlap(degs, group_dict) | |
| gse = get_gse() | |
| selection = get_selection() | |
| all_ = pd.concat([deg_overlap, gse, selection]) | |
| all_.to_csv('data/hallmark_enrichment_selection_overlap.tsv', sep = '\t', index = False) | |
| return None |