#!/usr/bin/env python3 import ast import glob import json import os from functools import reduce from pathlib import Path import pandas as pd import requests proj_root = Path(__file__).resolve().parent.parent # match "lymphoid leukemia" in query to "lymphoid leukemias" in GDC disease_type # load project_mappings # the function to create this tsv file is a one-time run, found as one of the api functions below project_mappings = pd.read_csv( os.path.join(proj_root, "csvs", "gdc_projects.tsv"), sep="\t", index_col=0, names=["project", "desc"] ) project_mappings["desc"] = project_mappings["desc"].apply(ast.literal_eval) project_mappings = project_mappings["desc"].to_dict() def get_gene_mutation_data(start, stop, step): # cannot query the entire thing at once, need to do it in parts for mini_stop in range(start, stop, step): if mini_stop != 0: # curl_cmd = "https://api.gdc.cancer.gov/ssms?fields=gene_aa_change&from={}&size={}".format(start, mini_stop) # print('curl cmd {}'.format(curl_cmd)) response = requests.get(curl_cmd) out_file = "_".join([str(start), str(mini_stop), "gene.mutation.txt"]) with open(out_file, "w") as response_out: response_out.write(response.text) start = mini_stop # final curl_cmd curl_cmd = ( "https://api.gdc.cancer.gov/ssms?fields=gene_aa_change&from={}&size={}".format( start, stop ) ) response = requests.get(curl_cmd) out_file = "_".join([str(start), str(stop), "gene.mutation.txt"]) with open(out_file, "w") as response_out: response_out.write(response.text) def process_gene_mutation_data(): gdc_genes = {} gene_mutation_data_files = glob.glob("*gene.mutation.txt") # print('gene_mutation_data_files {}'.format(gene_mutation_data_files)) for f in gene_mutation_data_files: # print('processing file {}'.format(f)) with open(f, "r") as f_in: data = json.load(f_in) for item in data["data"]["hits"]: for gene_aa_change in item["gene_aa_change"]: gene, mutation = gene_aa_change.split(" ") if not gene in gdc_genes: gdc_genes[gene] = [] if not mutation in gdc_genes[gene]: gdc_genes[gene].append(mutation) with open("gdc_genes_mutations.json", "w") as f_out: json.dump(gdc_genes, f_out, indent=4) # this function creates the project mappings tsv file # only to be run once def get_gdc_project_ids(start, stop): project_mappings = {} curl_cmd = "https://api.gdc.cancer.gov/projects?fields=project_id,disease_type,primary_site,name&from={}&size={}".format( start, stop ) # print('curl cmd {}'.format(curl_cmd)) out_file = "gdc_projects.tsv" try: response = requests.get(curl_cmd) # print('status code {}'.format(response.status_code)) with open(out_file, "w") as response_out: for item in response.json()["data"]["hits"]: disease_type_and_name = item["disease_type"] + [item["name"]] line = f"{item['project_id']}\t{disease_type_and_name}\n" response_out.write(line) project_mappings[item["project_id"]] = disease_type_and_name # print('project_mappings {}'.format(project_mappings)) except Exception as e: print("unable to execute GDC API request {}".format(str(e))) return project_mappings def get_ssm_id(gene, mutation): ssm_id_endpt = "https://api.gdc.cancer.gov/ssms" fields = ["mutation_type"] fields = ",".join(fields) expand = ["consequence.transcript"] filters = { "op": "=", "content": {"field": "ssms.gene_aa_change", "value": "[gene][mutation]"}, } filters["content"]["value"] = gene + " " + mutation # print('filters {}'.format(filters)) params = { "filters": json.dumps(filters), "fields": fields, "expand": expand, "size": 10, } try: print('build API call, endpt: {}'.format(ssm_id_endpt)) print('params: {}'.format(params)) response = requests.get(ssm_id_endpt, params=params) response_json = json.loads(response.content) ssm_id = response_json["data"]["hits"][0]["id"] print('obtained ssm id {}'.format(ssm_id)) except Exception as e: print("unable to execute GDC API request {}".format(str(e))) ssm_id = None return ssm_id def get_ssm_counts(ssm_id, cancer_entities): # get project level counts of ssm ssm_counts_by_project = {} for ce in cancer_entities: ssm_occurrences_endpt = "https://api.gdc.cancer.gov/ssm_occurrences" fields = ["case.project.project_id", "case.case_id"] fields = ",".join(fields) filters = { "op": "and", "content": [ { "op": '=', "content": {"field": "ssm.ssm_id", "value": ssm_id} }, { "op": "=", "content": {"field": "case.project.project_id", "value": ce} }, ]} params = {"filters": json.dumps(filters), "fields": fields, "size": 1000} try: print('build API call, endpt: {}'.format(ssm_occurrences_endpt)) print('params: {}'.format(params)) response = requests.get(ssm_occurrences_endpt, params=params) ssm_counts = json.loads(response.content) for item in ssm_counts["data"]["hits"]: project_name = item["case"]["project"]["project_id"] case_id_list = "case_id_list" if not project_name in ssm_counts_by_project: ssm_counts_by_project[project_name] = {} ssm_counts_by_project[project_name][case_id_list] = [] ssm_counts_by_project[project_name][case_id_list].append( item["case"]["case_id"] ) ssm_counts_by_project[project_name]["ssm_counts"] = ( ssm_counts_by_project[project_name]["ssm_counts"] + 1 if "ssm_counts" in ssm_counts_by_project[project_name] else 1 ) except Exception as e: print("unable to execute GDC API request {}".format(str(e))) return ssm_counts_by_project def get_available_cnv_data_for_project(project): case_ssm_endpt = "https://api.gdc.cancer.gov/case_ssms" fields = ["project.project_id", "available_variation_data"] fields = ",".join(fields) filters = { "op": "and", "content": [ { "op": "in", "content": {"field": "available_variation_data", "value": "cnv"}, }, {"op": "=", "content": {"field": "project.project_id", "value": project}}, ], } params = {"filters": json.dumps(filters), "fields": fields, "size": 1000} try: print('build API call, endpt: {}'.format(case_ssm_endpt)) print('params: {}'.format(params)) response = requests.get(case_ssm_endpt, params=params) response_json = json.loads(response.content) total_case_count = response_json["data"]["pagination"]["total"] except Exception as e: print("unable to execute GDC API request {}".format(str(e))) total_case_count = 0 return total_case_count def get_available_ssm_data_for_project(project): case_ssm_endpt = "https://api.gdc.cancer.gov/case_ssms" fields = ["project.project_id", "available_variation_data"] fields = ",".join(fields) filters = { "op": "and", "content": [ { "op": "in", "content": {"field": "available_variation_data", "value": "ssm"}, }, {"op": "=", "content": {"field": "project.project_id", "value": project}}, ], } params = {"filters": json.dumps(filters), "fields": fields, "size": 1000} try: print('build API call, endpt: {}'.format(case_ssm_endpt)) print('params: {}'.format(params)) response = requests.get(case_ssm_endpt, params=params) response_json = json.loads(response.content) total_case_count = response_json["data"]["pagination"]["total"] except Exception as e: print("unable to execute GDC API request {}".format(str(e))) return total_case_count def get_top_mutated_genes_by_project(cancer_entities, top_k): # need an AI way of recognizing top k from query, here using 10 as default top_mutated_genes_by_project = {} # if cancer_entities is empty, initialize some entities if not cancer_entities: cancer_entities = list(project_mappings.keys()) for ce in cancer_entities: endpt = "https://api.gdc.cancer.gov/analysis/top_mutated_genes_by_project" fields = ["gene_id", "symbol"] fields = ",".join(fields) filters = { "op": "and", "content": [ { "op": "in", "content": {"field": "case.project.project_id", "value": [ce]}, } ], } params = {"filters": json.dumps(filters), "fields": fields, "size": 1000} try: print('build API call, endpt: {}'.format(endpt)) print('params: {}'.format(params)) response = requests.get(endpt, params=params) response_json = json.loads(response.content) top_mutated_genes_by_project[ce] = response_json["data"]["hits"][:top_k] except Exception as e: print("unable to execute GDC API request {}".format(str(e))) return top_mutated_genes_by_project def return_joint_single_cnv_frequency(cnv, cnv_change, cnv_change_5_category): result_text = [] # set category for heterozygous del if not cnv_change_5_category: if cnv_change == "Loss": cnv_change_5_category = "Heterozygous Deletion" # print('formatting results {}'.format(cnv_change_5_category)) cnv_freq = {} for ce, v in cnv.items(): cnv_freq[ce] = {} genes = list(v.keys()) # print('ce, genes {} {}'.format(ce, genes)) total_number_of_cases_with_cnv_data = get_available_cnv_data_for_project(ce) # skip if total number of cnv cases from API is 0 if not total_number_of_cases_with_cnv_data: print('could not retrieve total number of cases with CNV data for {}'.format(ce)) total_number_of_cases_with_cnv_data = 0 print('\nStep 5: Query GDC and process results\n') print('total number of cases with CNV data {}'.format( total_number_of_cases_with_cnv_data)) if len(genes) > 1: cases_with_cnvs = [set(cnv[ce][g]["case_id_list"]) for g in genes] print('genes: {}'.format(genes)) num_cases_with_cnvs = [len(i) for i in cases_with_cnvs] print('number of cases with CNVs: {}'.format(num_cases_with_cnvs)) print('getting shared cases...') shared_cases = list(reduce(lambda x, y: x & y, cases_with_cnvs)) print('number of shared cases {}'.format(len(shared_cases))) print('preparing a GDC Result for query augmentation...') try: joint_frequency = round( (len(shared_cases) / total_number_of_cases_with_cnv_data) * 100, 2 ) except Exception as e: joint_frequency = 0.0 gdc_result = "joint frequency in {} is {}%".format(ce, joint_frequency) print('prepared GDC Result: {}'.format(gdc_result)) result_text.append(gdc_result) else: joint_frequency = 0 num_cases_with_cnvs = len(set(cnv[ce][genes[0]]["case_id_list"])) print('number of cases with cnvs {}'.format(num_cases_with_cnvs)) try: frequency = round((num_cases_with_cnvs / total_number_of_cases_with_cnv_data) * 100, 2) except Exception as e: frequency = 0.0 for k2, v2 in v.items(): print('preparing a GDC Result for query augmentation...') gdc_result = "The frequency of {} {} in {} is {}%".format( k2, cnv_change_5_category, ce, frequency ) print('prepared GDC Result: {}'.format(gdc_result)) result_text.append(gdc_result) return result_text def get_cnv_filter_with_cnv_change_category(cnv_change, ce, ge, cnv_change_5_category): filter = { "op": "and", "content": [ {"op": "in", "content": {"field": "cnv.cnv_change", "value": [cnv_change]}}, { "op": "in", "content": { "field": "cnv.cnv_change_5_category", "value": [cnv_change_5_category], }, }, { "op": "=", "content": {"field": "cnv.consequence.gene.symbol", "value": ge}, }, {"op": "=", "content": {"field": "case.project.project_id", "value": ce}}, ], } return filter def get_freq_cnv_loss_or_gain(gene_entities, cancer_entities, query, cnv_and_ssm_flag): cnv = {} lc_query = query.lower() # need to figure out how to get deletion and gain # V1 is only co-deletion, or co-gain loss_terms = ["loss", "loh", "deletion", "co-deletion", "lost", "LOH"] if any(term in lc_query for term in loss_terms): cnv_change = "Loss" if "homozygous" in lc_query: cnv_change_5_category = "Homozygous Deletion" else: cnv_change_5_category = "Loss" else: cnv_change = "Gain" if "amplification" in lc_query: cnv_change_5_category = "Amplification" else: cnv_change_5_category = "Gain" if not cancer_entities: cancer_entities = list(project_mappings.keys()) # print('cnv change, cnv change 5 category in query {} {}'.format( # cnv_change, cnv_change_5_category)) for ce in cancer_entities: for ge in gene_entities: # print('processing {}, {}'.format(ce, ge)) endpt = "https://api.gdc.cancer.gov/cnv_occurrences" fields = [ "cnv.chromosome", "cnv.cnv_change", "cnv.cnv_change_5_category", "cnv.consequence.gene.symbol", "case.case_id", "case.project.project_id", ] fields = ",".join(fields) filters = get_cnv_filter_with_cnv_change_category( cnv_change, ce, ge, cnv_change_5_category ) params = {"filters": json.dumps(filters), "fields": fields, "size": 1000} try: # print('filters {}'.format(json.dumps(filters))) # skip if response not successful print('build API call, endpt: {}'.format(endpt)) print('params: {}'.format(params)) response = requests.get(endpt, params=params) response_json = json.loads(response.content) except Exception as e: print("exception: {}".format(str(e))) continue if not ce in cnv: cnv[ce] = {} if not ge in cnv[ce]: cnv[ce][ge] = {} case_id_list = [] for item in response_json["data"]["hits"]: if item["case"]["case_id"]: case_id_list.append(item["case"]["case_id"]) number_of_cases_with_cnv_change = len(case_id_list) cnv[ce][ge]["case_id_list"] = case_id_list if cnv_and_ssm_flag: return cnv else: result_text = return_joint_single_cnv_frequency( cnv, cnv_change, cnv_change_5_category ) cancer_entities = list(cnv.keys()) return result_text, cancer_entities def get_msi_frequency(cancer_entities): msi_h_frequency = {} result_text = [] # init some starting cancer entities if none if not cancer_entities: cancer_entities = list(project_mappings.keys()) for ce in cancer_entities: endpt = "https://api.gdc.cancer.gov/files" fields = [ "cases.project.project_id", "msi_score", "msi_status", "experimental_strategy", ] fields = ",".join(fields) filters = { "op": "and", "content": [ {"op": "=", "content": {"field": "data_format", "value": "BAM"}}, { "op": "in", "content": { "field": "experimental_strategy", "value": ["WXS", "WGS"], }, }, { "op": "in", "content": {"field": "cases.project.project_id", "value": [ce]}, }, ], } params = {"filters": json.dumps(filters), "fields": fields, "size": 10000} try: print('build API call, endpt: {}'.format(endpt)) print('params: {}'.format(params)) response = requests.get(endpt, params=params) response_json = json.loads(response.content) msi_results = [] for item in response_json["data"]["hits"]: # only score tumors where MSI status is computed for frequency if "msi_status" in item: # exclude None if item['msi_status']: msi_results.append(item["msi_status"]) msi_pos = msi_results.count('MSI') msi_total = len(msi_results) freq = msi_pos / msi_total print('\nStep 5: Query GDC and process results\n') print('obtained {} BAM files with MSI tag, out of a total of {} BAM files with MSI information'.format( msi_pos, msi_total )) msi_h_frequency[ce] = {"frequency": round(freq * 100, 2)} print('preparing a GDC Result for query augmentation...') gdc_result = "The frequency of MSI in {} is {}%".format( ce, msi_h_frequency[ce]["frequency"] ) print('prepared GDC Result: {}'.format(gdc_result)) result_text.append(gdc_result) except Exception as e: print("unable to execute GDC API request {}".format(str(e))) ce_api_success = list(msi_h_frequency.keys()) return result_text, ce_api_success def get_ensembl_gene_ids(gene_entities): ensembl_gene_ids = [] for ge in gene_entities: endpt = "https://api.gdc.cancer.gov/genes" fields = ["gene_id"] fields = ",".join(fields) filters = { "op": "and", "content": [{"op": "=", "content": {"field": "symbol", "value": ge}}], } params = {"filters": json.dumps(filters), "fields": fields, "size": 100} try: print('build API call, endpt: {}'.format(endpt)) print('params: {}'.format(params)) response = requests.get(endpt, params=params) response_json = json.loads(response.content) ensembl_gene_ids.append(response_json["data"]["hits"][0]["gene_id"]) except Exception as e: print("unable to execute GDC API request {}".format(str(e))) return ensembl_gene_ids def get_total_variation_data_for_project(project): case_ssm_endpt = "https://api.gdc.cancer.gov/case_ssms" fields = ["project.project_id", "available_variation_data"] fields = ",".join(fields) filters = { "op": "and", "content": [ { "op": "in", "content": { "field": "available_variation_data", "value": ["ssm", "cnv"], }, }, {"op": "=", "content": {"field": "project.project_id", "value": project}}, ], } params = {"filters": json.dumps(filters), "fields": fields, "size": 1000} try: print('build API call, endpt: {}'.format(case_ssm_endpt)) print('params: {}'.format(params)) response = requests.get(case_ssm_endpt, params=params) response_json = json.loads(response.content) total_case_count = response_json["data"]["pagination"]["total"] except Exception as e: print("unable to execute GDC API request {}".format(str(e))) total_case_count = 0 return total_case_count def get_cases_with_ssms_in_a_gene(project, gene_name): result = {} endpt = "https://api.gdc.cancer.gov/ssm_occurrences" fields = ["case.case_id"] fields = ",".join(fields) filters = { "op": "and", "content": [ { "op": "=", "content": {"field": "case.project.project_id", "value": project}, }, { "op": "in", "content": { "field": "ssm.consequence.transcript.gene.symbol", "value": gene_name, }, }, ], } params = {"filters": json.dumps(filters), "fields": fields, "size": 1000} try: print('build API call, endpt: {}'.format(endpt)) print('params: {}'.format(params)) response = requests.get(endpt, params=params) response_json = json.loads(response.content) case_id_list = [] for item in response_json["data"]["hits"]: if item["case"]["case_id"]: case_id_list.append(item["case"]["case_id"]) result["case_id_list"] = list(set(case_id_list)) except Exception as e: print("unable to execute GDC API request {}".format(str(e))) return result def run_cnv_ssm_api(decompose_result, cancer_entities, query): """ decompose_result['cnv_and_ssm'] = True decompose_result['cnv_gene'] = cnv_gene.split(':')[1] decompose_result['mut_gene'] = mut_gene.split(':')[1] decompose_result['cnv_change_type'] = match_term """ gene_entities = [] cases_with_ssm_and_cnvs = [] result = [] gene_entities.append(decompose_result["cnv_gene"]) cnv_result = get_freq_cnv_loss_or_gain( gene_entities, cancer_entities, query, cnv_and_ssm_flag=True ) for ce in cancer_entities: try: # get_cases_with_ssms_in_a_gene returns the number of cases with ssms ssm_result = get_cases_with_ssms_in_a_gene( project=ce, gene_name=decompose_result["mut_gene"] ) total_case_count = get_total_variation_data_for_project(project=ce) print('\nStep 5: Query GDC and process results\n') # calcuate overlap of cases and return freq print('getting shared cases with CNV and SSMs...') cases_with_ssm_and_cnvs = [ set(cnv_result[ce][decompose_result["cnv_gene"]]["case_id_list"]), set(ssm_result["case_id_list"]), ] shared_cases = list(reduce(lambda x, y: x & y, cases_with_ssm_and_cnvs)) print('number of shared_cases {}'.format(len(shared_cases))) print('total case count {}'.format(total_case_count)) freq = round((len(shared_cases) / total_case_count) * 100, 2) print('preparing a GDC Result for query augmentation...') gdc_result = "The joint frequency in {} is {}%".format(ce, freq) except Exception as e: gdc_result = "joint freq in {} is not available".format(ce) print('prepared GDC Result {}'.format(gdc_result)) result.append(gdc_result) return result, cancer_entities def get_top_cases_counts_by_gene(gene_entities, cancer_entities): top_cases_counts_by_gene = {} result = [] emsembl_gene_ids = get_ensembl_gene_ids(gene_entities) if not cancer_entities: cancer_entities = list(project_mappings.keys()) for ce in cancer_entities: top_cases_counts_by_gene[ce] = {} # note this gives you ssm + cnv endpt = "https://api.gdc.cancer.gov/analysis/top_cases_counts_by_genes?gene_ids={}".format( ",".join(emsembl_gene_ids) ) print('build API call, endpt: {}'.format(endpt)) response = requests.get(endpt) response_json = json.loads(response.content) try: for item in response_json["aggregations"]["projects"]["buckets"]: if item["key"] == ce: cases_with_mutations = item["doc_count"] total_case_count = get_total_variation_data_for_project(project=ce) cases_without_mutations = total_case_count - cases_with_mutations top_cases_counts_by_gene[ce]["cases_with_mutations"] = cases_with_mutations top_cases_counts_by_gene[ce][ "cases_without_mutations" ] = cases_without_mutations top_cases_counts_by_gene[ce]["total_case_count"] = total_case_count print('\nStep 5: Query GDC and process results\n') print('obtained {} cases with mutations and a total case count of {}'.format( cases_with_mutations, total_case_count )) freq = cases_with_mutations / total_case_count top_cases_counts_by_gene[ce]["frequency"] = round(freq * 100, 2) print('preparing a GDC Result for query augmentation...') gdc_result = "The frequency of cases with mutations in {} is {}%".format( ce, top_cases_counts_by_gene[ce]["frequency"] ) result.append(gdc_result) except Exception as e: gdc_result = "frequency unavailable from API for {}".format(ce) result.append(gdc_result) print('prepared GDC Result {}'.format(gdc_result)) cancer_entities = list(top_cases_counts_by_gene.keys()) return result, cancer_entities def get_project_summary(cancer_entities): project_summary = {} for ce in cancer_entities: endpt = "https://api.gdc.cancer.gov/projects/{}?expand=summary,summary.experimental_strategies,summary.data_categories".format( ce ) response = requests.get(endpt) response_json = json.loads(response.content) project_summary[ce]["project_summary"] = response_json["data"] return project_summary def map_cancer_entities_to_project(initial_cancer_entities, project_mappings): project_match = {} for ce in initial_cancer_entities: # cancer_wild_card = '*' + ce endpoint = "https://api.gdc.cancer.gov/projects" fields = ["project_id", "disease_type", "name"] fields = ",".join(fields) filters = {"op": "=", "content": {"field": "name", "value": [ce]}} params = {"filters": json.dumps(filters), "fields": fields, "size": 10000} try: response = requests.get(endpoint, params=params) response_json = json.loads(response.content) # print('response_json {}'.format(json.dumps( # response_json, indent=4))) for item in response_json["data"]["hits"]: project_id = item["project_id"] project_match[ce] = project_id except Exception as e: pass # print('unable to return a match from projects endpt ' # 'perform further checks on project_mappings') return project_match