{ "cells": [ { "cell_type": "markdown", "id": "744b9f11-6ef8-4409-a388-fe860480c9de", "metadata": {}, "source": [ "# Processing the Reasoning Trace Data and Adding in Nucleotides" ] }, { "cell_type": "code", "execution_count": null, "id": "8950d38a-dfa9-4dbd-b388-941dec69b3ee", "metadata": {}, "outputs": [], "source": [ "cd kegg_data" ] }, { "cell_type": "code", "execution_count": 2, "id": "a1c3d972-c52e-4d73-9816-e970fca3e1bb", "metadata": {}, "outputs": [], "source": [ "import json\n", "from Bio import SeqIO" ] }, { "cell_type": "code", "execution_count": 3, "id": "c80d7741-7aaa-4c28-a93a-ad955f3da6bb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mkdir: processed_variants 1450 with seqs: File exists\n" ] } ], "source": [ "!mkdir 'processed_variants 1450 with seqs'" ] }, { "cell_type": "code", "execution_count": 4, "id": "e4021560-9130-4fdf-a640-15b5da6935a0", "metadata": {}, "outputs": [], "source": [ "for i in range(1,1450):\n", " # opened the json file\n", " with open(f'processed_variants first 700/KEGG_{i}_processed.json', 'r') as file:\n", " data = json.load(file)\n", "\n", " # open the nt file\n", " fasta_file = f\"nt_seq/KEGG_{i}.txt\"\n", " sequence_list = list(SeqIO.parse(fasta_file, \"fasta\"))\n", " ref_seq = sequence_list[0].seq\n", " var_seq = sequence_list[1].seq\n", "\n", " # Add sequences to the JSON data\n", " data[\"reference_sequence\"] = str(ref_seq)\n", " data[\"variant_sequence\"] = str(var_seq)\n", "\n", " # Save the updated JSON to a new file\n", " with open(f'processed_variants 1450 with seqs/KEGG_{i}_with_seqs.json', 'w') as out_file:\n", " json.dump(data, out_file, indent=2)" ] }, { "cell_type": "markdown", "id": "4db8af16-a11f-4987-b1a6-db552c6714fb", "metadata": {}, "source": [ "# Creating the Final KEGG SFT and RL Dataset\n", "\n", "# Final KEGG Dataset Creation\n", "\n", "This section creates the final machine learning dataset by combining variant data with sequences and generating structured question-answer pairs for biological reasoning tasks." ] }, { "cell_type": "code", "execution_count": null, "id": "f9517d40-74e3-4ddb-bd16-95f9ab7927aa", "metadata": {}, "outputs": [], "source": [ "cd kegg_data" ] }, { "cell_type": "code", "execution_count": 2, "id": "53c5948f-4bde-432d-b35c-34c733eb9ad1", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import json\n", "import ast" ] }, { "cell_type": "code", "execution_count": 3, "id": "60c66a0d-359b-4d2a-8427-53f4d18d1047", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Var_IDNetworkEntrySourceIDTranscriptIDNucChangeChrStartEnd...Network ExpandedPathwayClassDiseaseGeneVariant_NameVariant_GeneVariant_Gene InfoVariant_TypeDisease_Names
0KEGG_1N000731019v2ClinVar16929NC_000012.12NaN125775164657751646...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
1KEGG_2N000731019v2dbSNPrs104894340NC_000012.12NaN125775164657751646...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
2KEGG_3N000731019v2dbSNPrs104894340NC_000012.12NaN125775164657751646...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
3KEGG_4N000731019v2ClinVar16928NC_000012.12NaN125775164757751647...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
4KEGG_5N000731019v2dbSNPrs11547328NC_000012.12NaN125775164757751647...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
..................................................................
1444KEGG_1445N002449817v1COSM6196635ENST00000393623.6c.706G>T191049219610492196...9817v1 // 4780 => (3162,1728,119391,221357,293...{'hsa05225': 'Hepatocellular carcinoma'}{'nt06263': 'Hepatocellular carcinoma', 'nt062...{'H00048': 'Hepatocellular carcinoma (HCC) is ...{'9817': 'KEAP1; kelch like ECH associated pro...KEAP1 mutationKEAP1kelch like ECH associated protein 1 [KO:K10456]NaN{'H00048': 'Hepatocellular carcinoma;'}
1445KEGG_1446N002449817v1COSM6196637ENST00000393623.6c.548A>G191049948610499486...9817v1 // 4780 => (3162,1728,119391,221357,293...{'hsa05225': 'Hepatocellular carcinoma'}{'nt06263': 'Hepatocellular carcinoma', 'nt062...{'H00048': 'Hepatocellular carcinoma (HCC) is ...{'9817': 'KEAP1; kelch like ECH associated pro...KEAP1 mutationKEAP1kelch like ECH associated protein 1 [KO:K10456]NaN{'H00048': 'Hepatocellular carcinoma;'}
1446KEGG_1447N00258999v2COSM4766271ENST00000621016.4c.662A>G166880882368808823...999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...CDH1 mutationCDH1cadherin 1 [KO:K05689]NaN{'H00018': 'Gastric cancer'}
1447KEGG_1448N00258999v2COSM4766211ENST00000621016.4c.755T>G166881026468810264...999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...CDH1 mutationCDH1cadherin 1 [KO:K05689]NaN{'H00018': 'Gastric cancer'}
1448KEGG_1449N00258999v2COSM1379150ENST00000621016.4c.769G>A166881027868810278...999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...CDH1 mutationCDH1cadherin 1 [KO:K05689]NaN{'H00018': 'Gastric cancer'}
\n", "

1449 rows × 24 columns

\n", "
" ], "text/plain": [ " Var_ID Network Entry Source ID TranscriptID \\\n", "0 KEGG_1 N00073 1019v2 ClinVar 16929 NC_000012.12 \n", "1 KEGG_2 N00073 1019v2 dbSNP rs104894340 NC_000012.12 \n", "2 KEGG_3 N00073 1019v2 dbSNP rs104894340 NC_000012.12 \n", "3 KEGG_4 N00073 1019v2 ClinVar 16928 NC_000012.12 \n", "4 KEGG_5 N00073 1019v2 dbSNP rs11547328 NC_000012.12 \n", "... ... ... ... ... ... ... \n", "1444 KEGG_1445 N00244 9817v1 COSM 6196635 ENST00000393623.6 \n", "1445 KEGG_1446 N00244 9817v1 COSM 6196637 ENST00000393623.6 \n", "1446 KEGG_1447 N00258 999v2 COSM 4766271 ENST00000621016.4 \n", "1447 KEGG_1448 N00258 999v2 COSM 4766211 ENST00000621016.4 \n", "1448 KEGG_1449 N00258 999v2 COSM 1379150 ENST00000621016.4 \n", "\n", " NucChange Chr Start End ... \\\n", "0 NaN 12 57751646 57751646 ... \n", "1 NaN 12 57751646 57751646 ... \n", "2 NaN 12 57751646 57751646 ... \n", "3 NaN 12 57751647 57751647 ... \n", "4 NaN 12 57751647 57751647 ... \n", "... ... ... ... ... ... \n", "1444 c.706G>T 19 10492196 10492196 ... \n", "1445 c.548A>G 19 10499486 10499486 ... \n", "1446 c.662A>G 16 68808823 68808823 ... \n", "1447 c.755T>G 16 68810264 68810264 ... \n", "1448 c.769G>A 16 68810278 68810278 ... \n", "\n", " Network Expanded \\\n", "0 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", "1 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", "2 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", "3 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", "4 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", "... ... \n", "1444 9817v1 // 4780 => (3162,1728,119391,221357,293... \n", "1445 9817v1 // 4780 => (3162,1728,119391,221357,293... \n", "1446 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", "1447 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", "1448 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", "\n", " Pathway \\\n", "0 {'hsa05218': 'Melanoma'} \n", "1 {'hsa05218': 'Melanoma'} \n", "2 {'hsa05218': 'Melanoma'} \n", "3 {'hsa05218': 'Melanoma'} \n", "4 {'hsa05218': 'Melanoma'} \n", "... ... \n", "1444 {'hsa05225': 'Hepatocellular carcinoma'} \n", "1445 {'hsa05225': 'Hepatocellular carcinoma'} \n", "1446 {'hsa05226': 'Gastric cancer'} \n", "1447 {'hsa05226': 'Gastric cancer'} \n", "1448 {'hsa05226': 'Gastric cancer'} \n", "\n", " Class \\\n", "0 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", "1 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", "2 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", "3 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", "4 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", "... ... \n", "1444 {'nt06263': 'Hepatocellular carcinoma', 'nt062... \n", "1445 {'nt06263': 'Hepatocellular carcinoma', 'nt062... \n", "1446 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", "1447 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", "1448 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", "\n", " Disease \\\n", "0 {'H00038': 'Melanoma is a form of skin cancer ... \n", "1 {'H00038': 'Melanoma is a form of skin cancer ... \n", "2 {'H00038': 'Melanoma is a form of skin cancer ... \n", "3 {'H00038': 'Melanoma is a form of skin cancer ... \n", "4 {'H00038': 'Melanoma is a form of skin cancer ... \n", "... ... \n", "1444 {'H00048': 'Hepatocellular carcinoma (HCC) is ... \n", "1445 {'H00048': 'Hepatocellular carcinoma (HCC) is ... \n", "1446 {'H00018': \"Gastric cancer (GC) is one of the ... \n", "1447 {'H00018': \"Gastric cancer (GC) is one of the ... \n", "1448 {'H00018': \"Gastric cancer (GC) is one of the ... \n", "\n", " Gene Variant_Name \\\n", "0 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", "1 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", "2 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", "3 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", "4 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", "... ... ... \n", "1444 {'9817': 'KEAP1; kelch like ECH associated pro... KEAP1 mutation \n", "1445 {'9817': 'KEAP1; kelch like ECH associated pro... KEAP1 mutation \n", "1446 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... CDH1 mutation \n", "1447 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... CDH1 mutation \n", "1448 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... CDH1 mutation \n", "\n", " Variant_Gene Variant_Gene Info \\\n", "0 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", "1 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", "2 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", "3 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", "4 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", "... ... ... \n", "1444 KEAP1 kelch like ECH associated protein 1 [KO:K10456] \n", "1445 KEAP1 kelch like ECH associated protein 1 [KO:K10456] \n", "1446 CDH1 cadherin 1 [KO:K05689] \n", "1447 CDH1 cadherin 1 [KO:K05689] \n", "1448 CDH1 cadherin 1 [KO:K05689] \n", "\n", " Variant_Type Disease_Names \n", "0 NaN {'H00038': 'Melanoma'} \n", "1 NaN {'H00038': 'Melanoma'} \n", "2 NaN {'H00038': 'Melanoma'} \n", "3 NaN {'H00038': 'Melanoma'} \n", "4 NaN {'H00038': 'Melanoma'} \n", "... ... ... \n", "1444 NaN {'H00048': 'Hepatocellular carcinoma;'} \n", "1445 NaN {'H00048': 'Hepatocellular carcinoma;'} \n", "1446 NaN {'H00018': 'Gastric cancer'} \n", "1447 NaN {'H00018': 'Gastric cancer'} \n", "1448 NaN {'H00018': 'Gastric cancer'} \n", "\n", "[1449 rows x 24 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "variant_data = pd.read_csv(\"final_network_with_variant.tsv\", sep='\\t')\n", "variant_data" ] }, { "cell_type": "code", "execution_count": 9, "id": "51609538-9f96-4097-ac60-2a4a08a6e01c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'KEGG_2'" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "variant_data.iloc[1]['Var_ID']" ] }, { "cell_type": "code", "execution_count": 5, "id": "846b6ee3-1e4d-44bc-ad59-4074b4ff39bb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mkdir: final_data: File exists\n" ] } ], "source": [ "!mkdir final_data" ] }, { "cell_type": "code", "execution_count": null, "id": "56449f64-85ae-4804-8a01-3ce2afe1e6da", "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "import ast\n", "from CONFIG import CONFIG\n", "\n", "# Create final dataset with question-answer pairs\n", "variants_with_seqs_dir = CONFIG['variants_with_seqs_dir']\n", "final_data_dir = CONFIG['final_data_dir']\n", "start_idx, end_idx = CONFIG['variant_range']\n", "\n", "print(f\"Creating final dataset with Q&A pairs...\")\n", "print(f\"Input: {variants_with_seqs_dir}\")\n", "print(f\"Output: {final_data_dir}\")\n", "print(f\"Processing range: {start_idx} to {end_idx}\")\n", "\n", "processed_count = 0\n", "error_count = 0\n", "\n", "for i in range(start_idx, end_idx):\n", " try:\n", " # Load the JSON file with sequences\n", " input_file = f'{variants_with_seqs_dir}/KEGG_{i}_with_seqs.json'\n", " if not os.path.exists(input_file):\n", " error_count += 1\n", " continue\n", " \n", " with open(input_file, 'r') as file:\n", " data = json.load(file)\n", "\n", " # Build the question with fallback for inconsistent key casing\n", " try:\n", " chromosome = data['raw_data']['chromosome']\n", " network = data['raw_data']['network']\n", " except KeyError:\n", " try:\n", " chromosome = data['raw_data']['Chromosome']\n", " network = data['raw_data']['Network']\n", " except KeyError:\n", " print(f\"[Warning] Missing chromosome/network data in {input_file}\")\n", " error_count += 1\n", " continue\n", "\n", " # Extract gene information\n", " try:\n", " gene_list = list(ast.literal_eval(variant_data.iloc[i-1]['Gene']).values())\n", " gene_list_joined = ' | '.join(gene_list)\n", " variant_gene = variant_data.iloc[i-1]['Variant_Gene']\n", " except (KeyError, IndexError, ValueError) as e:\n", " print(f\"[Warning] Gene information error for {input_file}: {e}\")\n", " error_count += 1\n", " continue\n", "\n", " question = (\n", " f\"Chromosome Number: {chromosome}\\n\"\n", " f\"Network Definition of the pathway: {network}\\n\"\n", " f\"Genes in the pathway: {gene_list_joined}\\n\\n\"\n", " f\"Given this context, what is the biological effect of this \"\n", " f\"{variant_gene} allele, specifically what disease does this contribute to?\"\n", " )\n", "\n", " # Add Q&A to reasoning steps\n", " if 'reasoning' in data and 'reasoning_steps' in data['reasoning']:\n", " data['reasoning']['reasoning_steps'].append(data.get('answer', ''))\n", "\n", " # Extract answer\n", " try:\n", " answer = data['reasoning']['labels']['disease'][0]\n", " except (KeyError, IndexError):\n", " print(f\"[Warning] Missing disease label in {input_file}\")\n", " error_count += 1\n", " continue\n", "\n", " data['question'] = question\n", " data['answer'] = answer \n", "\n", " # Clean up unnecessary fields\n", " if 'reasoning' in data:\n", " for key in ['variant_id', 'hgvs', 'labels']:\n", " data['reasoning'].pop(key, None)\n", " data.pop('raw_data', None)\n", "\n", " # Save to final data directory\n", " output_file = f'{final_data_dir}/KEGG_{i}_with_seqs.json'\n", " with open(output_file, 'w') as out_file:\n", " json.dump(data, out_file, indent=2)\n", " \n", " processed_count += 1\n", " \n", " if processed_count % 100 == 0:\n", " print(f\"Created {processed_count} Q&A pairs...\")\n", " \n", " except Exception as e:\n", " print(f\"[Error] Failed to process variant {i}: {str(e)}\")\n", " error_count += 1\n", "\n", "print(f\"✅ Final dataset creation complete:\")\n", "print(f\" Successfully processed: {processed_count}\")\n", "print(f\" Errors encountered: {error_count}\")\n", "print(f\" Output directory: {final_data_dir}\")" ] }, { "cell_type": "markdown", "id": "11b3769e-33e5-4ab8-bc9d-f736913a2034", "metadata": {}, "source": [ "# Fixing Disease Labels" ] }, { "cell_type": "code", "execution_count": null, "id": "0cfa4eca-c11e-4e52-ad6b-2fa7b43be2a4", "metadata": {}, "outputs": [], "source": [ "cd kegg_data" ] }, { "cell_type": "code", "execution_count": 2, "id": "9e36bc3f-07af-4b3d-bc84-d449ced55e24", "metadata": {}, "outputs": [], "source": [ "import json" ] }, { "cell_type": "code", "execution_count": null, "id": "cd316862-e6c7-4dd9-a06c-33f3454355b0", "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "\n", "# CONFIG parameters\n", "CONFIG = {\n", " 'final_data_dir': 'final_data',\n", " 'variant_range': (1, 1450)\n", "}\n", "\n", "# Extract disease labels from final dataset for standardization\n", "final_data_dir = CONFIG['final_data_dir']\n", "start_idx, end_idx = CONFIG['variant_range']\n", "\n", "print(\"Extracting disease labels for standardization...\")\n", "\n", "disease = []\n", "processed_count = 0\n", "\n", "for i in range(start_idx, end_idx):\n", " try:\n", " input_file = f'{final_data_dir}/KEGG_{i}_with_seqs.json'\n", " if os.path.exists(input_file):\n", " with open(input_file, 'r') as file:\n", " data = json.load(file)\n", " \n", " if 'answer' in data:\n", " disease.append(data['answer'])\n", " processed_count += 1\n", " \n", " except Exception as e:\n", " print(f\"[Warning] Could not process {input_file}: {str(e)}\")\n", "\n", "print(f\"✅ Extracted {len(disease)} disease labels from {processed_count} files\")\n", "print(f\"Unique diseases: {len(set(disease))}\")" ] }, { "cell_type": "code", "execution_count": 13, "id": "cca4846c-aec9-49f3-b919-760cb9fa4bc7", "metadata": {}, "outputs": [], "source": [ "new_disease = {'Acute Myeloid Leukemia (AML)' : \"Acute Myeloid Leukemia\",\n", " 'Acute myeloid leukemia (AML)' : \"Acute Myeloid Leukemia\",\n", " 'Adenine Phosphoribosyltransferase Deficiency (APRTD)' : \"Adenine Phosphoribosyltransferase Deficiency\",\n", " 'Adenine phosphoribosyltransferase deficiency (APRTD)' : \"Adenine Phosphoribosyltransferase Deficiency\",\n", " \"Alzheimer's disease\" : \"Alzheimer's disease\",\n", " \"Alzheimer's disease (AD)\" : \"Alzheimer's disease\",\n", " 'Amyotrophic Lateral Sclerosis (ALS)' : \"Amyotrophic Lateral Sclerosis\",\n", " 'Amyotrophic lateral sclerosis (ALS)' : \"Amyotrophic Lateral Sclerosis\",\n", " 'Basal Cell Carcinoma (BCC)' : \"Basal Cell Carcinoma\",\n", " 'Basal cell carcinoma' : \"Basal Cell Carcinoma\",\n", " 'Basal cell carcinoma (BCC)' : \"Basal Cell Carcinoma\",\n", " 'Chronic Myeloid Leukemia (CML)' : \"Chronic Myeloid Leukemia\",\n", " 'Chronic myeloid leukemia (CML)' : \"Chronic Myeloid Leukemia\",\n", " 'Clear cell Renal Cell Carcinoma (ccRCC)' : \"Clear cell Renal Cell Carcinoma\",\n", " 'Clear cell renal cell carcinoma' : \"Clear cell Renal Cell Carcinoma\",\n", " 'Clear cell renal cell carcinoma (ccRCC)' : \"Clear cell Renal Cell Carcinoma\",\n", " 'Colorectal cancer' : \"Colorectal cancer\",\n", " 'Colorectal cancer (CRC)' : \"Colorectal cancer\",\n", " 'Cushing syndrome' : \"Cushing syndrome\",\n", " \"Early-onset Alzheimer's disease\" : \"Alzheimer's disease\",\n", " \"Early-onset familial Alzheimer's disease\" : \"Alzheimer's disease\",\n", " \"Early-onset familial Alzheimer's disease (FAD)\" : \"Alzheimer's disease\",\n", " 'Familial Creutzfeldt-Jakob Disease' : \"Creutzfeldt-Jakob Disease\",\n", " 'Familial Creutzfeldt-Jakob Disease (fCJD)' : \"Creutzfeldt-Jakob Disease\",\n", " 'Familial Creutzfeldt-Jakob disease' : \"Creutzfeldt-Jakob Disease\",\n", " 'Familial Creutzfeldt-Jakob disease (fCJD)' : \"Creutzfeldt-Jakob Disease\",\n", " \"Familial Early-Onset Alzheimer's Disease\" : \"Alzheimer's disease\",\n", " 'Familial Isolated Pituitary Adenoma (FIPA)' : \"Pituitary Adenoma\",\n", " \"Familial early-onset Alzheimer's disease\" : \"Alzheimer's disease\",\n", " \"Familial early-onset Alzheimer's disease (FAD)\" : \"Alzheimer's disease\",\n", " 'Familial isolated pituitary adenoma (FIPA)' : \"Pituitary Adenoma\",\n", " 'Gastric cancer' : \"Gastric cancer\",\n", " 'Gaucher disease' : \"Gaucher disease\",\n", " 'Glioblastoma multiforme' : \"Glioblastoma multiforme\",\n", " 'Glioblastoma multiforme (GBM)' : \"Glioblastoma multiforme\",\n", " 'Hepatocellular carcinoma' : \"Hepatocellular carcinoma\",\n", " 'Hepatocellular carcinoma (HCC)' : \"Hepatocellular carcinoma\",\n", " 'Huntington disease' : \"Huntington's disease\",\n", " 'Huntington disease (HD)' : \"Huntington's disease\",\n", " \"Huntington's disease\" : \"Huntington's disease\",\n", " \"Huntington's disease (HD)\" : \"Huntington's disease\",\n", " 'Lesch-Nyhan syndrome' : \"Lesch-Nyhan syndrome\",\n", " 'Melanoma' : \"Melanoma\",\n", " 'Melanoma (H00038)' : \"Melanoma\",\n", " 'Methylmalonic aciduria and homocystinuria (MAHC)' : \"Methylmalonic aciduria and homocystinuria\",\n", " 'Multiple Endocrine Neoplasia type 1 (MEN1)' : \"Multiple Endocrine Neoplasia type 1\",\n", " 'N-acetylglutamate synthase (NAGS) deficiency' : \"N-acetylglutamate synthase deficiency\",\n", " 'Non-small cell lung cancer' : \"Non-small cell lung cancer\",\n", " 'Non-small cell lung cancer (NSCLC)' : \"Non-small cell lung cancer\",\n", " 'Non-small-cell lung cancer' : \"Non-small cell lung cancer\",\n", " 'Non-small-cell lung cancer (NSCLC)' : \"Non-small cell lung cancer\",\n", " 'Pancreatic ductal adenocarcinoma' : \"Pancreatic ductal adenocarcinoma\",\n", " 'Papillary Renal Cell Carcinoma' : \"Papillary Renal Cell Carcinoma\",\n", " 'Papillary renal cell carcinoma' : \"Papillary Renal Cell Carcinoma\",\n", " 'Papillary thyroid carcinoma' : \"Papillary thyroid carcinoma\",\n", " 'Papillary thyroid carcinoma (PTC)' : \"Papillary thyroid carcinoma\",\n", " \"Parkinson's Disease\" : \"Parkinson's Disease\",\n", " \"Parkinson's disease\" : \"Parkinson's Disease\",\n", " \"Parkinson's disease (PD)\" : \"Parkinson's Disease\",\n", " 'Pituitary adenoma' : \"Pituitary Adenoma\",\n", " 'Primary Aldosteronism' : \"Primary Aldosteronism\",\n", " 'Primary aldosteronism' : \"Primary Aldosteronism\",\n", " 'Prion disease' : \"Prion disease\",\n", " 'Prion diseases' : \"Prion disease\",\n", " 'Prostate cancer' : \"Prostate cancer\",\n", " 'Renal cell cancer (RCC)' : \"Renal cell carcinoma\",\n", " 'Renal cell carcinoma' : \"Renal cell carcinoma\",\n", " 'Renal cell carcinoma (RCC)' : \"Renal cell carcinoma\",\n", " 'Robinow syndrome' : \"Robinow syndrome\",\n", " 'Sphingolipidoses' : \"Sphingolipidoses\",\n", " 'Sphingolipidosis' : \"Sphingolipidoses\",\n", " 'Spinocerebellar Ataxia (SCA)' : \"Spinocerebellar Ataxia\",\n", " 'Spinocerebellar Ataxia (SCA19/22)' : \"Spinocerebellar Ataxia\",\n", " 'Spinocerebellar Ataxia Type 1 (SCA1)' : \"Spinocerebellar Ataxia\",\n", " 'Spinocerebellar Ataxia Type 13 (SCA13)' : \"Spinocerebellar Ataxia\",\n", " 'Spinocerebellar Ataxia Type 14 (SCA14)' : \"Spinocerebellar Ataxia\",\n", " 'Spinocerebellar Ataxia Type 15 (SCA15)' : \"Spinocerebellar Ataxia\",\n", " 'Spinocerebellar Ataxia Type 2 (SCA2)' : \"Spinocerebellar Ataxia\",\n", " 'Spinocerebellar Ataxia Type 3' : \"Spinocerebellar Ataxia\",\n", " 'Spinocerebellar Ataxia Type 3 (SCA3)' : \"Spinocerebellar Ataxia\",\n", " 'Spinocerebellar Ataxia Type 5 (SCA5)' : \"Spinocerebellar Ataxia\",\n", " 'Spinocerebellar Ataxia type 13 (SCA13)' : \"Spinocerebellar Ataxia\",\n", " 'Spinocerebellar Ataxia type 6 (SCA6)' : \"Spinocerebellar Ataxia\",\n", " 'Spinocerebellar ataxia' : \"Spinocerebellar Ataxia\",\n", " 'Spinocerebellar ataxia (SCA)' : \"Spinocerebellar Ataxia\",\n", " 'Spinocerebellar ataxia (SCA19/22)' : \"Spinocerebellar Ataxia\",\n", " 'Spinocerebellar ataxia type 1 (SCA1)' : \"Spinocerebellar Ataxia\",\n", " 'Spinocerebellar ataxia type 19 (SCA19)' : \"Spinocerebellar Ataxia\",\n", " 'Spinocerebellar ataxia type 19/22 (SCA19/22)' : \"Spinocerebellar Ataxia\",\n", " 'Spinocerebellar ataxia type 2 (SCA2)' : \"Spinocerebellar Ataxia\",\n", " 'Spinocerebellar ataxia type 3' : \"Spinocerebellar Ataxia\",\n", " 'Spinocerebellar ataxia type 3 (SCA3)' : \"Spinocerebellar Ataxia\",\n", " 'Spinocerebellar ataxia type 5 (SCA5)' : \"Spinocerebellar Ataxia\",\n", " 'Spinocerebellar ataxia type 6 (SCA6)' : \"Spinocerebellar Ataxia\",\n", " 'Thyroid cancer' : \"Thyroid cancer\",\n", " 'Thyroid dyshormonogenesis' : \"Thyroid dyshormonogenesis\",\n", " 'Urothelial carcinoma' : \"Urothelial carcinoma\",\n", " 'von Hippel-Lindau syndrome' : \"von Hippel-Lindau syndrome\"}" ] }, { "cell_type": "code", "execution_count": 19, "id": "2451ebb1-a9d8-494c-9f7e-4f800cd158e8", "metadata": {}, "outputs": [], "source": [ "!mkdir final_data_fix" ] }, { "cell_type": "code", "execution_count": null, "id": "c71719e5-5215-4559-a47d-dfc160779260", "metadata": {}, "outputs": [], "source": [ "import json\n", "import os\n", "\n", "# CONFIG parameters\n", "CONFIG = {\n", " 'final_data_dir': 'final_data',\n", " 'final_data_fix_dir': 'final_data_fix',\n", " 'variant_range': (1, 1450)\n", "}\n", "\n", "# Dummy new_disease mapping for demonstration\n", "new_disease = {\n", " \"disease_A\": \"new_disease_A\",\n", " \"disease_B\": \"new_disease_B\"\n", " # Add more mappings as needed\n", "}\n", "\n", "# Standardize disease labels using the mapping dictionary\n", "final_data_dir = CONFIG['final_data_dir']\n", "final_data_fix_dir = CONFIG['final_data_fix_dir']\n", "start_idx, end_idx = CONFIG['variant_range']\n", "\n", "print(\"Applying disease label standardization...\")\n", "print(f\"Input: {final_data_dir}\")\n", "print(f\"Output: {final_data_fix_dir}\")\n", "\n", "processed_count = 0\n", "error_count = 0\n", "\n", "for i in range(start_idx, end_idx):\n", " try:\n", " input_file = f'{final_data_dir}/KEGG_{i}_with_seqs.json'\n", " if not os.path.exists(input_file):\n", " continue\n", " \n", " with open(input_file, 'r') as file:\n", " data = json.load(file)\n", "\n", " # Get original answer\n", " temp = data.get('answer', '')\n", " \n", " # Apply standardization if mapping exists\n", " if temp in new_disease:\n", " data['answer'] = new_disease[temp]\n", " else:\n", " print(f\"[Warning] No mapping found for disease: {temp}\")\n", " \n", " # Save to standardized directory\n", " output_file = f'{final_data_fix_dir}/KEGG_{i}_with_seqs.json'\n", " with open(output_file, 'w') as out_file:\n", " json.dump(data, out_file, indent=2)\n", " \n", " processed_count += 1\n", " \n", " if processed_count % 100 == 0:\n", " print(f\"Standardized {processed_count} disease labels...\")\n", " \n", " except Exception as e:\n", " print(f\"[Error] Failed to process {input_file}: {str(e)}\")\n", " error_count += 1\n", "\n", "print(f\"✅ Disease label standardization complete:\")\n", "print(f\" Successfully processed: {processed_count}\")\n", "print(f\" Errors encountered: {error_count}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "9a12df3e-9ceb-4a51-acaf-e2931792a844", "metadata": {}, "outputs": [], "source": [ "# Remove original final_data directory and replace with standardized version\n", "final_data_dir = CONFIG['final_data_dir']\n", "final_data_fix_dir = CONFIG['final_data_fix_dir']\n", "\n", "import shutil\n", "import os\n", "\n", "if os.path.exists(final_data_dir):\n", " shutil.rmtree(final_data_dir)\n", " print(f\"Removed original directory: {final_data_dir}\")\n", "else:\n", " print(f\"Directory not found: {final_data_dir}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "dbba2c19-08f6-4769-b38d-a64d8643e142", "metadata": {}, "outputs": [], "source": [ "import os\n", "from your_config_module import CONFIG # Adjust the import based on your project structure\n", "\n", "# Rename standardized directory to final_data\n", "final_data_dir = CONFIG['final_data_dir']\n", "final_data_fix_dir = CONFIG['final_data_fix_dir']\n", "\n", "if os.path.exists(final_data_fix_dir):\n", " os.rename(final_data_fix_dir, final_data_dir)\n", " print(f\"Renamed {final_data_fix_dir} to {final_data_dir}\")\n", " print(\"✅ Final dataset with standardized disease labels is ready\")\n", "else:\n", " print(f\"Directory not found: {final_data_fix_dir}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "8c87a0df-09c8-4fb6-baca-21a9cdd65b85", "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "\n", "# Assuming CONFIG is defined somewhere earlier in the code\n", "# CONFIG = {\n", "# 'final_data_dir': 'path_to_final_data_dir',\n", "# 'variant_range': (1, 1450)\n", "# }\n", "\n", "# Verify standardized disease labels\n", "final_data_dir = CONFIG['final_data_dir']\n", "start_idx, end_idx = CONFIG['variant_range']\n", "\n", "print(\"Verifying standardized disease labels...\")\n", "\n", "disease = []\n", "for i in range(start_idx, end_idx):\n", " try:\n", " input_file = f'{final_data_dir}/KEGG_{i}_with_seqs.json'\n", " if os.path.exists(input_file):\n", " with open(input_file, 'r') as file:\n", " data = json.load(file)\n", " \n", " if 'answer' in data:\n", " disease.append(data['answer'])\n", " \n", " except Exception as e:\n", " print(f\"[Warning] Could not verify {input_file}: {str(e)}\")\n", "\n", "print(f\"✅ Verification complete:\")\n", "print(f\" Total disease labels: {len(disease)}\")\n", "print(f\" Unique diseases: {len(set(disease))}\")\n", "print(f\" Top 10 diseases: {list(set(disease))[:10]}\")" ] }, { "cell_type": "markdown", "id": "60f75d92-e2f2-495f-ba8f-cb423410f1f4", "metadata": {}, "source": [ "# Saving the KEGG Task to the WangLab Hugging Face" ] }, { "cell_type": "code", "execution_count": null, "id": "1a069a67-b410-4adf-ab75-62eca67ab259", "metadata": {}, "outputs": [], "source": [ "cd ../../bioR_tasks" ] }, { "cell_type": "code", "execution_count": 2, "id": "10e9f0fb-4943-41bf-bef3-9fcd64796ddf", "metadata": {}, "outputs": [], "source": [ "mkdir kegg_variant" ] }, { "cell_type": "code", "execution_count": null, "id": "cced244e-9d03-47be-8fa1-864f2736fe01", "metadata": {}, "outputs": [], "source": [ "cp ../BioReason/data/kegg_data/final_data/* kegg_variant/" ] }, { "cell_type": "code", "execution_count": null, "id": "bff9ce06-2cd8-4675-a23f-080027770bdb", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "4c56a919", "metadata": {}, "source": [ "# Creating the Nt Variant Database" ] }, { "cell_type": "code", "execution_count": null, "id": "7c28bc9f", "metadata": {}, "outputs": [], "source": [ "cd kegg_data" ] }, { "cell_type": "code", "execution_count": null, "id": "7618faf2", "metadata": {}, "outputs": [], "source": [ "from Bio import SeqIO\n", "import pandas as pd\n", "import json\n", "import os\n", "from pathlib import Path\n", "\n", "# Optional: Uncomment if you want to use HuggingFace datasets\n", "# from datasets import load_dataset, Dataset, DatasetDict\n", "\n", "print(\"Imports loaded for nucleotide database creation\")" ] }, { "cell_type": "code", "execution_count": null, "id": "1b8cac05", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Var_IDNetworkEntrySourceIDTranscriptIDNucChangeChrStartEnd...Network ExpandedPathwayClassDiseaseGeneVariant_NameVariant_GeneVariant_Gene InfoVariant_TypeDisease_Names
0KEGG_1N000731019v2ClinVar16929NC_000012.12NaN125775164657751646...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
1KEGG_2N000731019v2dbSNPrs104894340NC_000012.12NaN125775164657751646...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
2KEGG_3N000731019v2dbSNPrs104894340NC_000012.12NaN125775164657751646...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
3KEGG_4N000731019v2ClinVar16928NC_000012.12NaN125775164757751647...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
4KEGG_5N000731019v2dbSNPrs11547328NC_000012.12NaN125775164757751647...((595,894,896)+1019v2) -> 5925 // (1869,1870,1...{'hsa05218': 'Melanoma'}{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...{'H00038': 'Melanoma is a form of skin cancer ...{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...CDK4 mutationCDK4cyclin dependent kinase 4 [KO:K02089]NaN{'H00038': 'Melanoma'}
..................................................................
1444KEGG_1445N002449817v1COSM6196635ENST00000393623.6c.706G>T191049219610492196...9817v1 // 4780 => (3162,1728,119391,221357,293...{'hsa05225': 'Hepatocellular carcinoma'}{'nt06263': 'Hepatocellular carcinoma', 'nt062...{'H00048': 'Hepatocellular carcinoma (HCC) is ...{'9817': 'KEAP1; kelch like ECH associated pro...KEAP1 mutationKEAP1kelch like ECH associated protein 1 [KO:K10456]NaN{'H00048': 'Hepatocellular carcinoma;'}
1445KEGG_1446N002449817v1COSM6196637ENST00000393623.6c.548A>G191049948610499486...9817v1 // 4780 => (3162,1728,119391,221357,293...{'hsa05225': 'Hepatocellular carcinoma'}{'nt06263': 'Hepatocellular carcinoma', 'nt062...{'H00048': 'Hepatocellular carcinoma (HCC) is ...{'9817': 'KEAP1; kelch like ECH associated pro...KEAP1 mutationKEAP1kelch like ECH associated protein 1 [KO:K10456]NaN{'H00048': 'Hepatocellular carcinoma;'}
1446KEGG_1447N00258999v2COSM4766271ENST00000621016.4c.662A>G166880882368808823...999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...CDH1 mutationCDH1cadherin 1 [KO:K05689]NaN{'H00018': 'Gastric cancer'}
1447KEGG_1448N00258999v2COSM4766211ENST00000621016.4c.755T>G166881026468810264...999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...CDH1 mutationCDH1cadherin 1 [KO:K05689]NaN{'H00018': 'Gastric cancer'}
1448KEGG_1449N00258999v2COSM1379150ENST00000621016.4c.769G>A166881027868810278...999v2 // 1499 -> (6932,83439,6934,51176) => (4...{'hsa05226': 'Gastric cancer'}{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...{'H00018': \"Gastric cancer (GC) is one of the ...{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...CDH1 mutationCDH1cadherin 1 [KO:K05689]NaN{'H00018': 'Gastric cancer'}
\n", "

1449 rows × 24 columns

\n", "
" ], "text/plain": [ " Var_ID Network Entry Source ID TranscriptID \\\n", "0 KEGG_1 N00073 1019v2 ClinVar 16929 NC_000012.12 \n", "1 KEGG_2 N00073 1019v2 dbSNP rs104894340 NC_000012.12 \n", "2 KEGG_3 N00073 1019v2 dbSNP rs104894340 NC_000012.12 \n", "3 KEGG_4 N00073 1019v2 ClinVar 16928 NC_000012.12 \n", "4 KEGG_5 N00073 1019v2 dbSNP rs11547328 NC_000012.12 \n", "... ... ... ... ... ... ... \n", "1444 KEGG_1445 N00244 9817v1 COSM 6196635 ENST00000393623.6 \n", "1445 KEGG_1446 N00244 9817v1 COSM 6196637 ENST00000393623.6 \n", "1446 KEGG_1447 N00258 999v2 COSM 4766271 ENST00000621016.4 \n", "1447 KEGG_1448 N00258 999v2 COSM 4766211 ENST00000621016.4 \n", "1448 KEGG_1449 N00258 999v2 COSM 1379150 ENST00000621016.4 \n", "\n", " NucChange Chr Start End ... \\\n", "0 NaN 12 57751646 57751646 ... \n", "1 NaN 12 57751646 57751646 ... \n", "2 NaN 12 57751646 57751646 ... \n", "3 NaN 12 57751647 57751647 ... \n", "4 NaN 12 57751647 57751647 ... \n", "... ... ... ... ... ... \n", "1444 c.706G>T 19 10492196 10492196 ... \n", "1445 c.548A>G 19 10499486 10499486 ... \n", "1446 c.662A>G 16 68808823 68808823 ... \n", "1447 c.755T>G 16 68810264 68810264 ... \n", "1448 c.769G>A 16 68810278 68810278 ... \n", "\n", " Network Expanded \\\n", "0 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", "1 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", "2 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", "3 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", "4 ((595,894,896)+1019v2) -> 5925 // (1869,1870,1... \n", "... ... \n", "1444 9817v1 // 4780 => (3162,1728,119391,221357,293... \n", "1445 9817v1 // 4780 => (3162,1728,119391,221357,293... \n", "1446 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", "1447 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", "1448 999v2 // 1499 -> (6932,83439,6934,51176) => (4... \n", "\n", " Pathway \\\n", "0 {'hsa05218': 'Melanoma'} \n", "1 {'hsa05218': 'Melanoma'} \n", "2 {'hsa05218': 'Melanoma'} \n", "3 {'hsa05218': 'Melanoma'} \n", "4 {'hsa05218': 'Melanoma'} \n", "... ... \n", "1444 {'hsa05225': 'Hepatocellular carcinoma'} \n", "1445 {'hsa05225': 'Hepatocellular carcinoma'} \n", "1446 {'hsa05226': 'Gastric cancer'} \n", "1447 {'hsa05226': 'Gastric cancer'} \n", "1448 {'hsa05226': 'Gastric cancer'} \n", "\n", " Class \\\n", "0 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", "1 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", "2 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", "3 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", "4 {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... \n", "... ... \n", "1444 {'nt06263': 'Hepatocellular carcinoma', 'nt062... \n", "1445 {'nt06263': 'Hepatocellular carcinoma', 'nt062... \n", "1446 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", "1447 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", "1448 {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... \n", "\n", " Disease \\\n", "0 {'H00038': 'Melanoma is a form of skin cancer ... \n", "1 {'H00038': 'Melanoma is a form of skin cancer ... \n", "2 {'H00038': 'Melanoma is a form of skin cancer ... \n", "3 {'H00038': 'Melanoma is a form of skin cancer ... \n", "4 {'H00038': 'Melanoma is a form of skin cancer ... \n", "... ... \n", "1444 {'H00048': 'Hepatocellular carcinoma (HCC) is ... \n", "1445 {'H00048': 'Hepatocellular carcinoma (HCC) is ... \n", "1446 {'H00018': \"Gastric cancer (GC) is one of the ... \n", "1447 {'H00018': \"Gastric cancer (GC) is one of the ... \n", "1448 {'H00018': \"Gastric cancer (GC) is one of the ... \n", "\n", " Gene Variant_Name \\\n", "0 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", "1 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", "2 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", "3 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", "4 {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... CDK4 mutation \n", "... ... ... \n", "1444 {'9817': 'KEAP1; kelch like ECH associated pro... KEAP1 mutation \n", "1445 {'9817': 'KEAP1; kelch like ECH associated pro... KEAP1 mutation \n", "1446 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... CDH1 mutation \n", "1447 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... CDH1 mutation \n", "1448 {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... CDH1 mutation \n", "\n", " Variant_Gene Variant_Gene Info \\\n", "0 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", "1 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", "2 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", "3 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", "4 CDK4 cyclin dependent kinase 4 [KO:K02089] \n", "... ... ... \n", "1444 KEAP1 kelch like ECH associated protein 1 [KO:K10456] \n", "1445 KEAP1 kelch like ECH associated protein 1 [KO:K10456] \n", "1446 CDH1 cadherin 1 [KO:K05689] \n", "1447 CDH1 cadherin 1 [KO:K05689] \n", "1448 CDH1 cadherin 1 [KO:K05689] \n", "\n", " Variant_Type Disease_Names \n", "0 NaN {'H00038': 'Melanoma'} \n", "1 NaN {'H00038': 'Melanoma'} \n", "2 NaN {'H00038': 'Melanoma'} \n", "3 NaN {'H00038': 'Melanoma'} \n", "4 NaN {'H00038': 'Melanoma'} \n", "... ... ... \n", "1444 NaN {'H00048': 'Hepatocellular carcinoma;'} \n", "1445 NaN {'H00048': 'Hepatocellular carcinoma;'} \n", "1446 NaN {'H00018': 'Gastric cancer'} \n", "1447 NaN {'H00018': 'Gastric cancer'} \n", "1448 NaN {'H00018': 'Gastric cancer'} \n", "\n", "[1449 rows x 24 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load variant data for nucleotide database creation\n", "network_file = CONFIG['network_data_file']\n", "variant_data = pd.read_csv(network_file, sep='\\t')\n", "print(f\"✅ Loaded variant data: {len(variant_data)} entries\")\n", "variant_data.head()" ] }, { "cell_type": "code", "execution_count": 6, "id": "a7d31451", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1449" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(variant_data)" ] }, { "cell_type": "code", "execution_count": 7, "id": "fc9baca9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'N00073'" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "variant_data.iloc[1][\"Network\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "928146a6", "metadata": {}, "outputs": [], "source": [ "from Bio import SeqIO\n", "import os\n", "\n", "# Load reference genome sequences\n", "fasta_file = CONFIG['reference_fasta']\n", "if not os.path.exists(fasta_file):\n", " print(f\"❌ Reference genome file not found: {fasta_file}\")\n", " print(\"Please update CONFIG['reference_fasta'] with correct path\")\n", " raise FileNotFoundError(f\"Reference genome not found: {fasta_file}\")\n", "\n", "record_dict = SeqIO.to_dict(SeqIO.parse(fasta_file, \"fasta\"))\n", "print(f\"✅ Loaded reference genome: {len(record_dict)} sequences\")" ] }, { "cell_type": "code", "execution_count": null, "id": "e3184e72", "metadata": {}, "outputs": [], "source": [ "# Use chromosome dictionary from configuration\n", "chromosome_dictionary = CONFIG['chromosome_dictionary']\n", "print(f\"✅ Chromosome mapping loaded: {len(chromosome_dictionary)} chromosomes\")\n", "print(\"Available chromosomes:\", list(chromosome_dictionary.keys()))" ] }, { "cell_type": "markdown", "id": "1cd34cc2", "metadata": {}, "source": [ "### Verification that the reference is present at the exact position I have in my data" ] }, { "cell_type": "code", "execution_count": null, "id": "70cc6625", "metadata": {}, "outputs": [], "source": [ "# Verify reference sequences (alternative implementation)\n", "chromosome_dictionary = CONFIG['chromosome_dictionary']\n", "verification_file = \"verification_alt.txt\"\n", "\n", "print(f\"Starting alternative sequence verification...\")\n", "print(f\"Results will be saved to: {verification_file}\")\n", "\n", "with open(verification_file, \"w\") as f:\n", " for i in range(len(variant_data)):\n", " try:\n", " # ---- Input ----\n", " chromosome_id = chromosome_dictionary[str(variant_data.iloc[i]['Chr'])]\n", " if (variant_data.iloc[i]['TranscriptID'][:4] == \"ENST\"):\n", " start = variant_data.iloc[i]['Start'] - 1\n", " else:\n", " start = variant_data.iloc[i]['Start']\n", " reference_allele = variant_data.iloc[i]['RefAllele']\n", " end = len(reference_allele) + start\n", "\n", " chrom_seq = record_dict[chromosome_id].seq\n", "\n", " # Adjust for 0-based indexing in Python\n", " genomic_ref = chrom_seq[start: start + len(reference_allele)]\n", "\n", " if genomic_ref.upper() != reference_allele.upper():\n", " f.write(f\"⚠️ Warning: Entry number {i} with variant {variant_data.iloc[i]['ID']} expected '{reference_allele}', but found '{genomic_ref}'\\n\")\n", " else:\n", " f.write(f\"✅ Verified: {chromosome_id}:{start}-{end} → '{reference_allele}' matches genome\\n\")\n", " \n", " except Exception as e:\n", " f.write(f\"❌ Error verifying variant {i}: {str(e)}\\n\")\n", " \n", " if (i + 1) % 200 == 0:\n", " print(f\"Verified {i + 1}/{len(variant_data)} variants...\")\n", "\n", "print(f\"✅ Alternative verification complete. Results: {verification_file}\")" ] }, { "cell_type": "markdown", "id": "83c0dcce-81b3-4162-a683-3ba86d065eb7", "metadata": {}, "source": [ "## Read in Final_data JSON files" ] }, { "cell_type": "code", "execution_count": null, "id": "9745a67d-3b2a-4679-92c3-92fc199a8763", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
questionanswerreference_sequencevariant_sequencereasoning.reasoning_stepsIDtemp_ID
0Chromosome Number: 20\\nNetwork Definition of t...Creutzfeldt-Jakob DiseaseAATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...[Step 1: The variant is an insertion in the PR...KEGG_854854
1Chromosome Number: 20\\nNetwork Definition of t...Creutzfeldt-Jakob DiseaseAATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...[Step 1: The variant is a deletion of 47 nucle...KEGG_841841
2Chromosome Number: 21\\nNetwork Definition of t...Alzheimer's diseaseGCTGAAACTAACATACTAGCAATCTGGAAAGGCAATATAAAATATA...GCTGAAACTAACATACTAGCAATCTGGAAAGGCAATATAAAATATA...[Step 1: The TC>GA mutation in the APP gene on...KEGG_468468
3Chromosome Number: 1\\nNetwork Definition of th...Primary AldosteronismAATGTTATAAAAGTAAATAGTCTAGGGATGTCTTATTTCCAGATAA...AATGTTATAAAAGTAAATAGTCTAGGGATGTCTTATTTCCAGATAA...[Step 1: The variant KEGG_635 is a 15-nucleoti...KEGG_635635
4Chromosome Number: 14\\nNetwork Definition of t...Spinocerebellar AtaxiaTCATTAACTAAATGAAGGTCTACATTTAGGCAGTTTGTAATTTTGG...TCATTAACTAAATGAAGGTCTACATTTAGGCAGTTTGTAATTTTGG...[Step 1: The variant is a trinucleotide repeat...KEGG_620620
........................
1444Chromosome Number: 6\\nNetwork Definition of th...Spinocerebellar AtaxiagaCGTATACATTAAATGTGTACAGTTTTTGTATATCAATTAGAAGT...gaCGTATACATTAAATGTGTACAGTTTTTGTATATCAATTAGAAGT...[Step 1: The variant KEGG_286 is an A>G substi...KEGG_286286
1445Chromosome Number: 6\\nNetwork Definition of th...Spinocerebellar AtaxiaTTTTCaagattataaaatatgaaatgtcaAAATATTACCTTCATCA...TTTTCaagattataaaatatgaaatgtcaAAATATTACCTTCATCA...[Step 1: The variant is a single cytosine (C) ...KEGG_293293
1446Chromosome Number: 12\\nNetwork Definition of t...Pituitary AdenomaGTGGACAAGATGTGGCTAAGAAAACAAGCTACACATCAAGCTCATC...GTGGACAAGATGTGGCTAAGAAAACAAGCTACACATCAAGCTCATC...[Step 1: The variant is a 20-nucleotide duplic...KEGG_77
1447Chromosome Number: 11\\nNetwork Definition of t...Spinocerebellar AtaxiaATCTGTGGGTTCTTTTAAATAGGCCAGATTTTATCCTAAAGGTAAG...ATCTGTGGGTTCTTTTAAATAGGCCAGATTTTATCCTAAAGGTAAG...[Step 1: The variant KEGG_1285 is an A>G subst...KEGG_12851285
1448Chromosome Number: 7\\nNetwork Definition of th...MelanomatataattttaggttttgcaATTTCAGCACTTAAAATCTGTTTTCCC...tataattttaggttttgcaATTTCAGCACTTAAAATCTGTTTTCCC...[Step 1: The variant involves a nucleotide cha...KEGG_12901290
\n", "

1449 rows × 7 columns

\n", "
" ], "text/plain": [ " question \\\n", "0 Chromosome Number: 20\\nNetwork Definition of t... \n", "1 Chromosome Number: 20\\nNetwork Definition of t... \n", "2 Chromosome Number: 21\\nNetwork Definition of t... \n", "3 Chromosome Number: 1\\nNetwork Definition of th... \n", "4 Chromosome Number: 14\\nNetwork Definition of t... \n", "... ... \n", "1444 Chromosome Number: 6\\nNetwork Definition of th... \n", "1445 Chromosome Number: 6\\nNetwork Definition of th... \n", "1446 Chromosome Number: 12\\nNetwork Definition of t... \n", "1447 Chromosome Number: 11\\nNetwork Definition of t... \n", "1448 Chromosome Number: 7\\nNetwork Definition of th... \n", "\n", " answer \\\n", "0 Creutzfeldt-Jakob Disease \n", "1 Creutzfeldt-Jakob Disease \n", "2 Alzheimer's disease \n", "3 Primary Aldosteronism \n", "4 Spinocerebellar Ataxia \n", "... ... \n", "1444 Spinocerebellar Ataxia \n", "1445 Spinocerebellar Ataxia \n", "1446 Pituitary Adenoma \n", "1447 Spinocerebellar Ataxia \n", "1448 Melanoma \n", "\n", " reference_sequence \\\n", "0 AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG... \n", "1 AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG... \n", "2 GCTGAAACTAACATACTAGCAATCTGGAAAGGCAATATAAAATATA... \n", "3 AATGTTATAAAAGTAAATAGTCTAGGGATGTCTTATTTCCAGATAA... \n", "4 TCATTAACTAAATGAAGGTCTACATTTAGGCAGTTTGTAATTTTGG... \n", "... ... \n", "1444 gaCGTATACATTAAATGTGTACAGTTTTTGTATATCAATTAGAAGT... \n", "1445 TTTTCaagattataaaatatgaaatgtcaAAATATTACCTTCATCA... \n", "1446 GTGGACAAGATGTGGCTAAGAAAACAAGCTACACATCAAGCTCATC... \n", "1447 ATCTGTGGGTTCTTTTAAATAGGCCAGATTTTATCCTAAAGGTAAG... \n", "1448 tataattttaggttttgcaATTTCAGCACTTAAAATCTGTTTTCCC... \n", "\n", " variant_sequence \\\n", "0 AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG... \n", "1 AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG... \n", "2 GCTGAAACTAACATACTAGCAATCTGGAAAGGCAATATAAAATATA... \n", "3 AATGTTATAAAAGTAAATAGTCTAGGGATGTCTTATTTCCAGATAA... \n", "4 TCATTAACTAAATGAAGGTCTACATTTAGGCAGTTTGTAATTTTGG... \n", "... ... \n", "1444 gaCGTATACATTAAATGTGTACAGTTTTTGTATATCAATTAGAAGT... \n", "1445 TTTTCaagattataaaatatgaaatgtcaAAATATTACCTTCATCA... \n", "1446 GTGGACAAGATGTGGCTAAGAAAACAAGCTACACATCAAGCTCATC... \n", "1447 ATCTGTGGGTTCTTTTAAATAGGCCAGATTTTATCCTAAAGGTAAG... \n", "1448 tataattttaggttttgcaATTTCAGCACTTAAAATCTGTTTTCCC... \n", "\n", " reasoning.reasoning_steps ID temp_ID \n", "0 [Step 1: The variant is an insertion in the PR... KEGG_854 854 \n", "1 [Step 1: The variant is a deletion of 47 nucle... KEGG_841 841 \n", "2 [Step 1: The TC>GA mutation in the APP gene on... KEGG_468 468 \n", "3 [Step 1: The variant KEGG_635 is a 15-nucleoti... KEGG_635 635 \n", "4 [Step 1: The variant is a trinucleotide repeat... KEGG_620 620 \n", "... ... ... ... \n", "1444 [Step 1: The variant KEGG_286 is an A>G substi... KEGG_286 286 \n", "1445 [Step 1: The variant is a single cytosine (C) ... KEGG_293 293 \n", "1446 [Step 1: The variant is a 20-nucleotide duplic... KEGG_7 7 \n", "1447 [Step 1: The variant KEGG_1285 is an A>G subst... KEGG_1285 1285 \n", "1448 [Step 1: The variant involves a nucleotide cha... KEGG_1290 1290 \n", "\n", "[1449 rows x 7 columns]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re\n", "import os\n", "import json\n", "import pandas as pd\n", "from pathlib import Path\n", "\n", "# Read final dataset JSON files and create combined DataFrame\n", "\n", "# Path to the directory containing JSON files\n", "json_dir = CONFIG['final_data_dir']\n", "if not os.path.exists(json_dir):\n", " print(f\"❌ JSON directory not found: {json_dir}\")\n", " print(\"Please ensure previous processing steps completed successfully\")\n", " raise FileNotFoundError(f\"Directory not found: {json_dir}\")\n", "\n", "print(f\"Processing JSON files from: {json_dir}\")\n", "\n", "# Initialize a list to hold DataFrames\n", "df_list = []\n", "processed_count = 0\n", "\n", "# Loop through all files in the directory\n", "for filename in os.listdir(json_dir):\n", " if filename.endswith(\".json\"):\n", " match = re.search(r\"(KEGG_\\d+)_with_seqs\", filename)\n", " if match:\n", " kegg_id = match.group(1) # Extract 'KEGG_'\n", " file_path = os.path.join(json_dir, filename)\n", " \n", " try:\n", " with open(file_path, 'r') as f:\n", " data = json.load(f)\n", " \n", " df = pd.json_normalize(data)\n", " df['ID'] = kegg_id # Add the full KEGG ID string\n", " df['temp_ID'] = int(kegg_id[5:]) # Extract numeric ID for sorting\n", " df_list.append(df)\n", " processed_count += 1\n", " \n", " if processed_count % 100 == 0:\n", " print(f\"Processed {processed_count} JSON files...\")\n", " \n", " except Exception as e:\n", " print(f\"[Warning] Could not process {filename}: {str(e)}\")\n", "\n", "# Concatenate all DataFrames into one\n", "if df_list:\n", " combined_df = pd.concat(df_list, ignore_index=True)\n", " print(f\"✅ Combined {len(df_list)} JSON files into DataFrame\")\n", " print(f\"Total samples: {len(combined_df)}\")\n", "else:\n", " print(\"❌ No JSON files found or processed successfully\")\n", " combined_df = pd.DataFrame()\n", "\n", "# Display the result\n", "combined_df.head() if not combined_df.empty else print(\"No data to display\")" ] }, { "cell_type": "code", "execution_count": null, "id": "a81e8836-9618-4e62-b192-ee397a063ce7", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 30, "id": "46c1083a-d499-428e-9180-2b62e83f1751", "metadata": {}, "outputs": [], "source": [ "combined_df = combined_df.sort_values(by=['temp_ID'])\n", "combined_df = combined_df.rename(columns={\"reasoning.reasoning_steps\" : \"reasoning\"})\n", "combined_df = combined_df.drop(columns=['temp_ID'])" ] }, { "cell_type": "code", "execution_count": 33, "id": "1c3e8a2e-444e-4d48-b4c1-c8b5dea5753e", "metadata": {}, "outputs": [], "source": [ "combined_df = combined_df[['ID','question','answer','reference_sequence','variant_sequence','reasoning']]\n", "combined_df = combined_df.reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 34, "id": "4200c786-4365-407e-96d4-f5cabfc7b3b1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDquestionanswerreference_sequencevariant_sequencereasoning
0KEGG_1Chromosome Number: 12\\nNetwork Definition of t...Melanomagcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...[Step 1: The C>T mutation at position 57751646...
1KEGG_2Chromosome Number: 12\\nNetwork Definition of t...Melanomagcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...[Step 1: The C>A mutation at position 57751646...
2KEGG_3Chromosome Number: 12\\nNetwork Definition of t...Melanomagcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...[Step 1: The C>G mutation at position 57751646...
3KEGG_4Chromosome Number: 12\\nNetwork Definition of t...Melanomacttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...[Step 1: The G>A mutation at position 57751647...
4KEGG_5Chromosome Number: 12\\nNetwork Definition of t...Melanomacttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...[Step 1: The G>C mutation at position 57751647...
.....................
1444KEGG_1445Chromosome Number: 19\\nNetwork Definition of t...Hepatocellular carcinomagagctgagatcatgccactgcactccaacctgggcaacagagcgag...gagctgagatcatgccactgcactccaacctgggcaacagagcgag...[Step 1: The variant is a C>A substitution at ...
1445KEGG_1446Chromosome Number: 19\\nNetwork Definition of t...Hepatocellular carcinomaTGAAGGGTAGTACTGCCTCATAGGACAGTTGGGAACAGTCAATGAT...TGAAGGGTAGTACTGCCTCATAGGACAGTTGGGAACAGTCAATGAT...[Step 1: The variant is a T>C substitution at ...
1446KEGG_1447Chromosome Number: 16\\nNetwork Definition of t...Gastric cancerCAAACACAGGATGTAGAGGGCAGAGAGCATaggtgtgtgcgcatgt...CAAACACAGGATGTAGAGGGCAGAGAGCATaggtgtgtgcgcatgt...[Step 1: The variant KEGG_1447 represents an A...
1447KEGG_1448Chromosome Number: 16\\nNetwork Definition of t...Gastric cancerGATATATATAATTTGTCATTGATAAGAGAATGTGTCATTAAATTCA...GATATATATAATTTGTCATTGATAAGAGAATGTGTCATTAAATTCA...[Step 1: The variant KEGG_1448 is a T>G substi...
1448KEGG_1449Chromosome Number: 16\\nNetwork Definition of t...Gastric cancerGTCATTGATAAGAGAATGTGTCATTAAATTCAAACTGTACACTGCC...GTCATTGATAAGAGAATGTGTCATTAAATTCAAACTGTACACTGCC...[Step 1: The variant KEGG_1449 is a G>A substi...
\n", "

1449 rows × 6 columns

\n", "
" ], "text/plain": [ " ID question \\\n", "0 KEGG_1 Chromosome Number: 12\\nNetwork Definition of t... \n", "1 KEGG_2 Chromosome Number: 12\\nNetwork Definition of t... \n", "2 KEGG_3 Chromosome Number: 12\\nNetwork Definition of t... \n", "3 KEGG_4 Chromosome Number: 12\\nNetwork Definition of t... \n", "4 KEGG_5 Chromosome Number: 12\\nNetwork Definition of t... \n", "... ... ... \n", "1444 KEGG_1445 Chromosome Number: 19\\nNetwork Definition of t... \n", "1445 KEGG_1446 Chromosome Number: 19\\nNetwork Definition of t... \n", "1446 KEGG_1447 Chromosome Number: 16\\nNetwork Definition of t... \n", "1447 KEGG_1448 Chromosome Number: 16\\nNetwork Definition of t... \n", "1448 KEGG_1449 Chromosome Number: 16\\nNetwork Definition of t... \n", "\n", " answer \\\n", "0 Melanoma \n", "1 Melanoma \n", "2 Melanoma \n", "3 Melanoma \n", "4 Melanoma \n", "... ... \n", "1444 Hepatocellular carcinoma \n", "1445 Hepatocellular carcinoma \n", "1446 Gastric cancer \n", "1447 Gastric cancer \n", "1448 Gastric cancer \n", "\n", " reference_sequence \\\n", "0 gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... \n", "1 gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... \n", "2 gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... \n", "3 cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt... \n", "4 cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt... \n", "... ... \n", "1444 gagctgagatcatgccactgcactccaacctgggcaacagagcgag... \n", "1445 TGAAGGGTAGTACTGCCTCATAGGACAGTTGGGAACAGTCAATGAT... \n", "1446 CAAACACAGGATGTAGAGGGCAGAGAGCATaggtgtgtgcgcatgt... \n", "1447 GATATATATAATTTGTCATTGATAAGAGAATGTGTCATTAAATTCA... \n", "1448 GTCATTGATAAGAGAATGTGTCATTAAATTCAAACTGTACACTGCC... \n", "\n", " variant_sequence \\\n", "0 gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... \n", "1 gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... \n", "2 gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... \n", "3 cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt... \n", "4 cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt... \n", "... ... \n", "1444 gagctgagatcatgccactgcactccaacctgggcaacagagcgag... \n", "1445 TGAAGGGTAGTACTGCCTCATAGGACAGTTGGGAACAGTCAATGAT... \n", "1446 CAAACACAGGATGTAGAGGGCAGAGAGCATaggtgtgtgcgcatgt... \n", "1447 GATATATATAATTTGTCATTGATAAGAGAATGTGTCATTAAATTCA... \n", "1448 GTCATTGATAAGAGAATGTGTCATTAAATTCAAACTGTACACTGCC... \n", "\n", " reasoning \n", "0 [Step 1: The C>T mutation at position 57751646... \n", "1 [Step 1: The C>A mutation at position 57751646... \n", "2 [Step 1: The C>G mutation at position 57751646... \n", "3 [Step 1: The G>A mutation at position 57751647... \n", "4 [Step 1: The G>C mutation at position 57751647... \n", "... ... \n", "1444 [Step 1: The variant is a C>A substitution at ... \n", "1445 [Step 1: The variant is a T>C substitution at ... \n", "1446 [Step 1: The variant KEGG_1447 represents an A... \n", "1447 [Step 1: The variant KEGG_1448 is a T>G substi... \n", "1448 [Step 1: The variant KEGG_1449 is a G>A substi... \n", "\n", "[1449 rows x 6 columns]" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "combined_df" ] }, { "cell_type": "markdown", "id": "f5cd7e22", "metadata": {}, "source": [ "### Performing the mutation and saving the reference and variant allele with a 1000 nt window" ] }, { "cell_type": "code", "execution_count": 42, "id": "8c89d455-598d-45e3-821b-6e37075b3a77", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4001" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(combined_df.iloc[0]['reference_sequence'])" ] }, { "cell_type": "code", "execution_count": 44, "id": "a1dd3ed8-18ca-4468-9ab9-98ebf4713260", "metadata": {}, "outputs": [], "source": [ "KEGG_2000 = combined_df.copy()" ] }, { "cell_type": "code", "execution_count": 49, "id": "688a7d0b-4a31-484d-9835-eb66d674b5de", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'KEGG_2'" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "KEGG_2000.at[1,'ID']" ] }, { "cell_type": "code", "execution_count": null, "id": "c6fc35c2", "metadata": {}, "outputs": [], "source": [ "# Generate sequences with updated window size\n", "chromosome_dictionary = CONFIG['chromosome_dictionary']\n", "window = CONFIG['sequence_window']\n", "\n", "print(f\"Generating sequences with {window}bp windows...\")\n", "KEGG_2000 = combined_df.copy()\n", "\n", "for i in range(len(KEGG_2000)):\n", " try:\n", " chromosome_id = chromosome_dictionary[str(variant_data.iloc[i]['Chr'])]\n", " if (variant_data.iloc[i]['TranscriptID'][:4] == \"ENST\"):\n", " start = variant_data.iloc[i]['Start'] - 1\n", " else:\n", " start = variant_data.iloc[i]['Start']\n", " reference_allele = variant_data.iloc[i]['RefAllele']\n", " variant_allele = variant_data.iloc[i]['AltAllele']\n", "\n", " end = len(reference_allele) + start\n", " \n", " chrom_seq = record_dict[chromosome_id].seq\n", "\n", " # Extract region\n", " region_start = max(0, start - window)\n", " region_end = end + window\n", "\n", " ref_seq = chrom_seq[region_start:region_end]\n", "\n", " if (variant_allele == \"deletion\"):\n", " # Apply mutation\n", " mutated_seq = ref_seq[:window] + ref_seq[window + len(reference_allele):]\n", "\n", " KEGG_2000.at[i,'reference_sequence'] = str(ref_seq)\n", " KEGG_2000.at[i,'variant_sequence'] = str(mutated_seq)\n", " \n", " else:\n", " del_len = len(reference_allele)\n", " # Apply mutation\n", " mutated_seq = ref_seq[:window] + variant_allele + ref_seq[window + del_len:]\n", "\n", " KEGG_2000.at[i,'reference_sequence'] = str(ref_seq)\n", " KEGG_2000.at[i,'variant_sequence'] = str(mutated_seq)\n", " \n", " if (i + 1) % 100 == 0:\n", " print(f\"Generated sequences for {i + 1}/{len(KEGG_2000)} variants...\")\n", " \n", " except Exception as e:\n", " print(f\"[Error] Failed to generate sequence for variant {i}: {str(e)}\")\n", "\n", "print(f\"✅ Sequence generation complete for {window}bp windows\")" ] }, { "cell_type": "code", "execution_count": 64, "id": "e2a50c08-ccae-45ca-98e1-0c3d3e7d4647", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDquestionanswerreference_sequencevariant_sequencereasoning
0KEGG_1Chromosome Number: 12\\nNetwork Definition of t...MelanomaTTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...[Step 1: The C>T mutation at position 57751646...
1KEGG_2Chromosome Number: 12\\nNetwork Definition of t...MelanomaTTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...[Step 1: The C>A mutation at position 57751646...
2KEGG_3Chromosome Number: 12\\nNetwork Definition of t...MelanomaTTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...[Step 1: The C>G mutation at position 57751646...
3KEGG_4Chromosome Number: 12\\nNetwork Definition of t...MelanomaTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...[Step 1: The G>A mutation at position 57751647...
4KEGG_5Chromosome Number: 12\\nNetwork Definition of t...MelanomaTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...[Step 1: The G>C mutation at position 57751647...
.....................
1444KEGG_1445Chromosome Number: 19\\nNetwork Definition of t...Hepatocellular carcinomagcactccagcctgggcaacagagcaagagagacagggtcttactct...gcactccagcctgggcaacagagcaagagagacagggtcttactct...[Step 1: The variant is a C>A substitution at ...
1445KEGG_1446Chromosome Number: 19\\nNetwork Definition of t...Hepatocellular carcinomactcccaaagtgctgggattacaggcgtgagccactgggccctgcCC...ctcccaaagtgctgggattacaggcgtgagccactgggccctgcCC...[Step 1: The variant is a T>C substitution at ...
1446KEGG_1447Chromosome Number: 16\\nNetwork Definition of t...Gastric cancerggctgggtgtggtggctcatgcctgtaatcccagcattttgggagg...ggctgggtgtggtggctcatgcctgtaatcccagcattttgggagg...[Step 1: The variant KEGG_1447 represents an A...
1447KEGG_1448Chromosome Number: 16\\nNetwork Definition of t...Gastric cancertttgagatagggtttcactctgtcacccaggctggaaccacaacct...tttgagatagggtttcactctgtcacccaggctggaaccacaacct...[Step 1: The variant KEGG_1448 is a T>G substi...
1448KEGG_1449Chromosome Number: 16\\nNetwork Definition of t...Gastric cancertcactctgtcacccaggctggaaccacaacctccacttcccgggtt...tcactctgtcacccaggctggaaccacaacctccacttcccgggtt...[Step 1: The variant KEGG_1449 is a G>A substi...
\n", "

1449 rows × 6 columns

\n", "
" ], "text/plain": [ " ID question \\\n", "0 KEGG_1 Chromosome Number: 12\\nNetwork Definition of t... \n", "1 KEGG_2 Chromosome Number: 12\\nNetwork Definition of t... \n", "2 KEGG_3 Chromosome Number: 12\\nNetwork Definition of t... \n", "3 KEGG_4 Chromosome Number: 12\\nNetwork Definition of t... \n", "4 KEGG_5 Chromosome Number: 12\\nNetwork Definition of t... \n", "... ... ... \n", "1444 KEGG_1445 Chromosome Number: 19\\nNetwork Definition of t... \n", "1445 KEGG_1446 Chromosome Number: 19\\nNetwork Definition of t... \n", "1446 KEGG_1447 Chromosome Number: 16\\nNetwork Definition of t... \n", "1447 KEGG_1448 Chromosome Number: 16\\nNetwork Definition of t... \n", "1448 KEGG_1449 Chromosome Number: 16\\nNetwork Definition of t... \n", "\n", " answer \\\n", "0 Melanoma \n", "1 Melanoma \n", "2 Melanoma \n", "3 Melanoma \n", "4 Melanoma \n", "... ... \n", "1444 Hepatocellular carcinoma \n", "1445 Hepatocellular carcinoma \n", "1446 Gastric cancer \n", "1447 Gastric cancer \n", "1448 Gastric cancer \n", "\n", " reference_sequence \\\n", "0 TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... \n", "1 TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... \n", "2 TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... \n", "3 TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT... \n", "4 TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT... \n", "... ... \n", "1444 gcactccagcctgggcaacagagcaagagagacagggtcttactct... \n", "1445 ctcccaaagtgctgggattacaggcgtgagccactgggccctgcCC... \n", "1446 ggctgggtgtggtggctcatgcctgtaatcccagcattttgggagg... \n", "1447 tttgagatagggtttcactctgtcacccaggctggaaccacaacct... \n", "1448 tcactctgtcacccaggctggaaccacaacctccacttcccgggtt... \n", "\n", " variant_sequence \\\n", "0 TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... \n", "1 TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... \n", "2 TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... \n", "3 TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT... \n", "4 TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT... \n", "... ... \n", "1444 gcactccagcctgggcaacagagcaagagagacagggtcttactct... \n", "1445 ctcccaaagtgctgggattacaggcgtgagccactgggccctgcCC... \n", "1446 ggctgggtgtggtggctcatgcctgtaatcccagcattttgggagg... \n", "1447 tttgagatagggtttcactctgtcacccaggctggaaccacaacct... \n", "1448 tcactctgtcacccaggctggaaccacaacctccacttcccgggtt... \n", "\n", " reasoning \n", "0 [Step 1: The C>T mutation at position 57751646... \n", "1 [Step 1: The C>A mutation at position 57751646... \n", "2 [Step 1: The C>G mutation at position 57751646... \n", "3 [Step 1: The G>A mutation at position 57751647... \n", "4 [Step 1: The G>C mutation at position 57751647... \n", "... ... \n", "1444 [Step 1: The variant is a C>A substitution at ... \n", "1445 [Step 1: The variant is a T>C substitution at ... \n", "1446 [Step 1: The variant KEGG_1447 represents an A... \n", "1447 [Step 1: The variant KEGG_1448 is a T>G substi... \n", "1448 [Step 1: The variant KEGG_1449 is a G>A substi... \n", "\n", "[1449 rows x 6 columns]" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "KEGG_2000" ] }, { "cell_type": "code", "execution_count": null, "id": "26c939b5-0768-4565-873a-10cba7396d99", "metadata": {}, "outputs": [], "source": [ "# Create dataset structure (HuggingFace datasets optional)\n", "try:\n", " from datasets import Dataset, DatasetDict\n", " \n", " # Create Hugging Face Datasets\n", " train_dataset = Dataset.from_pandas(KEGG_2000)\n", " \n", " # Combine into a DatasetDict\n", " dataset = DatasetDict({\n", " \"train\": train_dataset,\n", " })\n", " \n", " print(\"✅ HuggingFace dataset created\")\n", " use_hf_datasets = True\n", " \n", "except ImportError:\n", " print(\"⚠️ HuggingFace datasets not available, using pandas only\")\n", " dataset = KEGG_2000\n", " train_dataset = KEGG_2000\n", " use_hf_datasets = False\n", "\n", "print(f\"Final dataset contains {len(train_dataset)} samples\")" ] }, { "cell_type": "code", "execution_count": 69, "id": "afa07e17-e86a-41d8-9db3-5df6d77443f8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['ID', 'question', 'answer', 'reference_sequence', 'variant_sequence', 'reasoning'],\n", " num_rows: 1449\n", " })\n", "})" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset" ] }, { "cell_type": "code", "execution_count": null, "id": "affe2720-e748-45d2-97d0-0baf1d6530ce", "metadata": {}, "outputs": [], "source": [ "# Save final dataset locally instead of uploading to HuggingFace\n", "# Users can upload to their own repositories if needed\n", "\n", "output_file = \"kegg_variant_dataset_final.parquet\"\n", "dataset_info_file = \"dataset_info.json\"\n", "\n", "# Save dataset as Parquet for efficient storage\n", "train_dataset.to_parquet(output_file)\n", "print(f\"✅ Dataset saved to: {output_file}\")\n", "\n", "# Save dataset information\n", "dataset_info = {\n", " \"name\": \"KEGG Variant Dataset\",\n", " \"description\": \"Genetic variants with biological reasoning for disease association\",\n", " \"total_samples\": len(train_dataset),\n", " \"sequence_length\": f\"~{CONFIG['sequence_window']*2}bp\",\n", " \"features\": list(train_dataset.column_names),\n", " \"diseases\": len(set(disease)) if 'disease' in locals() else \"Unknown\",\n", " \"created_by\": \"KEGG Data Processing Pipeline\",\n", " \"version\": \"1.0\"\n", "}\n", "\n", "with open(dataset_info_file, 'w') as f:\n", " json.dump(dataset_info, f, indent=2)\n", " \n", "print(f\"✅ Dataset information saved to: {dataset_info_file}\")\n", "print(f\"\\nDataset ready for use:\")\n", "print(f\" - Main dataset: {output_file}\")\n", "print(f\" - Information: {dataset_info_file}\")\n", "print(f\" - Samples: {len(train_dataset)}\")\n", "print(f\" - Features: {train_dataset.column_names}\")\n", "\n", "print(\"\\n📝 To upload to HuggingFace Hub:\")\n", "print(\"dataset.push_to_hub('your-username/your-dataset-name')\")" ] }, { "cell_type": "markdown", "id": "5b448bd7-e256-4fad-ae95-dbe299d380f0", "metadata": {}, "source": [ "# KEGG Dataset with Alternative Window Size\n", "\n", "This section demonstrates creating the dataset with different sequence window parameters." ] }, { "cell_type": "code", "execution_count": null, "id": "9fd609ca-6276-4425-997f-0589fe03f1ea", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.9" } }, "nbformat": 4, "nbformat_minor": 5 }