{ "cells": [ { "cell_type": "markdown", "id": "744b9f11-6ef8-4409-a388-fe860480c9de", "metadata": {}, "source": [ "# Processing the Reasoning Trace Data and Adding in Nucleotides" ] }, { "cell_type": "code", "execution_count": null, "id": "8950d38a-dfa9-4dbd-b388-941dec69b3ee", "metadata": {}, "outputs": [], "source": [ "cd kegg_data" ] }, { "cell_type": "code", "execution_count": 2, "id": "a1c3d972-c52e-4d73-9816-e970fca3e1bb", "metadata": {}, "outputs": [], "source": [ "import json\n", "from Bio import SeqIO" ] }, { "cell_type": "code", "execution_count": 3, "id": "c80d7741-7aaa-4c28-a93a-ad955f3da6bb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mkdir: processed_variants 1450 with seqs: File exists\n" ] } ], "source": [ "!mkdir 'processed_variants 1450 with seqs'" ] }, { "cell_type": "code", "execution_count": 4, "id": "e4021560-9130-4fdf-a640-15b5da6935a0", "metadata": {}, "outputs": [], "source": [ "for i in range(1,1450):\n", " # opened the json file\n", " with open(f'processed_variants first 700/KEGG_{i}_processed.json', 'r') as file:\n", " data = json.load(file)\n", "\n", " # open the nt file\n", " fasta_file = f\"nt_seq/KEGG_{i}.txt\"\n", " sequence_list = list(SeqIO.parse(fasta_file, \"fasta\"))\n", " ref_seq = sequence_list[0].seq\n", " var_seq = sequence_list[1].seq\n", "\n", " # Add sequences to the JSON data\n", " data[\"reference_sequence\"] = str(ref_seq)\n", " data[\"variant_sequence\"] = str(var_seq)\n", "\n", " # Save the updated JSON to a new file\n", " with open(f'processed_variants 1450 with seqs/KEGG_{i}_with_seqs.json', 'w') as out_file:\n", " json.dump(data, out_file, indent=2)" ] }, { "cell_type": "markdown", "id": "4db8af16-a11f-4987-b1a6-db552c6714fb", "metadata": {}, "source": [ "# Creating the Final KEGG SFT and RL Dataset\n", "\n", "# Final KEGG Dataset Creation\n", "\n", "This section creates the final machine learning dataset by combining variant data with sequences and generating structured question-answer pairs for biological reasoning tasks." ] }, { "cell_type": "code", "execution_count": null, "id": "f9517d40-74e3-4ddb-bd16-95f9ab7927aa", "metadata": {}, "outputs": [], "source": [ "cd kegg_data" ] }, { "cell_type": "code", "execution_count": 2, "id": "53c5948f-4bde-432d-b35c-34c733eb9ad1", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import json\n", "import ast" ] }, { "cell_type": "code", "execution_count": 3, "id": "60c66a0d-359b-4d2a-8427-53f4d18d1047", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | Var_ID | \n", "Network | \n", "Entry | \n", "Source | \n", "ID | \n", "TranscriptID | \n", "NucChange | \n", "Chr | \n", "Start | \n", "End | \n", "... | \n", "Network Expanded | \n", "Pathway | \n", "Class | \n", "Disease | \n", "Gene | \n", "Variant_Name | \n", "Variant_Gene | \n", "Variant_Gene Info | \n", "Variant_Type | \n", "Disease_Names | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "KEGG_1 | \n", "N00073 | \n", "1019v2 | \n", "ClinVar | \n", "16929 | \n", "NC_000012.12 | \n", "NaN | \n", "12 | \n", "57751646 | \n", "57751646 | \n", "... | \n", "((595,894,896)+1019v2) -> 5925 // (1869,1870,1... | \n", "{'hsa05218': 'Melanoma'} | \n", "{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... | \n", "{'H00038': 'Melanoma is a form of skin cancer ... | \n", "{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... | \n", "CDK4 mutation | \n", "CDK4 | \n", "cyclin dependent kinase 4 [KO:K02089] | \n", "NaN | \n", "{'H00038': 'Melanoma'} | \n", "
| 1 | \n", "KEGG_2 | \n", "N00073 | \n", "1019v2 | \n", "dbSNP | \n", "rs104894340 | \n", "NC_000012.12 | \n", "NaN | \n", "12 | \n", "57751646 | \n", "57751646 | \n", "... | \n", "((595,894,896)+1019v2) -> 5925 // (1869,1870,1... | \n", "{'hsa05218': 'Melanoma'} | \n", "{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... | \n", "{'H00038': 'Melanoma is a form of skin cancer ... | \n", "{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... | \n", "CDK4 mutation | \n", "CDK4 | \n", "cyclin dependent kinase 4 [KO:K02089] | \n", "NaN | \n", "{'H00038': 'Melanoma'} | \n", "
| 2 | \n", "KEGG_3 | \n", "N00073 | \n", "1019v2 | \n", "dbSNP | \n", "rs104894340 | \n", "NC_000012.12 | \n", "NaN | \n", "12 | \n", "57751646 | \n", "57751646 | \n", "... | \n", "((595,894,896)+1019v2) -> 5925 // (1869,1870,1... | \n", "{'hsa05218': 'Melanoma'} | \n", "{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... | \n", "{'H00038': 'Melanoma is a form of skin cancer ... | \n", "{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... | \n", "CDK4 mutation | \n", "CDK4 | \n", "cyclin dependent kinase 4 [KO:K02089] | \n", "NaN | \n", "{'H00038': 'Melanoma'} | \n", "
| 3 | \n", "KEGG_4 | \n", "N00073 | \n", "1019v2 | \n", "ClinVar | \n", "16928 | \n", "NC_000012.12 | \n", "NaN | \n", "12 | \n", "57751647 | \n", "57751647 | \n", "... | \n", "((595,894,896)+1019v2) -> 5925 // (1869,1870,1... | \n", "{'hsa05218': 'Melanoma'} | \n", "{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... | \n", "{'H00038': 'Melanoma is a form of skin cancer ... | \n", "{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... | \n", "CDK4 mutation | \n", "CDK4 | \n", "cyclin dependent kinase 4 [KO:K02089] | \n", "NaN | \n", "{'H00038': 'Melanoma'} | \n", "
| 4 | \n", "KEGG_5 | \n", "N00073 | \n", "1019v2 | \n", "dbSNP | \n", "rs11547328 | \n", "NC_000012.12 | \n", "NaN | \n", "12 | \n", "57751647 | \n", "57751647 | \n", "... | \n", "((595,894,896)+1019v2) -> 5925 // (1869,1870,1... | \n", "{'hsa05218': 'Melanoma'} | \n", "{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... | \n", "{'H00038': 'Melanoma is a form of skin cancer ... | \n", "{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... | \n", "CDK4 mutation | \n", "CDK4 | \n", "cyclin dependent kinase 4 [KO:K02089] | \n", "NaN | \n", "{'H00038': 'Melanoma'} | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 1444 | \n", "KEGG_1445 | \n", "N00244 | \n", "9817v1 | \n", "COSM | \n", "6196635 | \n", "ENST00000393623.6 | \n", "c.706G>T | \n", "19 | \n", "10492196 | \n", "10492196 | \n", "... | \n", "9817v1 // 4780 => (3162,1728,119391,221357,293... | \n", "{'hsa05225': 'Hepatocellular carcinoma'} | \n", "{'nt06263': 'Hepatocellular carcinoma', 'nt062... | \n", "{'H00048': 'Hepatocellular carcinoma (HCC) is ... | \n", "{'9817': 'KEAP1; kelch like ECH associated pro... | \n", "KEAP1 mutation | \n", "KEAP1 | \n", "kelch like ECH associated protein 1 [KO:K10456] | \n", "NaN | \n", "{'H00048': 'Hepatocellular carcinoma;'} | \n", "
| 1445 | \n", "KEGG_1446 | \n", "N00244 | \n", "9817v1 | \n", "COSM | \n", "6196637 | \n", "ENST00000393623.6 | \n", "c.548A>G | \n", "19 | \n", "10499486 | \n", "10499486 | \n", "... | \n", "9817v1 // 4780 => (3162,1728,119391,221357,293... | \n", "{'hsa05225': 'Hepatocellular carcinoma'} | \n", "{'nt06263': 'Hepatocellular carcinoma', 'nt062... | \n", "{'H00048': 'Hepatocellular carcinoma (HCC) is ... | \n", "{'9817': 'KEAP1; kelch like ECH associated pro... | \n", "KEAP1 mutation | \n", "KEAP1 | \n", "kelch like ECH associated protein 1 [KO:K10456] | \n", "NaN | \n", "{'H00048': 'Hepatocellular carcinoma;'} | \n", "
| 1446 | \n", "KEGG_1447 | \n", "N00258 | \n", "999v2 | \n", "COSM | \n", "4766271 | \n", "ENST00000621016.4 | \n", "c.662A>G | \n", "16 | \n", "68808823 | \n", "68808823 | \n", "... | \n", "999v2 // 1499 -> (6932,83439,6934,51176) => (4... | \n", "{'hsa05226': 'Gastric cancer'} | \n", "{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... | \n", "{'H00018': \"Gastric cancer (GC) is one of the ... | \n", "{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... | \n", "CDH1 mutation | \n", "CDH1 | \n", "cadherin 1 [KO:K05689] | \n", "NaN | \n", "{'H00018': 'Gastric cancer'} | \n", "
| 1447 | \n", "KEGG_1448 | \n", "N00258 | \n", "999v2 | \n", "COSM | \n", "4766211 | \n", "ENST00000621016.4 | \n", "c.755T>G | \n", "16 | \n", "68810264 | \n", "68810264 | \n", "... | \n", "999v2 // 1499 -> (6932,83439,6934,51176) => (4... | \n", "{'hsa05226': 'Gastric cancer'} | \n", "{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... | \n", "{'H00018': \"Gastric cancer (GC) is one of the ... | \n", "{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... | \n", "CDH1 mutation | \n", "CDH1 | \n", "cadherin 1 [KO:K05689] | \n", "NaN | \n", "{'H00018': 'Gastric cancer'} | \n", "
| 1448 | \n", "KEGG_1449 | \n", "N00258 | \n", "999v2 | \n", "COSM | \n", "1379150 | \n", "ENST00000621016.4 | \n", "c.769G>A | \n", "16 | \n", "68810278 | \n", "68810278 | \n", "... | \n", "999v2 // 1499 -> (6932,83439,6934,51176) => (4... | \n", "{'hsa05226': 'Gastric cancer'} | \n", "{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... | \n", "{'H00018': \"Gastric cancer (GC) is one of the ... | \n", "{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... | \n", "CDH1 mutation | \n", "CDH1 | \n", "cadherin 1 [KO:K05689] | \n", "NaN | \n", "{'H00018': 'Gastric cancer'} | \n", "
1449 rows × 24 columns
\n", "| \n", " | Var_ID | \n", "Network | \n", "Entry | \n", "Source | \n", "ID | \n", "TranscriptID | \n", "NucChange | \n", "Chr | \n", "Start | \n", "End | \n", "... | \n", "Network Expanded | \n", "Pathway | \n", "Class | \n", "Disease | \n", "Gene | \n", "Variant_Name | \n", "Variant_Gene | \n", "Variant_Gene Info | \n", "Variant_Type | \n", "Disease_Names | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "KEGG_1 | \n", "N00073 | \n", "1019v2 | \n", "ClinVar | \n", "16929 | \n", "NC_000012.12 | \n", "NaN | \n", "12 | \n", "57751646 | \n", "57751646 | \n", "... | \n", "((595,894,896)+1019v2) -> 5925 // (1869,1870,1... | \n", "{'hsa05218': 'Melanoma'} | \n", "{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... | \n", "{'H00038': 'Melanoma is a form of skin cancer ... | \n", "{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... | \n", "CDK4 mutation | \n", "CDK4 | \n", "cyclin dependent kinase 4 [KO:K02089] | \n", "NaN | \n", "{'H00038': 'Melanoma'} | \n", "
| 1 | \n", "KEGG_2 | \n", "N00073 | \n", "1019v2 | \n", "dbSNP | \n", "rs104894340 | \n", "NC_000012.12 | \n", "NaN | \n", "12 | \n", "57751646 | \n", "57751646 | \n", "... | \n", "((595,894,896)+1019v2) -> 5925 // (1869,1870,1... | \n", "{'hsa05218': 'Melanoma'} | \n", "{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... | \n", "{'H00038': 'Melanoma is a form of skin cancer ... | \n", "{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... | \n", "CDK4 mutation | \n", "CDK4 | \n", "cyclin dependent kinase 4 [KO:K02089] | \n", "NaN | \n", "{'H00038': 'Melanoma'} | \n", "
| 2 | \n", "KEGG_3 | \n", "N00073 | \n", "1019v2 | \n", "dbSNP | \n", "rs104894340 | \n", "NC_000012.12 | \n", "NaN | \n", "12 | \n", "57751646 | \n", "57751646 | \n", "... | \n", "((595,894,896)+1019v2) -> 5925 // (1869,1870,1... | \n", "{'hsa05218': 'Melanoma'} | \n", "{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... | \n", "{'H00038': 'Melanoma is a form of skin cancer ... | \n", "{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... | \n", "CDK4 mutation | \n", "CDK4 | \n", "cyclin dependent kinase 4 [KO:K02089] | \n", "NaN | \n", "{'H00038': 'Melanoma'} | \n", "
| 3 | \n", "KEGG_4 | \n", "N00073 | \n", "1019v2 | \n", "ClinVar | \n", "16928 | \n", "NC_000012.12 | \n", "NaN | \n", "12 | \n", "57751647 | \n", "57751647 | \n", "... | \n", "((595,894,896)+1019v2) -> 5925 // (1869,1870,1... | \n", "{'hsa05218': 'Melanoma'} | \n", "{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... | \n", "{'H00038': 'Melanoma is a form of skin cancer ... | \n", "{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... | \n", "CDK4 mutation | \n", "CDK4 | \n", "cyclin dependent kinase 4 [KO:K02089] | \n", "NaN | \n", "{'H00038': 'Melanoma'} | \n", "
| 4 | \n", "KEGG_5 | \n", "N00073 | \n", "1019v2 | \n", "dbSNP | \n", "rs11547328 | \n", "NC_000012.12 | \n", "NaN | \n", "12 | \n", "57751647 | \n", "57751647 | \n", "... | \n", "((595,894,896)+1019v2) -> 5925 // (1869,1870,1... | \n", "{'hsa05218': 'Melanoma'} | \n", "{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle... | \n", "{'H00038': 'Melanoma is a form of skin cancer ... | \n", "{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc... | \n", "CDK4 mutation | \n", "CDK4 | \n", "cyclin dependent kinase 4 [KO:K02089] | \n", "NaN | \n", "{'H00038': 'Melanoma'} | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 1444 | \n", "KEGG_1445 | \n", "N00244 | \n", "9817v1 | \n", "COSM | \n", "6196635 | \n", "ENST00000393623.6 | \n", "c.706G>T | \n", "19 | \n", "10492196 | \n", "10492196 | \n", "... | \n", "9817v1 // 4780 => (3162,1728,119391,221357,293... | \n", "{'hsa05225': 'Hepatocellular carcinoma'} | \n", "{'nt06263': 'Hepatocellular carcinoma', 'nt062... | \n", "{'H00048': 'Hepatocellular carcinoma (HCC) is ... | \n", "{'9817': 'KEAP1; kelch like ECH associated pro... | \n", "KEAP1 mutation | \n", "KEAP1 | \n", "kelch like ECH associated protein 1 [KO:K10456] | \n", "NaN | \n", "{'H00048': 'Hepatocellular carcinoma;'} | \n", "
| 1445 | \n", "KEGG_1446 | \n", "N00244 | \n", "9817v1 | \n", "COSM | \n", "6196637 | \n", "ENST00000393623.6 | \n", "c.548A>G | \n", "19 | \n", "10499486 | \n", "10499486 | \n", "... | \n", "9817v1 // 4780 => (3162,1728,119391,221357,293... | \n", "{'hsa05225': 'Hepatocellular carcinoma'} | \n", "{'nt06263': 'Hepatocellular carcinoma', 'nt062... | \n", "{'H00048': 'Hepatocellular carcinoma (HCC) is ... | \n", "{'9817': 'KEAP1; kelch like ECH associated pro... | \n", "KEAP1 mutation | \n", "KEAP1 | \n", "kelch like ECH associated protein 1 [KO:K10456] | \n", "NaN | \n", "{'H00048': 'Hepatocellular carcinoma;'} | \n", "
| 1446 | \n", "KEGG_1447 | \n", "N00258 | \n", "999v2 | \n", "COSM | \n", "4766271 | \n", "ENST00000621016.4 | \n", "c.662A>G | \n", "16 | \n", "68808823 | \n", "68808823 | \n", "... | \n", "999v2 // 1499 -> (6932,83439,6934,51176) => (4... | \n", "{'hsa05226': 'Gastric cancer'} | \n", "{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... | \n", "{'H00018': \"Gastric cancer (GC) is one of the ... | \n", "{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... | \n", "CDH1 mutation | \n", "CDH1 | \n", "cadherin 1 [KO:K05689] | \n", "NaN | \n", "{'H00018': 'Gastric cancer'} | \n", "
| 1447 | \n", "KEGG_1448 | \n", "N00258 | \n", "999v2 | \n", "COSM | \n", "4766211 | \n", "ENST00000621016.4 | \n", "c.755T>G | \n", "16 | \n", "68810264 | \n", "68810264 | \n", "... | \n", "999v2 // 1499 -> (6932,83439,6934,51176) => (4... | \n", "{'hsa05226': 'Gastric cancer'} | \n", "{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... | \n", "{'H00018': \"Gastric cancer (GC) is one of the ... | \n", "{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... | \n", "CDH1 mutation | \n", "CDH1 | \n", "cadherin 1 [KO:K05689] | \n", "NaN | \n", "{'H00018': 'Gastric cancer'} | \n", "
| 1448 | \n", "KEGG_1449 | \n", "N00258 | \n", "999v2 | \n", "COSM | \n", "1379150 | \n", "ENST00000621016.4 | \n", "c.769G>A | \n", "16 | \n", "68810278 | \n", "68810278 | \n", "... | \n", "999v2 // 1499 -> (6932,83439,6934,51176) => (4... | \n", "{'hsa05226': 'Gastric cancer'} | \n", "{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ... | \n", "{'H00018': \"Gastric cancer (GC) is one of the ... | \n", "{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c... | \n", "CDH1 mutation | \n", "CDH1 | \n", "cadherin 1 [KO:K05689] | \n", "NaN | \n", "{'H00018': 'Gastric cancer'} | \n", "
1449 rows × 24 columns
\n", "| \n", " | question | \n", "answer | \n", "reference_sequence | \n", "variant_sequence | \n", "reasoning.reasoning_steps | \n", "ID | \n", "temp_ID | \n", "
|---|---|---|---|---|---|---|---|
| 0 | \n", "Chromosome Number: 20\\nNetwork Definition of t... | \n", "Creutzfeldt-Jakob Disease | \n", "AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG... | \n", "AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG... | \n", "[Step 1: The variant is an insertion in the PR... | \n", "KEGG_854 | \n", "854 | \n", "
| 1 | \n", "Chromosome Number: 20\\nNetwork Definition of t... | \n", "Creutzfeldt-Jakob Disease | \n", "AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG... | \n", "AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG... | \n", "[Step 1: The variant is a deletion of 47 nucle... | \n", "KEGG_841 | \n", "841 | \n", "
| 2 | \n", "Chromosome Number: 21\\nNetwork Definition of t... | \n", "Alzheimer's disease | \n", "GCTGAAACTAACATACTAGCAATCTGGAAAGGCAATATAAAATATA... | \n", "GCTGAAACTAACATACTAGCAATCTGGAAAGGCAATATAAAATATA... | \n", "[Step 1: The TC>GA mutation in the APP gene on... | \n", "KEGG_468 | \n", "468 | \n", "
| 3 | \n", "Chromosome Number: 1\\nNetwork Definition of th... | \n", "Primary Aldosteronism | \n", "AATGTTATAAAAGTAAATAGTCTAGGGATGTCTTATTTCCAGATAA... | \n", "AATGTTATAAAAGTAAATAGTCTAGGGATGTCTTATTTCCAGATAA... | \n", "[Step 1: The variant KEGG_635 is a 15-nucleoti... | \n", "KEGG_635 | \n", "635 | \n", "
| 4 | \n", "Chromosome Number: 14\\nNetwork Definition of t... | \n", "Spinocerebellar Ataxia | \n", "TCATTAACTAAATGAAGGTCTACATTTAGGCAGTTTGTAATTTTGG... | \n", "TCATTAACTAAATGAAGGTCTACATTTAGGCAGTTTGTAATTTTGG... | \n", "[Step 1: The variant is a trinucleotide repeat... | \n", "KEGG_620 | \n", "620 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 1444 | \n", "Chromosome Number: 6\\nNetwork Definition of th... | \n", "Spinocerebellar Ataxia | \n", "gaCGTATACATTAAATGTGTACAGTTTTTGTATATCAATTAGAAGT... | \n", "gaCGTATACATTAAATGTGTACAGTTTTTGTATATCAATTAGAAGT... | \n", "[Step 1: The variant KEGG_286 is an A>G substi... | \n", "KEGG_286 | \n", "286 | \n", "
| 1445 | \n", "Chromosome Number: 6\\nNetwork Definition of th... | \n", "Spinocerebellar Ataxia | \n", "TTTTCaagattataaaatatgaaatgtcaAAATATTACCTTCATCA... | \n", "TTTTCaagattataaaatatgaaatgtcaAAATATTACCTTCATCA... | \n", "[Step 1: The variant is a single cytosine (C) ... | \n", "KEGG_293 | \n", "293 | \n", "
| 1446 | \n", "Chromosome Number: 12\\nNetwork Definition of t... | \n", "Pituitary Adenoma | \n", "GTGGACAAGATGTGGCTAAGAAAACAAGCTACACATCAAGCTCATC... | \n", "GTGGACAAGATGTGGCTAAGAAAACAAGCTACACATCAAGCTCATC... | \n", "[Step 1: The variant is a 20-nucleotide duplic... | \n", "KEGG_7 | \n", "7 | \n", "
| 1447 | \n", "Chromosome Number: 11\\nNetwork Definition of t... | \n", "Spinocerebellar Ataxia | \n", "ATCTGTGGGTTCTTTTAAATAGGCCAGATTTTATCCTAAAGGTAAG... | \n", "ATCTGTGGGTTCTTTTAAATAGGCCAGATTTTATCCTAAAGGTAAG... | \n", "[Step 1: The variant KEGG_1285 is an A>G subst... | \n", "KEGG_1285 | \n", "1285 | \n", "
| 1448 | \n", "Chromosome Number: 7\\nNetwork Definition of th... | \n", "Melanoma | \n", "tataattttaggttttgcaATTTCAGCACTTAAAATCTGTTTTCCC... | \n", "tataattttaggttttgcaATTTCAGCACTTAAAATCTGTTTTCCC... | \n", "[Step 1: The variant involves a nucleotide cha... | \n", "KEGG_1290 | \n", "1290 | \n", "
1449 rows × 7 columns
\n", "| \n", " | ID | \n", "question | \n", "answer | \n", "reference_sequence | \n", "variant_sequence | \n", "reasoning | \n", "
|---|---|---|---|---|---|---|
| 0 | \n", "KEGG_1 | \n", "Chromosome Number: 12\\nNetwork Definition of t... | \n", "Melanoma | \n", "gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... | \n", "gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... | \n", "[Step 1: The C>T mutation at position 57751646... | \n", "
| 1 | \n", "KEGG_2 | \n", "Chromosome Number: 12\\nNetwork Definition of t... | \n", "Melanoma | \n", "gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... | \n", "gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... | \n", "[Step 1: The C>A mutation at position 57751646... | \n", "
| 2 | \n", "KEGG_3 | \n", "Chromosome Number: 12\\nNetwork Definition of t... | \n", "Melanoma | \n", "gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... | \n", "gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg... | \n", "[Step 1: The C>G mutation at position 57751646... | \n", "
| 3 | \n", "KEGG_4 | \n", "Chromosome Number: 12\\nNetwork Definition of t... | \n", "Melanoma | \n", "cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt... | \n", "cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt... | \n", "[Step 1: The G>A mutation at position 57751647... | \n", "
| 4 | \n", "KEGG_5 | \n", "Chromosome Number: 12\\nNetwork Definition of t... | \n", "Melanoma | \n", "cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt... | \n", "cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt... | \n", "[Step 1: The G>C mutation at position 57751647... | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 1444 | \n", "KEGG_1445 | \n", "Chromosome Number: 19\\nNetwork Definition of t... | \n", "Hepatocellular carcinoma | \n", "gagctgagatcatgccactgcactccaacctgggcaacagagcgag... | \n", "gagctgagatcatgccactgcactccaacctgggcaacagagcgag... | \n", "[Step 1: The variant is a C>A substitution at ... | \n", "
| 1445 | \n", "KEGG_1446 | \n", "Chromosome Number: 19\\nNetwork Definition of t... | \n", "Hepatocellular carcinoma | \n", "TGAAGGGTAGTACTGCCTCATAGGACAGTTGGGAACAGTCAATGAT... | \n", "TGAAGGGTAGTACTGCCTCATAGGACAGTTGGGAACAGTCAATGAT... | \n", "[Step 1: The variant is a T>C substitution at ... | \n", "
| 1446 | \n", "KEGG_1447 | \n", "Chromosome Number: 16\\nNetwork Definition of t... | \n", "Gastric cancer | \n", "CAAACACAGGATGTAGAGGGCAGAGAGCATaggtgtgtgcgcatgt... | \n", "CAAACACAGGATGTAGAGGGCAGAGAGCATaggtgtgtgcgcatgt... | \n", "[Step 1: The variant KEGG_1447 represents an A... | \n", "
| 1447 | \n", "KEGG_1448 | \n", "Chromosome Number: 16\\nNetwork Definition of t... | \n", "Gastric cancer | \n", "GATATATATAATTTGTCATTGATAAGAGAATGTGTCATTAAATTCA... | \n", "GATATATATAATTTGTCATTGATAAGAGAATGTGTCATTAAATTCA... | \n", "[Step 1: The variant KEGG_1448 is a T>G substi... | \n", "
| 1448 | \n", "KEGG_1449 | \n", "Chromosome Number: 16\\nNetwork Definition of t... | \n", "Gastric cancer | \n", "GTCATTGATAAGAGAATGTGTCATTAAATTCAAACTGTACACTGCC... | \n", "GTCATTGATAAGAGAATGTGTCATTAAATTCAAACTGTACACTGCC... | \n", "[Step 1: The variant KEGG_1449 is a G>A substi... | \n", "
1449 rows × 6 columns
\n", "| \n", " | ID | \n", "question | \n", "answer | \n", "reference_sequence | \n", "variant_sequence | \n", "reasoning | \n", "
|---|---|---|---|---|---|---|
| 0 | \n", "KEGG_1 | \n", "Chromosome Number: 12\\nNetwork Definition of t... | \n", "Melanoma | \n", "TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... | \n", "TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... | \n", "[Step 1: The C>T mutation at position 57751646... | \n", "
| 1 | \n", "KEGG_2 | \n", "Chromosome Number: 12\\nNetwork Definition of t... | \n", "Melanoma | \n", "TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... | \n", "TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... | \n", "[Step 1: The C>A mutation at position 57751646... | \n", "
| 2 | \n", "KEGG_3 | \n", "Chromosome Number: 12\\nNetwork Definition of t... | \n", "Melanoma | \n", "TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... | \n", "TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA... | \n", "[Step 1: The C>G mutation at position 57751646... | \n", "
| 3 | \n", "KEGG_4 | \n", "Chromosome Number: 12\\nNetwork Definition of t... | \n", "Melanoma | \n", "TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT... | \n", "TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT... | \n", "[Step 1: The G>A mutation at position 57751647... | \n", "
| 4 | \n", "KEGG_5 | \n", "Chromosome Number: 12\\nNetwork Definition of t... | \n", "Melanoma | \n", "TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT... | \n", "TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT... | \n", "[Step 1: The G>C mutation at position 57751647... | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 1444 | \n", "KEGG_1445 | \n", "Chromosome Number: 19\\nNetwork Definition of t... | \n", "Hepatocellular carcinoma | \n", "gcactccagcctgggcaacagagcaagagagacagggtcttactct... | \n", "gcactccagcctgggcaacagagcaagagagacagggtcttactct... | \n", "[Step 1: The variant is a C>A substitution at ... | \n", "
| 1445 | \n", "KEGG_1446 | \n", "Chromosome Number: 19\\nNetwork Definition of t... | \n", "Hepatocellular carcinoma | \n", "ctcccaaagtgctgggattacaggcgtgagccactgggccctgcCC... | \n", "ctcccaaagtgctgggattacaggcgtgagccactgggccctgcCC... | \n", "[Step 1: The variant is a T>C substitution at ... | \n", "
| 1446 | \n", "KEGG_1447 | \n", "Chromosome Number: 16\\nNetwork Definition of t... | \n", "Gastric cancer | \n", "ggctgggtgtggtggctcatgcctgtaatcccagcattttgggagg... | \n", "ggctgggtgtggtggctcatgcctgtaatcccagcattttgggagg... | \n", "[Step 1: The variant KEGG_1447 represents an A... | \n", "
| 1447 | \n", "KEGG_1448 | \n", "Chromosome Number: 16\\nNetwork Definition of t... | \n", "Gastric cancer | \n", "tttgagatagggtttcactctgtcacccaggctggaaccacaacct... | \n", "tttgagatagggtttcactctgtcacccaggctggaaccacaacct... | \n", "[Step 1: The variant KEGG_1448 is a T>G substi... | \n", "
| 1448 | \n", "KEGG_1449 | \n", "Chromosome Number: 16\\nNetwork Definition of t... | \n", "Gastric cancer | \n", "tcactctgtcacccaggctggaaccacaacctccacttcccgggtt... | \n", "tcactctgtcacccaggctggaaccacaacctccacttcccgggtt... | \n", "[Step 1: The variant KEGG_1449 is a G>A substi... | \n", "
1449 rows × 6 columns
\n", "