{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "744b9f11-6ef8-4409-a388-fe860480c9de",
   "metadata": {},
   "source": [
    "# Processing the Reasoning Trace Data and Adding in Nucleotides"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8950d38a-dfa9-4dbd-b388-941dec69b3ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "cd kegg_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a1c3d972-c52e-4d73-9816-e970fca3e1bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from Bio import SeqIO"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c80d7741-7aaa-4c28-a93a-ad955f3da6bb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mkdir: processed_variants 1450 with seqs: File exists\n"
     ]
    }
   ],
   "source": [
    "!mkdir 'processed_variants 1450 with seqs'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e4021560-9130-4fdf-a640-15b5da6935a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(1,1450):\n",
    "    # opened the json file\n",
    "    with open(f'processed_variants first 700/KEGG_{i}_processed.json', 'r') as file:\n",
    "        data = json.load(file)\n",
    "\n",
    "    # open the nt file\n",
    "    fasta_file = f\"nt_seq/KEGG_{i}.txt\"\n",
    "    sequence_list = list(SeqIO.parse(fasta_file, \"fasta\"))\n",
    "    ref_seq = sequence_list[0].seq\n",
    "    var_seq = sequence_list[1].seq\n",
    "\n",
    "    # Add sequences to the JSON data\n",
    "    data[\"reference_sequence\"] = str(ref_seq)\n",
    "    data[\"variant_sequence\"] = str(var_seq)\n",
    "\n",
    "    # Save the updated JSON to a new file\n",
    "    with open(f'processed_variants 1450 with seqs/KEGG_{i}_with_seqs.json', 'w') as out_file:\n",
    "        json.dump(data, out_file, indent=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4db8af16-a11f-4987-b1a6-db552c6714fb",
   "metadata": {},
   "source": [
    "# Creating the Final KEGG SFT and RL Dataset\n",
    "\n",
    "# Final KEGG Dataset Creation\n",
    "\n",
    "This section creates the final machine learning dataset by combining variant data with sequences and generating structured question-answer pairs for biological reasoning tasks."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f9517d40-74e3-4ddb-bd16-95f9ab7927aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "cd kegg_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "53c5948f-4bde-432d-b35c-34c733eb9ad1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "import ast"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "60c66a0d-359b-4d2a-8427-53f4d18d1047",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Var_ID</th>\n",
       "      <th>Network</th>\n",
       "      <th>Entry</th>\n",
       "      <th>Source</th>\n",
       "      <th>ID</th>\n",
       "      <th>TranscriptID</th>\n",
       "      <th>NucChange</th>\n",
       "      <th>Chr</th>\n",
       "      <th>Start</th>\n",
       "      <th>End</th>\n",
       "      <th>...</th>\n",
       "      <th>Network Expanded</th>\n",
       "      <th>Pathway</th>\n",
       "      <th>Class</th>\n",
       "      <th>Disease</th>\n",
       "      <th>Gene</th>\n",
       "      <th>Variant_Name</th>\n",
       "      <th>Variant_Gene</th>\n",
       "      <th>Variant_Gene Info</th>\n",
       "      <th>Variant_Type</th>\n",
       "      <th>Disease_Names</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>KEGG_1</td>\n",
       "      <td>N00073</td>\n",
       "      <td>1019v2</td>\n",
       "      <td>ClinVar</td>\n",
       "      <td>16929</td>\n",
       "      <td>NC_000012.12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12</td>\n",
       "      <td>57751646</td>\n",
       "      <td>57751646</td>\n",
       "      <td>...</td>\n",
       "      <td>((595,894,896)+1019v2) -&gt; 5925 // (1869,1870,1...</td>\n",
       "      <td>{'hsa05218': 'Melanoma'}</td>\n",
       "      <td>{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...</td>\n",
       "      <td>{'H00038': 'Melanoma is a form of skin cancer ...</td>\n",
       "      <td>{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...</td>\n",
       "      <td>CDK4 mutation</td>\n",
       "      <td>CDK4</td>\n",
       "      <td>cyclin dependent kinase 4 [KO:K02089]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>{'H00038': 'Melanoma'}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>KEGG_2</td>\n",
       "      <td>N00073</td>\n",
       "      <td>1019v2</td>\n",
       "      <td>dbSNP</td>\n",
       "      <td>rs104894340</td>\n",
       "      <td>NC_000012.12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12</td>\n",
       "      <td>57751646</td>\n",
       "      <td>57751646</td>\n",
       "      <td>...</td>\n",
       "      <td>((595,894,896)+1019v2) -&gt; 5925 // (1869,1870,1...</td>\n",
       "      <td>{'hsa05218': 'Melanoma'}</td>\n",
       "      <td>{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...</td>\n",
       "      <td>{'H00038': 'Melanoma is a form of skin cancer ...</td>\n",
       "      <td>{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...</td>\n",
       "      <td>CDK4 mutation</td>\n",
       "      <td>CDK4</td>\n",
       "      <td>cyclin dependent kinase 4 [KO:K02089]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>{'H00038': 'Melanoma'}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>KEGG_3</td>\n",
       "      <td>N00073</td>\n",
       "      <td>1019v2</td>\n",
       "      <td>dbSNP</td>\n",
       "      <td>rs104894340</td>\n",
       "      <td>NC_000012.12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12</td>\n",
       "      <td>57751646</td>\n",
       "      <td>57751646</td>\n",
       "      <td>...</td>\n",
       "      <td>((595,894,896)+1019v2) -&gt; 5925 // (1869,1870,1...</td>\n",
       "      <td>{'hsa05218': 'Melanoma'}</td>\n",
       "      <td>{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...</td>\n",
       "      <td>{'H00038': 'Melanoma is a form of skin cancer ...</td>\n",
       "      <td>{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...</td>\n",
       "      <td>CDK4 mutation</td>\n",
       "      <td>CDK4</td>\n",
       "      <td>cyclin dependent kinase 4 [KO:K02089]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>{'H00038': 'Melanoma'}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>KEGG_4</td>\n",
       "      <td>N00073</td>\n",
       "      <td>1019v2</td>\n",
       "      <td>ClinVar</td>\n",
       "      <td>16928</td>\n",
       "      <td>NC_000012.12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12</td>\n",
       "      <td>57751647</td>\n",
       "      <td>57751647</td>\n",
       "      <td>...</td>\n",
       "      <td>((595,894,896)+1019v2) -&gt; 5925 // (1869,1870,1...</td>\n",
       "      <td>{'hsa05218': 'Melanoma'}</td>\n",
       "      <td>{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...</td>\n",
       "      <td>{'H00038': 'Melanoma is a form of skin cancer ...</td>\n",
       "      <td>{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...</td>\n",
       "      <td>CDK4 mutation</td>\n",
       "      <td>CDK4</td>\n",
       "      <td>cyclin dependent kinase 4 [KO:K02089]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>{'H00038': 'Melanoma'}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>KEGG_5</td>\n",
       "      <td>N00073</td>\n",
       "      <td>1019v2</td>\n",
       "      <td>dbSNP</td>\n",
       "      <td>rs11547328</td>\n",
       "      <td>NC_000012.12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12</td>\n",
       "      <td>57751647</td>\n",
       "      <td>57751647</td>\n",
       "      <td>...</td>\n",
       "      <td>((595,894,896)+1019v2) -&gt; 5925 // (1869,1870,1...</td>\n",
       "      <td>{'hsa05218': 'Melanoma'}</td>\n",
       "      <td>{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...</td>\n",
       "      <td>{'H00038': 'Melanoma is a form of skin cancer ...</td>\n",
       "      <td>{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...</td>\n",
       "      <td>CDK4 mutation</td>\n",
       "      <td>CDK4</td>\n",
       "      <td>cyclin dependent kinase 4 [KO:K02089]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>{'H00038': 'Melanoma'}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1444</th>\n",
       "      <td>KEGG_1445</td>\n",
       "      <td>N00244</td>\n",
       "      <td>9817v1</td>\n",
       "      <td>COSM</td>\n",
       "      <td>6196635</td>\n",
       "      <td>ENST00000393623.6</td>\n",
       "      <td>c.706G&gt;T</td>\n",
       "      <td>19</td>\n",
       "      <td>10492196</td>\n",
       "      <td>10492196</td>\n",
       "      <td>...</td>\n",
       "      <td>9817v1 // 4780 =&gt; (3162,1728,119391,221357,293...</td>\n",
       "      <td>{'hsa05225': 'Hepatocellular carcinoma'}</td>\n",
       "      <td>{'nt06263': 'Hepatocellular carcinoma', 'nt062...</td>\n",
       "      <td>{'H00048': 'Hepatocellular carcinoma (HCC) is ...</td>\n",
       "      <td>{'9817': 'KEAP1; kelch like ECH associated pro...</td>\n",
       "      <td>KEAP1 mutation</td>\n",
       "      <td>KEAP1</td>\n",
       "      <td>kelch like ECH associated protein 1 [KO:K10456]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>{'H00048': 'Hepatocellular carcinoma;'}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1445</th>\n",
       "      <td>KEGG_1446</td>\n",
       "      <td>N00244</td>\n",
       "      <td>9817v1</td>\n",
       "      <td>COSM</td>\n",
       "      <td>6196637</td>\n",
       "      <td>ENST00000393623.6</td>\n",
       "      <td>c.548A&gt;G</td>\n",
       "      <td>19</td>\n",
       "      <td>10499486</td>\n",
       "      <td>10499486</td>\n",
       "      <td>...</td>\n",
       "      <td>9817v1 // 4780 =&gt; (3162,1728,119391,221357,293...</td>\n",
       "      <td>{'hsa05225': 'Hepatocellular carcinoma'}</td>\n",
       "      <td>{'nt06263': 'Hepatocellular carcinoma', 'nt062...</td>\n",
       "      <td>{'H00048': 'Hepatocellular carcinoma (HCC) is ...</td>\n",
       "      <td>{'9817': 'KEAP1; kelch like ECH associated pro...</td>\n",
       "      <td>KEAP1 mutation</td>\n",
       "      <td>KEAP1</td>\n",
       "      <td>kelch like ECH associated protein 1 [KO:K10456]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>{'H00048': 'Hepatocellular carcinoma;'}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1446</th>\n",
       "      <td>KEGG_1447</td>\n",
       "      <td>N00258</td>\n",
       "      <td>999v2</td>\n",
       "      <td>COSM</td>\n",
       "      <td>4766271</td>\n",
       "      <td>ENST00000621016.4</td>\n",
       "      <td>c.662A&gt;G</td>\n",
       "      <td>16</td>\n",
       "      <td>68808823</td>\n",
       "      <td>68808823</td>\n",
       "      <td>...</td>\n",
       "      <td>999v2 // 1499 -&gt; (6932,83439,6934,51176) =&gt; (4...</td>\n",
       "      <td>{'hsa05226': 'Gastric cancer'}</td>\n",
       "      <td>{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...</td>\n",
       "      <td>{'H00018': \"Gastric cancer (GC) is one of the ...</td>\n",
       "      <td>{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...</td>\n",
       "      <td>CDH1 mutation</td>\n",
       "      <td>CDH1</td>\n",
       "      <td>cadherin 1 [KO:K05689]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>{'H00018': 'Gastric cancer'}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1447</th>\n",
       "      <td>KEGG_1448</td>\n",
       "      <td>N00258</td>\n",
       "      <td>999v2</td>\n",
       "      <td>COSM</td>\n",
       "      <td>4766211</td>\n",
       "      <td>ENST00000621016.4</td>\n",
       "      <td>c.755T&gt;G</td>\n",
       "      <td>16</td>\n",
       "      <td>68810264</td>\n",
       "      <td>68810264</td>\n",
       "      <td>...</td>\n",
       "      <td>999v2 // 1499 -&gt; (6932,83439,6934,51176) =&gt; (4...</td>\n",
       "      <td>{'hsa05226': 'Gastric cancer'}</td>\n",
       "      <td>{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...</td>\n",
       "      <td>{'H00018': \"Gastric cancer (GC) is one of the ...</td>\n",
       "      <td>{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...</td>\n",
       "      <td>CDH1 mutation</td>\n",
       "      <td>CDH1</td>\n",
       "      <td>cadherin 1 [KO:K05689]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>{'H00018': 'Gastric cancer'}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1448</th>\n",
       "      <td>KEGG_1449</td>\n",
       "      <td>N00258</td>\n",
       "      <td>999v2</td>\n",
       "      <td>COSM</td>\n",
       "      <td>1379150</td>\n",
       "      <td>ENST00000621016.4</td>\n",
       "      <td>c.769G&gt;A</td>\n",
       "      <td>16</td>\n",
       "      <td>68810278</td>\n",
       "      <td>68810278</td>\n",
       "      <td>...</td>\n",
       "      <td>999v2 // 1499 -&gt; (6932,83439,6934,51176) =&gt; (4...</td>\n",
       "      <td>{'hsa05226': 'Gastric cancer'}</td>\n",
       "      <td>{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...</td>\n",
       "      <td>{'H00018': \"Gastric cancer (GC) is one of the ...</td>\n",
       "      <td>{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...</td>\n",
       "      <td>CDH1 mutation</td>\n",
       "      <td>CDH1</td>\n",
       "      <td>cadherin 1 [KO:K05689]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>{'H00018': 'Gastric cancer'}</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1449 rows × 24 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         Var_ID Network   Entry   Source           ID       TranscriptID  \\\n",
       "0        KEGG_1  N00073  1019v2  ClinVar        16929       NC_000012.12   \n",
       "1        KEGG_2  N00073  1019v2    dbSNP  rs104894340       NC_000012.12   \n",
       "2        KEGG_3  N00073  1019v2    dbSNP  rs104894340       NC_000012.12   \n",
       "3        KEGG_4  N00073  1019v2  ClinVar        16928       NC_000012.12   \n",
       "4        KEGG_5  N00073  1019v2    dbSNP   rs11547328       NC_000012.12   \n",
       "...         ...     ...     ...      ...          ...                ...   \n",
       "1444  KEGG_1445  N00244  9817v1     COSM      6196635  ENST00000393623.6   \n",
       "1445  KEGG_1446  N00244  9817v1     COSM      6196637  ENST00000393623.6   \n",
       "1446  KEGG_1447  N00258   999v2     COSM      4766271  ENST00000621016.4   \n",
       "1447  KEGG_1448  N00258   999v2     COSM      4766211  ENST00000621016.4   \n",
       "1448  KEGG_1449  N00258   999v2     COSM      1379150  ENST00000621016.4   \n",
       "\n",
       "     NucChange  Chr     Start       End  ...  \\\n",
       "0          NaN   12  57751646  57751646  ...   \n",
       "1          NaN   12  57751646  57751646  ...   \n",
       "2          NaN   12  57751646  57751646  ...   \n",
       "3          NaN   12  57751647  57751647  ...   \n",
       "4          NaN   12  57751647  57751647  ...   \n",
       "...        ...  ...       ...       ...  ...   \n",
       "1444  c.706G>T   19  10492196  10492196  ...   \n",
       "1445  c.548A>G   19  10499486  10499486  ...   \n",
       "1446  c.662A>G   16  68808823  68808823  ...   \n",
       "1447  c.755T>G   16  68810264  68810264  ...   \n",
       "1448  c.769G>A   16  68810278  68810278  ...   \n",
       "\n",
       "                                       Network Expanded  \\\n",
       "0     ((595,894,896)+1019v2) -> 5925 // (1869,1870,1...   \n",
       "1     ((595,894,896)+1019v2) -> 5925 // (1869,1870,1...   \n",
       "2     ((595,894,896)+1019v2) -> 5925 // (1869,1870,1...   \n",
       "3     ((595,894,896)+1019v2) -> 5925 // (1869,1870,1...   \n",
       "4     ((595,894,896)+1019v2) -> 5925 // (1869,1870,1...   \n",
       "...                                                 ...   \n",
       "1444  9817v1 // 4780 => (3162,1728,119391,221357,293...   \n",
       "1445  9817v1 // 4780 => (3162,1728,119391,221357,293...   \n",
       "1446  999v2 // 1499 -> (6932,83439,6934,51176) => (4...   \n",
       "1447  999v2 // 1499 -> (6932,83439,6934,51176) => (4...   \n",
       "1448  999v2 // 1499 -> (6932,83439,6934,51176) => (4...   \n",
       "\n",
       "                                       Pathway  \\\n",
       "0                     {'hsa05218': 'Melanoma'}   \n",
       "1                     {'hsa05218': 'Melanoma'}   \n",
       "2                     {'hsa05218': 'Melanoma'}   \n",
       "3                     {'hsa05218': 'Melanoma'}   \n",
       "4                     {'hsa05218': 'Melanoma'}   \n",
       "...                                        ...   \n",
       "1444  {'hsa05225': 'Hepatocellular carcinoma'}   \n",
       "1445  {'hsa05225': 'Hepatocellular carcinoma'}   \n",
       "1446            {'hsa05226': 'Gastric cancer'}   \n",
       "1447            {'hsa05226': 'Gastric cancer'}   \n",
       "1448            {'hsa05226': 'Gastric cancer'}   \n",
       "\n",
       "                                                  Class  \\\n",
       "0     {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...   \n",
       "1     {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...   \n",
       "2     {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...   \n",
       "3     {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...   \n",
       "4     {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...   \n",
       "...                                                 ...   \n",
       "1444  {'nt06263': 'Hepatocellular carcinoma', 'nt062...   \n",
       "1445  {'nt06263': 'Hepatocellular carcinoma', 'nt062...   \n",
       "1446  {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...   \n",
       "1447  {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...   \n",
       "1448  {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...   \n",
       "\n",
       "                                                Disease  \\\n",
       "0     {'H00038': 'Melanoma is a form of skin cancer ...   \n",
       "1     {'H00038': 'Melanoma is a form of skin cancer ...   \n",
       "2     {'H00038': 'Melanoma is a form of skin cancer ...   \n",
       "3     {'H00038': 'Melanoma is a form of skin cancer ...   \n",
       "4     {'H00038': 'Melanoma is a form of skin cancer ...   \n",
       "...                                                 ...   \n",
       "1444  {'H00048': 'Hepatocellular carcinoma (HCC) is ...   \n",
       "1445  {'H00048': 'Hepatocellular carcinoma (HCC) is ...   \n",
       "1446  {'H00018': \"Gastric cancer (GC) is one of the ...   \n",
       "1447  {'H00018': \"Gastric cancer (GC) is one of the ...   \n",
       "1448  {'H00018': \"Gastric cancer (GC) is one of the ...   \n",
       "\n",
       "                                                   Gene    Variant_Name  \\\n",
       "0     {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...   CDK4 mutation   \n",
       "1     {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...   CDK4 mutation   \n",
       "2     {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...   CDK4 mutation   \n",
       "3     {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...   CDK4 mutation   \n",
       "4     {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...   CDK4 mutation   \n",
       "...                                                 ...             ...   \n",
       "1444  {'9817': 'KEAP1; kelch like ECH associated pro...  KEAP1 mutation   \n",
       "1445  {'9817': 'KEAP1; kelch like ECH associated pro...  KEAP1 mutation   \n",
       "1446  {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...   CDH1 mutation   \n",
       "1447  {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...   CDH1 mutation   \n",
       "1448  {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...   CDH1 mutation   \n",
       "\n",
       "     Variant_Gene                                Variant_Gene Info  \\\n",
       "0            CDK4            cyclin dependent kinase 4 [KO:K02089]   \n",
       "1            CDK4            cyclin dependent kinase 4 [KO:K02089]   \n",
       "2            CDK4            cyclin dependent kinase 4 [KO:K02089]   \n",
       "3            CDK4            cyclin dependent kinase 4 [KO:K02089]   \n",
       "4            CDK4            cyclin dependent kinase 4 [KO:K02089]   \n",
       "...           ...                                              ...   \n",
       "1444        KEAP1  kelch like ECH associated protein 1 [KO:K10456]   \n",
       "1445        KEAP1  kelch like ECH associated protein 1 [KO:K10456]   \n",
       "1446         CDH1                           cadherin 1 [KO:K05689]   \n",
       "1447         CDH1                           cadherin 1 [KO:K05689]   \n",
       "1448         CDH1                           cadherin 1 [KO:K05689]   \n",
       "\n",
       "     Variant_Type                            Disease_Names  \n",
       "0             NaN                   {'H00038': 'Melanoma'}  \n",
       "1             NaN                   {'H00038': 'Melanoma'}  \n",
       "2             NaN                   {'H00038': 'Melanoma'}  \n",
       "3             NaN                   {'H00038': 'Melanoma'}  \n",
       "4             NaN                   {'H00038': 'Melanoma'}  \n",
       "...           ...                                      ...  \n",
       "1444          NaN  {'H00048': 'Hepatocellular carcinoma;'}  \n",
       "1445          NaN  {'H00048': 'Hepatocellular carcinoma;'}  \n",
       "1446          NaN             {'H00018': 'Gastric cancer'}  \n",
       "1447          NaN             {'H00018': 'Gastric cancer'}  \n",
       "1448          NaN             {'H00018': 'Gastric cancer'}  \n",
       "\n",
       "[1449 rows x 24 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "variant_data = pd.read_csv(\"final_network_with_variant.tsv\", sep='\\t')\n",
    "variant_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "51609538-9f96-4097-ac60-2a4a08a6e01c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'KEGG_2'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "variant_data.iloc[1]['Var_ID']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "846b6ee3-1e4d-44bc-ad59-4074b4ff39bb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mkdir: final_data: File exists\n"
     ]
    }
   ],
   "source": [
    "!mkdir final_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "56449f64-85ae-4804-8a01-3ce2afe1e6da",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import ast\n",
    "from CONFIG import CONFIG\n",
    "\n",
    "# Create final dataset with question-answer pairs\n",
    "variants_with_seqs_dir = CONFIG['variants_with_seqs_dir']\n",
    "final_data_dir = CONFIG['final_data_dir']\n",
    "start_idx, end_idx = CONFIG['variant_range']\n",
    "\n",
    "print(f\"Creating final dataset with Q&A pairs...\")\n",
    "print(f\"Input: {variants_with_seqs_dir}\")\n",
    "print(f\"Output: {final_data_dir}\")\n",
    "print(f\"Processing range: {start_idx} to {end_idx}\")\n",
    "\n",
    "processed_count = 0\n",
    "error_count = 0\n",
    "\n",
    "for i in range(start_idx, end_idx):\n",
    "    try:\n",
    "        # Load the JSON file with sequences\n",
    "        input_file = f'{variants_with_seqs_dir}/KEGG_{i}_with_seqs.json'\n",
    "        if not os.path.exists(input_file):\n",
    "            error_count += 1\n",
    "            continue\n",
    "            \n",
    "        with open(input_file, 'r') as file:\n",
    "            data = json.load(file)\n",
    "\n",
    "        # Build the question with fallback for inconsistent key casing\n",
    "        try:\n",
    "            chromosome = data['raw_data']['chromosome']\n",
    "            network = data['raw_data']['network']\n",
    "        except KeyError:\n",
    "            try:\n",
    "                chromosome = data['raw_data']['Chromosome']\n",
    "                network = data['raw_data']['Network']\n",
    "            except KeyError:\n",
    "                print(f\"[Warning] Missing chromosome/network data in {input_file}\")\n",
    "                error_count += 1\n",
    "                continue\n",
    "\n",
    "        # Extract gene information\n",
    "        try:\n",
    "            gene_list = list(ast.literal_eval(variant_data.iloc[i-1]['Gene']).values())\n",
    "            gene_list_joined = ' | '.join(gene_list)\n",
    "            variant_gene = variant_data.iloc[i-1]['Variant_Gene']\n",
    "        except (KeyError, IndexError, ValueError) as e:\n",
    "            print(f\"[Warning] Gene information error for {input_file}: {e}\")\n",
    "            error_count += 1\n",
    "            continue\n",
    "\n",
    "        question = (\n",
    "            f\"Chromosome Number: {chromosome}\\n\"\n",
    "            f\"Network Definition of the pathway: {network}\\n\"\n",
    "            f\"Genes in the pathway: {gene_list_joined}\\n\\n\"\n",
    "            f\"Given this context, what is the biological effect of this \"\n",
    "            f\"{variant_gene} allele, specifically what disease does this contribute to?\"\n",
    "        )\n",
    "\n",
    "        # Add Q&A to reasoning steps\n",
    "        if 'reasoning' in data and 'reasoning_steps' in data['reasoning']:\n",
    "            data['reasoning']['reasoning_steps'].append(data.get('answer', ''))\n",
    "\n",
    "        # Extract answer\n",
    "        try:\n",
    "            answer = data['reasoning']['labels']['disease'][0]\n",
    "        except (KeyError, IndexError):\n",
    "            print(f\"[Warning] Missing disease label in {input_file}\")\n",
    "            error_count += 1\n",
    "            continue\n",
    "\n",
    "        data['question'] = question\n",
    "        data['answer'] = answer    \n",
    "\n",
    "        # Clean up unnecessary fields\n",
    "        if 'reasoning' in data:\n",
    "            for key in ['variant_id', 'hgvs', 'labels']:\n",
    "                data['reasoning'].pop(key, None)\n",
    "        data.pop('raw_data', None)\n",
    "\n",
    "        # Save to final data directory\n",
    "        output_file = f'{final_data_dir}/KEGG_{i}_with_seqs.json'\n",
    "        with open(output_file, 'w') as out_file:\n",
    "            json.dump(data, out_file, indent=2)\n",
    "            \n",
    "        processed_count += 1\n",
    "        \n",
    "        if processed_count % 100 == 0:\n",
    "            print(f\"Created {processed_count} Q&A pairs...\")\n",
    "            \n",
    "    except Exception as e:\n",
    "        print(f\"[Error] Failed to process variant {i}: {str(e)}\")\n",
    "        error_count += 1\n",
    "\n",
    "print(f\"✅ Final dataset creation complete:\")\n",
    "print(f\"  Successfully processed: {processed_count}\")\n",
    "print(f\"  Errors encountered: {error_count}\")\n",
    "print(f\"  Output directory: {final_data_dir}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "11b3769e-33e5-4ab8-bc9d-f736913a2034",
   "metadata": {},
   "source": [
    "# Fixing Disease Labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0cfa4eca-c11e-4e52-ad6b-2fa7b43be2a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "cd kegg_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9e36bc3f-07af-4b3d-bc84-d449ced55e24",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cd316862-e6c7-4dd9-a06c-33f3454355b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "\n",
    "# CONFIG parameters\n",
    "CONFIG = {\n",
    "    'final_data_dir': 'final_data',\n",
    "    'variant_range': (1, 1450)\n",
    "}\n",
    "\n",
    "# Extract disease labels from final dataset for standardization\n",
    "final_data_dir = CONFIG['final_data_dir']\n",
    "start_idx, end_idx = CONFIG['variant_range']\n",
    "\n",
    "print(\"Extracting disease labels for standardization...\")\n",
    "\n",
    "disease = []\n",
    "processed_count = 0\n",
    "\n",
    "for i in range(start_idx, end_idx):\n",
    "    try:\n",
    "        input_file = f'{final_data_dir}/KEGG_{i}_with_seqs.json'\n",
    "        if os.path.exists(input_file):\n",
    "            with open(input_file, 'r') as file:\n",
    "                data = json.load(file)\n",
    "            \n",
    "            if 'answer' in data:\n",
    "                disease.append(data['answer'])\n",
    "                processed_count += 1\n",
    "                \n",
    "    except Exception as e:\n",
    "        print(f\"[Warning] Could not process {input_file}: {str(e)}\")\n",
    "\n",
    "print(f\"✅ Extracted {len(disease)} disease labels from {processed_count} files\")\n",
    "print(f\"Unique diseases: {len(set(disease))}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "cca4846c-aec9-49f3-b919-760cb9fa4bc7",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_disease = {'Acute Myeloid Leukemia (AML)' : \"Acute Myeloid Leukemia\",\n",
    " 'Acute myeloid leukemia (AML)' : \"Acute Myeloid Leukemia\",\n",
    " 'Adenine Phosphoribosyltransferase Deficiency (APRTD)' : \"Adenine Phosphoribosyltransferase Deficiency\",\n",
    " 'Adenine phosphoribosyltransferase deficiency (APRTD)' : \"Adenine Phosphoribosyltransferase Deficiency\",\n",
    " \"Alzheimer's disease\" : \"Alzheimer's disease\",\n",
    " \"Alzheimer's disease (AD)\" : \"Alzheimer's disease\",\n",
    " 'Amyotrophic Lateral Sclerosis (ALS)' : \"Amyotrophic Lateral Sclerosis\",\n",
    " 'Amyotrophic lateral sclerosis (ALS)' : \"Amyotrophic Lateral Sclerosis\",\n",
    " 'Basal Cell Carcinoma (BCC)' : \"Basal Cell Carcinoma\",\n",
    " 'Basal cell carcinoma' : \"Basal Cell Carcinoma\",\n",
    " 'Basal cell carcinoma (BCC)' : \"Basal Cell Carcinoma\",\n",
    " 'Chronic Myeloid Leukemia (CML)' : \"Chronic Myeloid Leukemia\",\n",
    " 'Chronic myeloid leukemia (CML)' : \"Chronic Myeloid Leukemia\",\n",
    " 'Clear cell Renal Cell Carcinoma (ccRCC)' : \"Clear cell Renal Cell Carcinoma\",\n",
    " 'Clear cell renal cell carcinoma' : \"Clear cell Renal Cell Carcinoma\",\n",
    " 'Clear cell renal cell carcinoma (ccRCC)' : \"Clear cell Renal Cell Carcinoma\",\n",
    " 'Colorectal cancer' : \"Colorectal cancer\",\n",
    " 'Colorectal cancer (CRC)' : \"Colorectal cancer\",\n",
    " 'Cushing syndrome' : \"Cushing syndrome\",\n",
    " \"Early-onset Alzheimer's disease\" : \"Alzheimer's disease\",\n",
    " \"Early-onset familial Alzheimer's disease\" : \"Alzheimer's disease\",\n",
    " \"Early-onset familial Alzheimer's disease (FAD)\" : \"Alzheimer's disease\",\n",
    " 'Familial Creutzfeldt-Jakob Disease' : \"Creutzfeldt-Jakob Disease\",\n",
    " 'Familial Creutzfeldt-Jakob Disease (fCJD)' : \"Creutzfeldt-Jakob Disease\",\n",
    " 'Familial Creutzfeldt-Jakob disease' : \"Creutzfeldt-Jakob Disease\",\n",
    " 'Familial Creutzfeldt-Jakob disease (fCJD)' : \"Creutzfeldt-Jakob Disease\",\n",
    " \"Familial Early-Onset Alzheimer's Disease\" : \"Alzheimer's disease\",\n",
    " 'Familial Isolated Pituitary Adenoma (FIPA)' : \"Pituitary Adenoma\",\n",
    " \"Familial early-onset Alzheimer's disease\" : \"Alzheimer's disease\",\n",
    " \"Familial early-onset Alzheimer's disease (FAD)\" : \"Alzheimer's disease\",\n",
    " 'Familial isolated pituitary adenoma (FIPA)' : \"Pituitary Adenoma\",\n",
    " 'Gastric cancer' : \"Gastric cancer\",\n",
    " 'Gaucher disease' : \"Gaucher disease\",\n",
    " 'Glioblastoma multiforme' : \"Glioblastoma multiforme\",\n",
    " 'Glioblastoma multiforme (GBM)' : \"Glioblastoma multiforme\",\n",
    " 'Hepatocellular carcinoma' : \"Hepatocellular carcinoma\",\n",
    " 'Hepatocellular carcinoma (HCC)' : \"Hepatocellular carcinoma\",\n",
    " 'Huntington disease' : \"Huntington's disease\",\n",
    " 'Huntington disease (HD)' : \"Huntington's disease\",\n",
    " \"Huntington's disease\" : \"Huntington's disease\",\n",
    " \"Huntington's disease (HD)\" : \"Huntington's disease\",\n",
    " 'Lesch-Nyhan syndrome' : \"Lesch-Nyhan syndrome\",\n",
    " 'Melanoma' : \"Melanoma\",\n",
    " 'Melanoma (H00038)' : \"Melanoma\",\n",
    " 'Methylmalonic aciduria and homocystinuria (MAHC)' : \"Methylmalonic aciduria and homocystinuria\",\n",
    " 'Multiple Endocrine Neoplasia type 1 (MEN1)' : \"Multiple Endocrine Neoplasia type 1\",\n",
    " 'N-acetylglutamate synthase (NAGS) deficiency' : \"N-acetylglutamate synthase deficiency\",\n",
    " 'Non-small cell lung cancer' : \"Non-small cell lung cancer\",\n",
    " 'Non-small cell lung cancer (NSCLC)' : \"Non-small cell lung cancer\",\n",
    " 'Non-small-cell lung cancer' : \"Non-small cell lung cancer\",\n",
    " 'Non-small-cell lung cancer (NSCLC)' : \"Non-small cell lung cancer\",\n",
    " 'Pancreatic ductal adenocarcinoma' : \"Pancreatic ductal adenocarcinoma\",\n",
    " 'Papillary Renal Cell Carcinoma' : \"Papillary Renal Cell Carcinoma\",\n",
    " 'Papillary renal cell carcinoma' : \"Papillary Renal Cell Carcinoma\",\n",
    " 'Papillary thyroid carcinoma' : \"Papillary thyroid carcinoma\",\n",
    " 'Papillary thyroid carcinoma (PTC)' : \"Papillary thyroid carcinoma\",\n",
    " \"Parkinson's Disease\" : \"Parkinson's Disease\",\n",
    " \"Parkinson's disease\" : \"Parkinson's Disease\",\n",
    " \"Parkinson's disease (PD)\" : \"Parkinson's Disease\",\n",
    " 'Pituitary adenoma' : \"Pituitary Adenoma\",\n",
    " 'Primary Aldosteronism' : \"Primary Aldosteronism\",\n",
    " 'Primary aldosteronism' : \"Primary Aldosteronism\",\n",
    " 'Prion disease' : \"Prion disease\",\n",
    " 'Prion diseases' : \"Prion disease\",\n",
    " 'Prostate cancer' : \"Prostate cancer\",\n",
    " 'Renal cell cancer (RCC)' : \"Renal cell carcinoma\",\n",
    " 'Renal cell carcinoma' : \"Renal cell carcinoma\",\n",
    " 'Renal cell carcinoma (RCC)' : \"Renal cell carcinoma\",\n",
    " 'Robinow syndrome' : \"Robinow syndrome\",\n",
    " 'Sphingolipidoses' : \"Sphingolipidoses\",\n",
    " 'Sphingolipidosis' : \"Sphingolipidoses\",\n",
    " 'Spinocerebellar Ataxia (SCA)' : \"Spinocerebellar Ataxia\",\n",
    " 'Spinocerebellar Ataxia (SCA19/22)' : \"Spinocerebellar Ataxia\",\n",
    " 'Spinocerebellar Ataxia Type 1 (SCA1)' : \"Spinocerebellar Ataxia\",\n",
    " 'Spinocerebellar Ataxia Type 13 (SCA13)' : \"Spinocerebellar Ataxia\",\n",
    " 'Spinocerebellar Ataxia Type 14 (SCA14)' : \"Spinocerebellar Ataxia\",\n",
    " 'Spinocerebellar Ataxia Type 15 (SCA15)' : \"Spinocerebellar Ataxia\",\n",
    " 'Spinocerebellar Ataxia Type 2 (SCA2)' : \"Spinocerebellar Ataxia\",\n",
    " 'Spinocerebellar Ataxia Type 3' : \"Spinocerebellar Ataxia\",\n",
    " 'Spinocerebellar Ataxia Type 3 (SCA3)' : \"Spinocerebellar Ataxia\",\n",
    " 'Spinocerebellar Ataxia Type 5 (SCA5)' : \"Spinocerebellar Ataxia\",\n",
    " 'Spinocerebellar Ataxia type 13 (SCA13)' : \"Spinocerebellar Ataxia\",\n",
    " 'Spinocerebellar Ataxia type 6 (SCA6)' : \"Spinocerebellar Ataxia\",\n",
    " 'Spinocerebellar ataxia' : \"Spinocerebellar Ataxia\",\n",
    " 'Spinocerebellar ataxia (SCA)' : \"Spinocerebellar Ataxia\",\n",
    " 'Spinocerebellar ataxia (SCA19/22)' : \"Spinocerebellar Ataxia\",\n",
    " 'Spinocerebellar ataxia type 1 (SCA1)' : \"Spinocerebellar Ataxia\",\n",
    " 'Spinocerebellar ataxia type 19 (SCA19)' : \"Spinocerebellar Ataxia\",\n",
    " 'Spinocerebellar ataxia type 19/22 (SCA19/22)' : \"Spinocerebellar Ataxia\",\n",
    " 'Spinocerebellar ataxia type 2 (SCA2)' : \"Spinocerebellar Ataxia\",\n",
    " 'Spinocerebellar ataxia type 3' : \"Spinocerebellar Ataxia\",\n",
    " 'Spinocerebellar ataxia type 3 (SCA3)' : \"Spinocerebellar Ataxia\",\n",
    " 'Spinocerebellar ataxia type 5 (SCA5)' : \"Spinocerebellar Ataxia\",\n",
    " 'Spinocerebellar ataxia type 6 (SCA6)' : \"Spinocerebellar Ataxia\",\n",
    " 'Thyroid cancer' : \"Thyroid cancer\",\n",
    " 'Thyroid dyshormonogenesis' : \"Thyroid dyshormonogenesis\",\n",
    " 'Urothelial carcinoma' : \"Urothelial carcinoma\",\n",
    " 'von Hippel-Lindau syndrome' : \"von Hippel-Lindau syndrome\"}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "2451ebb1-a9d8-494c-9f7e-4f800cd158e8",
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir final_data_fix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c71719e5-5215-4559-a47d-dfc160779260",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "\n",
    "# CONFIG parameters\n",
    "CONFIG = {\n",
    "    'final_data_dir': 'final_data',\n",
    "    'final_data_fix_dir': 'final_data_fix',\n",
    "    'variant_range': (1, 1450)\n",
    "}\n",
    "\n",
    "# Dummy new_disease mapping for demonstration\n",
    "new_disease = {\n",
    "    \"disease_A\": \"new_disease_A\",\n",
    "    \"disease_B\": \"new_disease_B\"\n",
    "    # Add more mappings as needed\n",
    "}\n",
    "\n",
    "# Standardize disease labels using the mapping dictionary\n",
    "final_data_dir = CONFIG['final_data_dir']\n",
    "final_data_fix_dir = CONFIG['final_data_fix_dir']\n",
    "start_idx, end_idx = CONFIG['variant_range']\n",
    "\n",
    "print(\"Applying disease label standardization...\")\n",
    "print(f\"Input: {final_data_dir}\")\n",
    "print(f\"Output: {final_data_fix_dir}\")\n",
    "\n",
    "processed_count = 0\n",
    "error_count = 0\n",
    "\n",
    "for i in range(start_idx, end_idx):\n",
    "    try:\n",
    "        input_file = f'{final_data_dir}/KEGG_{i}_with_seqs.json'\n",
    "        if not os.path.exists(input_file):\n",
    "            continue\n",
    "            \n",
    "        with open(input_file, 'r') as file:\n",
    "            data = json.load(file)\n",
    "\n",
    "        # Get original answer\n",
    "        temp = data.get('answer', '')\n",
    "        \n",
    "        # Apply standardization if mapping exists\n",
    "        if temp in new_disease:\n",
    "            data['answer'] = new_disease[temp]\n",
    "        else:\n",
    "            print(f\"[Warning] No mapping found for disease: {temp}\")\n",
    "        \n",
    "        # Save to standardized directory\n",
    "        output_file = f'{final_data_fix_dir}/KEGG_{i}_with_seqs.json'\n",
    "        with open(output_file, 'w') as out_file:\n",
    "            json.dump(data, out_file, indent=2)\n",
    "            \n",
    "        processed_count += 1\n",
    "        \n",
    "        if processed_count % 100 == 0:\n",
    "            print(f\"Standardized {processed_count} disease labels...\")\n",
    "            \n",
    "    except Exception as e:\n",
    "        print(f\"[Error] Failed to process {input_file}: {str(e)}\")\n",
    "        error_count += 1\n",
    "\n",
    "print(f\"✅ Disease label standardization complete:\")\n",
    "print(f\"  Successfully processed: {processed_count}\")\n",
    "print(f\"  Errors encountered: {error_count}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9a12df3e-9ceb-4a51-acaf-e2931792a844",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove original final_data directory and replace with standardized version\n",
    "final_data_dir = CONFIG['final_data_dir']\n",
    "final_data_fix_dir = CONFIG['final_data_fix_dir']\n",
    "\n",
    "import shutil\n",
    "import os\n",
    "\n",
    "if os.path.exists(final_data_dir):\n",
    "    shutil.rmtree(final_data_dir)\n",
    "    print(f\"Removed original directory: {final_data_dir}\")\n",
    "else:\n",
    "    print(f\"Directory not found: {final_data_dir}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dbba2c19-08f6-4769-b38d-a64d8643e142",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from your_config_module import CONFIG  # Adjust the import based on your project structure\n",
    "\n",
    "# Rename standardized directory to final_data\n",
    "final_data_dir = CONFIG['final_data_dir']\n",
    "final_data_fix_dir = CONFIG['final_data_fix_dir']\n",
    "\n",
    "if os.path.exists(final_data_fix_dir):\n",
    "    os.rename(final_data_fix_dir, final_data_dir)\n",
    "    print(f\"Renamed {final_data_fix_dir} to {final_data_dir}\")\n",
    "    print(\"✅ Final dataset with standardized disease labels is ready\")\n",
    "else:\n",
    "    print(f\"Directory not found: {final_data_fix_dir}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c87a0df-09c8-4fb6-baca-21a9cdd65b85",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "\n",
    "# Assuming CONFIG is defined somewhere earlier in the code\n",
    "# CONFIG = {\n",
    "#     'final_data_dir': 'path_to_final_data_dir',\n",
    "#     'variant_range': (1, 1450)\n",
    "# }\n",
    "\n",
    "# Verify standardized disease labels\n",
    "final_data_dir = CONFIG['final_data_dir']\n",
    "start_idx, end_idx = CONFIG['variant_range']\n",
    "\n",
    "print(\"Verifying standardized disease labels...\")\n",
    "\n",
    "disease = []\n",
    "for i in range(start_idx, end_idx):\n",
    "    try:\n",
    "        input_file = f'{final_data_dir}/KEGG_{i}_with_seqs.json'\n",
    "        if os.path.exists(input_file):\n",
    "            with open(input_file, 'r') as file:\n",
    "                data = json.load(file)\n",
    "            \n",
    "            if 'answer' in data:\n",
    "                disease.append(data['answer'])\n",
    "                \n",
    "    except Exception as e:\n",
    "        print(f\"[Warning] Could not verify {input_file}: {str(e)}\")\n",
    "\n",
    "print(f\"✅ Verification complete:\")\n",
    "print(f\"  Total disease labels: {len(disease)}\")\n",
    "print(f\"  Unique diseases: {len(set(disease))}\")\n",
    "print(f\"  Top 10 diseases: {list(set(disease))[:10]}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "60f75d92-e2f2-495f-ba8f-cb423410f1f4",
   "metadata": {},
   "source": [
    "# Saving the KEGG Task to the WangLab Hugging Face"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a069a67-b410-4adf-ab75-62eca67ab259",
   "metadata": {},
   "outputs": [],
   "source": [
    "cd ../../bioR_tasks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "10e9f0fb-4943-41bf-bef3-9fcd64796ddf",
   "metadata": {},
   "outputs": [],
   "source": [
    "mkdir kegg_variant"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cced244e-9d03-47be-8fa1-864f2736fe01",
   "metadata": {},
   "outputs": [],
   "source": [
    "cp ../BioReason/data/kegg_data/final_data/* kegg_variant/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bff9ce06-2cd8-4675-a23f-080027770bdb",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "4c56a919",
   "metadata": {},
   "source": [
    "# Creating the Nt Variant Database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c28bc9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "cd kegg_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7618faf2",
   "metadata": {},
   "outputs": [],
   "source": [
    "from Bio import SeqIO\n",
    "import pandas as pd\n",
    "import json\n",
    "import os\n",
    "from pathlib import Path\n",
    "\n",
    "# Optional: Uncomment if you want to use HuggingFace datasets\n",
    "# from datasets import load_dataset, Dataset, DatasetDict\n",
    "\n",
    "print(\"Imports loaded for nucleotide database creation\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1b8cac05",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Var_ID</th>\n",
       "      <th>Network</th>\n",
       "      <th>Entry</th>\n",
       "      <th>Source</th>\n",
       "      <th>ID</th>\n",
       "      <th>TranscriptID</th>\n",
       "      <th>NucChange</th>\n",
       "      <th>Chr</th>\n",
       "      <th>Start</th>\n",
       "      <th>End</th>\n",
       "      <th>...</th>\n",
       "      <th>Network Expanded</th>\n",
       "      <th>Pathway</th>\n",
       "      <th>Class</th>\n",
       "      <th>Disease</th>\n",
       "      <th>Gene</th>\n",
       "      <th>Variant_Name</th>\n",
       "      <th>Variant_Gene</th>\n",
       "      <th>Variant_Gene Info</th>\n",
       "      <th>Variant_Type</th>\n",
       "      <th>Disease_Names</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>KEGG_1</td>\n",
       "      <td>N00073</td>\n",
       "      <td>1019v2</td>\n",
       "      <td>ClinVar</td>\n",
       "      <td>16929</td>\n",
       "      <td>NC_000012.12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12</td>\n",
       "      <td>57751646</td>\n",
       "      <td>57751646</td>\n",
       "      <td>...</td>\n",
       "      <td>((595,894,896)+1019v2) -&gt; 5925 // (1869,1870,1...</td>\n",
       "      <td>{'hsa05218': 'Melanoma'}</td>\n",
       "      <td>{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...</td>\n",
       "      <td>{'H00038': 'Melanoma is a form of skin cancer ...</td>\n",
       "      <td>{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...</td>\n",
       "      <td>CDK4 mutation</td>\n",
       "      <td>CDK4</td>\n",
       "      <td>cyclin dependent kinase 4 [KO:K02089]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>{'H00038': 'Melanoma'}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>KEGG_2</td>\n",
       "      <td>N00073</td>\n",
       "      <td>1019v2</td>\n",
       "      <td>dbSNP</td>\n",
       "      <td>rs104894340</td>\n",
       "      <td>NC_000012.12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12</td>\n",
       "      <td>57751646</td>\n",
       "      <td>57751646</td>\n",
       "      <td>...</td>\n",
       "      <td>((595,894,896)+1019v2) -&gt; 5925 // (1869,1870,1...</td>\n",
       "      <td>{'hsa05218': 'Melanoma'}</td>\n",
       "      <td>{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...</td>\n",
       "      <td>{'H00038': 'Melanoma is a form of skin cancer ...</td>\n",
       "      <td>{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...</td>\n",
       "      <td>CDK4 mutation</td>\n",
       "      <td>CDK4</td>\n",
       "      <td>cyclin dependent kinase 4 [KO:K02089]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>{'H00038': 'Melanoma'}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>KEGG_3</td>\n",
       "      <td>N00073</td>\n",
       "      <td>1019v2</td>\n",
       "      <td>dbSNP</td>\n",
       "      <td>rs104894340</td>\n",
       "      <td>NC_000012.12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12</td>\n",
       "      <td>57751646</td>\n",
       "      <td>57751646</td>\n",
       "      <td>...</td>\n",
       "      <td>((595,894,896)+1019v2) -&gt; 5925 // (1869,1870,1...</td>\n",
       "      <td>{'hsa05218': 'Melanoma'}</td>\n",
       "      <td>{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...</td>\n",
       "      <td>{'H00038': 'Melanoma is a form of skin cancer ...</td>\n",
       "      <td>{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...</td>\n",
       "      <td>CDK4 mutation</td>\n",
       "      <td>CDK4</td>\n",
       "      <td>cyclin dependent kinase 4 [KO:K02089]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>{'H00038': 'Melanoma'}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>KEGG_4</td>\n",
       "      <td>N00073</td>\n",
       "      <td>1019v2</td>\n",
       "      <td>ClinVar</td>\n",
       "      <td>16928</td>\n",
       "      <td>NC_000012.12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12</td>\n",
       "      <td>57751647</td>\n",
       "      <td>57751647</td>\n",
       "      <td>...</td>\n",
       "      <td>((595,894,896)+1019v2) -&gt; 5925 // (1869,1870,1...</td>\n",
       "      <td>{'hsa05218': 'Melanoma'}</td>\n",
       "      <td>{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...</td>\n",
       "      <td>{'H00038': 'Melanoma is a form of skin cancer ...</td>\n",
       "      <td>{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...</td>\n",
       "      <td>CDK4 mutation</td>\n",
       "      <td>CDK4</td>\n",
       "      <td>cyclin dependent kinase 4 [KO:K02089]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>{'H00038': 'Melanoma'}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>KEGG_5</td>\n",
       "      <td>N00073</td>\n",
       "      <td>1019v2</td>\n",
       "      <td>dbSNP</td>\n",
       "      <td>rs11547328</td>\n",
       "      <td>NC_000012.12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12</td>\n",
       "      <td>57751647</td>\n",
       "      <td>57751647</td>\n",
       "      <td>...</td>\n",
       "      <td>((595,894,896)+1019v2) -&gt; 5925 // (1869,1870,1...</td>\n",
       "      <td>{'hsa05218': 'Melanoma'}</td>\n",
       "      <td>{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...</td>\n",
       "      <td>{'H00038': 'Melanoma is a form of skin cancer ...</td>\n",
       "      <td>{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...</td>\n",
       "      <td>CDK4 mutation</td>\n",
       "      <td>CDK4</td>\n",
       "      <td>cyclin dependent kinase 4 [KO:K02089]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>{'H00038': 'Melanoma'}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1444</th>\n",
       "      <td>KEGG_1445</td>\n",
       "      <td>N00244</td>\n",
       "      <td>9817v1</td>\n",
       "      <td>COSM</td>\n",
       "      <td>6196635</td>\n",
       "      <td>ENST00000393623.6</td>\n",
       "      <td>c.706G&gt;T</td>\n",
       "      <td>19</td>\n",
       "      <td>10492196</td>\n",
       "      <td>10492196</td>\n",
       "      <td>...</td>\n",
       "      <td>9817v1 // 4780 =&gt; (3162,1728,119391,221357,293...</td>\n",
       "      <td>{'hsa05225': 'Hepatocellular carcinoma'}</td>\n",
       "      <td>{'nt06263': 'Hepatocellular carcinoma', 'nt062...</td>\n",
       "      <td>{'H00048': 'Hepatocellular carcinoma (HCC) is ...</td>\n",
       "      <td>{'9817': 'KEAP1; kelch like ECH associated pro...</td>\n",
       "      <td>KEAP1 mutation</td>\n",
       "      <td>KEAP1</td>\n",
       "      <td>kelch like ECH associated protein 1 [KO:K10456]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>{'H00048': 'Hepatocellular carcinoma;'}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1445</th>\n",
       "      <td>KEGG_1446</td>\n",
       "      <td>N00244</td>\n",
       "      <td>9817v1</td>\n",
       "      <td>COSM</td>\n",
       "      <td>6196637</td>\n",
       "      <td>ENST00000393623.6</td>\n",
       "      <td>c.548A&gt;G</td>\n",
       "      <td>19</td>\n",
       "      <td>10499486</td>\n",
       "      <td>10499486</td>\n",
       "      <td>...</td>\n",
       "      <td>9817v1 // 4780 =&gt; (3162,1728,119391,221357,293...</td>\n",
       "      <td>{'hsa05225': 'Hepatocellular carcinoma'}</td>\n",
       "      <td>{'nt06263': 'Hepatocellular carcinoma', 'nt062...</td>\n",
       "      <td>{'H00048': 'Hepatocellular carcinoma (HCC) is ...</td>\n",
       "      <td>{'9817': 'KEAP1; kelch like ECH associated pro...</td>\n",
       "      <td>KEAP1 mutation</td>\n",
       "      <td>KEAP1</td>\n",
       "      <td>kelch like ECH associated protein 1 [KO:K10456]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>{'H00048': 'Hepatocellular carcinoma;'}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1446</th>\n",
       "      <td>KEGG_1447</td>\n",
       "      <td>N00258</td>\n",
       "      <td>999v2</td>\n",
       "      <td>COSM</td>\n",
       "      <td>4766271</td>\n",
       "      <td>ENST00000621016.4</td>\n",
       "      <td>c.662A&gt;G</td>\n",
       "      <td>16</td>\n",
       "      <td>68808823</td>\n",
       "      <td>68808823</td>\n",
       "      <td>...</td>\n",
       "      <td>999v2 // 1499 -&gt; (6932,83439,6934,51176) =&gt; (4...</td>\n",
       "      <td>{'hsa05226': 'Gastric cancer'}</td>\n",
       "      <td>{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...</td>\n",
       "      <td>{'H00018': \"Gastric cancer (GC) is one of the ...</td>\n",
       "      <td>{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...</td>\n",
       "      <td>CDH1 mutation</td>\n",
       "      <td>CDH1</td>\n",
       "      <td>cadherin 1 [KO:K05689]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>{'H00018': 'Gastric cancer'}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1447</th>\n",
       "      <td>KEGG_1448</td>\n",
       "      <td>N00258</td>\n",
       "      <td>999v2</td>\n",
       "      <td>COSM</td>\n",
       "      <td>4766211</td>\n",
       "      <td>ENST00000621016.4</td>\n",
       "      <td>c.755T&gt;G</td>\n",
       "      <td>16</td>\n",
       "      <td>68810264</td>\n",
       "      <td>68810264</td>\n",
       "      <td>...</td>\n",
       "      <td>999v2 // 1499 -&gt; (6932,83439,6934,51176) =&gt; (4...</td>\n",
       "      <td>{'hsa05226': 'Gastric cancer'}</td>\n",
       "      <td>{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...</td>\n",
       "      <td>{'H00018': \"Gastric cancer (GC) is one of the ...</td>\n",
       "      <td>{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...</td>\n",
       "      <td>CDH1 mutation</td>\n",
       "      <td>CDH1</td>\n",
       "      <td>cadherin 1 [KO:K05689]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>{'H00018': 'Gastric cancer'}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1448</th>\n",
       "      <td>KEGG_1449</td>\n",
       "      <td>N00258</td>\n",
       "      <td>999v2</td>\n",
       "      <td>COSM</td>\n",
       "      <td>1379150</td>\n",
       "      <td>ENST00000621016.4</td>\n",
       "      <td>c.769G&gt;A</td>\n",
       "      <td>16</td>\n",
       "      <td>68810278</td>\n",
       "      <td>68810278</td>\n",
       "      <td>...</td>\n",
       "      <td>999v2 // 1499 -&gt; (6932,83439,6934,51176) =&gt; (4...</td>\n",
       "      <td>{'hsa05226': 'Gastric cancer'}</td>\n",
       "      <td>{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...</td>\n",
       "      <td>{'H00018': \"Gastric cancer (GC) is one of the ...</td>\n",
       "      <td>{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...</td>\n",
       "      <td>CDH1 mutation</td>\n",
       "      <td>CDH1</td>\n",
       "      <td>cadherin 1 [KO:K05689]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>{'H00018': 'Gastric cancer'}</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1449 rows × 24 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         Var_ID Network   Entry   Source           ID       TranscriptID  \\\n",
       "0        KEGG_1  N00073  1019v2  ClinVar        16929       NC_000012.12   \n",
       "1        KEGG_2  N00073  1019v2    dbSNP  rs104894340       NC_000012.12   \n",
       "2        KEGG_3  N00073  1019v2    dbSNP  rs104894340       NC_000012.12   \n",
       "3        KEGG_4  N00073  1019v2  ClinVar        16928       NC_000012.12   \n",
       "4        KEGG_5  N00073  1019v2    dbSNP   rs11547328       NC_000012.12   \n",
       "...         ...     ...     ...      ...          ...                ...   \n",
       "1444  KEGG_1445  N00244  9817v1     COSM      6196635  ENST00000393623.6   \n",
       "1445  KEGG_1446  N00244  9817v1     COSM      6196637  ENST00000393623.6   \n",
       "1446  KEGG_1447  N00258   999v2     COSM      4766271  ENST00000621016.4   \n",
       "1447  KEGG_1448  N00258   999v2     COSM      4766211  ENST00000621016.4   \n",
       "1448  KEGG_1449  N00258   999v2     COSM      1379150  ENST00000621016.4   \n",
       "\n",
       "     NucChange  Chr     Start       End  ...  \\\n",
       "0          NaN   12  57751646  57751646  ...   \n",
       "1          NaN   12  57751646  57751646  ...   \n",
       "2          NaN   12  57751646  57751646  ...   \n",
       "3          NaN   12  57751647  57751647  ...   \n",
       "4          NaN   12  57751647  57751647  ...   \n",
       "...        ...  ...       ...       ...  ...   \n",
       "1444  c.706G>T   19  10492196  10492196  ...   \n",
       "1445  c.548A>G   19  10499486  10499486  ...   \n",
       "1446  c.662A>G   16  68808823  68808823  ...   \n",
       "1447  c.755T>G   16  68810264  68810264  ...   \n",
       "1448  c.769G>A   16  68810278  68810278  ...   \n",
       "\n",
       "                                       Network Expanded  \\\n",
       "0     ((595,894,896)+1019v2) -> 5925 // (1869,1870,1...   \n",
       "1     ((595,894,896)+1019v2) -> 5925 // (1869,1870,1...   \n",
       "2     ((595,894,896)+1019v2) -> 5925 // (1869,1870,1...   \n",
       "3     ((595,894,896)+1019v2) -> 5925 // (1869,1870,1...   \n",
       "4     ((595,894,896)+1019v2) -> 5925 // (1869,1870,1...   \n",
       "...                                                 ...   \n",
       "1444  9817v1 // 4780 => (3162,1728,119391,221357,293...   \n",
       "1445  9817v1 // 4780 => (3162,1728,119391,221357,293...   \n",
       "1446  999v2 // 1499 -> (6932,83439,6934,51176) => (4...   \n",
       "1447  999v2 // 1499 -> (6932,83439,6934,51176) => (4...   \n",
       "1448  999v2 // 1499 -> (6932,83439,6934,51176) => (4...   \n",
       "\n",
       "                                       Pathway  \\\n",
       "0                     {'hsa05218': 'Melanoma'}   \n",
       "1                     {'hsa05218': 'Melanoma'}   \n",
       "2                     {'hsa05218': 'Melanoma'}   \n",
       "3                     {'hsa05218': 'Melanoma'}   \n",
       "4                     {'hsa05218': 'Melanoma'}   \n",
       "...                                        ...   \n",
       "1444  {'hsa05225': 'Hepatocellular carcinoma'}   \n",
       "1445  {'hsa05225': 'Hepatocellular carcinoma'}   \n",
       "1446            {'hsa05226': 'Gastric cancer'}   \n",
       "1447            {'hsa05226': 'Gastric cancer'}   \n",
       "1448            {'hsa05226': 'Gastric cancer'}   \n",
       "\n",
       "                                                  Class  \\\n",
       "0     {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...   \n",
       "1     {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...   \n",
       "2     {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...   \n",
       "3     {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...   \n",
       "4     {'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...   \n",
       "...                                                 ...   \n",
       "1444  {'nt06263': 'Hepatocellular carcinoma', 'nt062...   \n",
       "1445  {'nt06263': 'Hepatocellular carcinoma', 'nt062...   \n",
       "1446  {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...   \n",
       "1447  {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...   \n",
       "1448  {'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...   \n",
       "\n",
       "                                                Disease  \\\n",
       "0     {'H00038': 'Melanoma is a form of skin cancer ...   \n",
       "1     {'H00038': 'Melanoma is a form of skin cancer ...   \n",
       "2     {'H00038': 'Melanoma is a form of skin cancer ...   \n",
       "3     {'H00038': 'Melanoma is a form of skin cancer ...   \n",
       "4     {'H00038': 'Melanoma is a form of skin cancer ...   \n",
       "...                                                 ...   \n",
       "1444  {'H00048': 'Hepatocellular carcinoma (HCC) is ...   \n",
       "1445  {'H00048': 'Hepatocellular carcinoma (HCC) is ...   \n",
       "1446  {'H00018': \"Gastric cancer (GC) is one of the ...   \n",
       "1447  {'H00018': \"Gastric cancer (GC) is one of the ...   \n",
       "1448  {'H00018': \"Gastric cancer (GC) is one of the ...   \n",
       "\n",
       "                                                   Gene    Variant_Name  \\\n",
       "0     {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...   CDK4 mutation   \n",
       "1     {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...   CDK4 mutation   \n",
       "2     {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...   CDK4 mutation   \n",
       "3     {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...   CDK4 mutation   \n",
       "4     {'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...   CDK4 mutation   \n",
       "...                                                 ...             ...   \n",
       "1444  {'9817': 'KEAP1; kelch like ECH associated pro...  KEAP1 mutation   \n",
       "1445  {'9817': 'KEAP1; kelch like ECH associated pro...  KEAP1 mutation   \n",
       "1446  {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...   CDH1 mutation   \n",
       "1447  {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...   CDH1 mutation   \n",
       "1448  {'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...   CDH1 mutation   \n",
       "\n",
       "     Variant_Gene                                Variant_Gene Info  \\\n",
       "0            CDK4            cyclin dependent kinase 4 [KO:K02089]   \n",
       "1            CDK4            cyclin dependent kinase 4 [KO:K02089]   \n",
       "2            CDK4            cyclin dependent kinase 4 [KO:K02089]   \n",
       "3            CDK4            cyclin dependent kinase 4 [KO:K02089]   \n",
       "4            CDK4            cyclin dependent kinase 4 [KO:K02089]   \n",
       "...           ...                                              ...   \n",
       "1444        KEAP1  kelch like ECH associated protein 1 [KO:K10456]   \n",
       "1445        KEAP1  kelch like ECH associated protein 1 [KO:K10456]   \n",
       "1446         CDH1                           cadherin 1 [KO:K05689]   \n",
       "1447         CDH1                           cadherin 1 [KO:K05689]   \n",
       "1448         CDH1                           cadherin 1 [KO:K05689]   \n",
       "\n",
       "     Variant_Type                            Disease_Names  \n",
       "0             NaN                   {'H00038': 'Melanoma'}  \n",
       "1             NaN                   {'H00038': 'Melanoma'}  \n",
       "2             NaN                   {'H00038': 'Melanoma'}  \n",
       "3             NaN                   {'H00038': 'Melanoma'}  \n",
       "4             NaN                   {'H00038': 'Melanoma'}  \n",
       "...           ...                                      ...  \n",
       "1444          NaN  {'H00048': 'Hepatocellular carcinoma;'}  \n",
       "1445          NaN  {'H00048': 'Hepatocellular carcinoma;'}  \n",
       "1446          NaN             {'H00018': 'Gastric cancer'}  \n",
       "1447          NaN             {'H00018': 'Gastric cancer'}  \n",
       "1448          NaN             {'H00018': 'Gastric cancer'}  \n",
       "\n",
       "[1449 rows x 24 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load variant data for nucleotide database creation\n",
    "network_file = CONFIG['network_data_file']\n",
    "variant_data = pd.read_csv(network_file, sep='\\t')\n",
    "print(f\"✅ Loaded variant data: {len(variant_data)} entries\")\n",
    "variant_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a7d31451",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1449"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(variant_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "fc9baca9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'N00073'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "variant_data.iloc[1][\"Network\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "928146a6",
   "metadata": {},
   "outputs": [],
   "source": [
    "from Bio import SeqIO\n",
    "import os\n",
    "\n",
    "# Load reference genome sequences\n",
    "fasta_file = CONFIG['reference_fasta']\n",
    "if not os.path.exists(fasta_file):\n",
    "    print(f\"❌ Reference genome file not found: {fasta_file}\")\n",
    "    print(\"Please update CONFIG['reference_fasta'] with correct path\")\n",
    "    raise FileNotFoundError(f\"Reference genome not found: {fasta_file}\")\n",
    "\n",
    "record_dict = SeqIO.to_dict(SeqIO.parse(fasta_file, \"fasta\"))\n",
    "print(f\"✅ Loaded reference genome: {len(record_dict)} sequences\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3184e72",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Use chromosome dictionary from configuration\n",
    "chromosome_dictionary = CONFIG['chromosome_dictionary']\n",
    "print(f\"✅ Chromosome mapping loaded: {len(chromosome_dictionary)} chromosomes\")\n",
    "print(\"Available chromosomes:\", list(chromosome_dictionary.keys()))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1cd34cc2",
   "metadata": {},
   "source": [
    "### Verification that the reference is present at the exact position I have in my data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70cc6625",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Verify reference sequences (alternative implementation)\n",
    "chromosome_dictionary = CONFIG['chromosome_dictionary']\n",
    "verification_file = \"verification_alt.txt\"\n",
    "\n",
    "print(f\"Starting alternative sequence verification...\")\n",
    "print(f\"Results will be saved to: {verification_file}\")\n",
    "\n",
    "with open(verification_file, \"w\") as f:\n",
    "    for i in range(len(variant_data)):\n",
    "        try:\n",
    "            # ---- Input ----\n",
    "            chromosome_id = chromosome_dictionary[str(variant_data.iloc[i]['Chr'])]\n",
    "            if (variant_data.iloc[i]['TranscriptID'][:4] == \"ENST\"):\n",
    "                start = variant_data.iloc[i]['Start'] - 1\n",
    "            else:\n",
    "                start = variant_data.iloc[i]['Start']\n",
    "            reference_allele = variant_data.iloc[i]['RefAllele']\n",
    "            end = len(reference_allele) + start\n",
    "\n",
    "            chrom_seq = record_dict[chromosome_id].seq\n",
    "\n",
    "            # Adjust for 0-based indexing in Python\n",
    "            genomic_ref = chrom_seq[start: start + len(reference_allele)]\n",
    "\n",
    "            if genomic_ref.upper() != reference_allele.upper():\n",
    "                f.write(f\"⚠️ Warning: Entry number {i} with variant {variant_data.iloc[i]['ID']} expected '{reference_allele}', but found '{genomic_ref}'\\n\")\n",
    "            else:\n",
    "                f.write(f\"✅ Verified: {chromosome_id}:{start}-{end} → '{reference_allele}' matches genome\\n\")\n",
    "        \n",
    "        except Exception as e:\n",
    "            f.write(f\"❌ Error verifying variant {i}: {str(e)}\\n\")\n",
    "        \n",
    "        if (i + 1) % 200 == 0:\n",
    "            print(f\"Verified {i + 1}/{len(variant_data)} variants...\")\n",
    "\n",
    "print(f\"✅ Alternative verification complete. Results: {verification_file}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "83c0dcce-81b3-4162-a683-3ba86d065eb7",
   "metadata": {},
   "source": [
    "## Read in Final_data JSON files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9745a67d-3b2a-4679-92c3-92fc199a8763",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question</th>\n",
       "      <th>answer</th>\n",
       "      <th>reference_sequence</th>\n",
       "      <th>variant_sequence</th>\n",
       "      <th>reasoning.reasoning_steps</th>\n",
       "      <th>ID</th>\n",
       "      <th>temp_ID</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Chromosome Number: 20\\nNetwork Definition of t...</td>\n",
       "      <td>Creutzfeldt-Jakob Disease</td>\n",
       "      <td>AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...</td>\n",
       "      <td>AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...</td>\n",
       "      <td>[Step 1: The variant is an insertion in the PR...</td>\n",
       "      <td>KEGG_854</td>\n",
       "      <td>854</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Chromosome Number: 20\\nNetwork Definition of t...</td>\n",
       "      <td>Creutzfeldt-Jakob Disease</td>\n",
       "      <td>AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...</td>\n",
       "      <td>AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...</td>\n",
       "      <td>[Step 1: The variant is a deletion of 47 nucle...</td>\n",
       "      <td>KEGG_841</td>\n",
       "      <td>841</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Chromosome Number: 21\\nNetwork Definition of t...</td>\n",
       "      <td>Alzheimer's disease</td>\n",
       "      <td>GCTGAAACTAACATACTAGCAATCTGGAAAGGCAATATAAAATATA...</td>\n",
       "      <td>GCTGAAACTAACATACTAGCAATCTGGAAAGGCAATATAAAATATA...</td>\n",
       "      <td>[Step 1: The TC&gt;GA mutation in the APP gene on...</td>\n",
       "      <td>KEGG_468</td>\n",
       "      <td>468</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Chromosome Number: 1\\nNetwork Definition of th...</td>\n",
       "      <td>Primary Aldosteronism</td>\n",
       "      <td>AATGTTATAAAAGTAAATAGTCTAGGGATGTCTTATTTCCAGATAA...</td>\n",
       "      <td>AATGTTATAAAAGTAAATAGTCTAGGGATGTCTTATTTCCAGATAA...</td>\n",
       "      <td>[Step 1: The variant KEGG_635 is a 15-nucleoti...</td>\n",
       "      <td>KEGG_635</td>\n",
       "      <td>635</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Chromosome Number: 14\\nNetwork Definition of t...</td>\n",
       "      <td>Spinocerebellar Ataxia</td>\n",
       "      <td>TCATTAACTAAATGAAGGTCTACATTTAGGCAGTTTGTAATTTTGG...</td>\n",
       "      <td>TCATTAACTAAATGAAGGTCTACATTTAGGCAGTTTGTAATTTTGG...</td>\n",
       "      <td>[Step 1: The variant is a trinucleotide repeat...</td>\n",
       "      <td>KEGG_620</td>\n",
       "      <td>620</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1444</th>\n",
       "      <td>Chromosome Number: 6\\nNetwork Definition of th...</td>\n",
       "      <td>Spinocerebellar Ataxia</td>\n",
       "      <td>gaCGTATACATTAAATGTGTACAGTTTTTGTATATCAATTAGAAGT...</td>\n",
       "      <td>gaCGTATACATTAAATGTGTACAGTTTTTGTATATCAATTAGAAGT...</td>\n",
       "      <td>[Step 1: The variant KEGG_286 is an A&gt;G substi...</td>\n",
       "      <td>KEGG_286</td>\n",
       "      <td>286</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1445</th>\n",
       "      <td>Chromosome Number: 6\\nNetwork Definition of th...</td>\n",
       "      <td>Spinocerebellar Ataxia</td>\n",
       "      <td>TTTTCaagattataaaatatgaaatgtcaAAATATTACCTTCATCA...</td>\n",
       "      <td>TTTTCaagattataaaatatgaaatgtcaAAATATTACCTTCATCA...</td>\n",
       "      <td>[Step 1: The variant is a single cytosine (C) ...</td>\n",
       "      <td>KEGG_293</td>\n",
       "      <td>293</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1446</th>\n",
       "      <td>Chromosome Number: 12\\nNetwork Definition of t...</td>\n",
       "      <td>Pituitary Adenoma</td>\n",
       "      <td>GTGGACAAGATGTGGCTAAGAAAACAAGCTACACATCAAGCTCATC...</td>\n",
       "      <td>GTGGACAAGATGTGGCTAAGAAAACAAGCTACACATCAAGCTCATC...</td>\n",
       "      <td>[Step 1: The variant is a 20-nucleotide duplic...</td>\n",
       "      <td>KEGG_7</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1447</th>\n",
       "      <td>Chromosome Number: 11\\nNetwork Definition of t...</td>\n",
       "      <td>Spinocerebellar Ataxia</td>\n",
       "      <td>ATCTGTGGGTTCTTTTAAATAGGCCAGATTTTATCCTAAAGGTAAG...</td>\n",
       "      <td>ATCTGTGGGTTCTTTTAAATAGGCCAGATTTTATCCTAAAGGTAAG...</td>\n",
       "      <td>[Step 1: The variant KEGG_1285 is an A&gt;G subst...</td>\n",
       "      <td>KEGG_1285</td>\n",
       "      <td>1285</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1448</th>\n",
       "      <td>Chromosome Number: 7\\nNetwork Definition of th...</td>\n",
       "      <td>Melanoma</td>\n",
       "      <td>tataattttaggttttgcaATTTCAGCACTTAAAATCTGTTTTCCC...</td>\n",
       "      <td>tataattttaggttttgcaATTTCAGCACTTAAAATCTGTTTTCCC...</td>\n",
       "      <td>[Step 1: The variant involves a nucleotide cha...</td>\n",
       "      <td>KEGG_1290</td>\n",
       "      <td>1290</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1449 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               question  \\\n",
       "0     Chromosome Number: 20\\nNetwork Definition of t...   \n",
       "1     Chromosome Number: 20\\nNetwork Definition of t...   \n",
       "2     Chromosome Number: 21\\nNetwork Definition of t...   \n",
       "3     Chromosome Number: 1\\nNetwork Definition of th...   \n",
       "4     Chromosome Number: 14\\nNetwork Definition of t...   \n",
       "...                                                 ...   \n",
       "1444  Chromosome Number: 6\\nNetwork Definition of th...   \n",
       "1445  Chromosome Number: 6\\nNetwork Definition of th...   \n",
       "1446  Chromosome Number: 12\\nNetwork Definition of t...   \n",
       "1447  Chromosome Number: 11\\nNetwork Definition of t...   \n",
       "1448  Chromosome Number: 7\\nNetwork Definition of th...   \n",
       "\n",
       "                         answer  \\\n",
       "0     Creutzfeldt-Jakob Disease   \n",
       "1     Creutzfeldt-Jakob Disease   \n",
       "2           Alzheimer's disease   \n",
       "3         Primary Aldosteronism   \n",
       "4        Spinocerebellar Ataxia   \n",
       "...                         ...   \n",
       "1444     Spinocerebellar Ataxia   \n",
       "1445     Spinocerebellar Ataxia   \n",
       "1446          Pituitary Adenoma   \n",
       "1447     Spinocerebellar Ataxia   \n",
       "1448                   Melanoma   \n",
       "\n",
       "                                     reference_sequence  \\\n",
       "0     AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...   \n",
       "1     AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...   \n",
       "2     GCTGAAACTAACATACTAGCAATCTGGAAAGGCAATATAAAATATA...   \n",
       "3     AATGTTATAAAAGTAAATAGTCTAGGGATGTCTTATTTCCAGATAA...   \n",
       "4     TCATTAACTAAATGAAGGTCTACATTTAGGCAGTTTGTAATTTTGG...   \n",
       "...                                                 ...   \n",
       "1444  gaCGTATACATTAAATGTGTACAGTTTTTGTATATCAATTAGAAGT...   \n",
       "1445  TTTTCaagattataaaatatgaaatgtcaAAATATTACCTTCATCA...   \n",
       "1446  GTGGACAAGATGTGGCTAAGAAAACAAGCTACACATCAAGCTCATC...   \n",
       "1447  ATCTGTGGGTTCTTTTAAATAGGCCAGATTTTATCCTAAAGGTAAG...   \n",
       "1448  tataattttaggttttgcaATTTCAGCACTTAAAATCTGTTTTCCC...   \n",
       "\n",
       "                                       variant_sequence  \\\n",
       "0     AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...   \n",
       "1     AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...   \n",
       "2     GCTGAAACTAACATACTAGCAATCTGGAAAGGCAATATAAAATATA...   \n",
       "3     AATGTTATAAAAGTAAATAGTCTAGGGATGTCTTATTTCCAGATAA...   \n",
       "4     TCATTAACTAAATGAAGGTCTACATTTAGGCAGTTTGTAATTTTGG...   \n",
       "...                                                 ...   \n",
       "1444  gaCGTATACATTAAATGTGTACAGTTTTTGTATATCAATTAGAAGT...   \n",
       "1445  TTTTCaagattataaaatatgaaatgtcaAAATATTACCTTCATCA...   \n",
       "1446  GTGGACAAGATGTGGCTAAGAAAACAAGCTACACATCAAGCTCATC...   \n",
       "1447  ATCTGTGGGTTCTTTTAAATAGGCCAGATTTTATCCTAAAGGTAAG...   \n",
       "1448  tataattttaggttttgcaATTTCAGCACTTAAAATCTGTTTTCCC...   \n",
       "\n",
       "                              reasoning.reasoning_steps         ID  temp_ID  \n",
       "0     [Step 1: The variant is an insertion in the PR...   KEGG_854      854  \n",
       "1     [Step 1: The variant is a deletion of 47 nucle...   KEGG_841      841  \n",
       "2     [Step 1: The TC>GA mutation in the APP gene on...   KEGG_468      468  \n",
       "3     [Step 1: The variant KEGG_635 is a 15-nucleoti...   KEGG_635      635  \n",
       "4     [Step 1: The variant is a trinucleotide repeat...   KEGG_620      620  \n",
       "...                                                 ...        ...      ...  \n",
       "1444  [Step 1: The variant KEGG_286 is an A>G substi...   KEGG_286      286  \n",
       "1445  [Step 1: The variant is a single cytosine (C) ...   KEGG_293      293  \n",
       "1446  [Step 1: The variant is a 20-nucleotide duplic...     KEGG_7        7  \n",
       "1447  [Step 1: The variant KEGG_1285 is an A>G subst...  KEGG_1285     1285  \n",
       "1448  [Step 1: The variant involves a nucleotide cha...  KEGG_1290     1290  \n",
       "\n",
       "[1449 rows x 7 columns]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import re\n",
    "import os\n",
    "import json\n",
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "\n",
    "# Read final dataset JSON files and create combined DataFrame\n",
    "\n",
    "# Path to the directory containing JSON files\n",
    "json_dir = CONFIG['final_data_dir']\n",
    "if not os.path.exists(json_dir):\n",
    "    print(f\"❌ JSON directory not found: {json_dir}\")\n",
    "    print(\"Please ensure previous processing steps completed successfully\")\n",
    "    raise FileNotFoundError(f\"Directory not found: {json_dir}\")\n",
    "\n",
    "print(f\"Processing JSON files from: {json_dir}\")\n",
    "\n",
    "# Initialize a list to hold DataFrames\n",
    "df_list = []\n",
    "processed_count = 0\n",
    "\n",
    "# Loop through all files in the directory\n",
    "for filename in os.listdir(json_dir):\n",
    "    if filename.endswith(\".json\"):\n",
    "        match = re.search(r\"(KEGG_\\d+)_with_seqs\", filename)\n",
    "        if match:\n",
    "            kegg_id = match.group(1)  # Extract 'KEGG_<number>'\n",
    "            file_path = os.path.join(json_dir, filename)\n",
    "            \n",
    "            try:\n",
    "                with open(file_path, 'r') as f:\n",
    "                    data = json.load(f)\n",
    "                    \n",
    "                df = pd.json_normalize(data)\n",
    "                df['ID'] = kegg_id  # Add the full KEGG ID string\n",
    "                df['temp_ID'] = int(kegg_id[5:])  # Extract numeric ID for sorting\n",
    "                df_list.append(df)\n",
    "                processed_count += 1\n",
    "                \n",
    "                if processed_count % 100 == 0:\n",
    "                    print(f\"Processed {processed_count} JSON files...\")\n",
    "                    \n",
    "            except Exception as e:\n",
    "                print(f\"[Warning] Could not process {filename}: {str(e)}\")\n",
    "\n",
    "# Concatenate all DataFrames into one\n",
    "if df_list:\n",
    "    combined_df = pd.concat(df_list, ignore_index=True)\n",
    "    print(f\"✅ Combined {len(df_list)} JSON files into DataFrame\")\n",
    "    print(f\"Total samples: {len(combined_df)}\")\n",
    "else:\n",
    "    print(\"❌ No JSON files found or processed successfully\")\n",
    "    combined_df = pd.DataFrame()\n",
    "\n",
    "# Display the result\n",
    "combined_df.head() if not combined_df.empty else print(\"No data to display\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a81e8836-9618-4e62-b192-ee397a063ce7",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "46c1083a-d499-428e-9180-2b62e83f1751",
   "metadata": {},
   "outputs": [],
   "source": [
    "combined_df = combined_df.sort_values(by=['temp_ID'])\n",
    "combined_df = combined_df.rename(columns={\"reasoning.reasoning_steps\" : \"reasoning\"})\n",
    "combined_df = combined_df.drop(columns=['temp_ID'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "1c3e8a2e-444e-4d48-b4c1-c8b5dea5753e",
   "metadata": {},
   "outputs": [],
   "source": [
    "combined_df = combined_df[['ID','question','answer','reference_sequence','variant_sequence','reasoning']]\n",
    "combined_df = combined_df.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "4200c786-4365-407e-96d4-f5cabfc7b3b1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>question</th>\n",
       "      <th>answer</th>\n",
       "      <th>reference_sequence</th>\n",
       "      <th>variant_sequence</th>\n",
       "      <th>reasoning</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>KEGG_1</td>\n",
       "      <td>Chromosome Number: 12\\nNetwork Definition of t...</td>\n",
       "      <td>Melanoma</td>\n",
       "      <td>gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...</td>\n",
       "      <td>gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...</td>\n",
       "      <td>[Step 1: The C&gt;T mutation at position 57751646...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>KEGG_2</td>\n",
       "      <td>Chromosome Number: 12\\nNetwork Definition of t...</td>\n",
       "      <td>Melanoma</td>\n",
       "      <td>gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...</td>\n",
       "      <td>gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...</td>\n",
       "      <td>[Step 1: The C&gt;A mutation at position 57751646...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>KEGG_3</td>\n",
       "      <td>Chromosome Number: 12\\nNetwork Definition of t...</td>\n",
       "      <td>Melanoma</td>\n",
       "      <td>gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...</td>\n",
       "      <td>gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...</td>\n",
       "      <td>[Step 1: The C&gt;G mutation at position 57751646...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>KEGG_4</td>\n",
       "      <td>Chromosome Number: 12\\nNetwork Definition of t...</td>\n",
       "      <td>Melanoma</td>\n",
       "      <td>cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...</td>\n",
       "      <td>cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...</td>\n",
       "      <td>[Step 1: The G&gt;A mutation at position 57751647...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>KEGG_5</td>\n",
       "      <td>Chromosome Number: 12\\nNetwork Definition of t...</td>\n",
       "      <td>Melanoma</td>\n",
       "      <td>cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...</td>\n",
       "      <td>cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...</td>\n",
       "      <td>[Step 1: The G&gt;C mutation at position 57751647...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1444</th>\n",
       "      <td>KEGG_1445</td>\n",
       "      <td>Chromosome Number: 19\\nNetwork Definition of t...</td>\n",
       "      <td>Hepatocellular carcinoma</td>\n",
       "      <td>gagctgagatcatgccactgcactccaacctgggcaacagagcgag...</td>\n",
       "      <td>gagctgagatcatgccactgcactccaacctgggcaacagagcgag...</td>\n",
       "      <td>[Step 1: The variant is a C&gt;A substitution at ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1445</th>\n",
       "      <td>KEGG_1446</td>\n",
       "      <td>Chromosome Number: 19\\nNetwork Definition of t...</td>\n",
       "      <td>Hepatocellular carcinoma</td>\n",
       "      <td>TGAAGGGTAGTACTGCCTCATAGGACAGTTGGGAACAGTCAATGAT...</td>\n",
       "      <td>TGAAGGGTAGTACTGCCTCATAGGACAGTTGGGAACAGTCAATGAT...</td>\n",
       "      <td>[Step 1: The variant is a T&gt;C substitution at ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1446</th>\n",
       "      <td>KEGG_1447</td>\n",
       "      <td>Chromosome Number: 16\\nNetwork Definition of t...</td>\n",
       "      <td>Gastric cancer</td>\n",
       "      <td>CAAACACAGGATGTAGAGGGCAGAGAGCATaggtgtgtgcgcatgt...</td>\n",
       "      <td>CAAACACAGGATGTAGAGGGCAGAGAGCATaggtgtgtgcgcatgt...</td>\n",
       "      <td>[Step 1: The variant KEGG_1447 represents an A...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1447</th>\n",
       "      <td>KEGG_1448</td>\n",
       "      <td>Chromosome Number: 16\\nNetwork Definition of t...</td>\n",
       "      <td>Gastric cancer</td>\n",
       "      <td>GATATATATAATTTGTCATTGATAAGAGAATGTGTCATTAAATTCA...</td>\n",
       "      <td>GATATATATAATTTGTCATTGATAAGAGAATGTGTCATTAAATTCA...</td>\n",
       "      <td>[Step 1: The variant KEGG_1448 is a T&gt;G substi...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1448</th>\n",
       "      <td>KEGG_1449</td>\n",
       "      <td>Chromosome Number: 16\\nNetwork Definition of t...</td>\n",
       "      <td>Gastric cancer</td>\n",
       "      <td>GTCATTGATAAGAGAATGTGTCATTAAATTCAAACTGTACACTGCC...</td>\n",
       "      <td>GTCATTGATAAGAGAATGTGTCATTAAATTCAAACTGTACACTGCC...</td>\n",
       "      <td>[Step 1: The variant KEGG_1449 is a G&gt;A substi...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1449 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             ID                                           question  \\\n",
       "0        KEGG_1  Chromosome Number: 12\\nNetwork Definition of t...   \n",
       "1        KEGG_2  Chromosome Number: 12\\nNetwork Definition of t...   \n",
       "2        KEGG_3  Chromosome Number: 12\\nNetwork Definition of t...   \n",
       "3        KEGG_4  Chromosome Number: 12\\nNetwork Definition of t...   \n",
       "4        KEGG_5  Chromosome Number: 12\\nNetwork Definition of t...   \n",
       "...         ...                                                ...   \n",
       "1444  KEGG_1445  Chromosome Number: 19\\nNetwork Definition of t...   \n",
       "1445  KEGG_1446  Chromosome Number: 19\\nNetwork Definition of t...   \n",
       "1446  KEGG_1447  Chromosome Number: 16\\nNetwork Definition of t...   \n",
       "1447  KEGG_1448  Chromosome Number: 16\\nNetwork Definition of t...   \n",
       "1448  KEGG_1449  Chromosome Number: 16\\nNetwork Definition of t...   \n",
       "\n",
       "                        answer  \\\n",
       "0                     Melanoma   \n",
       "1                     Melanoma   \n",
       "2                     Melanoma   \n",
       "3                     Melanoma   \n",
       "4                     Melanoma   \n",
       "...                        ...   \n",
       "1444  Hepatocellular carcinoma   \n",
       "1445  Hepatocellular carcinoma   \n",
       "1446            Gastric cancer   \n",
       "1447            Gastric cancer   \n",
       "1448            Gastric cancer   \n",
       "\n",
       "                                     reference_sequence  \\\n",
       "0     gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...   \n",
       "1     gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...   \n",
       "2     gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...   \n",
       "3     cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...   \n",
       "4     cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...   \n",
       "...                                                 ...   \n",
       "1444  gagctgagatcatgccactgcactccaacctgggcaacagagcgag...   \n",
       "1445  TGAAGGGTAGTACTGCCTCATAGGACAGTTGGGAACAGTCAATGAT...   \n",
       "1446  CAAACACAGGATGTAGAGGGCAGAGAGCATaggtgtgtgcgcatgt...   \n",
       "1447  GATATATATAATTTGTCATTGATAAGAGAATGTGTCATTAAATTCA...   \n",
       "1448  GTCATTGATAAGAGAATGTGTCATTAAATTCAAACTGTACACTGCC...   \n",
       "\n",
       "                                       variant_sequence  \\\n",
       "0     gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...   \n",
       "1     gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...   \n",
       "2     gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...   \n",
       "3     cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...   \n",
       "4     cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...   \n",
       "...                                                 ...   \n",
       "1444  gagctgagatcatgccactgcactccaacctgggcaacagagcgag...   \n",
       "1445  TGAAGGGTAGTACTGCCTCATAGGACAGTTGGGAACAGTCAATGAT...   \n",
       "1446  CAAACACAGGATGTAGAGGGCAGAGAGCATaggtgtgtgcgcatgt...   \n",
       "1447  GATATATATAATTTGTCATTGATAAGAGAATGTGTCATTAAATTCA...   \n",
       "1448  GTCATTGATAAGAGAATGTGTCATTAAATTCAAACTGTACACTGCC...   \n",
       "\n",
       "                                              reasoning  \n",
       "0     [Step 1: The C>T mutation at position 57751646...  \n",
       "1     [Step 1: The C>A mutation at position 57751646...  \n",
       "2     [Step 1: The C>G mutation at position 57751646...  \n",
       "3     [Step 1: The G>A mutation at position 57751647...  \n",
       "4     [Step 1: The G>C mutation at position 57751647...  \n",
       "...                                                 ...  \n",
       "1444  [Step 1: The variant is a C>A substitution at ...  \n",
       "1445  [Step 1: The variant is a T>C substitution at ...  \n",
       "1446  [Step 1: The variant KEGG_1447 represents an A...  \n",
       "1447  [Step 1: The variant KEGG_1448 is a T>G substi...  \n",
       "1448  [Step 1: The variant KEGG_1449 is a G>A substi...  \n",
       "\n",
       "[1449 rows x 6 columns]"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "combined_df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f5cd7e22",
   "metadata": {},
   "source": [
    "### Performing the mutation and saving the reference and variant allele with a 1000 nt window"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "8c89d455-598d-45e3-821b-6e37075b3a77",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4001"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(combined_df.iloc[0]['reference_sequence'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "a1dd3ed8-18ca-4468-9ab9-98ebf4713260",
   "metadata": {},
   "outputs": [],
   "source": [
    "KEGG_2000 = combined_df.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "688a7d0b-4a31-484d-9835-eb66d674b5de",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'KEGG_2'"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "KEGG_2000.at[1,'ID']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c6fc35c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate sequences with updated window size\n",
    "chromosome_dictionary = CONFIG['chromosome_dictionary']\n",
    "window = CONFIG['sequence_window']\n",
    "\n",
    "print(f\"Generating sequences with {window}bp windows...\")\n",
    "KEGG_2000 = combined_df.copy()\n",
    "\n",
    "for i in range(len(KEGG_2000)):\n",
    "    try:\n",
    "        chromosome_id = chromosome_dictionary[str(variant_data.iloc[i]['Chr'])]\n",
    "        if (variant_data.iloc[i]['TranscriptID'][:4] == \"ENST\"):\n",
    "            start = variant_data.iloc[i]['Start'] - 1\n",
    "        else:\n",
    "            start = variant_data.iloc[i]['Start']\n",
    "        reference_allele = variant_data.iloc[i]['RefAllele']\n",
    "        variant_allele = variant_data.iloc[i]['AltAllele']\n",
    "\n",
    "        end = len(reference_allele) + start\n",
    "        \n",
    "        chrom_seq = record_dict[chromosome_id].seq\n",
    "\n",
    "        # Extract region\n",
    "        region_start = max(0, start - window)\n",
    "        region_end = end + window\n",
    "\n",
    "        ref_seq = chrom_seq[region_start:region_end]\n",
    "\n",
    "        if (variant_allele == \"deletion\"):\n",
    "            # Apply mutation\n",
    "            mutated_seq = ref_seq[:window] + ref_seq[window + len(reference_allele):]\n",
    "\n",
    "            KEGG_2000.at[i,'reference_sequence'] = str(ref_seq)\n",
    "            KEGG_2000.at[i,'variant_sequence'] = str(mutated_seq)\n",
    "            \n",
    "        else:\n",
    "            del_len = len(reference_allele)\n",
    "            # Apply mutation\n",
    "            mutated_seq = ref_seq[:window] + variant_allele + ref_seq[window + del_len:]\n",
    "\n",
    "            KEGG_2000.at[i,'reference_sequence'] = str(ref_seq)\n",
    "            KEGG_2000.at[i,'variant_sequence'] = str(mutated_seq)\n",
    "            \n",
    "        if (i + 1) % 100 == 0:\n",
    "            print(f\"Generated sequences for {i + 1}/{len(KEGG_2000)} variants...\")\n",
    "            \n",
    "    except Exception as e:\n",
    "        print(f\"[Error] Failed to generate sequence for variant {i}: {str(e)}\")\n",
    "\n",
    "print(f\"✅ Sequence generation complete for {window}bp windows\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "e2a50c08-ccae-45ca-98e1-0c3d3e7d4647",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>question</th>\n",
       "      <th>answer</th>\n",
       "      <th>reference_sequence</th>\n",
       "      <th>variant_sequence</th>\n",
       "      <th>reasoning</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>KEGG_1</td>\n",
       "      <td>Chromosome Number: 12\\nNetwork Definition of t...</td>\n",
       "      <td>Melanoma</td>\n",
       "      <td>TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...</td>\n",
       "      <td>TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...</td>\n",
       "      <td>[Step 1: The C&gt;T mutation at position 57751646...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>KEGG_2</td>\n",
       "      <td>Chromosome Number: 12\\nNetwork Definition of t...</td>\n",
       "      <td>Melanoma</td>\n",
       "      <td>TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...</td>\n",
       "      <td>TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...</td>\n",
       "      <td>[Step 1: The C&gt;A mutation at position 57751646...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>KEGG_3</td>\n",
       "      <td>Chromosome Number: 12\\nNetwork Definition of t...</td>\n",
       "      <td>Melanoma</td>\n",
       "      <td>TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...</td>\n",
       "      <td>TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...</td>\n",
       "      <td>[Step 1: The C&gt;G mutation at position 57751646...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>KEGG_4</td>\n",
       "      <td>Chromosome Number: 12\\nNetwork Definition of t...</td>\n",
       "      <td>Melanoma</td>\n",
       "      <td>TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...</td>\n",
       "      <td>TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...</td>\n",
       "      <td>[Step 1: The G&gt;A mutation at position 57751647...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>KEGG_5</td>\n",
       "      <td>Chromosome Number: 12\\nNetwork Definition of t...</td>\n",
       "      <td>Melanoma</td>\n",
       "      <td>TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...</td>\n",
       "      <td>TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...</td>\n",
       "      <td>[Step 1: The G&gt;C mutation at position 57751647...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1444</th>\n",
       "      <td>KEGG_1445</td>\n",
       "      <td>Chromosome Number: 19\\nNetwork Definition of t...</td>\n",
       "      <td>Hepatocellular carcinoma</td>\n",
       "      <td>gcactccagcctgggcaacagagcaagagagacagggtcttactct...</td>\n",
       "      <td>gcactccagcctgggcaacagagcaagagagacagggtcttactct...</td>\n",
       "      <td>[Step 1: The variant is a C&gt;A substitution at ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1445</th>\n",
       "      <td>KEGG_1446</td>\n",
       "      <td>Chromosome Number: 19\\nNetwork Definition of t...</td>\n",
       "      <td>Hepatocellular carcinoma</td>\n",
       "      <td>ctcccaaagtgctgggattacaggcgtgagccactgggccctgcCC...</td>\n",
       "      <td>ctcccaaagtgctgggattacaggcgtgagccactgggccctgcCC...</td>\n",
       "      <td>[Step 1: The variant is a T&gt;C substitution at ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1446</th>\n",
       "      <td>KEGG_1447</td>\n",
       "      <td>Chromosome Number: 16\\nNetwork Definition of t...</td>\n",
       "      <td>Gastric cancer</td>\n",
       "      <td>ggctgggtgtggtggctcatgcctgtaatcccagcattttgggagg...</td>\n",
       "      <td>ggctgggtgtggtggctcatgcctgtaatcccagcattttgggagg...</td>\n",
       "      <td>[Step 1: The variant KEGG_1447 represents an A...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1447</th>\n",
       "      <td>KEGG_1448</td>\n",
       "      <td>Chromosome Number: 16\\nNetwork Definition of t...</td>\n",
       "      <td>Gastric cancer</td>\n",
       "      <td>tttgagatagggtttcactctgtcacccaggctggaaccacaacct...</td>\n",
       "      <td>tttgagatagggtttcactctgtcacccaggctggaaccacaacct...</td>\n",
       "      <td>[Step 1: The variant KEGG_1448 is a T&gt;G substi...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1448</th>\n",
       "      <td>KEGG_1449</td>\n",
       "      <td>Chromosome Number: 16\\nNetwork Definition of t...</td>\n",
       "      <td>Gastric cancer</td>\n",
       "      <td>tcactctgtcacccaggctggaaccacaacctccacttcccgggtt...</td>\n",
       "      <td>tcactctgtcacccaggctggaaccacaacctccacttcccgggtt...</td>\n",
       "      <td>[Step 1: The variant KEGG_1449 is a G&gt;A substi...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1449 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             ID                                           question  \\\n",
       "0        KEGG_1  Chromosome Number: 12\\nNetwork Definition of t...   \n",
       "1        KEGG_2  Chromosome Number: 12\\nNetwork Definition of t...   \n",
       "2        KEGG_3  Chromosome Number: 12\\nNetwork Definition of t...   \n",
       "3        KEGG_4  Chromosome Number: 12\\nNetwork Definition of t...   \n",
       "4        KEGG_5  Chromosome Number: 12\\nNetwork Definition of t...   \n",
       "...         ...                                                ...   \n",
       "1444  KEGG_1445  Chromosome Number: 19\\nNetwork Definition of t...   \n",
       "1445  KEGG_1446  Chromosome Number: 19\\nNetwork Definition of t...   \n",
       "1446  KEGG_1447  Chromosome Number: 16\\nNetwork Definition of t...   \n",
       "1447  KEGG_1448  Chromosome Number: 16\\nNetwork Definition of t...   \n",
       "1448  KEGG_1449  Chromosome Number: 16\\nNetwork Definition of t...   \n",
       "\n",
       "                        answer  \\\n",
       "0                     Melanoma   \n",
       "1                     Melanoma   \n",
       "2                     Melanoma   \n",
       "3                     Melanoma   \n",
       "4                     Melanoma   \n",
       "...                        ...   \n",
       "1444  Hepatocellular carcinoma   \n",
       "1445  Hepatocellular carcinoma   \n",
       "1446            Gastric cancer   \n",
       "1447            Gastric cancer   \n",
       "1448            Gastric cancer   \n",
       "\n",
       "                                     reference_sequence  \\\n",
       "0     TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...   \n",
       "1     TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...   \n",
       "2     TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...   \n",
       "3     TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...   \n",
       "4     TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...   \n",
       "...                                                 ...   \n",
       "1444  gcactccagcctgggcaacagagcaagagagacagggtcttactct...   \n",
       "1445  ctcccaaagtgctgggattacaggcgtgagccactgggccctgcCC...   \n",
       "1446  ggctgggtgtggtggctcatgcctgtaatcccagcattttgggagg...   \n",
       "1447  tttgagatagggtttcactctgtcacccaggctggaaccacaacct...   \n",
       "1448  tcactctgtcacccaggctggaaccacaacctccacttcccgggtt...   \n",
       "\n",
       "                                       variant_sequence  \\\n",
       "0     TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...   \n",
       "1     TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...   \n",
       "2     TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...   \n",
       "3     TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...   \n",
       "4     TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...   \n",
       "...                                                 ...   \n",
       "1444  gcactccagcctgggcaacagagcaagagagacagggtcttactct...   \n",
       "1445  ctcccaaagtgctgggattacaggcgtgagccactgggccctgcCC...   \n",
       "1446  ggctgggtgtggtggctcatgcctgtaatcccagcattttgggagg...   \n",
       "1447  tttgagatagggtttcactctgtcacccaggctggaaccacaacct...   \n",
       "1448  tcactctgtcacccaggctggaaccacaacctccacttcccgggtt...   \n",
       "\n",
       "                                              reasoning  \n",
       "0     [Step 1: The C>T mutation at position 57751646...  \n",
       "1     [Step 1: The C>A mutation at position 57751646...  \n",
       "2     [Step 1: The C>G mutation at position 57751646...  \n",
       "3     [Step 1: The G>A mutation at position 57751647...  \n",
       "4     [Step 1: The G>C mutation at position 57751647...  \n",
       "...                                                 ...  \n",
       "1444  [Step 1: The variant is a C>A substitution at ...  \n",
       "1445  [Step 1: The variant is a T>C substitution at ...  \n",
       "1446  [Step 1: The variant KEGG_1447 represents an A...  \n",
       "1447  [Step 1: The variant KEGG_1448 is a T>G substi...  \n",
       "1448  [Step 1: The variant KEGG_1449 is a G>A substi...  \n",
       "\n",
       "[1449 rows x 6 columns]"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "KEGG_2000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26c939b5-0768-4565-873a-10cba7396d99",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create dataset structure (HuggingFace datasets optional)\n",
    "try:\n",
    "    from datasets import Dataset, DatasetDict\n",
    "    \n",
    "    # Create Hugging Face Datasets\n",
    "    train_dataset = Dataset.from_pandas(KEGG_2000)\n",
    "    \n",
    "    # Combine into a DatasetDict\n",
    "    dataset = DatasetDict({\n",
    "        \"train\": train_dataset,\n",
    "    })\n",
    "    \n",
    "    print(\"✅ HuggingFace dataset created\")\n",
    "    use_hf_datasets = True\n",
    "    \n",
    "except ImportError:\n",
    "    print(\"⚠️ HuggingFace datasets not available, using pandas only\")\n",
    "    dataset = KEGG_2000\n",
    "    train_dataset = KEGG_2000\n",
    "    use_hf_datasets = False\n",
    "\n",
    "print(f\"Final dataset contains {len(train_dataset)} samples\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "afa07e17-e86a-41d8-9db3-5df6d77443f8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['ID', 'question', 'answer', 'reference_sequence', 'variant_sequence', 'reasoning'],\n",
       "        num_rows: 1449\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "affe2720-e748-45d2-97d0-0baf1d6530ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save final dataset locally instead of uploading to HuggingFace\n",
    "# Users can upload to their own repositories if needed\n",
    "\n",
    "output_file = \"kegg_variant_dataset_final.parquet\"\n",
    "dataset_info_file = \"dataset_info.json\"\n",
    "\n",
    "# Save dataset as Parquet for efficient storage\n",
    "train_dataset.to_parquet(output_file)\n",
    "print(f\"✅ Dataset saved to: {output_file}\")\n",
    "\n",
    "# Save dataset information\n",
    "dataset_info = {\n",
    "    \"name\": \"KEGG Variant Dataset\",\n",
    "    \"description\": \"Genetic variants with biological reasoning for disease association\",\n",
    "    \"total_samples\": len(train_dataset),\n",
    "    \"sequence_length\": f\"~{CONFIG['sequence_window']*2}bp\",\n",
    "    \"features\": list(train_dataset.column_names),\n",
    "    \"diseases\": len(set(disease)) if 'disease' in locals() else \"Unknown\",\n",
    "    \"created_by\": \"KEGG Data Processing Pipeline\",\n",
    "    \"version\": \"1.0\"\n",
    "}\n",
    "\n",
    "with open(dataset_info_file, 'w') as f:\n",
    "    json.dump(dataset_info, f, indent=2)\n",
    "    \n",
    "print(f\"✅ Dataset information saved to: {dataset_info_file}\")\n",
    "print(f\"\\nDataset ready for use:\")\n",
    "print(f\"  - Main dataset: {output_file}\")\n",
    "print(f\"  - Information: {dataset_info_file}\")\n",
    "print(f\"  - Samples: {len(train_dataset)}\")\n",
    "print(f\"  - Features: {train_dataset.column_names}\")\n",
    "\n",
    "print(\"\\n📝 To upload to HuggingFace Hub:\")\n",
    "print(\"dataset.push_to_hub('your-username/your-dataset-name')\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5b448bd7-e256-4fad-ae95-dbe299d380f0",
   "metadata": {},
   "source": [
    "# KEGG Dataset with Alternative Window Size\n",
    "\n",
    "This section demonstrates creating the dataset with different sequence window parameters."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9fd609ca-6276-4425-997f-0589fe03f1ea",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}