{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "2e29f3a3-381c-4c16-853c-d73d38abb383",
   "metadata": {
    "libroFormatter": "formatter-string",
    "trusted": true
   },
   "source": [
    "# 1. 加载数据，查看数据格式\n",
    "# 2. 使用 data_transform.py 将 Uniprot_id 格式转为 Saprot 可以接受的 Foldseek Seq 格式\n",
    "# 3. 记录所有 Target_Uniprot_id 和 Compound_Smiles 及其 对应信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "bd7b0b18-c1f1-4f1c-be46-a3bd4686ca57",
   "metadata": {
    "execution": {
     "shell.execute_reply.end": "2025-12-29T11:57:38.962407Z",
     "shell.execute_reply.started": "2025-12-29T11:57:37.782798Z",
     "to_execute": "2025-12-29T11:57:37.691Z"
    },
    "isLargeOutputDisplay": true,
    "libroFormatter": "formatter-string",
    "trusted": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "assay_id                                                            P00316\n",
       "target_id                                                             ROS1\n",
       "compound_id                                                       EB000590\n",
       "mode                                                               Binding\n",
       "mechanism                                              Competition Binding\n",
       "outcome_is_active                                                     True\n",
       "outcome_potency_pxc50                                                 11.8\n",
       "outcome_max_activity                                                  99.3\n",
       "observed_max                                                         100.0\n",
       "is_quantified                                                         True\n",
       "frequency_flag                                                       False\n",
       "viability_flag                                                       False\n",
       "pxc50_modifier                                                           >\n",
       "slope                                                                  0.6\n",
       "asymp_min                                                             58.0\n",
       "asymp_max                                                             99.3\n",
       "assay__technology                                                  TR-FRET\n",
       "target__class                                                       Kinase\n",
       "target__gene                                                          ROS1\n",
       "target__uniprot_id                                                  P08922\n",
       "target__is_mutant                                                    False\n",
       "target__wildtype_id                                                   ROS1\n",
       "target__name                    Proto-oncogene tyrosine-protein kinase ROS\n",
       "compound__name                                                  Lorlatinib\n",
       "compound__smiles         C[C@H]1OC2=C(N)N=CC(=C2)C2=C(C#N)N(C)N=C2CN(C)...\n",
       "compound__drugbank_id                                              DB12130\n",
       "compound__cas                                                 1454846-35-5\n",
       "compound__unii                                                  OSP71S83EU\n",
       "compound__inchikey                             IIXWYSCJSQVBQM-LLVKDONJSA-N\n",
       "progressed                                                            True\n",
       "release                                                                  8\n",
       "Name: 0, dtype: object"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# 读取 parquet 文件\n",
    "data_path = 'drug_target_activity/train.parquet'\n",
    "df = pd.read_parquet(data_path)\n",
    "\n",
    "# 查看一个example\n",
    "df.iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "21412976-18ab-44d4-b73d-18bb0a883f0f",
   "metadata": {
    "execution": {
     "shell.execute_reply.end": "2025-12-26T07:18:30.046621Z",
     "shell.execute_reply.started": "2025-12-26T07:18:30.043306Z",
     "to_execute": "2025-12-26T07:18:30.099Z"
    },
    "isLargeOutputDisplay": true,
    "libroFormatter": "formatter-string",
    "trusted": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "正在加载数据集配置...\n",
      "------------------------------\n",
      "总数据量: 421894\n",
      "野生型 (False): 406527 条 (96.36%)\n",
      "突变体 (True) : 15367 条 (3.64%)\n",
      "------------------------------\n"
     ]
    }
   ],
   "source": [
    "def check_mutant_ratio(df):\n",
    "    print(\"正在加载数据集配置...\")\n",
    "    # 加载数据集 (假设你已经登录或数据集是公开的)\n",
    "    # 如果只是为了统计，可以使用 streaming=True 来避免下载整个数据集，但全量统计需要遍历\n",
    "    # 这里假设显存/内存足够，直接加载 train split\n",
    "    try:\n",
    "        # ds = load_dataset(\"eve-bio/drug-target-activity\", split=\"train\")\n",
    "        \n",
    "        # 将其转换为 pandas DataFrame 以便处理\n",
    "        # 为了节省内存，只取 target__is_mutant 这一列\n",
    "        # print(\"正在转换数据...\")\n",
    "        # df = ds.select_columns([\"target__is_mutant\"]).to_pandas()\n",
    "        \n",
    "        # 统计数量\n",
    "        counts = df['target__is_mutant'].value_counts()\n",
    "        total = len(df)\n",
    "        \n",
    "        # 计算比例\n",
    "        false_ratio = (counts.get(False, 0) / total) * 100\n",
    "        true_ratio = (counts.get(True, 0) / total) * 100\n",
    "        \n",
    "        print(\"-\" * 30)\n",
    "        print(f\"总数据量: {total}\")\n",
    "        print(f\"野生型 (False): {counts.get(False, 0)} 条 ({false_ratio:.2f}%)\")\n",
    "        print(f\"突变体 (True) : {counts.get(True, 0)} 条 ({true_ratio:.2f}%)\")\n",
    "        print(\"-\" * 30)\n",
    "    except Exception as e:\n",
    "        print(f\"发生错误: {e}\")\n",
    "check_mutant_ratio(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "1b7faa51-db68-4a52-823f-a4a27d44c142",
   "metadata": {
    "execution": {
     "shell.execute_reply.end": "2025-12-26T07:18:35.112954Z",
     "shell.execute_reply.started": "2025-12-26T07:18:35.104700Z",
     "to_execute": "2025-12-26T07:18:35.222Z"
    },
    "isLargeOutputDisplay": true,
    "libroFormatter": "formatter-string",
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# 建立 uniprot_id -> foldseek seq 的 map 并 save\n",
    "# from dataset_transform import generate_and_save_foldseek_dict\n",
    "# uniprot_ids = get_unique_uniprot_ids(data_path)\n",
    "# 这里在另外一个电脑上做的数据爬取，所以两个path没有具体写定\n",
    "map_save_path = 'drug_target_activity/protein_foldseek_seqs.json'\n",
    "foldseek_path = 'path/to/foldseek'\n",
    "# generate_and_save_foldseek_dict(uniprot_ids, map_save_path, foldseek_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "51ddcbbb-0e0a-4fec-8c67-698b14ad8e34",
   "metadata": {
    "isLargeOutputDisplay": true,
    "libroFormatter": "formatter-string",
    "trusted": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Step 1: 原始数据加载完成，当前数据量: 421894\n",
      "Step 2: 筛选非突变体 (is_mutant=False) 后，当前数据量: 406527\n",
      "Step 3: Map文件加载完成，包含 138 个 ID 映射\n",
      "Step 4: 筛选 Uniprot ID 存在于 Map 中的数据后，当前数据量: 318516\n",
      "Step 5: 处理完成，最终文件已保存至: drug_target_activity/processed_train.parquet\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "\n",
    "def build_dataset(data_path, foldseek_map_path, new_dataset_path):\n",
    "    '''\n",
    "        1. 打开data_path的parquet文件\n",
    "        2. 打开foldseek_map_path的json文件, 读取 dict, 其中 key:value 为 uniprot_id:foldseek seq\n",
    "        3. 筛选'target__is_mutant'为false的 row\n",
    "        4. 筛选dataset中'target__uniprot_id' 在 dict 的 key 中的 row, 并增加一列'target__foldseek_seq', 值为 dict 中对应的 value\n",
    "        5. 保存newdataset到new_dataset_path\n",
    "    '''\n",
    "    df = pd.read_parquet(data_path)\n",
    "    print(f\"Step 1: 原始数据加载完成，当前数据量: {len(df)}\")\n",
    "\n",
    "    df = df[df['target__is_mutant'] == False]\n",
    "    print(f\"Step 2: 筛选非突变体 (is_mutant=False) 后，当前数据量: {len(df)}\")\n",
    "\n",
    "    with open(foldseek_map_path, 'r') as f:\n",
    "        foldseek_map = json.load(f)\n",
    "    print(f\"Step 3: Map文件加载完成，包含 {len(foldseek_map)} 个 ID 映射\")\n",
    "\n",
    "    df = df[df['target__uniprot_id'].isin(foldseek_map.keys())].copy()\n",
    "    print(f\"Step 4: 筛选 Uniprot ID 存在于 Map 中的数据后，当前数据量: {len(df)}\")\n",
    "\n",
    "    df['target__foldseek_seq'] = df['target__uniprot_id'].map(foldseek_map)\n",
    "\n",
    "    try:\n",
    "        df.to_parquet(new_dataset_path)\n",
    "        print(f\"Step 5: 处理完成，最终文件已保存至: {new_dataset_path}\")\n",
    "    except Exception as e:\n",
    "        print(f\"保存文件失败: {e}\")\n",
    "\n",
    "# 示例调用（如果需要测试）\n",
    "new_dataset_path = 'drug_target_activity/processed_train.parquet'\n",
    "build_dataset(data_path, map_save_path, new_dataset_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6a28872-f377-47ed-b3e8-903c8e25567e",
   "metadata": {
    "isLargeOutputDisplay": true,
    "libroFormatter": "formatter-string",
    "trusted": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "创建目录: drug_target_activity/candidates\n",
      "正在读取文件: drug_target_activity/processed_train.parquet ...\n",
      "正在提取 Unique Target 信息...\n",
      "Target 信息已保存至: drug_target_activity/candidates/unique_targets.json (数量: 138)\n",
      "正在提取 Unique Compound 信息...\n",
      "Compound 信息已保存至: drug_target_activity/candidates/unique_compounds.json (数量: 1382)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "import os\n",
    "\n",
    "import pandas as pd\n",
    "import json\n",
    "import os\n",
    "\n",
    "def extract_unique_entities(parquet_path, output_dir):\n",
    "    \"\"\"\n",
    "    读取 Parquet 文件，提取唯一的 Protein (含 foldseek seq) 和 Molecule 信息，并保存为 JSON。\n",
    "    \"\"\"\n",
    "    \n",
    "    # 1. 确保输出目录存在\n",
    "    if not os.path.exists(output_dir):\n",
    "        os.makedirs(output_dir)\n",
    "        print(f\"创建目录: {output_dir}\")\n",
    "\n",
    "    print(f\"正在读取文件: {parquet_path} ...\")\n",
    "    try:\n",
    "        df = pd.read_parquet(parquet_path)\n",
    "    except Exception as e:\n",
    "        print(f\"读取 Parquet 失败: {e}\")\n",
    "        return\n",
    "\n",
    "    # ==========================================\n",
    "    # 2. 处理 Proteins (Targets)\n",
    "    # ==========================================\n",
    "    print(\"正在提取 Unique Target 信息...\")\n",
    "    \n",
    "    # 【修改点】加入了 'target__foldseek_seq'\n",
    "    target_cols = ['target__uniprot_id', 'target__foldseek_seq', 'target__class', 'target__gene']\n",
    "    \n",
    "    # 检查列是否存在\n",
    "    existing_target_cols = [c for c in target_cols if c in df.columns]\n",
    "    \n",
    "    if 'target__uniprot_id' in existing_target_cols and 'target__foldseek_seq' in existing_target_cols:\n",
    "        # 提取列 -> 去除 ID 为空的行 -> 根据 ID 去重 \n",
    "        # 注意：这里假设同一个 ID 对应的 seq 是一样的，只保留第一条\n",
    "        target_df = df[existing_target_cols].dropna(subset=['target__uniprot_id'])\n",
    "        target_df = target_df.drop_duplicates(subset=['target__uniprot_id'])\n",
    "        \n",
    "        # 将 NaN 替换为 None\n",
    "        target_df = target_df.where(pd.notnull(target_df), None)\n",
    "        \n",
    "        # 转换为字典: \n",
    "        # { \n",
    "        #   \"UniprotID\": { \n",
    "        #       \"target__foldseek_seq\": \"...\", \n",
    "        #       \"target__class\": \"...\", \n",
    "        #       ... \n",
    "        #    } \n",
    "        # }\n",
    "        target_data = target_df.set_index('target__uniprot_id').to_dict(orient='index')\n",
    "        \n",
    "        target_out_path = os.path.join(output_dir, 'unique_targets.json')\n",
    "        with open(target_out_path, 'w', encoding='utf-8') as f:\n",
    "            json.dump(target_data, f, indent=4, ensure_ascii=False)\n",
    "        print(f\"Target 信息已保存至: {target_out_path} (数量: {len(target_data)})\")\n",
    "    else:\n",
    "        print(\"警告: 数据中缺少 'target__uniprot_id' 或 'target__foldseek_seq' 列，跳过 Target 提取。\")\n",
    "\n",
    "    # ==========================================\n",
    "    # 3. 处理 Molecules (Compounds)\n",
    "    # ==========================================\n",
    "    print(\"正在提取 Unique Compound 信息...\")\n",
    "    \n",
    "    compound_cols = [\n",
    "        'compound__smiles', \n",
    "        'compound__name', \n",
    "        'compound__drugbank_id', \n",
    "        'compound__cas', \n",
    "        'compound__unii', \n",
    "        'compound__inchikey'\n",
    "    ]\n",
    "    \n",
    "    existing_compound_cols = [c for c in compound_cols if c in df.columns]\n",
    "    \n",
    "    if 'compound__smiles' in existing_compound_cols:\n",
    "        # 提取列 -> 去除 SMILES 为空的行 -> 根据 SMILES 去重\n",
    "        mol_df = df[existing_compound_cols].dropna(subset=['compound__smiles'])\n",
    "        mol_df = mol_df.drop_duplicates(subset=['compound__smiles'])\n",
    "        \n",
    "        # 将 NaN 替换为 None\n",
    "        mol_df = mol_df.where(pd.notnull(mol_df), None)\n",
    "        \n",
    "        # 转换为字典: { \"SMILES\": { \"compound__name\": \"...\", ... } }\n",
    "        mol_data = mol_df.set_index('compound__smiles').to_dict(orient='index')\n",
    "        \n",
    "        mol_out_path = os.path.join(output_dir, 'unique_compounds.json')\n",
    "        with open(mol_out_path, 'w', encoding='utf-8') as f:\n",
    "            json.dump(mol_data, f, indent=4, ensure_ascii=False)\n",
    "        print(f\"Compound 信息已保存至: {mol_out_path} (数量: {len(mol_data)})\")\n",
    "    else:\n",
    "        print(\"警告: 数据中缺少 'compound__smiles' 列，跳过 Compound 提取。\")\n",
    "\n",
    "# --- 使用示例 ---\n",
    "dataset_path = 'drug_target_activity/processed_train.parquet'\n",
    "output_directory = 'drug_target_activity/candidates'\n",
    "extract_unique_entities(dataset_path, output_directory)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee0d150f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}