{ "cells": [ { "cell_type": "markdown", "id": "2e29f3a3-381c-4c16-853c-d73d38abb383", "metadata": { "libroFormatter": "formatter-string", "trusted": true }, "source": [ "# 1. 加载数据,查看数据格式\n", "# 2. 使用 data_transform.py 将 Uniprot_id 格式转为 Saprot 可以接受的 Foldseek Seq 格式\n", "# 3. 记录所有 Target_Uniprot_id 和 Compound_Smiles 及其 对应信息" ] }, { "cell_type": "code", "execution_count": 1, "id": "bd7b0b18-c1f1-4f1c-be46-a3bd4686ca57", "metadata": { "execution": { "shell.execute_reply.end": "2025-12-29T11:57:38.962407Z", "shell.execute_reply.started": "2025-12-29T11:57:37.782798Z", "to_execute": "2025-12-29T11:57:37.691Z" }, "isLargeOutputDisplay": true, "libroFormatter": "formatter-string", "trusted": true }, "outputs": [ { "data": { "text/plain": [ "assay_id P00316\n", "target_id ROS1\n", "compound_id EB000590\n", "mode Binding\n", "mechanism Competition Binding\n", "outcome_is_active True\n", "outcome_potency_pxc50 11.8\n", "outcome_max_activity 99.3\n", "observed_max 100.0\n", "is_quantified True\n", "frequency_flag False\n", "viability_flag False\n", "pxc50_modifier >\n", "slope 0.6\n", "asymp_min 58.0\n", "asymp_max 99.3\n", "assay__technology TR-FRET\n", "target__class Kinase\n", "target__gene ROS1\n", "target__uniprot_id P08922\n", "target__is_mutant False\n", "target__wildtype_id ROS1\n", "target__name Proto-oncogene tyrosine-protein kinase ROS\n", "compound__name Lorlatinib\n", "compound__smiles C[C@H]1OC2=C(N)N=CC(=C2)C2=C(C#N)N(C)N=C2CN(C)...\n", "compound__drugbank_id DB12130\n", "compound__cas 1454846-35-5\n", "compound__unii OSP71S83EU\n", "compound__inchikey IIXWYSCJSQVBQM-LLVKDONJSA-N\n", "progressed True\n", "release 8\n", "Name: 0, dtype: object" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "# 读取 parquet 文件\n", "data_path = 'drug_target_activity/train.parquet'\n", "df = pd.read_parquet(data_path)\n", "\n", "# 查看一个example\n", "df.iloc[0]" ] }, { "cell_type": "code", "execution_count": 2, "id": "21412976-18ab-44d4-b73d-18bb0a883f0f", "metadata": { "execution": { "shell.execute_reply.end": "2025-12-26T07:18:30.046621Z", "shell.execute_reply.started": "2025-12-26T07:18:30.043306Z", "to_execute": "2025-12-26T07:18:30.099Z" }, "isLargeOutputDisplay": true, "libroFormatter": "formatter-string", "trusted": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "正在加载数据集配置...\n", "------------------------------\n", "总数据量: 421894\n", "野生型 (False): 406527 条 (96.36%)\n", "突变体 (True) : 15367 条 (3.64%)\n", "------------------------------\n" ] } ], "source": [ "def check_mutant_ratio(df):\n", " print(\"正在加载数据集配置...\")\n", " # 加载数据集 (假设你已经登录或数据集是公开的)\n", " # 如果只是为了统计,可以使用 streaming=True 来避免下载整个数据集,但全量统计需要遍历\n", " # 这里假设显存/内存足够,直接加载 train split\n", " try:\n", " # ds = load_dataset(\"eve-bio/drug-target-activity\", split=\"train\")\n", " \n", " # 将其转换为 pandas DataFrame 以便处理\n", " # 为了节省内存,只取 target__is_mutant 这一列\n", " # print(\"正在转换数据...\")\n", " # df = ds.select_columns([\"target__is_mutant\"]).to_pandas()\n", " \n", " # 统计数量\n", " counts = df['target__is_mutant'].value_counts()\n", " total = len(df)\n", " \n", " # 计算比例\n", " false_ratio = (counts.get(False, 0) / total) * 100\n", " true_ratio = (counts.get(True, 0) / total) * 100\n", " \n", " print(\"-\" * 30)\n", " print(f\"总数据量: {total}\")\n", " print(f\"野生型 (False): {counts.get(False, 0)} 条 ({false_ratio:.2f}%)\")\n", " print(f\"突变体 (True) : {counts.get(True, 0)} 条 ({true_ratio:.2f}%)\")\n", " print(\"-\" * 30)\n", " except Exception as e:\n", " print(f\"发生错误: {e}\")\n", "check_mutant_ratio(df)" ] }, { "cell_type": "code", "execution_count": 6, "id": "1b7faa51-db68-4a52-823f-a4a27d44c142", "metadata": { "execution": { "shell.execute_reply.end": "2025-12-26T07:18:35.112954Z", "shell.execute_reply.started": "2025-12-26T07:18:35.104700Z", "to_execute": "2025-12-26T07:18:35.222Z" }, "isLargeOutputDisplay": true, "libroFormatter": "formatter-string", "trusted": true }, "outputs": [], "source": [ "# 建立 uniprot_id -> foldseek seq 的 map 并 save\n", "# from dataset_transform import generate_and_save_foldseek_dict\n", "# uniprot_ids = get_unique_uniprot_ids(data_path)\n", "# 这里在另外一个电脑上做的数据爬取,所以两个path没有具体写定\n", "map_save_path = 'drug_target_activity/protein_foldseek_seqs.json'\n", "foldseek_path = 'path/to/foldseek'\n", "# generate_and_save_foldseek_dict(uniprot_ids, map_save_path, foldseek_path)" ] }, { "cell_type": "code", "execution_count": 8, "id": "51ddcbbb-0e0a-4fec-8c67-698b14ad8e34", "metadata": { "isLargeOutputDisplay": true, "libroFormatter": "formatter-string", "trusted": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Step 1: 原始数据加载完成,当前数据量: 421894\n", "Step 2: 筛选非突变体 (is_mutant=False) 后,当前数据量: 406527\n", "Step 3: Map文件加载完成,包含 138 个 ID 映射\n", "Step 4: 筛选 Uniprot ID 存在于 Map 中的数据后,当前数据量: 318516\n", "Step 5: 处理完成,最终文件已保存至: drug_target_activity/processed_train.parquet\n" ] } ], "source": [ "import pandas as pd\n", "import json\n", "\n", "def build_dataset(data_path, foldseek_map_path, new_dataset_path):\n", " '''\n", " 1. 打开data_path的parquet文件\n", " 2. 打开foldseek_map_path的json文件, 读取 dict, 其中 key:value 为 uniprot_id:foldseek seq\n", " 3. 筛选'target__is_mutant'为false的 row\n", " 4. 筛选dataset中'target__uniprot_id' 在 dict 的 key 中的 row, 并增加一列'target__foldseek_seq', 值为 dict 中对应的 value\n", " 5. 保存newdataset到new_dataset_path\n", " '''\n", " df = pd.read_parquet(data_path)\n", " print(f\"Step 1: 原始数据加载完成,当前数据量: {len(df)}\")\n", "\n", " df = df[df['target__is_mutant'] == False]\n", " print(f\"Step 2: 筛选非突变体 (is_mutant=False) 后,当前数据量: {len(df)}\")\n", "\n", " with open(foldseek_map_path, 'r') as f:\n", " foldseek_map = json.load(f)\n", " print(f\"Step 3: Map文件加载完成,包含 {len(foldseek_map)} 个 ID 映射\")\n", "\n", " df = df[df['target__uniprot_id'].isin(foldseek_map.keys())].copy()\n", " print(f\"Step 4: 筛选 Uniprot ID 存在于 Map 中的数据后,当前数据量: {len(df)}\")\n", "\n", " df['target__foldseek_seq'] = df['target__uniprot_id'].map(foldseek_map)\n", "\n", " try:\n", " df.to_parquet(new_dataset_path)\n", " print(f\"Step 5: 处理完成,最终文件已保存至: {new_dataset_path}\")\n", " except Exception as e:\n", " print(f\"保存文件失败: {e}\")\n", "\n", "# 示例调用(如果需要测试)\n", "new_dataset_path = 'drug_target_activity/processed_train.parquet'\n", "build_dataset(data_path, map_save_path, new_dataset_path)" ] }, { "cell_type": "code", "execution_count": null, "id": "e6a28872-f377-47ed-b3e8-903c8e25567e", "metadata": { "isLargeOutputDisplay": true, "libroFormatter": "formatter-string", "trusted": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "创建目录: drug_target_activity/candidates\n", "正在读取文件: drug_target_activity/processed_train.parquet ...\n", "正在提取 Unique Target 信息...\n", "Target 信息已保存至: drug_target_activity/candidates/unique_targets.json (数量: 138)\n", "正在提取 Unique Compound 信息...\n", "Compound 信息已保存至: drug_target_activity/candidates/unique_compounds.json (数量: 1382)\n" ] } ], "source": [ "import pandas as pd\n", "import json\n", "import os\n", "\n", "import pandas as pd\n", "import json\n", "import os\n", "\n", "def extract_unique_entities(parquet_path, output_dir):\n", " \"\"\"\n", " 读取 Parquet 文件,提取唯一的 Protein (含 foldseek seq) 和 Molecule 信息,并保存为 JSON。\n", " \"\"\"\n", " \n", " # 1. 确保输出目录存在\n", " if not os.path.exists(output_dir):\n", " os.makedirs(output_dir)\n", " print(f\"创建目录: {output_dir}\")\n", "\n", " print(f\"正在读取文件: {parquet_path} ...\")\n", " try:\n", " df = pd.read_parquet(parquet_path)\n", " except Exception as e:\n", " print(f\"读取 Parquet 失败: {e}\")\n", " return\n", "\n", " # ==========================================\n", " # 2. 处理 Proteins (Targets)\n", " # ==========================================\n", " print(\"正在提取 Unique Target 信息...\")\n", " \n", " # 【修改点】加入了 'target__foldseek_seq'\n", " target_cols = ['target__uniprot_id', 'target__foldseek_seq', 'target__class', 'target__gene']\n", " \n", " # 检查列是否存在\n", " existing_target_cols = [c for c in target_cols if c in df.columns]\n", " \n", " if 'target__uniprot_id' in existing_target_cols and 'target__foldseek_seq' in existing_target_cols:\n", " # 提取列 -> 去除 ID 为空的行 -> 根据 ID 去重 \n", " # 注意:这里假设同一个 ID 对应的 seq 是一样的,只保留第一条\n", " target_df = df[existing_target_cols].dropna(subset=['target__uniprot_id'])\n", " target_df = target_df.drop_duplicates(subset=['target__uniprot_id'])\n", " \n", " # 将 NaN 替换为 None\n", " target_df = target_df.where(pd.notnull(target_df), None)\n", " \n", " # 转换为字典: \n", " # { \n", " # \"UniprotID\": { \n", " # \"target__foldseek_seq\": \"...\", \n", " # \"target__class\": \"...\", \n", " # ... \n", " # } \n", " # }\n", " target_data = target_df.set_index('target__uniprot_id').to_dict(orient='index')\n", " \n", " target_out_path = os.path.join(output_dir, 'unique_targets.json')\n", " with open(target_out_path, 'w', encoding='utf-8') as f:\n", " json.dump(target_data, f, indent=4, ensure_ascii=False)\n", " print(f\"Target 信息已保存至: {target_out_path} (数量: {len(target_data)})\")\n", " else:\n", " print(\"警告: 数据中缺少 'target__uniprot_id' 或 'target__foldseek_seq' 列,跳过 Target 提取。\")\n", "\n", " # ==========================================\n", " # 3. 处理 Molecules (Compounds)\n", " # ==========================================\n", " print(\"正在提取 Unique Compound 信息...\")\n", " \n", " compound_cols = [\n", " 'compound__smiles', \n", " 'compound__name', \n", " 'compound__drugbank_id', \n", " 'compound__cas', \n", " 'compound__unii', \n", " 'compound__inchikey'\n", " ]\n", " \n", " existing_compound_cols = [c for c in compound_cols if c in df.columns]\n", " \n", " if 'compound__smiles' in existing_compound_cols:\n", " # 提取列 -> 去除 SMILES 为空的行 -> 根据 SMILES 去重\n", " mol_df = df[existing_compound_cols].dropna(subset=['compound__smiles'])\n", " mol_df = mol_df.drop_duplicates(subset=['compound__smiles'])\n", " \n", " # 将 NaN 替换为 None\n", " mol_df = mol_df.where(pd.notnull(mol_df), None)\n", " \n", " # 转换为字典: { \"SMILES\": { \"compound__name\": \"...\", ... } }\n", " mol_data = mol_df.set_index('compound__smiles').to_dict(orient='index')\n", " \n", " mol_out_path = os.path.join(output_dir, 'unique_compounds.json')\n", " with open(mol_out_path, 'w', encoding='utf-8') as f:\n", " json.dump(mol_data, f, indent=4, ensure_ascii=False)\n", " print(f\"Compound 信息已保存至: {mol_out_path} (数量: {len(mol_data)})\")\n", " else:\n", " print(\"警告: 数据中缺少 'compound__smiles' 列,跳过 Compound 提取。\")\n", "\n", "# --- 使用示例 ---\n", "dataset_path = 'drug_target_activity/processed_train.parquet'\n", "output_directory = 'drug_target_activity/candidates'\n", "extract_unique_entities(dataset_path, output_directory)" ] }, { "cell_type": "code", "execution_count": null, "id": "ee0d150f", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" } }, "nbformat": 4, "nbformat_minor": 5 }