File size: 16,236 Bytes
2180e31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "2e29f3a3-381c-4c16-853c-d73d38abb383",
   "metadata": {
    "libroFormatter": "formatter-string",
    "trusted": true
   },
   "source": [
    "# 1. 加载数据,查看数据格式\n",
    "# 2. 使用 data_transform.py 将 Uniprot_id 格式转为 Saprot 可以接受的 Foldseek Seq 格式\n",
    "# 3. 记录所有 Target_Uniprot_id 和 Compound_Smiles 及其 对应信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "bd7b0b18-c1f1-4f1c-be46-a3bd4686ca57",
   "metadata": {
    "execution": {
     "shell.execute_reply.end": "2025-12-29T11:57:38.962407Z",
     "shell.execute_reply.started": "2025-12-29T11:57:37.782798Z",
     "to_execute": "2025-12-29T11:57:37.691Z"
    },
    "isLargeOutputDisplay": true,
    "libroFormatter": "formatter-string",
    "trusted": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "assay_id                                                            P00316\n",
       "target_id                                                             ROS1\n",
       "compound_id                                                       EB000590\n",
       "mode                                                               Binding\n",
       "mechanism                                              Competition Binding\n",
       "outcome_is_active                                                     True\n",
       "outcome_potency_pxc50                                                 11.8\n",
       "outcome_max_activity                                                  99.3\n",
       "observed_max                                                         100.0\n",
       "is_quantified                                                         True\n",
       "frequency_flag                                                       False\n",
       "viability_flag                                                       False\n",
       "pxc50_modifier                                                           >\n",
       "slope                                                                  0.6\n",
       "asymp_min                                                             58.0\n",
       "asymp_max                                                             99.3\n",
       "assay__technology                                                  TR-FRET\n",
       "target__class                                                       Kinase\n",
       "target__gene                                                          ROS1\n",
       "target__uniprot_id                                                  P08922\n",
       "target__is_mutant                                                    False\n",
       "target__wildtype_id                                                   ROS1\n",
       "target__name                    Proto-oncogene tyrosine-protein kinase ROS\n",
       "compound__name                                                  Lorlatinib\n",
       "compound__smiles         C[C@H]1OC2=C(N)N=CC(=C2)C2=C(C#N)N(C)N=C2CN(C)...\n",
       "compound__drugbank_id                                              DB12130\n",
       "compound__cas                                                 1454846-35-5\n",
       "compound__unii                                                  OSP71S83EU\n",
       "compound__inchikey                             IIXWYSCJSQVBQM-LLVKDONJSA-N\n",
       "progressed                                                            True\n",
       "release                                                                  8\n",
       "Name: 0, dtype: object"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# 读取 parquet 文件\n",
    "data_path = 'drug_target_activity/train.parquet'\n",
    "df = pd.read_parquet(data_path)\n",
    "\n",
    "# 查看一个example\n",
    "df.iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "21412976-18ab-44d4-b73d-18bb0a883f0f",
   "metadata": {
    "execution": {
     "shell.execute_reply.end": "2025-12-26T07:18:30.046621Z",
     "shell.execute_reply.started": "2025-12-26T07:18:30.043306Z",
     "to_execute": "2025-12-26T07:18:30.099Z"
    },
    "isLargeOutputDisplay": true,
    "libroFormatter": "formatter-string",
    "trusted": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "正在加载数据集配置...\n",
      "------------------------------\n",
      "总数据量: 421894\n",
      "野生型 (False): 406527 条 (96.36%)\n",
      "突变体 (True) : 15367 条 (3.64%)\n",
      "------------------------------\n"
     ]
    }
   ],
   "source": [
    "def check_mutant_ratio(df):\n",
    "    print(\"正在加载数据集配置...\")\n",
    "    # 加载数据集 (假设你已经登录或数据集是公开的)\n",
    "    # 如果只是为了统计,可以使用 streaming=True 来避免下载整个数据集,但全量统计需要遍历\n",
    "    # 这里假设显存/内存足够,直接加载 train split\n",
    "    try:\n",
    "        # ds = load_dataset(\"eve-bio/drug-target-activity\", split=\"train\")\n",
    "        \n",
    "        # 将其转换为 pandas DataFrame 以便处理\n",
    "        # 为了节省内存,只取 target__is_mutant 这一列\n",
    "        # print(\"正在转换数据...\")\n",
    "        # df = ds.select_columns([\"target__is_mutant\"]).to_pandas()\n",
    "        \n",
    "        # 统计数量\n",
    "        counts = df['target__is_mutant'].value_counts()\n",
    "        total = len(df)\n",
    "        \n",
    "        # 计算比例\n",
    "        false_ratio = (counts.get(False, 0) / total) * 100\n",
    "        true_ratio = (counts.get(True, 0) / total) * 100\n",
    "        \n",
    "        print(\"-\" * 30)\n",
    "        print(f\"总数据量: {total}\")\n",
    "        print(f\"野生型 (False): {counts.get(False, 0)} 条 ({false_ratio:.2f}%)\")\n",
    "        print(f\"突变体 (True) : {counts.get(True, 0)} 条 ({true_ratio:.2f}%)\")\n",
    "        print(\"-\" * 30)\n",
    "    except Exception as e:\n",
    "        print(f\"发生错误: {e}\")\n",
    "check_mutant_ratio(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "1b7faa51-db68-4a52-823f-a4a27d44c142",
   "metadata": {
    "execution": {
     "shell.execute_reply.end": "2025-12-26T07:18:35.112954Z",
     "shell.execute_reply.started": "2025-12-26T07:18:35.104700Z",
     "to_execute": "2025-12-26T07:18:35.222Z"
    },
    "isLargeOutputDisplay": true,
    "libroFormatter": "formatter-string",
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# 建立 uniprot_id -> foldseek seq 的 map 并 save\n",
    "# from dataset_transform import generate_and_save_foldseek_dict\n",
    "# uniprot_ids = get_unique_uniprot_ids(data_path)\n",
    "# 这里在另外一个电脑上做的数据爬取,所以两个path没有具体写定\n",
    "map_save_path = 'drug_target_activity/protein_foldseek_seqs.json'\n",
    "foldseek_path = 'path/to/foldseek'\n",
    "# generate_and_save_foldseek_dict(uniprot_ids, map_save_path, foldseek_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "51ddcbbb-0e0a-4fec-8c67-698b14ad8e34",
   "metadata": {
    "isLargeOutputDisplay": true,
    "libroFormatter": "formatter-string",
    "trusted": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Step 1: 原始数据加载完成,当前数据量: 421894\n",
      "Step 2: 筛选非突变体 (is_mutant=False) 后,当前数据量: 406527\n",
      "Step 3: Map文件加载完成,包含 138 个 ID 映射\n",
      "Step 4: 筛选 Uniprot ID 存在于 Map 中的数据后,当前数据量: 318516\n",
      "Step 5: 处理完成,最终文件已保存至: drug_target_activity/processed_train.parquet\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "\n",
    "def build_dataset(data_path, foldseek_map_path, new_dataset_path):\n",
    "    '''\n",
    "        1. 打开data_path的parquet文件\n",
    "        2. 打开foldseek_map_path的json文件, 读取 dict, 其中 key:value 为 uniprot_id:foldseek seq\n",
    "        3. 筛选'target__is_mutant'为false的 row\n",
    "        4. 筛选dataset中'target__uniprot_id' 在 dict 的 key 中的 row, 并增加一列'target__foldseek_seq', 值为 dict 中对应的 value\n",
    "        5. 保存newdataset到new_dataset_path\n",
    "    '''\n",
    "    df = pd.read_parquet(data_path)\n",
    "    print(f\"Step 1: 原始数据加载完成,当前数据量: {len(df)}\")\n",
    "\n",
    "    df = df[df['target__is_mutant'] == False]\n",
    "    print(f\"Step 2: 筛选非突变体 (is_mutant=False) 后,当前数据量: {len(df)}\")\n",
    "\n",
    "    with open(foldseek_map_path, 'r') as f:\n",
    "        foldseek_map = json.load(f)\n",
    "    print(f\"Step 3: Map文件加载完成,包含 {len(foldseek_map)} 个 ID 映射\")\n",
    "\n",
    "    df = df[df['target__uniprot_id'].isin(foldseek_map.keys())].copy()\n",
    "    print(f\"Step 4: 筛选 Uniprot ID 存在于 Map 中的数据后,当前数据量: {len(df)}\")\n",
    "\n",
    "    df['target__foldseek_seq'] = df['target__uniprot_id'].map(foldseek_map)\n",
    "\n",
    "    try:\n",
    "        df.to_parquet(new_dataset_path)\n",
    "        print(f\"Step 5: 处理完成,最终文件已保存至: {new_dataset_path}\")\n",
    "    except Exception as e:\n",
    "        print(f\"保存文件失败: {e}\")\n",
    "\n",
    "# 示例调用(如果需要测试)\n",
    "new_dataset_path = 'drug_target_activity/processed_train.parquet'\n",
    "build_dataset(data_path, map_save_path, new_dataset_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6a28872-f377-47ed-b3e8-903c8e25567e",
   "metadata": {
    "isLargeOutputDisplay": true,
    "libroFormatter": "formatter-string",
    "trusted": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "创建目录: drug_target_activity/candidates\n",
      "正在读取文件: drug_target_activity/processed_train.parquet ...\n",
      "正在提取 Unique Target 信息...\n",
      "Target 信息已保存至: drug_target_activity/candidates/unique_targets.json (数量: 138)\n",
      "正在提取 Unique Compound 信息...\n",
      "Compound 信息已保存至: drug_target_activity/candidates/unique_compounds.json (数量: 1382)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "import os\n",
    "\n",
    "import pandas as pd\n",
    "import json\n",
    "import os\n",
    "\n",
    "def extract_unique_entities(parquet_path, output_dir):\n",
    "    \"\"\"\n",
    "    读取 Parquet 文件,提取唯一的 Protein (含 foldseek seq) 和 Molecule 信息,并保存为 JSON。\n",
    "    \"\"\"\n",
    "    \n",
    "    # 1. 确保输出目录存在\n",
    "    if not os.path.exists(output_dir):\n",
    "        os.makedirs(output_dir)\n",
    "        print(f\"创建目录: {output_dir}\")\n",
    "\n",
    "    print(f\"正在读取文件: {parquet_path} ...\")\n",
    "    try:\n",
    "        df = pd.read_parquet(parquet_path)\n",
    "    except Exception as e:\n",
    "        print(f\"读取 Parquet 失败: {e}\")\n",
    "        return\n",
    "\n",
    "    # ==========================================\n",
    "    # 2. 处理 Proteins (Targets)\n",
    "    # ==========================================\n",
    "    print(\"正在提取 Unique Target 信息...\")\n",
    "    \n",
    "    # 【修改点】加入了 'target__foldseek_seq'\n",
    "    target_cols = ['target__uniprot_id', 'target__foldseek_seq', 'target__class', 'target__gene']\n",
    "    \n",
    "    # 检查列是否存在\n",
    "    existing_target_cols = [c for c in target_cols if c in df.columns]\n",
    "    \n",
    "    if 'target__uniprot_id' in existing_target_cols and 'target__foldseek_seq' in existing_target_cols:\n",
    "        # 提取列 -> 去除 ID 为空的行 -> 根据 ID 去重 \n",
    "        # 注意:这里假设同一个 ID 对应的 seq 是一样的,只保留第一条\n",
    "        target_df = df[existing_target_cols].dropna(subset=['target__uniprot_id'])\n",
    "        target_df = target_df.drop_duplicates(subset=['target__uniprot_id'])\n",
    "        \n",
    "        # 将 NaN 替换为 None\n",
    "        target_df = target_df.where(pd.notnull(target_df), None)\n",
    "        \n",
    "        # 转换为字典: \n",
    "        # { \n",
    "        #   \"UniprotID\": { \n",
    "        #       \"target__foldseek_seq\": \"...\", \n",
    "        #       \"target__class\": \"...\", \n",
    "        #       ... \n",
    "        #    } \n",
    "        # }\n",
    "        target_data = target_df.set_index('target__uniprot_id').to_dict(orient='index')\n",
    "        \n",
    "        target_out_path = os.path.join(output_dir, 'unique_targets.json')\n",
    "        with open(target_out_path, 'w', encoding='utf-8') as f:\n",
    "            json.dump(target_data, f, indent=4, ensure_ascii=False)\n",
    "        print(f\"Target 信息已保存至: {target_out_path} (数量: {len(target_data)})\")\n",
    "    else:\n",
    "        print(\"警告: 数据中缺少 'target__uniprot_id' 或 'target__foldseek_seq' 列,跳过 Target 提取。\")\n",
    "\n",
    "    # ==========================================\n",
    "    # 3. 处理 Molecules (Compounds)\n",
    "    # ==========================================\n",
    "    print(\"正在提取 Unique Compound 信息...\")\n",
    "    \n",
    "    compound_cols = [\n",
    "        'compound__smiles', \n",
    "        'compound__name', \n",
    "        'compound__drugbank_id', \n",
    "        'compound__cas', \n",
    "        'compound__unii', \n",
    "        'compound__inchikey'\n",
    "    ]\n",
    "    \n",
    "    existing_compound_cols = [c for c in compound_cols if c in df.columns]\n",
    "    \n",
    "    if 'compound__smiles' in existing_compound_cols:\n",
    "        # 提取列 -> 去除 SMILES 为空的行 -> 根据 SMILES 去重\n",
    "        mol_df = df[existing_compound_cols].dropna(subset=['compound__smiles'])\n",
    "        mol_df = mol_df.drop_duplicates(subset=['compound__smiles'])\n",
    "        \n",
    "        # 将 NaN 替换为 None\n",
    "        mol_df = mol_df.where(pd.notnull(mol_df), None)\n",
    "        \n",
    "        # 转换为字典: { \"SMILES\": { \"compound__name\": \"...\", ... } }\n",
    "        mol_data = mol_df.set_index('compound__smiles').to_dict(orient='index')\n",
    "        \n",
    "        mol_out_path = os.path.join(output_dir, 'unique_compounds.json')\n",
    "        with open(mol_out_path, 'w', encoding='utf-8') as f:\n",
    "            json.dump(mol_data, f, indent=4, ensure_ascii=False)\n",
    "        print(f\"Compound 信息已保存至: {mol_out_path} (数量: {len(mol_data)})\")\n",
    "    else:\n",
    "        print(\"警告: 数据中缺少 'compound__smiles' 列,跳过 Compound 提取。\")\n",
    "\n",
    "# --- 使用示例 ---\n",
    "dataset_path = 'drug_target_activity/processed_train.parquet'\n",
    "output_directory = 'drug_target_activity/candidates'\n",
    "extract_unique_entities(dataset_path, output_directory)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee0d150f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}