AlexSychovUN commited on
Commit
da5d904
·
1 Parent(s): da7c0f0

Added files

Browse files
.gitignore CHANGED
@@ -1,2 +1 @@
1
- .idea
2
- .ipynb_checkpoints
 
1
+ .idea
 
GNN_classification/Dataset_Preparation.py CHANGED
@@ -1,10 +1,49 @@
 
1
  import torch
2
  import pandas as pd
3
-
4
- from rdkit import Chem
5
  from torch_geometric.data import Data
6
  from torch.utils.data import Dataset
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  class SmilesDataset(Dataset):
10
  def __init__(self, dataframe):
@@ -22,8 +61,9 @@ class SmilesDataset(Dataset):
22
  if mol is None: return None
23
 
24
  # Nodes
25
- atom_features = [[atom.GetAtomicNum()] for atom in mol.GetAtoms()]
26
- x = torch.tensor(atom_features, dtype=torch.float)
 
27
 
28
  # Edges
29
  edge_indexes = []
@@ -35,10 +75,8 @@ class SmilesDataset(Dataset):
35
 
36
  # t - transpose, [num_of_edges, 2] -> [2, num_of_edges]
37
  # contiguous - take the virtually transposed tensor and make its physical copy and lay bytes sequentially
38
- if not edge_indexes:
39
- edge_index = torch.empty((2, 0), dtype=torch.long)
40
- else:
41
- edge_index = torch.tensor(edge_indexes, dtype=torch.long).t().contiguous()
42
 
43
 
44
  # Label
@@ -54,10 +92,13 @@ if __name__ == "__main__":
54
  test_dataset = pd.read_csv(
55
  "dataset/classification/data_test.txt", sep=" ", header=None, names=columns
56
  )
 
 
57
 
58
  train_dataset = SmilesDataset(train_dataset)
59
  test_dataset = SmilesDataset(test_dataset)
60
 
 
61
  print(len(train_dataset))
62
  print(len(test_dataset))
63
 
 
1
+ import numpy as np
2
  import torch
3
  import pandas as pd
4
+ from rdkit import Chem, rdBase
 
5
  from torch_geometric.data import Data
6
  from torch.utils.data import Dataset
7
 
8
+ rdBase.DisableLog('rdApp.*')
9
+
10
+
11
+ def one_of_k_encoding(x, allowable_set):
12
+ # last position - unknown
13
+ if x not in allowable_set:
14
+ x = allowable_set[-1]
15
+ return list(map(lambda s: x == s, allowable_set))
16
+
17
+
18
+ def get_atom_features(atom):
19
+ symbols_list = ['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe', 'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb', 'Sb', 'Sn', 'Ag', 'Pd', 'Co', 'Se', 'Ti', 'Zn', 'H', 'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In', 'Mn', 'Zr', 'Cr', 'Pt', 'Hg', 'Pb', 'Unknown']
20
+ degrees_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
21
+ numhs_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
22
+ implicit_valences_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
23
+ return np.array(
24
+ # Type of atom (Symbol)
25
+ one_of_k_encoding(atom.GetSymbol(), symbols_list) +
26
+ # Number of neighbours (Degree)
27
+ one_of_k_encoding(atom.GetDegree(), degrees_list) +
28
+ # Number of hydrogen atoms (Implicit Hs) - bond donors
29
+ one_of_k_encoding(atom.GetTotalNumHs(), numhs_list) +
30
+ # Valence - chemical potential
31
+ one_of_k_encoding(atom.GetImplicitValence(), implicit_valences_list) +
32
+ # Hybridization - so important for 3d structure, sp2 - Trigonal planar, sp3 - Tetrahedral
33
+ one_of_k_encoding(atom.GetHybridization(), [
34
+ Chem.rdchem.HybridizationType.SP,
35
+ Chem.rdchem.HybridizationType.SP2,
36
+ Chem.rdchem.HybridizationType.SP3,
37
+ Chem.rdchem.HybridizationType.SP3D,
38
+ Chem.rdchem.HybridizationType.SP3D2,
39
+ 'other']) +
40
+ # Aromaticity (Boolean)
41
+ [atom.GetIsAromatic()]
42
+
43
+
44
+ )
45
+
46
+
47
 
48
  class SmilesDataset(Dataset):
49
  def __init__(self, dataframe):
 
61
  if mol is None: return None
62
 
63
  # Nodes
64
+ atom_features = [get_atom_features(atom) for atom in mol.GetAtoms()]
65
+ x = torch.tensor(np.array(atom_features), dtype=torch.float)
66
+
67
 
68
  # Edges
69
  edge_indexes = []
 
75
 
76
  # t - transpose, [num_of_edges, 2] -> [2, num_of_edges]
77
  # contiguous - take the virtually transposed tensor and make its physical copy and lay bytes sequentially
78
+
79
+ edge_index = torch.tensor(edge_indexes, dtype=torch.long).t().contiguous()
 
 
80
 
81
 
82
  # Label
 
92
  test_dataset = pd.read_csv(
93
  "dataset/classification/data_test.txt", sep=" ", header=None, names=columns
94
  )
95
+ train_dataset.to_csv("dataset/classification/data_train.csv", index=False)
96
+ test_dataset.to_csv("dataset/classification/data_test.csv", index=False)
97
 
98
  train_dataset = SmilesDataset(train_dataset)
99
  test_dataset = SmilesDataset(test_dataset)
100
 
101
+
102
  print(len(train_dataset))
103
  print(len(test_dataset))
104
 
GNN_classification/dataset/classification/EDA.ipynb ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "id": "initial_id",
6
+ "metadata": {
7
+ "collapsed": true,
8
+ "ExecuteTime": {
9
+ "end_time": "2025-12-04T17:09:59.971023Z",
10
+ "start_time": "2025-12-04T17:09:59.487573Z"
11
+ }
12
+ },
13
+ "source": "import pandas as pd",
14
+ "outputs": [],
15
+ "execution_count": 1
16
+ },
17
+ {
18
+ "metadata": {
19
+ "ExecuteTime": {
20
+ "end_time": "2025-12-04T17:10:17.852208Z",
21
+ "start_time": "2025-12-04T17:10:17.820499Z"
22
+ }
23
+ },
24
+ "cell_type": "code",
25
+ "source": "train_dataset = pd.read_csv(\"data_train.csv\")",
26
+ "id": "5602ccb4aefc74b1",
27
+ "outputs": [],
28
+ "execution_count": 2
29
+ },
30
+ {
31
+ "metadata": {
32
+ "ExecuteTime": {
33
+ "end_time": "2025-12-04T17:10:28.992611Z",
34
+ "start_time": "2025-12-04T17:10:28.969929Z"
35
+ }
36
+ },
37
+ "cell_type": "code",
38
+ "source": "train_dataset.describe()",
39
+ "id": "426ec6e722b80a8a",
40
+ "outputs": [
41
+ {
42
+ "data": {
43
+ "text/plain": [
44
+ " label\n",
45
+ "count 37720.000000\n",
46
+ "mean 0.035260\n",
47
+ "std 0.184438\n",
48
+ "min 0.000000\n",
49
+ "25% 0.000000\n",
50
+ "50% 0.000000\n",
51
+ "75% 0.000000\n",
52
+ "max 1.000000"
53
+ ],
54
+ "text/html": [
55
+ "<div>\n",
56
+ "<style scoped>\n",
57
+ " .dataframe tbody tr th:only-of-type {\n",
58
+ " vertical-align: middle;\n",
59
+ " }\n",
60
+ "\n",
61
+ " .dataframe tbody tr th {\n",
62
+ " vertical-align: top;\n",
63
+ " }\n",
64
+ "\n",
65
+ " .dataframe thead th {\n",
66
+ " text-align: right;\n",
67
+ " }\n",
68
+ "</style>\n",
69
+ "<table border=\"1\" class=\"dataframe\">\n",
70
+ " <thead>\n",
71
+ " <tr style=\"text-align: right;\">\n",
72
+ " <th></th>\n",
73
+ " <th>label</th>\n",
74
+ " </tr>\n",
75
+ " </thead>\n",
76
+ " <tbody>\n",
77
+ " <tr>\n",
78
+ " <th>count</th>\n",
79
+ " <td>37720.000000</td>\n",
80
+ " </tr>\n",
81
+ " <tr>\n",
82
+ " <th>mean</th>\n",
83
+ " <td>0.035260</td>\n",
84
+ " </tr>\n",
85
+ " <tr>\n",
86
+ " <th>std</th>\n",
87
+ " <td>0.184438</td>\n",
88
+ " </tr>\n",
89
+ " <tr>\n",
90
+ " <th>min</th>\n",
91
+ " <td>0.000000</td>\n",
92
+ " </tr>\n",
93
+ " <tr>\n",
94
+ " <th>25%</th>\n",
95
+ " <td>0.000000</td>\n",
96
+ " </tr>\n",
97
+ " <tr>\n",
98
+ " <th>50%</th>\n",
99
+ " <td>0.000000</td>\n",
100
+ " </tr>\n",
101
+ " <tr>\n",
102
+ " <th>75%</th>\n",
103
+ " <td>0.000000</td>\n",
104
+ " </tr>\n",
105
+ " <tr>\n",
106
+ " <th>max</th>\n",
107
+ " <td>1.000000</td>\n",
108
+ " </tr>\n",
109
+ " </tbody>\n",
110
+ "</table>\n",
111
+ "</div>"
112
+ ]
113
+ },
114
+ "execution_count": 3,
115
+ "metadata": {},
116
+ "output_type": "execute_result"
117
+ }
118
+ ],
119
+ "execution_count": 3
120
+ },
121
+ {
122
+ "metadata": {
123
+ "ExecuteTime": {
124
+ "end_time": "2025-12-04T17:11:30.023817Z",
125
+ "start_time": "2025-12-04T17:11:30.012904Z"
126
+ }
127
+ },
128
+ "cell_type": "code",
129
+ "source": "train_dataset['label'].value_counts()",
130
+ "id": "355c3ed8e5f76bbf",
131
+ "outputs": [
132
+ {
133
+ "data": {
134
+ "text/plain": [
135
+ "label\n",
136
+ "0 36390\n",
137
+ "1 1330\n",
138
+ "Name: count, dtype: int64"
139
+ ]
140
+ },
141
+ "execution_count": 4,
142
+ "metadata": {},
143
+ "output_type": "execute_result"
144
+ }
145
+ ],
146
+ "execution_count": 4
147
+ },
148
+ {
149
+ "metadata": {},
150
+ "cell_type": "code",
151
+ "outputs": [],
152
+ "execution_count": null,
153
+ "source": "",
154
+ "id": "a88bb26653a0eb02"
155
+ }
156
+ ],
157
+ "metadata": {
158
+ "kernelspec": {
159
+ "display_name": "Python 3",
160
+ "language": "python",
161
+ "name": "python3"
162
+ },
163
+ "language_info": {
164
+ "codemirror_mode": {
165
+ "name": "ipython",
166
+ "version": 2
167
+ },
168
+ "file_extension": ".py",
169
+ "mimetype": "text/x-python",
170
+ "name": "python",
171
+ "nbconvert_exporter": "python",
172
+ "pygments_lexer": "ipython2",
173
+ "version": "2.7.6"
174
+ }
175
+ },
176
+ "nbformat": 4,
177
+ "nbformat_minor": 5
178
+ }
GNN_classification/training.py CHANGED
@@ -1,8 +1,7 @@
1
  import torch
2
- import torch.nn as nn
3
- import torch.nn.functional as F
4
  import pandas as pd
5
- from rdkit import Chem
6
 
7
  from torch_geometric.loader import DataLoader
8
 
@@ -61,6 +60,9 @@ if __name__ == "__main__":
61
  train_dataset = SmilesDataset(train_dataset)
62
  test_dataset = SmilesDataset(test_dataset)
63
 
 
 
 
64
  num_node_features = train_dataset[0].x.shape[1]
65
  num_classes = 2
66
 
@@ -71,7 +73,9 @@ if __name__ == "__main__":
71
  train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
72
  test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)
73
 
74
- model = GNNClassifier(input_dim=1, output_dim=2, hidden_channels=16).to(DEVICE)
 
 
75
 
76
  optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
77
  criterion = torch.nn.CrossEntropyLoss()
@@ -80,7 +84,10 @@ if __name__ == "__main__":
80
  print("Start Training")
81
 
82
  for epoch in range(1, EPOCHS + 1):
83
- train_loss = train_epoch(model, train_loader, optimizer, criterion)
84
 
85
- train_acc = evaluate(model, train_loader)
86
  print(f"Epoch: {epoch}, Loss: {train_loss}, Train Accuracy: {train_acc}")
 
 
 
 
1
  import torch
2
+
 
3
  import pandas as pd
4
+
5
 
6
  from torch_geometric.loader import DataLoader
7
 
 
60
  train_dataset = SmilesDataset(train_dataset)
61
  test_dataset = SmilesDataset(test_dataset)
62
 
63
+ train_dataset = [data for data in train_dataset if data is not None]
64
+ test_dataset = [data for data in test_dataset if data is not None]
65
+
66
  num_node_features = train_dataset[0].x.shape[1]
67
  num_classes = 2
68
 
 
73
  train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
74
  test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)
75
 
76
+
77
+
78
+ model = GNNClassifier(input_dim=num_node_features, output_dim=num_classes, hidden_channels=16).to(DEVICE)
79
 
80
  optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
81
  criterion = torch.nn.CrossEntropyLoss()
 
84
  print("Start Training")
85
 
86
  for epoch in range(1, EPOCHS + 1):
87
+ train_loss = train(model, train_loader, optimizer, criterion)
88
 
89
+ train_acc = test(model, train_loader)
90
  print(f"Epoch: {epoch}, Loss: {train_loss}, Train Accuracy: {train_acc}")
91
+
92
+ test_acc = test(model, test_loader)
93
+ print(f"Test Accuracy: {test_acc}")