AOZ2025 commited on
Commit
dd98fd2
·
verified ·
1 Parent(s): e9a01a7

Upload 4 files

Browse files
Utils/__pycache__/model_utils.cpython-312.pyc CHANGED
Binary files a/Utils/__pycache__/model_utils.cpython-312.pyc and b/Utils/__pycache__/model_utils.cpython-312.pyc differ
 
Utils/__pycache__/utils.cpython-312.pyc CHANGED
Binary files a/Utils/__pycache__/utils.cpython-312.pyc and b/Utils/__pycache__/utils.cpython-312.pyc differ
 
Utils/model_utils.py CHANGED
@@ -1,427 +1,427 @@
1
- import torch
2
- import torch.nn as nn
3
- from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
4
- from sklearn.impute import SimpleImputer
5
- import pandas as pd
6
- import numpy as np
7
- import matplotlib.pyplot as plt
8
- from sklearn.model_selection import train_test_split
9
- from sklearn.metrics import accuracy_score, classification_report
10
- from sklearn.utils.class_weight import compute_class_weight
11
- from torch.amp import autocast, GradScaler
12
- from torch.utils.data import TensorDataset, DataLoader
13
- from torch.nn.utils import clip_grad_norm_
14
- from collections import Counter
15
- import torch
16
- import torch.nn as nn
17
- import os
18
- import torch.optim as optim
19
-
20
- # Define the ImprovedTagClassifier class for tag prediction
21
- class ImprovedTagClassifier(nn.Module):
22
- def __init__(self, input_size, output_size, dropout_rate=0.4):
23
- super(ImprovedTagClassifier, self).__init__()
24
-
25
- # First hidden layer: transforms input features to 512 dimensions
26
- self.fc1 = nn.Linear(input_size, 512)
27
- self.bn1 = nn.BatchNorm1d(512) # Normalizes the output
28
-
29
- # Second hidden layer: reduces from 512 to 256 dimensions
30
- self.fc2 = nn.Linear(512, 256)
31
- self.bn2 = nn.BatchNorm1d(256) # Normalizes again
32
-
33
- # Third hidden layer: further reduces to 128 dimensions
34
- self.fc3 = nn.Linear(256, 128)
35
- self.bn3 = nn.BatchNorm1d(128) # Another normalization
36
-
37
- # Output layer: maps 128 dimensions to the number of classes
38
- self.fc4 = nn.Linear(128, output_size)
39
-
40
- # Tools to prevent overfitting and improve learning
41
- self.dropout = nn.Dropout(dropout_rate) # Randomly drops some data
42
- self.leaky_relu = nn.LeakyReLU(0.1) # Activation function with a small slope
43
-
44
- # Skip connection: connects layer 1 directly to layer 3
45
- self.skip1_3 = nn.Linear(512, 128)
46
-
47
- # Set up the initial weights for better training
48
- self._initialize_weights()
49
-
50
- def _initialize_weights(self):
51
- # Loop through all parts of the model
52
- for m in self.modules():
53
- if isinstance(m, nn.Linear):
54
- # Use a special method to set weights for linear layers
55
- nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='leaky_relu')
56
- if m.bias is not None:
57
- # Set biases to zero
58
- nn.init.constant_(m.bias, 0)
59
- elif isinstance(m, nn.BatchNorm1d):
60
- # Set batch norm weights to 1 and biases to 0
61
- nn.init.constant_(m.weight, 1)
62
- nn.init.constant_(m.bias, 0)
63
-
64
- def forward(self, x):
65
- # First block: process input through the first layer
66
- x1 = self.fc1(x)
67
- x1 = self.bn1(x1) # Normalize
68
- x1 = self.leaky_relu(x1) # Activate
69
- x1 = self.dropout(x1) # Drop some data to prevent overfitting
70
-
71
- # Second block: process through the second layer
72
- x2 = self.fc2(x1)
73
- x2 = self.bn2(x2) # Normalize
74
- x2 = self.leaky_relu(x2) # Activate
75
- x2 = self.dropout(x2) # Drop some data
76
-
77
- # Third block: process with a skip connection
78
- x3 = self.fc3(x2)
79
- skip_x1 = self.skip1_3(x1) # Skip connection from first layer
80
- x3 = x3 + skip_x1 # Add the skip connection
81
- x3 = self.bn3(x3) # Normalize
82
- x3 = self.leaky_relu(x3) # Activate
83
- x3 = self.dropout(x3) # Drop some data
84
-
85
- # Final output: get the class predictions
86
- output = self.fc4(x3)
87
- return output
88
-
89
- class FocalLoss(nn.Module):
90
- """Focal Loss for handling class imbalance"""
91
- def __init__(self, weight=None, gamma=2.0, reduction='mean'):
92
- super(FocalLoss, self).__init__()
93
- self.weight = weight # Weights for each class
94
- self.gamma = gamma # Focus on hard examples
95
- self.reduction = reduction
96
- self.ce_loss = nn.CrossEntropyLoss(weight=weight, reduction='none')
97
-
98
- def forward(self, inputs, targets):
99
- # Calculate basic cross-entropy loss
100
- ce_loss = self.ce_loss(inputs, targets)
101
- pt = torch.exp(-ce_loss) # Probability of correct class
102
- focal_loss = ((1 - pt) ** self.gamma) * ce_loss # Adjust loss
103
-
104
- # Combine losses based on reduction type
105
- if self.reduction == 'mean':
106
- return focal_loss.mean()
107
- elif self.reduction == 'sum':
108
- return focal_loss.sum()
109
- else:
110
- return focal_loss
111
-
112
- class MultiLevelTagClassifier:
113
- def __init__(self, device='cuda'):
114
- # Use GPU
115
- self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
116
- self.models = {} # Store models for each parent tag
117
- self.preprocessors = {} # Store preprocessing tools
118
- self.label_encoders = {} # Store label encoders
119
-
120
- # Define tag groups
121
- self.tag_hierarchy = {
122
- 'DIV': ['DIV', 'LIST', 'CARD', 'FORM'],
123
- 'P': ['P', 'LABEL', 'LI', 'A'],
124
- 'INPUT': ['INPUT', 'DROPDOWN'],
125
- 'ICON': ['ICON', 'CHECKBOX', 'RADIO'],
126
- }
127
- print(f"Using device: {self.device}")
128
-
129
- def prepare_data_for_subtask(self, df, parent_tag, subtags):
130
- # Get only the data for this parent tag’s subtags
131
- filtered_df = df[df['tag'].isin(subtags)].copy()
132
- print(f"\n=== Preparing data for {parent_tag} sub-classification ===")
133
- print(f"Subtags: {subtags}")
134
- print(f"Total samples: {len(filtered_df)}")
135
- print(f"Distribution: \n{filtered_df['tag'].value_counts()}")
136
-
137
- if len(filtered_df) == 0:
138
- print(f"No data found for {parent_tag} subtags!")
139
- return None, None, None, None, None, None
140
-
141
- y = filtered_df["tag"] # Target tags
142
- X = filtered_df.drop(columns=["tag"]) # Features
143
-
144
- # Define which columns are categories and numerical features
145
- categorical_cols = ['type', 'prev_sibling_html_tag', 'child_1_html_tag', 'child_2_html_tag', 'parent_tag_html']
146
- continuous_cols = [col for col in X.columns if col not in categorical_cols]
147
-
148
- # Add missing columns with default values
149
- missing_cols = [col for col in categorical_cols + continuous_cols if col not in X.columns]
150
- if missing_cols:
151
- print(f"Warning: Missing columns {missing_cols} in data for {parent_tag}")
152
- for col in missing_cols:
153
- X[col] = 'unknown' if col in categorical_cols else 0
154
-
155
- # Process categories
156
- X[categorical_cols] = X[categorical_cols].astype(str).fillna('unknown')
157
- ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
158
- X_cat_encoded = ohe.fit_transform(X[categorical_cols])
159
-
160
- # Process continous features
161
- imputer = SimpleImputer(strategy='median')
162
- X_continuous_imputed = imputer.fit_transform(X[continuous_cols])
163
- scaler = StandardScaler()
164
- X_continuous_scaled = scaler.fit_transform(X_continuous_imputed)
165
- X_processed = np.concatenate([X_cat_encoded, X_continuous_scaled], axis=1)
166
-
167
- # Encode target tags
168
- label_encoder = LabelEncoder()
169
- y_encoded = label_encoder.fit_transform(y)
170
-
171
- # Boost rare classes by copying them
172
- class_counts = Counter(y_encoded)
173
- min_samples_threshold = max(10, len(subtags) * 3)
174
- rare_classes = [cls for cls, count in class_counts.items() if count < min_samples_threshold]
175
-
176
- for cls in rare_classes:
177
- idx = np.where(y_encoded == cls)[0]
178
- original_class_name = label_encoder.inverse_transform([cls])[0]
179
- samples_needed = min_samples_threshold - len(idx)
180
- print(f"Adding {samples_needed} copies to class '{original_class_name}'")
181
- for _ in range(samples_needed):
182
- sample_idx = np.random.choice(idx)
183
- new_sample = X_processed[sample_idx].copy()
184
- continuous_start = X_cat_encoded.shape[1]
185
- noise = np.random.normal(0, 0.05, size=X_continuous_scaled.shape[1])
186
- new_sample[continuous_start:] += noise
187
- X_processed = np.vstack([X_processed, new_sample])
188
- y_encoded = np.append(y_encoded, cls)
189
-
190
- # Bundle up preprocessing models
191
- preprocessors = {
192
- 'ohe': ohe,
193
- 'imputer': imputer,
194
- 'scaler': scaler,
195
- 'label_encoder': label_encoder,
196
- 'categorical_cols': categorical_cols,
197
- 'continuous_cols': continuous_cols
198
- }
199
- return X_processed, y_encoded, preprocessors, categorical_cols, continuous_cols, label_encoder
200
-
201
- def train_subtask_model(self, X, y, preprocessors, parent_tag, epochs=100):
202
- # Split data into train, validation, and test sets
203
- print(f"\n=== Training {parent_tag} sub-classifier ===")
204
- X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
205
- X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.15, random_state=42, stratify=y_temp)
206
- print(f"Training set size: {X_train.shape[0]}")
207
- print(f"Validation set size: {X_val.shape[0]}")
208
- print(f"Test set size: {X_test.shape[0]}")
209
-
210
- # Balance classes
211
- class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
212
-
213
- # Turn data into tensors
214
- X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
215
- y_train_tensor = torch.tensor(y_train, dtype=torch.long)
216
- X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
217
- y_val_tensor = torch.tensor(y_val, dtype=torch.long)
218
- X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
219
- y_test_tensor = torch.tensor(y_test, dtype=torch.long)
220
- class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(self.device)
221
-
222
- # Set up datasets and loaders
223
- train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
224
- val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
225
- test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
226
- train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=2)
227
- val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False, num_workers=2)
228
- test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=2)
229
-
230
- # Create and set up the model
231
- input_size = X_train.shape[1]
232
- output_size = len(np.unique(y))
233
- model = ImprovedTagClassifier(input_size, output_size).to(self.device)
234
- criterion = FocalLoss(weight=class_weights_tensor, gamma=2.0)
235
- optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
236
- scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)
237
- scaler = GradScaler()
238
-
239
- # Training loop
240
- best_val_loss = float('inf')
241
- patience = 15
242
- counter = 0
243
- train_losses = []
244
- val_losses = []
245
- val_accuracies = []
246
-
247
- for epoch in range(epochs):
248
- model.train()
249
- running_loss = 0.0
250
- for batch_X, batch_y in train_loader:
251
- batch_X, batch_y = batch_X.to(self.device), batch_y.to(self.device)
252
- optimizer.zero_grad()
253
- with autocast(device_type=self.device.type):
254
- outputs = model(batch_X)
255
- loss = criterion(outputs, batch_y)
256
- scaler.scale(loss).backward()
257
- clip_grad_norm_(model.parameters(), max_norm=1.0)
258
- scaler.step(optimizer)
259
- scaler.update()
260
- running_loss += loss.item()
261
-
262
- train_loss = running_loss / len(train_loader)
263
- model.eval()
264
- val_running_loss = 0.0
265
- all_preds = []
266
- all_labels = []
267
-
268
- with torch.no_grad():
269
- for batch_X, batch_y in val_loader:
270
- batch_X, batch_y = batch_X.to(self.device), batch_y.to(self.device)
271
- with autocast(device_type=self.device.type):
272
- outputs = model(batch_X)
273
- loss = criterion(outputs, batch_y)
274
- val_running_loss += loss.item()
275
- _, preds = torch.max(outputs, 1)
276
- all_preds.extend(preds.cpu().numpy())
277
- all_labels.extend(batch_y.cpu().numpy())
278
-
279
- val_loss = val_running_loss / len(val_loader)
280
- val_accuracy = accuracy_score(all_labels, all_preds)
281
- scheduler.step(val_loss)
282
-
283
- # Track progress
284
- train_losses.append(train_loss)
285
- val_losses.append(val_loss)
286
- val_accuracies.append(val_accuracy)
287
- print(f"Epoch [{epoch+1}/{epochs}] - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
288
-
289
- if val_loss < best_val_loss:
290
- best_val_loss = val_loss
291
- counter = 0
292
- best_model_state = model.state_dict().copy()
293
- else:
294
- counter += 1
295
- if counter >= patience:
296
- print(f"Early stopping triggered after {epoch+1} epochs")
297
- break
298
-
299
- model.load_state_dict(best_model_state)
300
- model.eval()
301
- test_preds = []
302
- test_labels = []
303
-
304
- with torch.no_grad():
305
- for batch_X, batch_y in test_loader:
306
- batch_X, batch_y = batch_X.to(self.device), batch_y.to(self.device)
307
- outputs = model(batch_X)
308
- _, preds = torch.max(outputs, 1)
309
- test_preds.extend(preds.cpu().numpy())
310
- test_labels.extend(batch_y.cpu().numpy())
311
-
312
- test_accuracy = accuracy_score(test_labels, test_preds)
313
- print(f"\n{parent_tag} Model Test Accuracy: {test_accuracy:.4f}")
314
- print(f"\n{parent_tag} Classification Report:")
315
- print(classification_report(test_labels, test_preds, target_names=preprocessors['label_encoder'].classes_, zero_division=0))
316
-
317
- return model, (train_losses, val_losses, val_accuracies), test_accuracy
318
-
319
- def train_all_models(self, df_path, epochs=100):
320
- # Load and clean the main dataset
321
- print("Loading and cleaning data...")
322
- df = pd.read_csv(df_path)
323
- df.loc[(df["tag"] == "SPAN") & ((df["type"] == "RECTANGLE") | (df["type"] == "GROUP")), "tag"] = "DIV"
324
- children_cols = ['child_1_html_tag', 'child_2_html_tag']
325
- for col in children_cols:
326
- df[col] = df[col].apply(lambda x: "DIV" if isinstance(x, str) and '-' in x else x)
327
- for col in ['tag', 'prev_sibling_html_tag', 'child_1_html_tag', 'child_2_html_tag']:
328
- df[col] = df[col].str.upper()
329
-
330
- # Make a folder for models
331
- os.makedirs('../models/sub_classifiers', exist_ok=True)
332
-
333
- # Train a model for each parent tag
334
- for parent_tag, subtags in self.tag_hierarchy.items():
335
- print(f"\n{'='*60}")
336
- print(f"Training {parent_tag} sub-classifier")
337
- print(f"{'='*60}")
338
- result = self.prepare_data_for_subtask(df, parent_tag, subtags)
339
- if result[0] is None:
340
- print(f"Skipping {parent_tag} due to insufficient data")
341
- continue
342
- X, y, preprocessors, cat_cols, cont_cols, label_encoder = result
343
- model, training_history, test_accuracy = self.train_subtask_model(X, y, preprocessors, parent_tag, epochs)
344
- self.models[parent_tag] = model
345
- self.preprocessors[parent_tag] = preprocessors
346
- self.label_encoders[parent_tag] = label_encoder
347
- model_path = f'../models/sub_classifiers/{parent_tag.lower()}_classifier.pth'
348
- torch.save({
349
- 'model_state_dict': model.state_dict(),
350
- 'input_size': X.shape[1],
351
- 'output_size': len(np.unique(y)),
352
- 'preprocessors': preprocessors,
353
- 'test_accuracy': test_accuracy
354
- }, model_path)
355
- print(f"Saved {parent_tag} model to {model_path}")
356
- self.plot_training_history(training_history, parent_tag)
357
-
358
- def plot_training_history(self, history, parent_tag):
359
- # Plot training history (good function naming no need for commenting but here we go)
360
- train_losses, val_losses, val_accuracies = history
361
- plt.figure(figsize=(12, 5))
362
- plt.subplot(1, 2, 1)
363
- plt.plot(train_losses, label='Training Loss')
364
- plt.plot(val_losses, label='Validation Loss')
365
- plt.title(f'{parent_tag} Model: Loss over epochs')
366
- plt.xlabel('Epoch')
367
- plt.ylabel('Loss')
368
- plt.legend()
369
- plt.subplot(1, 2, 2)
370
- plt.plot(val_accuracies, label='Validation Accuracy')
371
- plt.title(f'{parent_tag} Model: Accuracy over epochs')
372
- plt.xlabel('Epoch')
373
- plt.ylabel('Accuracy')
374
- plt.legend()
375
- plt.tight_layout()
376
- plt.savefig(f'../models/sub_classifiers/{parent_tag.lower()}_training_history.png')
377
- plt.close()
378
-
379
- def load_models(self, model_dir='../models/sub_classifiers'):
380
- # Load saved models
381
- for parent_tag in self.tag_hierarchy.keys():
382
- model_path = f'{model_dir}/{parent_tag.lower()}_classifier.pth'
383
- if os.path.exists(model_path):
384
- print(f"Loading {parent_tag} model from {model_path}")
385
- checkpoint = torch.load(model_path, map_location=self.device,weights_only=False)
386
- model = ImprovedTagClassifier(checkpoint['input_size'], checkpoint['output_size']).to(self.device)
387
- model.load_state_dict(checkpoint['model_state_dict'])
388
- model.eval()
389
- self.models[parent_tag] = model
390
- self.preprocessors[parent_tag] = checkpoint['preprocessors']
391
- self.label_encoders[parent_tag] = checkpoint['preprocessors']['label_encoder']
392
- print(f"Loaded {parent_tag} model (Test Accuracy: {checkpoint['test_accuracy']:.4f})")
393
- else:
394
- print(f"Model file {model_path} not found!")
395
-
396
- def predict_hierarchical(self, sample_data, base_prediction):
397
- # Predict a tag using the right sub-classifier
398
- if base_prediction not in self.tag_hierarchy:
399
- return base_prediction, 1.0
400
- if base_prediction not in self.models:
401
- print(f"No sub-classifier found for {base_prediction}")
402
- return base_prediction, 1.0
403
- preprocessors = self.preprocessors[base_prediction]
404
- sample_df = pd.DataFrame([sample_data])
405
- cat_cols = preprocessors['categorical_cols']
406
- cont_cols = preprocessors['continuous_cols']
407
-
408
- # Add missing columns
409
- for col in cat_cols + cont_cols:
410
- if col not in sample_df.columns:
411
- sample_df[col] = 'unknown' if col in cat_cols else 0
412
-
413
- sample_df[cat_cols] = sample_df[cat_cols].astype(str).fillna('unknown')
414
- X_cat = preprocessors['ohe'].transform(sample_df[cat_cols])
415
- X_cont = preprocessors['imputer'].transform(sample_df[cont_cols])
416
- X_cont = preprocessors['scaler'].transform(X_cont)
417
- X_processed = np.concatenate([X_cat, X_cont], axis=1)
418
- X_tensor = torch.tensor(X_processed, dtype=torch.float32).to(self.device)
419
-
420
- model = self.models[base_prediction]
421
- with torch.no_grad():
422
- outputs = model(X_tensor)
423
- probabilities = torch.softmax(outputs, dim=1)
424
- _, predicted = torch.max(outputs, 1)
425
- predicted_label = preprocessors['label_encoder'].inverse_transform([predicted.cpu().numpy()[0]])[0]
426
- confidence = probabilities.max().item()
427
  return predicted_label, confidence
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
4
+ from sklearn.impute import SimpleImputer
5
+ import pandas as pd
6
+ import numpy as np
7
+ import matplotlib.pyplot as plt
8
+ from sklearn.model_selection import train_test_split
9
+ from sklearn.metrics import accuracy_score, classification_report
10
+ from sklearn.utils.class_weight import compute_class_weight
11
+ from torch.amp import autocast, GradScaler
12
+ from torch.utils.data import TensorDataset, DataLoader
13
+ from torch.nn.utils import clip_grad_norm_
14
+ from collections import Counter
15
+ import torch
16
+ import torch.nn as nn
17
+ import os
18
+ import torch.optim as optim
19
+
20
+ # Define the ImprovedTagClassifier class for tag prediction
21
+ class ImprovedTagClassifier(nn.Module):
22
+ def __init__(self, input_size, output_size, dropout_rate=0.4):
23
+ super(ImprovedTagClassifier, self).__init__()
24
+
25
+ # First hidden layer: transforms input features to 512 dimensions
26
+ self.fc1 = nn.Linear(input_size, 512)
27
+ self.bn1 = nn.BatchNorm1d(512) # Normalizes the output
28
+
29
+ # Second hidden layer: reduces from 512 to 256 dimensions
30
+ self.fc2 = nn.Linear(512, 256)
31
+ self.bn2 = nn.BatchNorm1d(256) # Normalizes again
32
+
33
+ # Third hidden layer: further reduces to 128 dimensions
34
+ self.fc3 = nn.Linear(256, 128)
35
+ self.bn3 = nn.BatchNorm1d(128) # Another normalization
36
+
37
+ # Output layer: maps 128 dimensions to the number of classes
38
+ self.fc4 = nn.Linear(128, output_size)
39
+
40
+ # Tools to prevent overfitting and improve learning
41
+ self.dropout = nn.Dropout(dropout_rate) # Randomly drops some data
42
+ self.leaky_relu = nn.LeakyReLU(0.1) # Activation function with a small slope
43
+
44
+ # Skip connection: connects layer 1 directly to layer 3
45
+ self.skip1_3 = nn.Linear(512, 128)
46
+
47
+ # Set up the initial weights for better training
48
+ self._initialize_weights()
49
+
50
+ def _initialize_weights(self):
51
+ # Loop through all parts of the model
52
+ for m in self.modules():
53
+ if isinstance(m, nn.Linear):
54
+ # Use a special method to set weights for linear layers
55
+ nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='leaky_relu')
56
+ if m.bias is not None:
57
+ # Set biases to zero
58
+ nn.init.constant_(m.bias, 0)
59
+ elif isinstance(m, nn.BatchNorm1d):
60
+ # Set batch norm weights to 1 and biases to 0
61
+ nn.init.constant_(m.weight, 1)
62
+ nn.init.constant_(m.bias, 0)
63
+
64
+ def forward(self, x):
65
+ # First block: process input through the first layer
66
+ x1 = self.fc1(x)
67
+ x1 = self.bn1(x1) # Normalize
68
+ x1 = self.leaky_relu(x1) # Activate
69
+ x1 = self.dropout(x1) # Drop some data to prevent overfitting
70
+
71
+ # Second block: process through the second layer
72
+ x2 = self.fc2(x1)
73
+ x2 = self.bn2(x2) # Normalize
74
+ x2 = self.leaky_relu(x2) # Activate
75
+ x2 = self.dropout(x2) # Drop some data
76
+
77
+ # Third block: process with a skip connection
78
+ x3 = self.fc3(x2)
79
+ skip_x1 = self.skip1_3(x1) # Skip connection from first layer
80
+ x3 = x3 + skip_x1 # Add the skip connection
81
+ x3 = self.bn3(x3) # Normalize
82
+ x3 = self.leaky_relu(x3) # Activate
83
+ x3 = self.dropout(x3) # Drop some data
84
+
85
+ # Final output: get the class predictions
86
+ output = self.fc4(x3)
87
+ return output
88
+
89
+ class FocalLoss(nn.Module):
90
+ """Focal Loss for handling class imbalance"""
91
+ def __init__(self, weight=None, gamma=2.0, reduction='mean'):
92
+ super(FocalLoss, self).__init__()
93
+ self.weight = weight # Weights for each class
94
+ self.gamma = gamma # Focus on hard examples
95
+ self.reduction = reduction
96
+ self.ce_loss = nn.CrossEntropyLoss(weight=weight, reduction='none')
97
+
98
+ def forward(self, inputs, targets):
99
+ # Calculate basic cross-entropy loss
100
+ ce_loss = self.ce_loss(inputs, targets)
101
+ pt = torch.exp(-ce_loss) # Probability of correct class
102
+ focal_loss = ((1 - pt) ** self.gamma) * ce_loss # Adjust loss
103
+
104
+ # Combine losses based on reduction type
105
+ if self.reduction == 'mean':
106
+ return focal_loss.mean()
107
+ elif self.reduction == 'sum':
108
+ return focal_loss.sum()
109
+ else:
110
+ return focal_loss
111
+
112
+ class MultiLevelTagClassifier:
113
+ def __init__(self, device='cuda'):
114
+ # Use GPU
115
+ self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
116
+ self.models = {} # Store models for each parent tag
117
+ self.preprocessors = {} # Store preprocessing tools
118
+ self.label_encoders = {} # Store label encoders
119
+
120
+ # Define tag groups
121
+ self.tag_hierarchy = {
122
+ 'DIV': ['DIV', 'LIST', 'CARD'],
123
+ 'P': ['P', 'LI'],
124
+ 'INPUT': ['INPUT', 'DROPDOWN'],
125
+ 'ICON': ['ICON', 'CHECKBOX', 'RADIO'],
126
+ }
127
+ print(f"Using device: {self.device}")
128
+
129
+ def prepare_data_for_subtask(self, df, parent_tag, subtags):
130
+ # Get only the data for this parent tag’s subtags
131
+ filtered_df = df[df['tag'].isin(subtags)].copy()
132
+ print(f"\n=== Preparing data for {parent_tag} sub-classification ===")
133
+ print(f"Subtags: {subtags}")
134
+ print(f"Total samples: {len(filtered_df)}")
135
+ print(f"Distribution: \n{filtered_df['tag'].value_counts()}")
136
+
137
+ if len(filtered_df) == 0:
138
+ print(f"No data found for {parent_tag} subtags!")
139
+ return None, None, None, None, None, None
140
+
141
+ y = filtered_df["tag"] # Target tags
142
+ X = filtered_df.drop(columns=["tag"]) # Features
143
+
144
+ # Define which columns are categories and numerical features
145
+ categorical_cols = ['type', 'prev_sibling_html_tag', 'child_1_html_tag', 'child_2_html_tag', 'parent_tag_html']
146
+ continuous_cols = [col for col in X.columns if col not in categorical_cols]
147
+
148
+ # Add missing columns with default values
149
+ missing_cols = [col for col in categorical_cols + continuous_cols if col not in X.columns]
150
+ if missing_cols:
151
+ print(f"Warning: Missing columns {missing_cols} in data for {parent_tag}")
152
+ for col in missing_cols:
153
+ X[col] = 'unknown' if col in categorical_cols else 0
154
+
155
+ # Process categories
156
+ X[categorical_cols] = X[categorical_cols].astype(str).fillna('unknown')
157
+ ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
158
+ X_cat_encoded = ohe.fit_transform(X[categorical_cols])
159
+
160
+ # Process continous features
161
+ imputer = SimpleImputer(strategy='median')
162
+ X_continuous_imputed = imputer.fit_transform(X[continuous_cols])
163
+ scaler = StandardScaler()
164
+ X_continuous_scaled = scaler.fit_transform(X_continuous_imputed)
165
+ X_processed = np.concatenate([X_cat_encoded, X_continuous_scaled], axis=1)
166
+
167
+ # Encode target tags
168
+ label_encoder = LabelEncoder()
169
+ y_encoded = label_encoder.fit_transform(y)
170
+
171
+ # Boost rare classes by copying them
172
+ class_counts = Counter(y_encoded)
173
+ min_samples_threshold = max(10, len(subtags) * 3)
174
+ rare_classes = [cls for cls, count in class_counts.items() if count < min_samples_threshold]
175
+
176
+ for cls in rare_classes:
177
+ idx = np.where(y_encoded == cls)[0]
178
+ original_class_name = label_encoder.inverse_transform([cls])[0]
179
+ samples_needed = min_samples_threshold - len(idx)
180
+ print(f"Adding {samples_needed} copies to class '{original_class_name}'")
181
+ for _ in range(samples_needed):
182
+ sample_idx = np.random.choice(idx)
183
+ new_sample = X_processed[sample_idx].copy()
184
+ continuous_start = X_cat_encoded.shape[1]
185
+ noise = np.random.normal(0, 0.05, size=X_continuous_scaled.shape[1])
186
+ new_sample[continuous_start:] += noise
187
+ X_processed = np.vstack([X_processed, new_sample])
188
+ y_encoded = np.append(y_encoded, cls)
189
+
190
+ # Bundle up preprocessing models
191
+ preprocessors = {
192
+ 'ohe': ohe,
193
+ 'imputer': imputer,
194
+ 'scaler': scaler,
195
+ 'label_encoder': label_encoder,
196
+ 'categorical_cols': categorical_cols,
197
+ 'continuous_cols': continuous_cols
198
+ }
199
+ return X_processed, y_encoded, preprocessors, categorical_cols, continuous_cols, label_encoder
200
+
201
+ def train_subtask_model(self, X, y, preprocessors, parent_tag, epochs=100):
202
+ # Split data into train, validation, and test sets
203
+ print(f"\n=== Training {parent_tag} sub-classifier ===")
204
+ X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
205
+ X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.15, random_state=42, stratify=y_temp)
206
+ print(f"Training set size: {X_train.shape[0]}")
207
+ print(f"Validation set size: {X_val.shape[0]}")
208
+ print(f"Test set size: {X_test.shape[0]}")
209
+
210
+ # Balance classes
211
+ class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
212
+
213
+ # Turn data into tensors
214
+ X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
215
+ y_train_tensor = torch.tensor(y_train, dtype=torch.long)
216
+ X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
217
+ y_val_tensor = torch.tensor(y_val, dtype=torch.long)
218
+ X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
219
+ y_test_tensor = torch.tensor(y_test, dtype=torch.long)
220
+ class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(self.device)
221
+
222
+ # Set up datasets and loaders
223
+ train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
224
+ val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
225
+ test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
226
+ train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=2)
227
+ val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False, num_workers=2)
228
+ test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=2)
229
+
230
+ # Create and set up the model
231
+ input_size = X_train.shape[1]
232
+ output_size = len(np.unique(y))
233
+ model = ImprovedTagClassifier(input_size, output_size).to(self.device)
234
+ criterion = FocalLoss(weight=class_weights_tensor, gamma=2.0)
235
+ optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
236
+ scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)
237
+ scaler = GradScaler()
238
+
239
+ # Training loop
240
+ best_val_loss = float('inf')
241
+ patience = 15
242
+ counter = 0
243
+ train_losses = []
244
+ val_losses = []
245
+ val_accuracies = []
246
+
247
+ for epoch in range(epochs):
248
+ model.train()
249
+ running_loss = 0.0
250
+ for batch_X, batch_y in train_loader:
251
+ batch_X, batch_y = batch_X.to(self.device), batch_y.to(self.device)
252
+ optimizer.zero_grad()
253
+ with autocast(device_type=self.device.type):
254
+ outputs = model(batch_X)
255
+ loss = criterion(outputs, batch_y)
256
+ scaler.scale(loss).backward()
257
+ clip_grad_norm_(model.parameters(), max_norm=1.0)
258
+ scaler.step(optimizer)
259
+ scaler.update()
260
+ running_loss += loss.item()
261
+
262
+ train_loss = running_loss / len(train_loader)
263
+ model.eval()
264
+ val_running_loss = 0.0
265
+ all_preds = []
266
+ all_labels = []
267
+
268
+ with torch.no_grad():
269
+ for batch_X, batch_y in val_loader:
270
+ batch_X, batch_y = batch_X.to(self.device), batch_y.to(self.device)
271
+ with autocast(device_type=self.device.type):
272
+ outputs = model(batch_X)
273
+ loss = criterion(outputs, batch_y)
274
+ val_running_loss += loss.item()
275
+ _, preds = torch.max(outputs, 1)
276
+ all_preds.extend(preds.cpu().numpy())
277
+ all_labels.extend(batch_y.cpu().numpy())
278
+
279
+ val_loss = val_running_loss / len(val_loader)
280
+ val_accuracy = accuracy_score(all_labels, all_preds)
281
+ scheduler.step(val_loss)
282
+
283
+ # Track progress
284
+ train_losses.append(train_loss)
285
+ val_losses.append(val_loss)
286
+ val_accuracies.append(val_accuracy)
287
+ print(f"Epoch [{epoch+1}/{epochs}] - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
288
+
289
+ if val_loss < best_val_loss:
290
+ best_val_loss = val_loss
291
+ counter = 0
292
+ best_model_state = model.state_dict().copy()
293
+ else:
294
+ counter += 1
295
+ if counter >= patience:
296
+ print(f"Early stopping triggered after {epoch+1} epochs")
297
+ break
298
+
299
+ model.load_state_dict(best_model_state)
300
+ model.eval()
301
+ test_preds = []
302
+ test_labels = []
303
+
304
+ with torch.no_grad():
305
+ for batch_X, batch_y in test_loader:
306
+ batch_X, batch_y = batch_X.to(self.device), batch_y.to(self.device)
307
+ outputs = model(batch_X)
308
+ _, preds = torch.max(outputs, 1)
309
+ test_preds.extend(preds.cpu().numpy())
310
+ test_labels.extend(batch_y.cpu().numpy())
311
+
312
+ test_accuracy = accuracy_score(test_labels, test_preds)
313
+ print(f"\n{parent_tag} Model Test Accuracy: {test_accuracy:.4f}")
314
+ print(f"\n{parent_tag} Classification Report:")
315
+ print(classification_report(test_labels, test_preds, target_names=preprocessors['label_encoder'].classes_, zero_division=0))
316
+
317
+ return model, (train_losses, val_losses, val_accuracies), test_accuracy
318
+
319
+ def train_all_models(self, df_path, epochs=100):
320
+ # Load and clean the main dataset
321
+ print("Loading and cleaning data...")
322
+ df = pd.read_csv(df_path)
323
+ df.loc[(df["tag"] == "SPAN") & ((df["type"] == "RECTANGLE") | (df["type"] == "GROUP")), "tag"] = "DIV"
324
+ children_cols = ['child_1_html_tag', 'child_2_html_tag']
325
+ for col in children_cols:
326
+ df[col] = df[col].apply(lambda x: "DIV" if isinstance(x, str) and '-' in x else x)
327
+ for col in ['tag', 'prev_sibling_html_tag', 'child_1_html_tag', 'child_2_html_tag']:
328
+ df[col] = df[col].str.upper()
329
+
330
+ # Make a folder for models
331
+ os.makedirs('../models/sub_classifiers', exist_ok=True)
332
+
333
+ # Train a model for each parent tag
334
+ for parent_tag, subtags in self.tag_hierarchy.items():
335
+ print(f"\n{'='*60}")
336
+ print(f"Training {parent_tag} sub-classifier")
337
+ print(f"{'='*60}")
338
+ result = self.prepare_data_for_subtask(df, parent_tag, subtags)
339
+ if result[0] is None:
340
+ print(f"Skipping {parent_tag} due to insufficient data")
341
+ continue
342
+ X, y, preprocessors, cat_cols, cont_cols, label_encoder = result
343
+ model, training_history, test_accuracy = self.train_subtask_model(X, y, preprocessors, parent_tag, epochs)
344
+ self.models[parent_tag] = model
345
+ self.preprocessors[parent_tag] = preprocessors
346
+ self.label_encoders[parent_tag] = label_encoder
347
+ model_path = f'../models/sub_classifiers/{parent_tag.lower()}_classifier.pth'
348
+ torch.save({
349
+ 'model_state_dict': model.state_dict(),
350
+ 'input_size': X.shape[1],
351
+ 'output_size': len(np.unique(y)),
352
+ 'preprocessors': preprocessors,
353
+ 'test_accuracy': test_accuracy
354
+ }, model_path)
355
+ print(f"Saved {parent_tag} model to {model_path}")
356
+ self.plot_training_history(training_history, parent_tag)
357
+
358
+ def plot_training_history(self, history, parent_tag):
359
+ # Plot training history (good function naming no need for commenting but here we go)
360
+ train_losses, val_losses, val_accuracies = history
361
+ plt.figure(figsize=(12, 5))
362
+ plt.subplot(1, 2, 1)
363
+ plt.plot(train_losses, label='Training Loss')
364
+ plt.plot(val_losses, label='Validation Loss')
365
+ plt.title(f'{parent_tag} Model: Loss over epochs')
366
+ plt.xlabel('Epoch')
367
+ plt.ylabel('Loss')
368
+ plt.legend()
369
+ plt.subplot(1, 2, 2)
370
+ plt.plot(val_accuracies, label='Validation Accuracy')
371
+ plt.title(f'{parent_tag} Model: Accuracy over epochs')
372
+ plt.xlabel('Epoch')
373
+ plt.ylabel('Accuracy')
374
+ plt.legend()
375
+ plt.tight_layout()
376
+ plt.savefig(f'../models/sub_classifiers/{parent_tag.lower()}_training_history.png')
377
+ plt.close()
378
+
379
+ def load_models(self, model_dir='../models/sub_classifiers'):
380
+ # Load saved models
381
+ for parent_tag in self.tag_hierarchy.keys():
382
+ model_path = f'{model_dir}/{parent_tag.lower()}_classifier.pth'
383
+ if os.path.exists(model_path):
384
+ print(f"Loading {parent_tag} model from {model_path}")
385
+ checkpoint = torch.load(model_path, map_location=self.device,weights_only=False)
386
+ model = ImprovedTagClassifier(checkpoint['input_size'], checkpoint['output_size']).to(self.device)
387
+ model.load_state_dict(checkpoint['model_state_dict'])
388
+ model.eval()
389
+ self.models[parent_tag] = model
390
+ self.preprocessors[parent_tag] = checkpoint['preprocessors']
391
+ self.label_encoders[parent_tag] = checkpoint['preprocessors']['label_encoder']
392
+ print(f"Loaded {parent_tag} model (Test Accuracy: {checkpoint['test_accuracy']:.4f})")
393
+ else:
394
+ print(f"Model file {model_path} not found!")
395
+
396
+ def predict_hierarchical(self, sample_data, base_prediction):
397
+ # Predict a tag using the right sub-classifier
398
+ if base_prediction not in self.tag_hierarchy:
399
+ return base_prediction, 1.0
400
+ if base_prediction not in self.models:
401
+ print(f"No sub-classifier found for {base_prediction}")
402
+ return base_prediction, 1.0
403
+ preprocessors = self.preprocessors[base_prediction]
404
+ sample_df = pd.DataFrame([sample_data])
405
+ cat_cols = preprocessors['categorical_cols']
406
+ cont_cols = preprocessors['continuous_cols']
407
+
408
+ # Add missing columns
409
+ for col in cat_cols + cont_cols:
410
+ if col not in sample_df.columns:
411
+ sample_df[col] = 'unknown' if col in cat_cols else 0
412
+
413
+ sample_df[cat_cols] = sample_df[cat_cols].astype(str).fillna('unknown')
414
+ X_cat = preprocessors['ohe'].transform(sample_df[cat_cols])
415
+ X_cont = preprocessors['imputer'].transform(sample_df[cont_cols])
416
+ X_cont = preprocessors['scaler'].transform(X_cont)
417
+ X_processed = np.concatenate([X_cat, X_cont], axis=1)
418
+ X_tensor = torch.tensor(X_processed, dtype=torch.float32).to(self.device)
419
+
420
+ model = self.models[base_prediction]
421
+ with torch.no_grad():
422
+ outputs = model(X_tensor)
423
+ probabilities = torch.softmax(outputs, dim=1)
424
+ _, predicted = torch.max(outputs, 1)
425
+ predicted_label = preprocessors['label_encoder'].inverse_transform([predicted.cpu().numpy()[0]])[0]
426
+ confidence = probabilities.max().item()
427
  return predicted_label, confidence