Spaces:

AOZ2025
/

Semantic-Assigner

Sleeping

App Files Files Community

AOZ2025 commited on Jun 25, 2025

Commit

dd98fd2

verified ·

1 Parent(s): e9a01a7

Upload 4 files

Browse files

Files changed (3) hide show

Utils/__pycache__/model_utils.cpython-312.pyc +0 -0
Utils/__pycache__/utils.cpython-312.pyc +0 -0
Utils/model_utils.py +426 -426

Utils/__pycache__/model_utils.cpython-312.pyc CHANGED Viewed

Binary files a/Utils/__pycache__/model_utils.cpython-312.pyc and b/Utils/__pycache__/model_utils.cpython-312.pyc differ

Utils/__pycache__/utils.cpython-312.pyc CHANGED Viewed

Binary files a/Utils/__pycache__/utils.cpython-312.pyc and b/Utils/__pycache__/utils.cpython-312.pyc differ

Utils/model_utils.py CHANGED Viewed

@@ -1,427 +1,427 @@
-import torch
-import torch.nn as nn
-from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
-from sklearn.impute import SimpleImputer
-import pandas as pd
-import numpy as np
-import matplotlib.pyplot as plt
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score, classification_report
-from sklearn.utils.class_weight import compute_class_weight
-from torch.amp import autocast, GradScaler
-from torch.utils.data import TensorDataset, DataLoader
-from torch.nn.utils import clip_grad_norm_
-from collections import Counter
-import torch
-import torch.nn as nn
-import os
-import torch.optim as optim
-# Define the ImprovedTagClassifier class for tag prediction
-class ImprovedTagClassifier(nn.Module):
-    def __init__(self, input_size, output_size, dropout_rate=0.4):
-        super(ImprovedTagClassifier, self).__init__()
-        # First hidden layer: transforms input features to 512 dimensions
-        self.fc1 = nn.Linear(input_size, 512)
-        self.bn1 = nn.BatchNorm1d(512)  # Normalizes the output
-        # Second hidden layer: reduces from 512 to 256 dimensions
-        self.fc2 = nn.Linear(512, 256)
-        self.bn2 = nn.BatchNorm1d(256)  # Normalizes again
-        # Third hidden layer: further reduces to 128 dimensions
-        self.fc3 = nn.Linear(256, 128)
-        self.bn3 = nn.BatchNorm1d(128)  # Another normalization
-        # Output layer: maps 128 dimensions to the number of classes
-        self.fc4 = nn.Linear(128, output_size)
-        # Tools to prevent overfitting and improve learning
-        self.dropout = nn.Dropout(dropout_rate)  # Randomly drops some data
-        self.leaky_relu = nn.LeakyReLU(0.1)  # Activation function with a small slope
-        # Skip connection: connects layer 1 directly to layer 3
-        self.skip1_3 = nn.Linear(512, 128)
-        # Set up the initial weights for better training
-        self._initialize_weights()
-    def _initialize_weights(self):
-        # Loop through all parts of the model
-        for m in self.modules():
-            if isinstance(m, nn.Linear):
-                # Use a special method to set weights for linear layers
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='leaky_relu')
-                if m.bias is not None:
-                    # Set biases to zero
-                    nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.BatchNorm1d):
-                # Set batch norm weights to 1 and biases to 0
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-    def forward(self, x):
-        # First block: process input through the first layer
-        x1 = self.fc1(x)
-        x1 = self.bn1(x1)  # Normalize
-        x1 = self.leaky_relu(x1)  # Activate
-        x1 = self.dropout(x1)  # Drop some data to prevent overfitting
-        # Second block: process through the second layer
-        x2 = self.fc2(x1)
-        x2 = self.bn2(x2)  # Normalize
-        x2 = self.leaky_relu(x2)  # Activate
-        x2 = self.dropout(x2)  # Drop some data
-        # Third block: process with a skip connection
-        x3 = self.fc3(x2)
-        skip_x1 = self.skip1_3(x1)  # Skip connection from first layer
-        x3 = x3 + skip_x1  # Add the skip connection
-        x3 = self.bn3(x3)  # Normalize
-        x3 = self.leaky_relu(x3)  # Activate
-        x3 = self.dropout(x3)  # Drop some data
-        # Final output: get the class predictions
-        output = self.fc4(x3)
-        return output
-class FocalLoss(nn.Module):
-    """Focal Loss for handling class imbalance"""
-    def __init__(self, weight=None, gamma=2.0, reduction='mean'):
-        super(FocalLoss, self).__init__()
-        self.weight = weight  # Weights for each class
-        self.gamma = gamma    # Focus on hard examples
-        self.reduction = reduction
-        self.ce_loss = nn.CrossEntropyLoss(weight=weight, reduction='none')
-    def forward(self, inputs, targets):
-        # Calculate basic cross-entropy loss
-        ce_loss = self.ce_loss(inputs, targets)
-        pt = torch.exp(-ce_loss)  # Probability of correct class
-        focal_loss = ((1 - pt) ** self.gamma) * ce_loss  # Adjust loss
-        # Combine losses based on reduction type
-        if self.reduction == 'mean':
-            return focal_loss.mean()
-        elif self.reduction == 'sum':
-            return focal_loss.sum()
-        else:
-            return focal_loss
-class MultiLevelTagClassifier:
-    def __init__(self, device='cuda'):
-        # Use GPU
-        self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
-        self.models = {}           # Store models for each parent tag
-        self.preprocessors = {}    # Store preprocessing tools
-        self.label_encoders = {}   # Store label encoders
-        # Define tag groups
-        self.tag_hierarchy = {
-            'DIV': ['DIV', 'LIST', 'CARD', 'FORM'],
-            'P': ['P', 'LABEL', 'LI', 'A'],
-            'INPUT': ['INPUT', 'DROPDOWN'],
-            'ICON': ['ICON', 'CHECKBOX', 'RADIO'],
-        }
-        print(f"Using device: {self.device}")
-    def prepare_data_for_subtask(self, df, parent_tag, subtags):
-        # Get only the data for this parent tag’s subtags
-        filtered_df = df[df['tag'].isin(subtags)].copy()
-        print(f"\n=== Preparing data for {parent_tag} sub-classification ===")
-        print(f"Subtags: {subtags}")
-        print(f"Total samples: {len(filtered_df)}")
-        print(f"Distribution: \n{filtered_df['tag'].value_counts()}")
-        if len(filtered_df) == 0:
-            print(f"No data found for {parent_tag} subtags!")
-            return None, None, None, None, None, None
-        y = filtered_df["tag"]  # Target tags
-        X = filtered_df.drop(columns=["tag"])  # Features
-        # Define which columns are categories and numerical features
-        categorical_cols = ['type', 'prev_sibling_html_tag', 'child_1_html_tag', 'child_2_html_tag', 'parent_tag_html']
-        continuous_cols = [col for col in X.columns if col not in categorical_cols]
-        # Add missing columns with default values
-        missing_cols = [col for col in categorical_cols + continuous_cols if col not in X.columns]
-        if missing_cols:
-            print(f"Warning: Missing columns {missing_cols} in data for {parent_tag}")
-            for col in missing_cols:
-                X[col] = 'unknown' if col in categorical_cols else 0
-        # Process categories
-        X[categorical_cols] = X[categorical_cols].astype(str).fillna('unknown')
-        ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
-        X_cat_encoded = ohe.fit_transform(X[categorical_cols])
-        # Process continous features
-        imputer = SimpleImputer(strategy='median')
-        X_continuous_imputed = imputer.fit_transform(X[continuous_cols])
-        scaler = StandardScaler()
-        X_continuous_scaled = scaler.fit_transform(X_continuous_imputed)
-        X_processed = np.concatenate([X_cat_encoded, X_continuous_scaled], axis=1)
-        # Encode target tags
-        label_encoder = LabelEncoder()
-        y_encoded = label_encoder.fit_transform(y)
-        # Boost rare classes by copying them
-        class_counts = Counter(y_encoded)
-        min_samples_threshold = max(10, len(subtags) * 3)
-        rare_classes = [cls for cls, count in class_counts.items() if count < min_samples_threshold]
-        for cls in rare_classes:
-            idx = np.where(y_encoded == cls)[0]
-            original_class_name = label_encoder.inverse_transform([cls])[0]
-            samples_needed = min_samples_threshold - len(idx)
-            print(f"Adding {samples_needed} copies to class '{original_class_name}'")
-            for _ in range(samples_needed):
-                sample_idx = np.random.choice(idx)
-                new_sample = X_processed[sample_idx].copy()
-                continuous_start = X_cat_encoded.shape[1]
-                noise = np.random.normal(0, 0.05, size=X_continuous_scaled.shape[1])
-                new_sample[continuous_start:] += noise
-                X_processed = np.vstack([X_processed, new_sample])
-                y_encoded = np.append(y_encoded, cls)
-        # Bundle up preprocessing models
-        preprocessors = {
-            'ohe': ohe,
-            'imputer': imputer,
-            'scaler': scaler,
-            'label_encoder': label_encoder,
-            'categorical_cols': categorical_cols,
-            'continuous_cols': continuous_cols
-        }
-        return X_processed, y_encoded, preprocessors, categorical_cols, continuous_cols, label_encoder
-    def train_subtask_model(self, X, y, preprocessors, parent_tag, epochs=100):
-        # Split data into train, validation, and test sets
-        print(f"\n=== Training {parent_tag} sub-classifier ===")
-        X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
-        X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.15, random_state=42, stratify=y_temp)
-        print(f"Training set size: {X_train.shape[0]}")
-        print(f"Validation set size: {X_val.shape[0]}")
-        print(f"Test set size: {X_test.shape[0]}")
-        # Balance classes
-        class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
-        # Turn data into tensors
-        X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
-        y_train_tensor = torch.tensor(y_train, dtype=torch.long)
-        X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
-        y_val_tensor = torch.tensor(y_val, dtype=torch.long)
-        X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
-        y_test_tensor = torch.tensor(y_test, dtype=torch.long)
-        class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(self.device)
-        # Set up datasets and loaders
-        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
-        val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
-        test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
-        train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=2)
-        val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False, num_workers=2)
-        test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=2)
-        # Create and set up the model
-        input_size = X_train.shape[1]
-        output_size = len(np.unique(y))
-        model = ImprovedTagClassifier(input_size, output_size).to(self.device)
-        criterion = FocalLoss(weight=class_weights_tensor, gamma=2.0)
-        optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
-        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)
-        scaler = GradScaler()
-        # Training loop
-        best_val_loss = float('inf')
-        patience = 15
-        counter = 0
-        train_losses = []
-        val_losses = []
-        val_accuracies = []
-        for epoch in range(epochs):
-            model.train()
-            running_loss = 0.0
-            for batch_X, batch_y in train_loader:
-                batch_X, batch_y = batch_X.to(self.device), batch_y.to(self.device)
-                optimizer.zero_grad()
-                with autocast(device_type=self.device.type):
-                    outputs = model(batch_X)
-                    loss = criterion(outputs, batch_y)
-                scaler.scale(loss).backward()
-                clip_grad_norm_(model.parameters(), max_norm=1.0)
-                scaler.step(optimizer)
-                scaler.update()
-                running_loss += loss.item()
-            train_loss = running_loss / len(train_loader)
-            model.eval()
-            val_running_loss = 0.0
-            all_preds = []
-            all_labels = []
-            with torch.no_grad():
-                for batch_X, batch_y in val_loader:
-                    batch_X, batch_y = batch_X.to(self.device), batch_y.to(self.device)
-                    with autocast(device_type=self.device.type):
-                        outputs = model(batch_X)
-                        loss = criterion(outputs, batch_y)
-                    val_running_loss += loss.item()
-                    _, preds = torch.max(outputs, 1)
-                    all_preds.extend(preds.cpu().numpy())
-                    all_labels.extend(batch_y.cpu().numpy())
-            val_loss = val_running_loss / len(val_loader)
-            val_accuracy = accuracy_score(all_labels, all_preds)
-            scheduler.step(val_loss)
-            # Track progress
-            train_losses.append(train_loss)
-            val_losses.append(val_loss)
-            val_accuracies.append(val_accuracy)
-            print(f"Epoch [{epoch+1}/{epochs}] - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
-            if val_loss < best_val_loss:
-                best_val_loss = val_loss
-                counter = 0
-                best_model_state = model.state_dict().copy()
-            else:
-                counter += 1
-                if counter >= patience:
-                    print(f"Early stopping triggered after {epoch+1} epochs")
-                    break
-        model.load_state_dict(best_model_state)
-        model.eval()
-        test_preds = []
-        test_labels = []
-        with torch.no_grad():
-            for batch_X, batch_y in test_loader:
-                batch_X, batch_y = batch_X.to(self.device), batch_y.to(self.device)
-                outputs = model(batch_X)
-                _, preds = torch.max(outputs, 1)
-                test_preds.extend(preds.cpu().numpy())
-                test_labels.extend(batch_y.cpu().numpy())
-        test_accuracy = accuracy_score(test_labels, test_preds)
-        print(f"\n{parent_tag} Model Test Accuracy: {test_accuracy:.4f}")
-        print(f"\n{parent_tag} Classification Report:")
-        print(classification_report(test_labels, test_preds, target_names=preprocessors['label_encoder'].classes_, zero_division=0))
-        return model, (train_losses, val_losses, val_accuracies), test_accuracy
-    def train_all_models(self, df_path, epochs=100):
-        # Load and clean the main dataset
-        print("Loading and cleaning data...")
-        df = pd.read_csv(df_path)
-        df.loc[(df["tag"] == "SPAN") & ((df["type"] == "RECTANGLE") | (df["type"] == "GROUP")), "tag"] = "DIV"
-        children_cols = ['child_1_html_tag', 'child_2_html_tag']
-        for col in children_cols:
-            df[col] = df[col].apply(lambda x: "DIV" if isinstance(x, str) and '-' in x else x)
-        for col in ['tag', 'prev_sibling_html_tag', 'child_1_html_tag', 'child_2_html_tag']:
-            df[col] = df[col].str.upper()
-        # Make a folder for models
-        os.makedirs('../models/sub_classifiers', exist_ok=True)
-        # Train a model for each parent tag
-        for parent_tag, subtags in self.tag_hierarchy.items():
-            print(f"\n{'='*60}")
-            print(f"Training {parent_tag} sub-classifier")
-            print(f"{'='*60}")
-            result = self.prepare_data_for_subtask(df, parent_tag, subtags)
-            if result[0] is None:
-                print(f"Skipping {parent_tag} due to insufficient data")
-                continue
-            X, y, preprocessors, cat_cols, cont_cols, label_encoder = result
-            model, training_history, test_accuracy = self.train_subtask_model(X, y, preprocessors, parent_tag, epochs)
-            self.models[parent_tag] = model
-            self.preprocessors[parent_tag] = preprocessors
-            self.label_encoders[parent_tag] = label_encoder
-            model_path = f'../models/sub_classifiers/{parent_tag.lower()}_classifier.pth'
-            torch.save({
-                'model_state_dict': model.state_dict(),
-                'input_size': X.shape[1],
-                'output_size': len(np.unique(y)),
-                'preprocessors': preprocessors,
-                'test_accuracy': test_accuracy
-            }, model_path)
-            print(f"Saved {parent_tag} model to {model_path}")
-            self.plot_training_history(training_history, parent_tag)
-    def plot_training_history(self, history, parent_tag):
-        # Plot training history (good function naming no need for commenting but here we go)
-        train_losses, val_losses, val_accuracies = history
-        plt.figure(figsize=(12, 5))
-        plt.subplot(1, 2, 1)
-        plt.plot(train_losses, label='Training Loss')
-        plt.plot(val_losses, label='Validation Loss')
-        plt.title(f'{parent_tag} Model: Loss over epochs')
-        plt.xlabel('Epoch')
-        plt.ylabel('Loss')
-        plt.legend()
-        plt.subplot(1, 2, 2)
-        plt.plot(val_accuracies, label='Validation Accuracy')
-        plt.title(f'{parent_tag} Model: Accuracy over epochs')
-        plt.xlabel('Epoch')
-        plt.ylabel('Accuracy')
-        plt.legend()
-        plt.tight_layout()
-        plt.savefig(f'../models/sub_classifiers/{parent_tag.lower()}_training_history.png')
-        plt.close()
-    def load_models(self, model_dir='../models/sub_classifiers'):
-        # Load saved models
-        for parent_tag in self.tag_hierarchy.keys():
-            model_path = f'{model_dir}/{parent_tag.lower()}_classifier.pth'
-            if os.path.exists(model_path):
-                print(f"Loading {parent_tag} model from {model_path}")
-                checkpoint = torch.load(model_path, map_location=self.device,weights_only=False)
-                model = ImprovedTagClassifier(checkpoint['input_size'], checkpoint['output_size']).to(self.device)
-                model.load_state_dict(checkpoint['model_state_dict'])
-                model.eval()
-                self.models[parent_tag] = model
-                self.preprocessors[parent_tag] = checkpoint['preprocessors']
-                self.label_encoders[parent_tag] = checkpoint['preprocessors']['label_encoder']
-                print(f"Loaded {parent_tag} model (Test Accuracy: {checkpoint['test_accuracy']:.4f})")
-            else:
-                print(f"Model file {model_path} not found!")
-    def predict_hierarchical(self, sample_data, base_prediction):
-        # Predict a tag using the right sub-classifier
-        if base_prediction not in self.tag_hierarchy:
-            return base_prediction, 1.0
-        if base_prediction not in self.models:
-            print(f"No sub-classifier found for {base_prediction}")
-            return base_prediction, 1.0
-        preprocessors = self.preprocessors[base_prediction]
-        sample_df = pd.DataFrame([sample_data])
-        cat_cols = preprocessors['categorical_cols']
-        cont_cols = preprocessors['continuous_cols']
-        # Add missing columns
-        for col in cat_cols + cont_cols:
-            if col not in sample_df.columns:
-                sample_df[col] = 'unknown' if col in cat_cols else 0
-        sample_df[cat_cols] = sample_df[cat_cols].astype(str).fillna('unknown')
-        X_cat = preprocessors['ohe'].transform(sample_df[cat_cols])
-        X_cont = preprocessors['imputer'].transform(sample_df[cont_cols])
-        X_cont = preprocessors['scaler'].transform(X_cont)
-        X_processed = np.concatenate([X_cat, X_cont], axis=1)
-        X_tensor = torch.tensor(X_processed, dtype=torch.float32).to(self.device)
-        model = self.models[base_prediction]
-        with torch.no_grad():
-            outputs = model(X_tensor)
-            probabilities = torch.softmax(outputs, dim=1)
-            _, predicted = torch.max(outputs, 1)
-        predicted_label = preprocessors['label_encoder'].inverse_transform([predicted.cpu().numpy()[0]])[0]
-        confidence = probabilities.max().item()
         return predicted_label, confidence

+import torch
+import torch.nn as nn
+from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
+from sklearn.impute import SimpleImputer
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, classification_report
+from sklearn.utils.class_weight import compute_class_weight
+from torch.amp import autocast, GradScaler
+from torch.utils.data import TensorDataset, DataLoader
+from torch.nn.utils import clip_grad_norm_
+from collections import Counter
+import torch
+import torch.nn as nn
+import os
+import torch.optim as optim
+# Define the ImprovedTagClassifier class for tag prediction
+class ImprovedTagClassifier(nn.Module):
+    def __init__(self, input_size, output_size, dropout_rate=0.4):
+        super(ImprovedTagClassifier, self).__init__()
+        # First hidden layer: transforms input features to 512 dimensions
+        self.fc1 = nn.Linear(input_size, 512)
+        self.bn1 = nn.BatchNorm1d(512)  # Normalizes the output
+        # Second hidden layer: reduces from 512 to 256 dimensions
+        self.fc2 = nn.Linear(512, 256)
+        self.bn2 = nn.BatchNorm1d(256)  # Normalizes again
+        # Third hidden layer: further reduces to 128 dimensions
+        self.fc3 = nn.Linear(256, 128)
+        self.bn3 = nn.BatchNorm1d(128)  # Another normalization
+        # Output layer: maps 128 dimensions to the number of classes
+        self.fc4 = nn.Linear(128, output_size)
+        # Tools to prevent overfitting and improve learning
+        self.dropout = nn.Dropout(dropout_rate)  # Randomly drops some data
+        self.leaky_relu = nn.LeakyReLU(0.1)  # Activation function with a small slope
+        # Skip connection: connects layer 1 directly to layer 3
+        self.skip1_3 = nn.Linear(512, 128)
+        # Set up the initial weights for better training
+        self._initialize_weights()
+    def _initialize_weights(self):
+        # Loop through all parts of the model
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                # Use a special method to set weights for linear layers
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='leaky_relu')
+                if m.bias is not None:
+                    # Set biases to zero
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm1d):
+                # Set batch norm weights to 1 and biases to 0
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        # First block: process input through the first layer
+        x1 = self.fc1(x)
+        x1 = self.bn1(x1)  # Normalize
+        x1 = self.leaky_relu(x1)  # Activate
+        x1 = self.dropout(x1)  # Drop some data to prevent overfitting
+        # Second block: process through the second layer
+        x2 = self.fc2(x1)
+        x2 = self.bn2(x2)  # Normalize
+        x2 = self.leaky_relu(x2)  # Activate
+        x2 = self.dropout(x2)  # Drop some data
+        # Third block: process with a skip connection
+        x3 = self.fc3(x2)
+        skip_x1 = self.skip1_3(x1)  # Skip connection from first layer
+        x3 = x3 + skip_x1  # Add the skip connection
+        x3 = self.bn3(x3)  # Normalize
+        x3 = self.leaky_relu(x3)  # Activate
+        x3 = self.dropout(x3)  # Drop some data
+        # Final output: get the class predictions
+        output = self.fc4(x3)
+        return output
+class FocalLoss(nn.Module):
+    """Focal Loss for handling class imbalance"""
+    def __init__(self, weight=None, gamma=2.0, reduction='mean'):
+        super(FocalLoss, self).__init__()
+        self.weight = weight  # Weights for each class
+        self.gamma = gamma    # Focus on hard examples
+        self.reduction = reduction
+        self.ce_loss = nn.CrossEntropyLoss(weight=weight, reduction='none')
+    def forward(self, inputs, targets):
+        # Calculate basic cross-entropy loss
+        ce_loss = self.ce_loss(inputs, targets)
+        pt = torch.exp(-ce_loss)  # Probability of correct class
+        focal_loss = ((1 - pt) ** self.gamma) * ce_loss  # Adjust loss
+        # Combine losses based on reduction type
+        if self.reduction == 'mean':
+            return focal_loss.mean()
+        elif self.reduction == 'sum':
+            return focal_loss.sum()
+        else:
+            return focal_loss
+class MultiLevelTagClassifier:
+    def __init__(self, device='cuda'):
+        # Use GPU
+        self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
+        self.models = {}           # Store models for each parent tag
+        self.preprocessors = {}    # Store preprocessing tools
+        self.label_encoders = {}   # Store label encoders
+        # Define tag groups
+        self.tag_hierarchy = {
+            'DIV': ['DIV', 'LIST', 'CARD'],
+            'P': ['P', 'LI'],
+            'INPUT': ['INPUT', 'DROPDOWN'],
+            'ICON': ['ICON', 'CHECKBOX', 'RADIO'],
+        }
+        print(f"Using device: {self.device}")
+    def prepare_data_for_subtask(self, df, parent_tag, subtags):
+        # Get only the data for this parent tag’s subtags
+        filtered_df = df[df['tag'].isin(subtags)].copy()
+        print(f"\n=== Preparing data for {parent_tag} sub-classification ===")
+        print(f"Subtags: {subtags}")
+        print(f"Total samples: {len(filtered_df)}")
+        print(f"Distribution: \n{filtered_df['tag'].value_counts()}")
+        if len(filtered_df) == 0:
+            print(f"No data found for {parent_tag} subtags!")
+            return None, None, None, None, None, None
+        y = filtered_df["tag"]  # Target tags
+        X = filtered_df.drop(columns=["tag"])  # Features
+        # Define which columns are categories and numerical features
+        categorical_cols = ['type', 'prev_sibling_html_tag', 'child_1_html_tag', 'child_2_html_tag', 'parent_tag_html']
+        continuous_cols = [col for col in X.columns if col not in categorical_cols]
+        # Add missing columns with default values
+        missing_cols = [col for col in categorical_cols + continuous_cols if col not in X.columns]
+        if missing_cols:
+            print(f"Warning: Missing columns {missing_cols} in data for {parent_tag}")
+            for col in missing_cols:
+                X[col] = 'unknown' if col in categorical_cols else 0
+        # Process categories
+        X[categorical_cols] = X[categorical_cols].astype(str).fillna('unknown')
+        ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
+        X_cat_encoded = ohe.fit_transform(X[categorical_cols])
+        # Process continous features
+        imputer = SimpleImputer(strategy='median')
+        X_continuous_imputed = imputer.fit_transform(X[continuous_cols])
+        scaler = StandardScaler()
+        X_continuous_scaled = scaler.fit_transform(X_continuous_imputed)
+        X_processed = np.concatenate([X_cat_encoded, X_continuous_scaled], axis=1)
+        # Encode target tags
+        label_encoder = LabelEncoder()
+        y_encoded = label_encoder.fit_transform(y)
+        # Boost rare classes by copying them
+        class_counts = Counter(y_encoded)
+        min_samples_threshold = max(10, len(subtags) * 3)
+        rare_classes = [cls for cls, count in class_counts.items() if count < min_samples_threshold]
+        for cls in rare_classes:
+            idx = np.where(y_encoded == cls)[0]
+            original_class_name = label_encoder.inverse_transform([cls])[0]
+            samples_needed = min_samples_threshold - len(idx)
+            print(f"Adding {samples_needed} copies to class '{original_class_name}'")
+            for _ in range(samples_needed):
+                sample_idx = np.random.choice(idx)
+                new_sample = X_processed[sample_idx].copy()
+                continuous_start = X_cat_encoded.shape[1]
+                noise = np.random.normal(0, 0.05, size=X_continuous_scaled.shape[1])
+                new_sample[continuous_start:] += noise
+                X_processed = np.vstack([X_processed, new_sample])
+                y_encoded = np.append(y_encoded, cls)
+        # Bundle up preprocessing models
+        preprocessors = {
+            'ohe': ohe,
+            'imputer': imputer,
+            'scaler': scaler,
+            'label_encoder': label_encoder,
+            'categorical_cols': categorical_cols,
+            'continuous_cols': continuous_cols
+        }
+        return X_processed, y_encoded, preprocessors, categorical_cols, continuous_cols, label_encoder
+    def train_subtask_model(self, X, y, preprocessors, parent_tag, epochs=100):
+        # Split data into train, validation, and test sets
+        print(f"\n=== Training {parent_tag} sub-classifier ===")
+        X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
+        X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.15, random_state=42, stratify=y_temp)
+        print(f"Training set size: {X_train.shape[0]}")
+        print(f"Validation set size: {X_val.shape[0]}")
+        print(f"Test set size: {X_test.shape[0]}")
+        # Balance classes
+        class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
+        # Turn data into tensors
+        X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
+        y_train_tensor = torch.tensor(y_train, dtype=torch.long)
+        X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
+        y_val_tensor = torch.tensor(y_val, dtype=torch.long)
+        X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
+        y_test_tensor = torch.tensor(y_test, dtype=torch.long)
+        class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(self.device)
+        # Set up datasets and loaders
+        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
+        val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
+        test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
+        train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=2)
+        val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False, num_workers=2)
+        test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=2)
+        # Create and set up the model
+        input_size = X_train.shape[1]
+        output_size = len(np.unique(y))
+        model = ImprovedTagClassifier(input_size, output_size).to(self.device)
+        criterion = FocalLoss(weight=class_weights_tensor, gamma=2.0)
+        optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
+        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)
+        scaler = GradScaler()
+        # Training loop
+        best_val_loss = float('inf')
+        patience = 15
+        counter = 0
+        train_losses = []
+        val_losses = []
+        val_accuracies = []
+        for epoch in range(epochs):
+            model.train()
+            running_loss = 0.0
+            for batch_X, batch_y in train_loader:
+                batch_X, batch_y = batch_X.to(self.device), batch_y.to(self.device)
+                optimizer.zero_grad()
+                with autocast(device_type=self.device.type):
+                    outputs = model(batch_X)
+                    loss = criterion(outputs, batch_y)
+                scaler.scale(loss).backward()
+                clip_grad_norm_(model.parameters(), max_norm=1.0)
+                scaler.step(optimizer)
+                scaler.update()
+                running_loss += loss.item()
+            train_loss = running_loss / len(train_loader)
+            model.eval()
+            val_running_loss = 0.0
+            all_preds = []
+            all_labels = []
+            with torch.no_grad():
+                for batch_X, batch_y in val_loader:
+                    batch_X, batch_y = batch_X.to(self.device), batch_y.to(self.device)
+                    with autocast(device_type=self.device.type):
+                        outputs = model(batch_X)
+                        loss = criterion(outputs, batch_y)
+                    val_running_loss += loss.item()
+                    _, preds = torch.max(outputs, 1)
+                    all_preds.extend(preds.cpu().numpy())
+                    all_labels.extend(batch_y.cpu().numpy())
+            val_loss = val_running_loss / len(val_loader)
+            val_accuracy = accuracy_score(all_labels, all_preds)
+            scheduler.step(val_loss)
+            # Track progress
+            train_losses.append(train_loss)
+            val_losses.append(val_loss)
+            val_accuracies.append(val_accuracy)
+            print(f"Epoch [{epoch+1}/{epochs}] - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                counter = 0
+                best_model_state = model.state_dict().copy()
+            else:
+                counter += 1
+                if counter >= patience:
+                    print(f"Early stopping triggered after {epoch+1} epochs")
+                    break
+        model.load_state_dict(best_model_state)
+        model.eval()
+        test_preds = []
+        test_labels = []
+        with torch.no_grad():
+            for batch_X, batch_y in test_loader:
+                batch_X, batch_y = batch_X.to(self.device), batch_y.to(self.device)
+                outputs = model(batch_X)
+                _, preds = torch.max(outputs, 1)
+                test_preds.extend(preds.cpu().numpy())
+                test_labels.extend(batch_y.cpu().numpy())
+        test_accuracy = accuracy_score(test_labels, test_preds)
+        print(f"\n{parent_tag} Model Test Accuracy: {test_accuracy:.4f}")
+        print(f"\n{parent_tag} Classification Report:")
+        print(classification_report(test_labels, test_preds, target_names=preprocessors['label_encoder'].classes_, zero_division=0))
+        return model, (train_losses, val_losses, val_accuracies), test_accuracy
+    def train_all_models(self, df_path, epochs=100):
+        # Load and clean the main dataset
+        print("Loading and cleaning data...")
+        df = pd.read_csv(df_path)
+        df.loc[(df["tag"] == "SPAN") & ((df["type"] == "RECTANGLE") | (df["type"] == "GROUP")), "tag"] = "DIV"
+        children_cols = ['child_1_html_tag', 'child_2_html_tag']
+        for col in children_cols:
+            df[col] = df[col].apply(lambda x: "DIV" if isinstance(x, str) and '-' in x else x)
+        for col in ['tag', 'prev_sibling_html_tag', 'child_1_html_tag', 'child_2_html_tag']:
+            df[col] = df[col].str.upper()
+        # Make a folder for models
+        os.makedirs('../models/sub_classifiers', exist_ok=True)
+        # Train a model for each parent tag
+        for parent_tag, subtags in self.tag_hierarchy.items():
+            print(f"\n{'='*60}")
+            print(f"Training {parent_tag} sub-classifier")
+            print(f"{'='*60}")
+            result = self.prepare_data_for_subtask(df, parent_tag, subtags)
+            if result[0] is None:
+                print(f"Skipping {parent_tag} due to insufficient data")
+                continue
+            X, y, preprocessors, cat_cols, cont_cols, label_encoder = result
+            model, training_history, test_accuracy = self.train_subtask_model(X, y, preprocessors, parent_tag, epochs)
+            self.models[parent_tag] = model
+            self.preprocessors[parent_tag] = preprocessors
+            self.label_encoders[parent_tag] = label_encoder
+            model_path = f'../models/sub_classifiers/{parent_tag.lower()}_classifier.pth'
+            torch.save({
+                'model_state_dict': model.state_dict(),
+                'input_size': X.shape[1],
+                'output_size': len(np.unique(y)),
+                'preprocessors': preprocessors,
+                'test_accuracy': test_accuracy
+            }, model_path)
+            print(f"Saved {parent_tag} model to {model_path}")
+            self.plot_training_history(training_history, parent_tag)
+    def plot_training_history(self, history, parent_tag):
+        # Plot training history (good function naming no need for commenting but here we go)
+        train_losses, val_losses, val_accuracies = history
+        plt.figure(figsize=(12, 5))
+        plt.subplot(1, 2, 1)
+        plt.plot(train_losses, label='Training Loss')
+        plt.plot(val_losses, label='Validation Loss')
+        plt.title(f'{parent_tag} Model: Loss over epochs')
+        plt.xlabel('Epoch')
+        plt.ylabel('Loss')
+        plt.legend()
+        plt.subplot(1, 2, 2)
+        plt.plot(val_accuracies, label='Validation Accuracy')
+        plt.title(f'{parent_tag} Model: Accuracy over epochs')
+        plt.xlabel('Epoch')
+        plt.ylabel('Accuracy')
+        plt.legend()
+        plt.tight_layout()
+        plt.savefig(f'../models/sub_classifiers/{parent_tag.lower()}_training_history.png')
+        plt.close()
+    def load_models(self, model_dir='../models/sub_classifiers'):
+        # Load saved models
+        for parent_tag in self.tag_hierarchy.keys():
+            model_path = f'{model_dir}/{parent_tag.lower()}_classifier.pth'
+            if os.path.exists(model_path):
+                print(f"Loading {parent_tag} model from {model_path}")
+                checkpoint = torch.load(model_path, map_location=self.device,weights_only=False)
+                model = ImprovedTagClassifier(checkpoint['input_size'], checkpoint['output_size']).to(self.device)
+                model.load_state_dict(checkpoint['model_state_dict'])
+                model.eval()
+                self.models[parent_tag] = model
+                self.preprocessors[parent_tag] = checkpoint['preprocessors']
+                self.label_encoders[parent_tag] = checkpoint['preprocessors']['label_encoder']
+                print(f"Loaded {parent_tag} model (Test Accuracy: {checkpoint['test_accuracy']:.4f})")
+            else:
+                print(f"Model file {model_path} not found!")
+    def predict_hierarchical(self, sample_data, base_prediction):
+        # Predict a tag using the right sub-classifier
+        if base_prediction not in self.tag_hierarchy:
+            return base_prediction, 1.0
+        if base_prediction not in self.models:
+            print(f"No sub-classifier found for {base_prediction}")
+            return base_prediction, 1.0
+        preprocessors = self.preprocessors[base_prediction]
+        sample_df = pd.DataFrame([sample_data])
+        cat_cols = preprocessors['categorical_cols']
+        cont_cols = preprocessors['continuous_cols']
+        # Add missing columns
+        for col in cat_cols + cont_cols:
+            if col not in sample_df.columns:
+                sample_df[col] = 'unknown' if col in cat_cols else 0
+        sample_df[cat_cols] = sample_df[cat_cols].astype(str).fillna('unknown')
+        X_cat = preprocessors['ohe'].transform(sample_df[cat_cols])
+        X_cont = preprocessors['imputer'].transform(sample_df[cont_cols])
+        X_cont = preprocessors['scaler'].transform(X_cont)
+        X_processed = np.concatenate([X_cat, X_cont], axis=1)
+        X_tensor = torch.tensor(X_processed, dtype=torch.float32).to(self.device)
+        model = self.models[base_prediction]
+        with torch.no_grad():
+            outputs = model(X_tensor)
+            probabilities = torch.softmax(outputs, dim=1)
+            _, predicted = torch.max(outputs, 1)
+        predicted_label = preprocessors['label_encoder'].inverse_transform([predicted.cpu().numpy()[0]])[0]
+        confidence = probabilities.max().item()
         return predicted_label, confidence