Spaces:

AOZ2025
/

Semantic-Assigner

Sleeping

App Files Files Community

Semantic-Assigner / Utils /model_utils.py

AOZ2025

Upload 4 files

dd98fd2 verified 8 months ago

raw

history blame contribute delete

20.5 kB

	import torch
	import torch.nn as nn
	from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
	from sklearn.impute import SimpleImputer
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score, classification_report
	from sklearn.utils.class_weight import compute_class_weight
	from torch.amp import autocast, GradScaler
	from torch.utils.data import TensorDataset, DataLoader
	from torch.nn.utils import clip_grad_norm_
	from collections import Counter
	import torch
	import torch.nn as nn
	import os
	import torch.optim as optim

	# Define the ImprovedTagClassifier class for tag prediction
	class ImprovedTagClassifier(nn.Module):
	def __init__(self, input_size, output_size, dropout_rate=0.4):
	super(ImprovedTagClassifier, self).__init__()

	# First hidden layer: transforms input features to 512 dimensions
	self.fc1 = nn.Linear(input_size, 512)
	self.bn1 = nn.BatchNorm1d(512) # Normalizes the output

	# Second hidden layer: reduces from 512 to 256 dimensions
	self.fc2 = nn.Linear(512, 256)
	self.bn2 = nn.BatchNorm1d(256) # Normalizes again

	# Third hidden layer: further reduces to 128 dimensions
	self.fc3 = nn.Linear(256, 128)
	self.bn3 = nn.BatchNorm1d(128) # Another normalization

	# Output layer: maps 128 dimensions to the number of classes
	self.fc4 = nn.Linear(128, output_size)

	# Tools to prevent overfitting and improve learning
	self.dropout = nn.Dropout(dropout_rate) # Randomly drops some data
	self.leaky_relu = nn.LeakyReLU(0.1) # Activation function with a small slope

	# Skip connection: connects layer 1 directly to layer 3
	self.skip1_3 = nn.Linear(512, 128)

	# Set up the initial weights for better training
	self._initialize_weights()

	def _initialize_weights(self):
	# Loop through all parts of the model
	for m in self.modules():
	if isinstance(m, nn.Linear):
	# Use a special method to set weights for linear layers
	nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='leaky_relu')
	if m.bias is not None:
	# Set biases to zero
	nn.init.constant_(m.bias, 0)
	elif isinstance(m, nn.BatchNorm1d):
	# Set batch norm weights to 1 and biases to 0
	nn.init.constant_(m.weight, 1)
	nn.init.constant_(m.bias, 0)

	def forward(self, x):
	# First block: process input through the first layer
	x1 = self.fc1(x)
	x1 = self.bn1(x1) # Normalize
	x1 = self.leaky_relu(x1) # Activate
	x1 = self.dropout(x1) # Drop some data to prevent overfitting

	# Second block: process through the second layer
	x2 = self.fc2(x1)
	x2 = self.bn2(x2) # Normalize
	x2 = self.leaky_relu(x2) # Activate
	x2 = self.dropout(x2) # Drop some data

	# Third block: process with a skip connection
	x3 = self.fc3(x2)
	skip_x1 = self.skip1_3(x1) # Skip connection from first layer
	x3 = x3 + skip_x1 # Add the skip connection
	x3 = self.bn3(x3) # Normalize
	x3 = self.leaky_relu(x3) # Activate
	x3 = self.dropout(x3) # Drop some data

	# Final output: get the class predictions
	output = self.fc4(x3)
	return output

	class FocalLoss(nn.Module):
	"""Focal Loss for handling class imbalance"""
	def __init__(self, weight=None, gamma=2.0, reduction='mean'):
	super(FocalLoss, self).__init__()
	self.weight = weight # Weights for each class
	self.gamma = gamma # Focus on hard examples
	self.reduction = reduction
	self.ce_loss = nn.CrossEntropyLoss(weight=weight, reduction='none')

	def forward(self, inputs, targets):
	# Calculate basic cross-entropy loss
	ce_loss = self.ce_loss(inputs, targets)
	pt = torch.exp(-ce_loss) # Probability of correct class
	focal_loss = ((1 - pt) ** self.gamma) * ce_loss # Adjust loss

	# Combine losses based on reduction type
	if self.reduction == 'mean':
	return focal_loss.mean()
	elif self.reduction == 'sum':
	return focal_loss.sum()
	else:
	return focal_loss

	class MultiLevelTagClassifier:
	def __init__(self, device='cuda'):
	# Use GPU
	self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
	self.models = {} # Store models for each parent tag
	self.preprocessors = {} # Store preprocessing tools
	self.label_encoders = {} # Store label encoders

	# Define tag groups
	self.tag_hierarchy = {
	'DIV': ['DIV', 'LIST', 'CARD'],
	'P': ['P', 'LI'],
	'INPUT': ['INPUT', 'DROPDOWN'],
	'ICON': ['ICON', 'CHECKBOX', 'RADIO'],
	}
	print(f"Using device: {self.device}")

	def prepare_data_for_subtask(self, df, parent_tag, subtags):
	# Get only the data for this parent tag’s subtags
	filtered_df = df[df['tag'].isin(subtags)].copy()
	print(f"\n=== Preparing data for {parent_tag} sub-classification ===")
	print(f"Subtags: {subtags}")
	print(f"Total samples: {len(filtered_df)}")
	print(f"Distribution: \n{filtered_df['tag'].value_counts()}")

	if len(filtered_df) == 0:
	print(f"No data found for {parent_tag} subtags!")
	return None, None, None, None, None, None

	y = filtered_df["tag"] # Target tags
	X = filtered_df.drop(columns=["tag"]) # Features

	# Define which columns are categories and numerical features
	categorical_cols = ['type', 'prev_sibling_html_tag', 'child_1_html_tag', 'child_2_html_tag', 'parent_tag_html']
	continuous_cols = [col for col in X.columns if col not in categorical_cols]

	# Add missing columns with default values
	missing_cols = [col for col in categorical_cols + continuous_cols if col not in X.columns]
	if missing_cols:
	print(f"Warning: Missing columns {missing_cols} in data for {parent_tag}")
	for col in missing_cols:
	X[col] = 'unknown' if col in categorical_cols else 0

	# Process categories
	X[categorical_cols] = X[categorical_cols].astype(str).fillna('unknown')
	ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
	X_cat_encoded = ohe.fit_transform(X[categorical_cols])

	# Process continous features
	imputer = SimpleImputer(strategy='median')
	X_continuous_imputed = imputer.fit_transform(X[continuous_cols])
	scaler = StandardScaler()
	X_continuous_scaled = scaler.fit_transform(X_continuous_imputed)
	X_processed = np.concatenate([X_cat_encoded, X_continuous_scaled], axis=1)

	# Encode target tags
	label_encoder = LabelEncoder()
	y_encoded = label_encoder.fit_transform(y)

	# Boost rare classes by copying them
	class_counts = Counter(y_encoded)
	min_samples_threshold = max(10, len(subtags) * 3)
	rare_classes = [cls for cls, count in class_counts.items() if count < min_samples_threshold]

	for cls in rare_classes:
	idx = np.where(y_encoded == cls)[0]
	original_class_name = label_encoder.inverse_transform([cls])[0]
	samples_needed = min_samples_threshold - len(idx)
	print(f"Adding {samples_needed} copies to class '{original_class_name}'")
	for _ in range(samples_needed):
	sample_idx = np.random.choice(idx)
	new_sample = X_processed[sample_idx].copy()
	continuous_start = X_cat_encoded.shape[1]
	noise = np.random.normal(0, 0.05, size=X_continuous_scaled.shape[1])
	new_sample[continuous_start:] += noise
	X_processed = np.vstack([X_processed, new_sample])
	y_encoded = np.append(y_encoded, cls)

	# Bundle up preprocessing models
	preprocessors = {
	'ohe': ohe,
	'imputer': imputer,
	'scaler': scaler,
	'label_encoder': label_encoder,
	'categorical_cols': categorical_cols,
	'continuous_cols': continuous_cols
	}
	return X_processed, y_encoded, preprocessors, categorical_cols, continuous_cols, label_encoder

	def train_subtask_model(self, X, y, preprocessors, parent_tag, epochs=100):
	# Split data into train, validation, and test sets
	print(f"\n=== Training {parent_tag} sub-classifier ===")
	X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
	X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.15, random_state=42, stratify=y_temp)
	print(f"Training set size: {X_train.shape[0]}")
	print(f"Validation set size: {X_val.shape[0]}")
	print(f"Test set size: {X_test.shape[0]}")

	# Balance classes
	class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

	# Turn data into tensors
	X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
	y_train_tensor = torch.tensor(y_train, dtype=torch.long)
	X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
	y_val_tensor = torch.tensor(y_val, dtype=torch.long)
	X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
	y_test_tensor = torch.tensor(y_test, dtype=torch.long)
	class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(self.device)

	# Set up datasets and loaders
	train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
	val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
	test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
	train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=2)
	val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False, num_workers=2)
	test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=2)

	# Create and set up the model
	input_size = X_train.shape[1]
	output_size = len(np.unique(y))
	model = ImprovedTagClassifier(input_size, output_size).to(self.device)
	criterion = FocalLoss(weight=class_weights_tensor, gamma=2.0)
	optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
	scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)
	scaler = GradScaler()

	# Training loop
	best_val_loss = float('inf')
	patience = 15
	counter = 0
	train_losses = []
	val_losses = []
	val_accuracies = []

	for epoch in range(epochs):
	model.train()
	running_loss = 0.0
	for batch_X, batch_y in train_loader:
	batch_X, batch_y = batch_X.to(self.device), batch_y.to(self.device)
	optimizer.zero_grad()
	with autocast(device_type=self.device.type):
	outputs = model(batch_X)
	loss = criterion(outputs, batch_y)
	scaler.scale(loss).backward()
	clip_grad_norm_(model.parameters(), max_norm=1.0)
	scaler.step(optimizer)
	scaler.update()
	running_loss += loss.item()

	train_loss = running_loss / len(train_loader)
	model.eval()
	val_running_loss = 0.0
	all_preds = []
	all_labels = []

	with torch.no_grad():
	for batch_X, batch_y in val_loader:
	batch_X, batch_y = batch_X.to(self.device), batch_y.to(self.device)
	with autocast(device_type=self.device.type):
	outputs = model(batch_X)
	loss = criterion(outputs, batch_y)
	val_running_loss += loss.item()
	_, preds = torch.max(outputs, 1)
	all_preds.extend(preds.cpu().numpy())
	all_labels.extend(batch_y.cpu().numpy())

	val_loss = val_running_loss / len(val_loader)
	val_accuracy = accuracy_score(all_labels, all_preds)
	scheduler.step(val_loss)

	# Track progress
	train_losses.append(train_loss)
	val_losses.append(val_loss)
	val_accuracies.append(val_accuracy)
	print(f"Epoch [{epoch+1}/{epochs}] - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

	if val_loss < best_val_loss:
	best_val_loss = val_loss
	counter = 0
	best_model_state = model.state_dict().copy()
	else:
	counter += 1
	if counter >= patience:
	print(f"Early stopping triggered after {epoch+1} epochs")
	break

	model.load_state_dict(best_model_state)
	model.eval()
	test_preds = []
	test_labels = []

	with torch.no_grad():
	for batch_X, batch_y in test_loader:
	batch_X, batch_y = batch_X.to(self.device), batch_y.to(self.device)
	outputs = model(batch_X)
	_, preds = torch.max(outputs, 1)
	test_preds.extend(preds.cpu().numpy())
	test_labels.extend(batch_y.cpu().numpy())

	test_accuracy = accuracy_score(test_labels, test_preds)
	print(f"\n{parent_tag} Model Test Accuracy: {test_accuracy:.4f}")
	print(f"\n{parent_tag} Classification Report:")
	print(classification_report(test_labels, test_preds, target_names=preprocessors['label_encoder'].classes_, zero_division=0))

	return model, (train_losses, val_losses, val_accuracies), test_accuracy

	def train_all_models(self, df_path, epochs=100):
	# Load and clean the main dataset
	print("Loading and cleaning data...")
	df = pd.read_csv(df_path)
	df.loc[(df["tag"] == "SPAN") & ((df["type"] == "RECTANGLE") \| (df["type"] == "GROUP")), "tag"] = "DIV"
	children_cols = ['child_1_html_tag', 'child_2_html_tag']
	for col in children_cols:
	df[col] = df[col].apply(lambda x: "DIV" if isinstance(x, str) and '-' in x else x)
	for col in ['tag', 'prev_sibling_html_tag', 'child_1_html_tag', 'child_2_html_tag']:
	df[col] = df[col].str.upper()

	# Make a folder for models
	os.makedirs('../models/sub_classifiers', exist_ok=True)

	# Train a model for each parent tag
	for parent_tag, subtags in self.tag_hierarchy.items():
	print(f"\n{'='*60}")
	print(f"Training {parent_tag} sub-classifier")
	print(f"{'='*60}")
	result = self.prepare_data_for_subtask(df, parent_tag, subtags)
	if result[0] is None:
	print(f"Skipping {parent_tag} due to insufficient data")
	continue
	X, y, preprocessors, cat_cols, cont_cols, label_encoder = result
	model, training_history, test_accuracy = self.train_subtask_model(X, y, preprocessors, parent_tag, epochs)
	self.models[parent_tag] = model
	self.preprocessors[parent_tag] = preprocessors
	self.label_encoders[parent_tag] = label_encoder
	model_path = f'../models/sub_classifiers/{parent_tag.lower()}_classifier.pth'
	torch.save({
	'model_state_dict': model.state_dict(),
	'input_size': X.shape[1],
	'output_size': len(np.unique(y)),
	'preprocessors': preprocessors,
	'test_accuracy': test_accuracy
	}, model_path)
	print(f"Saved {parent_tag} model to {model_path}")
	self.plot_training_history(training_history, parent_tag)

	def plot_training_history(self, history, parent_tag):
	# Plot training history (good function naming no need for commenting but here we go)
	train_losses, val_losses, val_accuracies = history
	plt.figure(figsize=(12, 5))
	plt.subplot(1, 2, 1)
	plt.plot(train_losses, label='Training Loss')
	plt.plot(val_losses, label='Validation Loss')
	plt.title(f'{parent_tag} Model: Loss over epochs')
	plt.xlabel('Epoch')
	plt.ylabel('Loss')
	plt.legend()
	plt.subplot(1, 2, 2)
	plt.plot(val_accuracies, label='Validation Accuracy')
	plt.title(f'{parent_tag} Model: Accuracy over epochs')
	plt.xlabel('Epoch')
	plt.ylabel('Accuracy')
	plt.legend()
	plt.tight_layout()
	plt.savefig(f'../models/sub_classifiers/{parent_tag.lower()}_training_history.png')
	plt.close()

	def load_models(self, model_dir='../models/sub_classifiers'):
	# Load saved models
	for parent_tag in self.tag_hierarchy.keys():
	model_path = f'{model_dir}/{parent_tag.lower()}_classifier.pth'
	if os.path.exists(model_path):
	print(f"Loading {parent_tag} model from {model_path}")
	checkpoint = torch.load(model_path, map_location=self.device,weights_only=False)
	model = ImprovedTagClassifier(checkpoint['input_size'], checkpoint['output_size']).to(self.device)
	model.load_state_dict(checkpoint['model_state_dict'])
	model.eval()
	self.models[parent_tag] = model
	self.preprocessors[parent_tag] = checkpoint['preprocessors']
	self.label_encoders[parent_tag] = checkpoint['preprocessors']['label_encoder']
	print(f"Loaded {parent_tag} model (Test Accuracy: {checkpoint['test_accuracy']:.4f})")
	else:
	print(f"Model file {model_path} not found!")

	def predict_hierarchical(self, sample_data, base_prediction):
	# Predict a tag using the right sub-classifier
	if base_prediction not in self.tag_hierarchy:
	return base_prediction, 1.0
	if base_prediction not in self.models:
	print(f"No sub-classifier found for {base_prediction}")
	return base_prediction, 1.0
	preprocessors = self.preprocessors[base_prediction]
	sample_df = pd.DataFrame([sample_data])
	cat_cols = preprocessors['categorical_cols']
	cont_cols = preprocessors['continuous_cols']

	# Add missing columns
	for col in cat_cols + cont_cols:
	if col not in sample_df.columns:
	sample_df[col] = 'unknown' if col in cat_cols else 0

	sample_df[cat_cols] = sample_df[cat_cols].astype(str).fillna('unknown')
	X_cat = preprocessors['ohe'].transform(sample_df[cat_cols])
	X_cont = preprocessors['imputer'].transform(sample_df[cont_cols])
	X_cont = preprocessors['scaler'].transform(X_cont)
	X_processed = np.concatenate([X_cat, X_cont], axis=1)
	X_tensor = torch.tensor(X_processed, dtype=torch.float32).to(self.device)

	model = self.models[base_prediction]
	with torch.no_grad():
	outputs = model(X_tensor)
	probabilities = torch.softmax(outputs, dim=1)
	_, predicted = torch.max(outputs, 1)
	predicted_label = preprocessors['label_encoder'].inverse_transform([predicted.cpu().numpy()[0]])[0]
	confidence = probabilities.max().item()
	return predicted_label, confidence