Spaces:

hghghgkskdmskdms
/

testing_destilation

Paused

App Files Files Community

testing_destilation / app.py

Kfjjdjdjdhdhd

Update app.py

3f08f0d verified 10 months ago

raw

history blame contribute delete

61.8 kB

	import gradio as gr
	import os
	import unicodedata
	import logging
	import warnings
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import time
	from transformers import AutoConfig, get_scheduler, pipeline, AutoTokenizer, AutoModelForPreTraining, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, AutoModelForTokenClassification, AutoModelForCausalLM, AutoModelForMaskedLM, AutoModel
	from huggingface_hub import HfApi
	from tqdm import tqdm
	from torch.utils.tensorboard import SummaryWriter
	import numpy as np
	import psutil
	import signal
	import sys
	import gc
	from gradio_huggingfacehub_search import HuggingfaceHubSearch

	warnings.filterwarnings("ignore", message="cannot set number of interop threads after parallel work has started or set_num_interop_threads called")
	torch.set_num_threads(os.cpu_count() // 2 if os.cpu_count() > 1 else 1)
	device = torch.device("cpu")
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	MODELS = {}
	LOG_TEXT = ""
	GRADIO_LOG_OUTPUT = None
	HF_API = HfApi()
	AUTH_TOKEN = None
	USER_NAME = None

	SAVE_ON_RESOURCE_TERMINATION = True
	CHECKPOINT_DIR = "./fusion_distillation_checkpoints"
	TENSORBOARD_LOG_DIR = "./tensorboard_logs"
	SAVE_CHECKPOINTS = True
	CHECKPOINT_INTERVAL = 1000
	LOG_INTERVAL = 100
	KD_STEPS = 10000
	ACCUMULATION_STEPS = 1
	FREEZE_STUDENT_STEPS = 2000
	ENABLE_MIXED_PRECISION = False
	KD_LOSS_FACTOR = 1.0
	CE_LOSS_FACTOR = 0.0
	KD_TEMPERATURE = 2.0
	ENABLE_ATTENTION_KD = True
	ENABLE_HIDDEN_STATE_KD = True
	ENABLE_INTERMEDIATE_KD = True
	ENABLE_LAYER_NORM_KD = True
	ENABLE_EMBEDDING_KD = True
	ENABLE_PARAMETER_KD = True
	ENABLE_ACTIVATION_KD = True
	ENABLE_LOGIT_MASKING_KD = True
	ENABLE_SPARCITY_REGULARIZATION = True
	ENABLE_FEATURE_MAP_KD = True
	ENABLE_OUTPUT_LOGIT_KD = True
	ENABLE_LAYERWISE_PARAMETER_KD = True
	ENABLE_VOCAB_PROJECTION_KD = True
	ENABLE_CONTRASTIVE_KD = True
	ENABLE_RDROP_KD = True
	ENABLE_ADAPTIVE_TEMPERATURE_KD = True
	ENABLE_LAYER_WISE_KD = True
	ENABLE_ACTIVATION_REGULARIZATION = True
	ENABLE_NEURON_SELECTIVITY_KD = True
	ENABLE_WEIGHTED_PARAMETER_KD = True
	ENABLE_FSP_KD = True
	ENABLE_ATTENTION_ALIGNMENT_KD = True
	ENABLE_GRAM_MATRIX_KD = True
	ATTENTION_KD_FACTOR = 0.1
	HIDDEN_STATE_KD_FACTOR = 0.1
	INTERMEDIATE_KD_FACTOR = 0.1
	LAYER_NORM_KD_FACTOR = 0.01
	EMBEDDING_KD_FACTOR = 0.01
	PARAMETER_KD_FACTOR = 0.001
	ACTIVATION_KD_FACTOR = 0.0001
	LOGIT_MASKING_FACTOR = 0.01
	SPARCITY_REGULARIZATION_FACTOR = 1e-5
	FEATURE_MAP_KD_FACTOR = 0.005
	OUTPUT_LOGIT_KD_FACTOR = 1.0
	LAYERWISE_PARAMETER_KD_FACTOR = 0.0005
	VOCAB_PROJECTION_KD_FACTOR = 0.001
	CONTRASTIVE_KD_FACTOR = 0.05
	RDROP_KD_FACTOR = 0.02
	ADAPTIVE_TEMPERATURE_KD_FACTOR = 0.01
	LAYER_WISE_KD_FACTOR = 0.05
	ACTIVATION_REG_LAMBDA = 1e-7
	NEURON_SELECTIVITY_KD_FACTOR = 0.001
	WEIGHTED_PARAMETER_KD_FACTOR = 0.0002
	FSP_KD_FACTOR = 0.001
	ATTENTION_ALIGNMENT_KD_FACTOR = 0.01
	GRAM_MATRIX_KD_FACTOR = 0.0005
	ATTENTION_KD_LOSS_TYPE = 'mse'
	HIDDEN_STATE_KD_LOSS_TYPE = 'mse'
	OUTPUT_LOGIT_KD_LOSS_TYPE = 'kl'
	CONTRASTIVE_KD_LOSS_TYPE = 'cosine'
	RDROP_KD_LOSS_TYPE = 'kl'
	LAYER_WISE_LOSS_TYPE = 'mse'
	NEURON_SELECTIVITY_LOSS_TYPE = 'mse'
	WEIGHTED_PARAMETER_LOSS_TYPE = 'mse'
	FSP_KD_LOSS_TYPE = 'mse'
	ATTENTION_ALIGNMENT_LOSS_TYPE = 'mse'
	GRAM_MATRIX_LOSS_TYPE = 'mse'
	INTERMEDIATE_LAYERS = [2, 5, 8]
	LAYER_NORM_MODULES = ['LayerNorm']
	ACTIVATION_MODULES = ['Linear']
	FEATURE_MAP_MODULES = ['Linear']
	LAYERWISE_PARAMETER_MODULES = ['transformer.h']
	VOCAB_PROJECTION_MODULES = ['lm_head']
	LAYER_WISE_MODULES = ['transformer.h']
	NEURON_SELECTIVITY_MODULES = ['Linear']
	WEIGHTED_PARAMETER_MODULES = ['Linear']
	FSP_MODULES = ['Linear']
	ATTENTION_ALIGNMENT_MODULES = ['SelfAttention']
	GRAM_MATRIX_MODULES = ['Linear']
	ADAPTIVE_TEMPERATURE_INITIAL = 2.0
	ADAPTIVE_TEMPERATURE_DECAY_RATE = 0.999
	LR_INITIAL = 5e-5
	WEIGHT_DECAY = 1e-4
	SCHEDULER_TYPE = "linear"
	WARMUP_STEPS = 500
	ENABLE_LR_SCHEDULER = True
	ENABLE_STUDENT_PARAMETER_FREEZE = True
	ENABLE_DYNAMIC_FREEZE = False
	FREEZE_THRESHOLD = 0.1
	ENABLE_GRADIENT_CLIPPING = True
	GRAD_CLIP_VALUE = 1.0
	ENABLE_EARLY_STOPPING = True
	EARLY_STOPPING_PATIENCE = 5
	ENABLE_PARAMETER_COUNT_CHECK = False
	ENABLE_EMBEDDING_NOISE = True
	EMBEDDING_NOISE_STD = 1e-4
	ENABLE_L2_REGULARIZATION = False
	L2_LAMBDA = 1e-6
	ENABLE_L1_REGULARIZATION = False
	L1_LAMBDA = 1e-6


	def cleanup_and_exit(signal_code):
	log_message("Initiating cleanup...", level="info")
	if MODELS.get('student') and SAVE_ON_RESOURCE_TERMINATION:
	try:
	student_model = MODELS['student']
	os.makedirs(CHECKPOINT_DIR, exist_ok=True)
	checkpoint_path = os.path.join(CHECKPOINT_DIR, "student_model_terminated.pt")
	save_checkpoint(student_model, checkpoint_path)
	log_message(f"Model checkpoint saved to {checkpoint_path} due to termination signal.", level="info")
	except Exception as e:
	log_message(f"Error during checkpoint save on cleanup: {e}", level="error")
	if MODELS.get('writer'):
	try:
	MODELS['writer'].close()
	log_message("TensorBoard writer closed.", level="info")
	except Exception as e:
	log_message(f"Error closing TensorBoard writer during cleanup: {e}", level="error")
	log_message(f"Cleanup completed.", level="info")


	def signal_handler(sig, frame):
	log_message(f"Received signal {sig}. Continuing process...", level="warning")


	signal.signal(signal.SIGTERM, signal_handler)
	signal.signal(signal.SIGINT, signal_handler)


	def log_message(message, level="info"):
	global LOG_TEXT, GRADIO_LOG_OUTPUT
	log_str = f"{message}\n"
	LOG_TEXT += log_str
	if GRADIO_LOG_OUTPUT:
	GRADIO_LOG_OUTPUT.value = LOG_TEXT
	if level == "info":
	logger.info(message)
	elif level == "warning":
	logger.warning(message)
	elif level == "error":
	logger.error(message)
	print(message)
	tqdm.write(message)


	def log_to_file(message, log_file="training_log.txt"):
	try:
	with open(log_file, "a") as f:
	f.write(f"{message}\n")
	except Exception as e:
	log_message(f"Error writing to log file: {e}", level="warning")


	def unify_parameters(student_model, teacher_model, exclude_layers=None):
	try:
	teacher_state = teacher_model.model.state_dict()
	student_state = student_model.model.state_dict()
	excluded_names = exclude_layers or []
	for name, param in student_state.items():
	if any(excluded_name in name for excluded_name in excluded_names):
	continue
	if name in teacher_state:
	try:
	if student_state[name].shape == teacher_state[name].shape:
	student_state[name].copy_(teacher_state[name])
	else:
	min_shape = [min(s, t) for s, t in zip(student_state[name].shape, teacher_state[name].shape)]
	student_slice = tuple([slice(0, s) for s in min_shape])
	teacher_slice = tuple([slice(0, s) for s in min_shape])
	student_state[name][student_slice].copy_(teacher_state[name][teacher_slice])
	except Exception as e:
	log_message(f"Parameter copy error for {name}: {e}", level="warning")
	student_model.model.load_state_dict(student_state, strict=False)
	except Exception as e:
	log_message(f"Error in unify_parameters: {e}", level="warning")


	def unify_embeddings(student_model, teacher_model, project_embeddings=True, mean_resizing=False):
	try:
	if hasattr(student_model.model, "get_input_embeddings") and hasattr(teacher_model.model, "get_input_embeddings"):
	student_emb = student_model.model.get_input_embeddings()
	teacher_emb = teacher_model.model.get_input_embeddings()
	if project_embeddings:
	in_dim = teacher_emb.weight.shape[1]
	out_dim = student_emb.weight.shape[1]
	projection = nn.Linear(in_dim, out_dim).to(device)
	teacher_emb_projected = projection(teacher_emb.weight)
	student_vocab_size = student_emb.weight.shape[0]
	teacher_vocab_size_proj = teacher_emb_projected.shape[0]
	if mean_resizing and student_vocab_size < teacher_vocab_size_proj:
	try:
	student_model.model.resize_token_embeddings(teacher_vocab_size_proj)
	student_emb = student_model.model.get_input_embeddings()
	except Exception as e:
	log_message(f"Error resizing student embeddings: {e}", level="warning")
	min_vocab_size = min(student_emb.weight.shape[0], teacher_vocab_size_proj)
	try:
	student_emb.weight.data[:min_vocab_size].copy_(teacher_emb_projected.data[:min_vocab_size])
	except Exception as e:
	log_message(f"Error copying projected embeddings: {e}", level="warning")
	else:
	min_vocab_size = min(student_emb.weight.shape[0], teacher_emb.weight.shape[0])
	try:
	student_emb.weight.data[:min_vocab_size].copy_(teacher_emb.weight.data[:min_vocab_size])
	except Exception as e:
	log_message(f"Error copying embeddings: {e}", level="warning")

	if hasattr(student_model.model, "get_output_embeddings") and hasattr(teacher_model.model, "get_output_embeddings"):
	student_out_emb = student_model.model.get_output_embeddings()
	teacher_out_emb = teacher_model.model.get_output_embeddings()
	if student_out_emb is not None and teacher_out_emb is not None:
	if project_embeddings:
	in_dim = teacher_out_emb.weight.shape[1]
	out_dim = student_out_emb.weight.shape[1]
	projection = nn.Linear(in_dim, out_dim).to(device)
	teacher_out_emb_projected = projection(teacher_out_emb.weight)
	student_vocab_size = student_out_emb.weight.shape[0]
	teacher_vocab_size_proj = teacher_out_emb_projected.shape[0]

	if mean_resizing and student_vocab_size < teacher_vocab_size_proj:
	try:
	student_model.model.resize_token_embeddings(teacher_vocab_size_proj)
	student_out_emb = student_model.model.get_output_embeddings()
	except Exception as e:
	log_message(f"Error resizing student output embeddings: {e}", level="warning")
	min_vocab_size = min(student_out_emb.weight.shape[0], teacher_vocab_size_proj)
	try:
	student_out_emb.weight.data[:min_vocab_size].copy_(teacher_out_emb_projected.data[:min_vocab_size])
	except Exception as e:
	log_message(f"Error copying projected output embeddings: {e}", level="warning")
	else:
	min_vocab_size = min(student_out_emb.weight.shape[0], teacher_out_emb.weight.shape[0])
	try:
	student_out_emb.weight.data[:min_vocab_size].copy_(teacher_out_emb.weight.data[:min_vocab_size])
	except Exception as e:
	log_message(f"Error copying output embeddings: {e}", level="warning")
	except Exception as e:
	log_message(f"Error in unify_embeddings: {e}", level="warning")


	def unify_tokenizers(student_tokenizer, teacher_tokenizer, student_model):
	if teacher_tokenizer is None:
	return student_tokenizer
	try:
	teacher_vocab = teacher_tokenizer.get_vocab()
	student_vocab = student_tokenizer.get_vocab()
	new_tokens = [token for token in teacher_vocab if token not in student_vocab]
	if new_tokens:
	student_tokenizer.add_tokens(new_tokens)
	student_model.model.resize_token_embeddings(len(student_tokenizer))
	except Exception as e:
	log_message(f"Tokenizer unification error: {e}", level="warning")
	return student_tokenizer


	def normalize_text(text):
	return unicodedata.normalize('NFKC', text)


	def generate_predictions(model, tokenizer, texts, max_length=150, **tokenizer_kwargs):
	try:
	inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=max_length, use_fast=True, **tokenizer_kwargs).to(device)
	outputs = model.model.generate(**inputs)
	return list(map(lambda output: tokenizer.decode(output, skip_special_tokens=True), outputs))
	except Exception as e:
	log_message(f"Prediction generation error: {e}", level="warning")
	return [""] * len(texts)


	fusion_functions = {
	'geometric_mean_double': lambda teacher_states, layer_scale, key, device: geometric_mean_fusion_double(teacher_states, device, len(teacher_states), layer_scale, key),
	}


	def geometric_mean_fusion_double(states, device, num_teachers, layer_scale, key):
	min_len = min([state[key].shape[0] for state in states])
	log_abs_sum = sum([np.log(torch.abs(state[key][:min_len].detach().cpu().numpy())) for state in states])
	sign_prod = np.prod([np.sign(state[key][:min_len].detach().cpu().numpy()) for state in states])
	geometric_mean_val = torch.tensor(np.exp(log_abs_sum * (1.0 / num_teachers)) * sign_prod, device=device) * layer_scale
	return geometric_mean_val


	def complete_unify_teacher_models_double(teacher_models, fusion_method='geometric_mean_double', layer_scale=1.0, layer_weights=None, device=device):
	teacher_states = [teacher.model.state_dict() for teacher in teacher_models]
	unified_state = {}
	first_teacher_state = teacher_states[0]
	for key in first_teacher_state.keys():
	teacher_layer_states = []
	all_teachers_have_layer = True
	for state in teacher_states:
	if key in state:
	teacher_layer_states.append(state)
	else:
	all_teachers_have_layer = False
	break
	if all_teachers_have_layer:
	if fusion_method in fusion_functions:
	unified_state[key] = fusion_functions['geometric_mean_double'](teacher_layer_states, layer_scale, key, device=device)
	else:
	layer_sum = torch.stack([state[key] for state in teacher_layer_states]).sum(dim=0)
	unified_state[key] = layer_sum / len(teacher_models)
	else:
	unified_state[key] = first_teacher_state[key]
	return unified_state


	def unify_teacher_into_student(unified_teacher_state, student, force_parameter_copy=False):
	student_state = student.model.state_dict()
	new_state = {}
	for key, student_value in student_state.items():
	if key in unified_teacher_state:
	teacher_value = unified_teacher_state[key]
	try:
	if student_value.shape == teacher_value.shape:
	new_state[key] = teacher_value
	else:
	min_shape = [min(s, t) for s, t in zip(student_value.shape, teacher_value.shape)]
	student_slice = tuple([slice(0, s) for s in min_shape])
	teacher_slice = tuple([slice(0, s) for s in min_shape])
	new_state[key] = student_value.clone()
	new_state[key][student_slice] = teacher_value[teacher_slice]
	except Exception as e:
	log_message(f"Parameter assignment error for {key}: {e}", level="warning")
	else:
	new_state[key] = student_value
	student.model.load_state_dict(new_state, strict=False)
	return student


	def fuse_tokenizers(teacher_tokenizers, student_tokenizer):
	unified_vocab = set(student_tokenizer.get_vocab().keys()) if student_tokenizer else set()
	for teacher_tokenizer in teacher_tokenizers:
	if teacher_tokenizer:
	teacher_vocab = set(teacher_tokenizer.get_vocab().keys())
	unified_vocab = unified_vocab.union(teacher_vocab)
	if student_tokenizer:
	try:
	student_tokenizer.add_tokens(list(unified_vocab - set(student_tokenizer.get_vocab().keys())))
	except Exception as e:
	log_message(f"Error fusing tokenizers: {e}", level="warning")
	return student_tokenizer


	def update_student_embeddings_double(student, teacher_models, fusion_type='geometric_mean_double', device=device, mean_resizing=False):
	emb_student = student.model.get_input_embeddings().weight.data
	student_vocab_size, student_emb_dim = emb_student.shape
	teacher_embeddings_proj = []
	min_teacher_vocab_size = float('inf')

	for teacher in teacher_models:
	emb_teacher = teacher.model.get_input_embeddings().weight.data.detach().cpu()
	teacher_vocab_size, teacher_emb_dim = emb_teacher.shape
	min_teacher_vocab_size = min(min_teacher_vocab_size, teacher_vocab_size)
	if teacher_emb_dim != student_emb_dim:
	proj = nn.Linear(teacher_emb_dim, student_emb_dim, bias=False).to(device)
	emb_teacher_proj = proj(emb_teacher.to(device)).cpu()
	else:
	emb_teacher_proj = emb_teacher
	teacher_embeddings_proj.append(emb_teacher_proj)

	min_vocab = min(min_teacher_vocab_size, student_vocab_size)
	fusion_function = fusion_functions.get(fusion_type, fusion_functions['geometric_mean_double'])
	teacher_states = [{'embedding': emb} for emb in teacher_embeddings_proj]
	emb_updated = fusion_function(teacher_states, 1.0, 'embedding', device=device)[:min_vocab] if fusion_type != 'layerwise' else fusion_functions['geometric_mean_double'](teacher_states, 1.0, 'embedding', device=device)[:min_vocab]

	if ENABLE_EMBEDDING_NOISE:
	noise = torch.randn_like(emb_updated) * EMBEDDING_NOISE_STD
	emb_updated = emb_updated + noise
	try:
	student.model.get_input_embeddings().weight.data[:min_vocab].copy_(emb_updated.to(device))
	except Exception as e:
	log_message(f"Error updating student embeddings: {e}", level="warning")
	return student


	def save_checkpoint(model, checkpoint_path):
	try:
	torch.save(model.model.state_dict(), checkpoint_path)
	log_message(f"Checkpoint saved to: {checkpoint_path}", level="info")
	except Exception as e:
	log_message(f"Error saving checkpoint: {e}", level="warning")


	kd_loss_functions = {
	'mse': F.mse_loss,
	'kl': lambda student, teacher: F.kl_div(F.log_softmax(student, dim=-1), F.softmax(teacher.detach(), dim=-1), reduction='batchmean', log_target=True),
	'cosine': lambda student, teacher: 1.0 - F.cosine_similarity(student.view(-1, student.size(-1)), teacher.detach().view(-1, teacher.size(-1))).mean(),
	}


	class ActivationSaver:
	def __init__(self, module):
	self.module = module
	self.activation_values = None
	self.feature_map_values = None

	def __call__(self, args, *kwargs):
	output = self.module(args, *kwargs)
	if isinstance(output, tuple):
	self.activation_values = output[0]
	self.feature_map_values = output[1] if len(output) > 1 else None
	return output
	else:
	self.activation_values = output
	return output


	def attach_activation_saver(model, activation_modules, feature_map_modules):
	for name, module in model.model.named_modules():
	if type(module).__name__ in activation_modules or type(module).__name__ in feature_map_modules:
	wrapped_module = ActivationSaver(module)
	setattr(model.model, name, wrapped_module)
	return model


	def attention_kd_loss(student_attention, teacher_attention, loss_type='mse'):
	try:
	if loss_type == 'mse':
	return F.mse_loss(student_attention, teacher_attention.detach())
	elif loss_type == 'kl':
	return F.kl_div(F.log_softmax(student_attention, dim=-1), F.softmax(teacher_attention.detach(), dim=-1), reduction='batchmean', log_target=True)
	elif loss_type == 'cosine':
	return 1.0 - F.cosine_similarity(student_attention.view(-1), teacher_attention.detach().view(-1)).mean()
	except Exception as e:
	log_message(f"Attention KD Loss error: {e}", level="warning")
	return torch.tensor(0.0, device=device)


	def hidden_state_kd_loss(student_hidden, teacher_hidden, loss_type='mse'):
	try:
	if loss_type == 'mse':
	return F.mse_loss(student_hidden, teacher_hidden.detach())
	elif loss_type == 'kl':
	return F.kl_div(F.log_softmax(student_hidden, dim=-1), F.softmax(teacher_hidden.detach(), dim=-1), reduction='batchmean', log_target=True)
	elif loss_type == 'cosine':
	return 1.0 - F.cosine_similarity(student_hidden.view(-1), teacher_hidden.detach().view(-1)).mean()
	except Exception as e:
	log_message(f"Hidden State KD Loss error: {e}", level="warning")
	return torch.tensor(0.0, device=device)


	def advanced_knowledge_distillation(teacher_models, student_model, device):
	for teacher_model in teacher_models:
	teacher_model.model.eval()
	student_model.model.train()
	total_steps = KD_STEPS
	accumulation_steps = ACCUMULATION_STEPS
	freeze_steps = FREEZE_STUDENT_STEPS
	use_mixed_precision = ENABLE_MIXED_PRECISION
	kd_loss_factor = KD_LOSS_FACTOR
	ce_loss_factor = CE_LOSS_FACTOR
	kd_temperature = KD_TEMPERATURE

	attention_kd_enabled = ENABLE_ATTENTION_KD
	hidden_kd_enabled = ENABLE_HIDDEN_STATE_KD
	intermediate_kd_enabled = ENABLE_INTERMEDIATE_KD
	layer_norm_kd_enabled = ENABLE_LAYER_NORM_KD
	embedding_kd_enabled = ENABLE_EMBEDDING_KD
	parameter_kd_enabled = ENABLE_PARAMETER_KD
	activation_kd_enabled = ENABLE_ACTIVATION_KD
	logit_masking_enabled = ENABLE_LOGIT_MASKING_KD
	sparsity_regularization_enabled = ENABLE_SPARCITY_REGULARIZATION
	feature_map_kd_enabled = ENABLE_FEATURE_MAP_KD
	output_logit_kd_enabled = ENABLE_OUTPUT_LOGIT_KD
	layerwise_parameter_kd_enabled = ENABLE_LAYERWISE_PARAMETER_KD
	vocab_projection_kd_enabled = ENABLE_VOCAB_PROJECTION_KD
	contrastive_kd_enabled = ENABLE_CONTRASTIVE_KD
	rdrop_kd_enabled = ENABLE_RDROP_KD
	adaptive_temperature_kd_enabled = ENABLE_ADAPTIVE_TEMPERATURE_KD
	layer_wise_kd_enabled = ENABLE_LAYER_WISE_KD
	activation_reg_enabled = ENABLE_ACTIVATION_REGULARIZATION
	neuron_selectivity_kd_enabled = ENABLE_NEURON_SELECTIVITY_KD
	weighted_parameter_kd_enabled = ENABLE_WEIGHTED_PARAMETER_KD
	fsp_kd_enabled = ENABLE_FSP_KD
	attention_alignment_kd_enabled = ENABLE_ATTENTION_ALIGNMENT_KD
	gram_matrix_kd_enabled = ENABLE_GRAM_MATRIX_KD

	attention_kd_factor = ATTENTION_KD_FACTOR
	hidden_kd_factor = HIDDEN_STATE_KD_FACTOR
	intermediate_kd_factor = INTERMEDIATE_KD_FACTOR
	layer_norm_kd_factor = LAYER_NORM_KD_FACTOR
	embedding_kd_factor = EMBEDDING_KD_FACTOR
	parameter_kd_factor = PARAMETER_KD_FACTOR
	activation_kd_factor = ACTIVATION_KD_FACTOR
	logit_masking_factor = LOGIT_MASKING_FACTOR
	sparsity_regularization_factor = SPARCITY_REGULARIZATION_FACTOR
	feature_map_kd_factor = FEATURE_MAP_KD_FACTOR
	output_logit_kd_factor = OUTPUT_LOGIT_KD_FACTOR
	layerwise_parameter_kd_factor = LAYERWISE_PARAMETER_KD_FACTOR
	vocab_projection_kd_factor = VOCAB_PROJECTION_KD_FACTOR
	contrastive_kd_factor = CONTRASTIVE_KD_FACTOR
	rdrop_kd_factor = RDROP_KD_FACTOR
	adaptive_temperature_kd_factor = ADAPTIVE_TEMPERATURE_KD_FACTOR
	layer_wise_kd_factor = LAYER_WISE_KD_FACTOR
	activation_reg_lambda = ACTIVATION_REG_LAMBDA
	neuron_selectivity_kd_factor = NEURON_SELECTIVITY_KD_FACTOR
	weighted_parameter_kd_factor = WEIGHTED_PARAMETER_KD_FACTOR
	fsp_kd_factor = FSP_KD_FACTOR
	attention_alignment_kd_factor = ATTENTION_ALIGNMENT_KD_FACTOR
	gram_matrix_kd_factor = GRAM_MATRIX_KD_FACTOR

	attention_kd_loss_type = ATTENTION_KD_LOSS_TYPE
	hidden_kd_loss_type = HIDDEN_STATE_KD_LOSS_TYPE
	output_logit_kd_loss_type = OUTPUT_LOGIT_KD_LOSS_TYPE
	contrastive_kd_loss_type = CONTRASTIVE_KD_LOSS_TYPE
	rdrop_kd_loss_type = RDROP_KD_LOSS_TYPE
	layer_wise_loss_type = LAYER_WISE_LOSS_TYPE
	neuron_selectivity_loss_type = NEURON_SELECTIVITY_LOSS_TYPE
	weighted_parameter_loss_type = WEIGHTED_PARAMETER_LOSS_TYPE
	fsp_kd_loss_type = FSP_KD_LOSS_TYPE
	attention_alignment_loss_type = ATTENTION_ALIGNMENT_LOSS_TYPE
	gram_matrix_loss_type = GRAM_MATRIX_LOSS_TYPE

	intermediate_layers = INTERMEDIATE_LAYERS
	layer_norm_modules = LAYER_NORM_MODULES
	activation_modules = ACTIVATION_MODULES
	feature_map_modules = FEATURE_MAP_MODULES
	layerwise_parameter_modules = LAYERWISE_PARAMETER_MODULES
	vocab_projection_MODULES = VOCAB_PROJECTION_MODULES
	layer_wise_modules = LAYER_WISE_MODULES
	neuron_selectivity_modules = NEURON_SELECTIVITY_MODULES
	weighted_parameter_modules = WEIGHTED_PARAMETER_MODULES
	fsp_modules = FSP_MODULES
	attention_alignment_modules = ATTENTION_ALIGNMENT_MODULES
	gram_matrix_modules = GRAM_MATRIX_MODULES

	adaptive_temperature_initial = ADAPTIVE_TEMPERATURE_INITIAL
	adaptive_temperature_decay_rate = adaptive_temperature_decay_rate
	adaptive_temperature = torch.tensor(adaptive_temperature_initial, device=device, requires_grad=False)

	student_model.model = attach_activation_saver(student_model.model, activation_modules, feature_map_modules)
	for teacher_model in teacher_models:
	teacher_model.model = attach_activation_saver(teacher_model.model, activation_modules, feature_map_modules)

	optimizer = torch.optim.AdamW(
	student_model.model.parameters(),
	lr=LR_INITIAL,
	weight_decay=WEIGHT_DECAY
	)
	scheduler = get_scheduler(
	name=SCHEDULER_TYPE,
	optimizer=optimizer,
	num_warmup_steps=WARMUP_STEPS,
	num_training_steps=total_steps
	) if ENABLE_LR_SCHEDULER else None
	scaler = torch.amp.GradScaler(enabled=use_mixed_precision)
	if ENABLE_STUDENT_PARAMETER_FREEZE:
	for param in student_model.model.parameters():
	param.requires_grad = False
	log_to_file(f"Starting KD: freezing for {freeze_steps} steps.")
	else:
	log_to_file("KD started without initial freezing.")
	writer = SummaryWriter(log_dir=TENSORBOARD_LOG_DIR)
	MODELS['writer'] = writer
	best_loss = float('inf')
	patience_counter = 0
	start_time = time.time()
	progress_bar = tqdm(range(KD_STEPS), desc="Knowledge Distillation Progress")
	for step in progress_bar:
	try:
	optimizer.zero_grad()
	loss_accum = 0.0
	gc.collect()
	torch.cuda.empty_cache()
	for _ in range(ACCUMULATION_STEPS):
	batch_texts = ["This is a sample text for distillation.", "Another example sentence."]
	try:
	with torch.amp.autocast(device_type=device.type, enabled=ENABLE_MIXED_PRECISION):
	teacher_outputs_list = []
	teacher_inputs_list = []
	for teacher_model in MODELS['teacher']:
	with torch.no_grad():
	teacher_inputs = teacher_model.tokenizer(
	batch_texts,
	return_tensors="pt",
	padding=True,
	truncation=True,
	max_length=128,
	).to(device)
	teacher_inputs_list.append(teacher_inputs)

	teacher_outputs = teacher_model.model(**teacher_inputs, output_attentions=ENABLE_ATTENTION_KD or ENABLE_ATTENTION_ALIGNMENT_KD, output_hidden_states=ENABLE_HIDDEN_STATE_KD or ENABLE_INTERMEDIATE_KD or ENABLE_LAYER_WISE_KD)
	teacher_outputs_list.append(teacher_outputs)

	student_inputs = MODELS['student'].tokenizer(
	batch_texts,
	return_tensors="pt",
	padding=True,
	truncation=True,
	max_length=128,
	).to(device)

	student_outputs = MODELS['student'].model(**student_inputs, output_attentions=ENABLE_ATTENTION_KD or ENABLE_ATTENTION_ALIGNMENT_KD, output_hidden_states=ENABLE_HIDDEN_STATE_KD or ENABLE_INTERMEDIATE_KD or ENABLE_LAYER_WISE_KD)
	student_logits = student_outputs.logits
	student_attentions = student_outputs.attentions if ENABLE_ATTENTION_KD or ENABLE_ATTENTION_ALIGNMENT_KD else None
	student_hidden_states = student_outputs.hidden_states if ENABLE_HIDDEN_STATE_KD or ENABLE_INTERMEDIATE_KD or ENABLE_LAYER_WISE_KD else None

	teacher_logits = torch.stack([teacher.logits for teacher in teacher_outputs_list]).mean(dim=0)
	teacher_attentions = [teacher.attentions for teacher in teacher_outputs_list]
	teacher_attentions = torch.stack([torch.stack(attn) for attn in teacher_attentions]).mean(dim=0) if teacher_attentions[0] is not None else None
	teacher_hidden_states = [teacher.hidden_states for teacher in teacher_outputs_list]
	teacher_hidden_states = torch.stack([torch.stack(hiddens) for hiddens in teacher_hidden_states]).mean(dim=0) if teacher_hidden_states[0] is not None else None

	min_vocab_size_logits = min(teacher_logits.size(-1), student_logits.size(-1))
	teacher_logits_trimmed = teacher_logits[:, :, :min_vocab_size_logits]
	student_logits_trimmed = student_logits[:, :, :min_vocab_size_logits]

	teacher_embeds = torch.stack([MODELS['teacher'][0].model.get_input_embeddings()(teacher_inputs_list[0].input_ids) for _ in range(len(MODELS['teacher']))]).mean(dim=0)
	student_embeds = MODELS['student'].model.get_input_embeddings()(student_inputs.input_ids)

	ce_loss = torch.tensor(0.0).to(device)

	output_kd_loss = OUTPUT_LOGIT_KD_FACTOR * kd_loss_functions[OUTPUT_LOGIT_KD_LOSS_TYPE](student_logits_trimmed / adaptive_temperature, teacher_logits_trimmed / adaptive_temperature) if ENABLE_OUTPUT_LOGIT_KD else torch.tensor(0.0).to(device)
	loss = output_kd_loss
	loss = loss + CE_LOSS_FACTOR * ce_loss

	attn_loss = ATTENTION_KD_FACTOR * attention_kd_loss(student_attentions[-1], teacher_attentions[-1], ATTENTION_KD_LOSS_TYPE) if ENABLE_ATTENTION_KD and teacher_attentions is not None and student_attentions is not None else torch.tensor(0.0).to(device)
	loss += attn_loss

	hidden_loss = HIDDEN_STATE_KD_FACTOR * hidden_state_kd_loss(student_hidden_states[-1], teacher_hidden_states[-1], HIDDEN_STATE_KD_LOSS_TYPE) if ENABLE_HIDDEN_STATE_KD and teacher_hidden_states is not None and student_hidden_states is not None else torch.tensor(0.0).to(device)
	loss += hidden_loss

	intermediate_loss = 0.0
	if ENABLE_INTERMEDIATE_KD and teacher_hidden_states is not None and student_hidden_states is not None:
	for layer_idx in INTERMEDIATE_LAYERS:
	if layer_idx < len(teacher_hidden_states) and layer_idx < len(student_hidden_states):
	teacher_layer_output = teacher_hidden_states[layer_idx]
	student_layer_output = student_hidden_states[layer_idx]
	layer_loss = hidden_state_kd_loss(student_layer_output, teacher_layer_output, HIDDEN_STATE_KD_LOSS_TYPE)
	intermediate_loss += layer_loss
	loss += INTERMEDIATE_KD_FACTOR * intermediate_loss

	layer_norm_loss = 0.0
	if ENABLE_LAYER_NORM_KD:
	for layer_norm_module_name in LAYER_NORM_MODULES:
	if hasattr(MODELS['student'].model, layer_norm_module_name):
	student_ln = getattr(MODELS['student'].model, layer_norm_module_name)
	layer_norm_weights = []
	layer_norm_biases = []
	for teacher_model in MODELS['teacher']:
	if hasattr(teacher_model.model, layer_norm_module_name):
	teacher_ln = getattr(teacher_model.model, layer_norm_module_name)
	if isinstance(student_ln, nn.LayerNorm) and isinstance(teacher_ln, nn.LayerNorm):
	layer_norm_weights.append(teacher_ln.weight)
	layer_norm_biases.append(teacher_ln.bias)
	if layer_norm_weights:
	teacher_weight_mean = torch.stack(layer_norm_weights).mean(dim=0)
	teacher_bias_mean = torch.stack(layer_norm_biases).mean(dim=0)
	layer_norm_loss += F.mse_loss(student_ln.weight, teacher_weight_mean)
	layer_norm_loss += F.mse_loss(student_ln.bias, teacher_bias_mean)
	loss += LAYER_NORM_KD_FACTOR * layer_norm_loss

	embed_loss = EMBEDDING_KD_FACTOR * hidden_state_kd_loss(student_embeds, teacher_embeds, HIDDEN_STATE_KD_LOSS_TYPE) if ENABLE_EMBEDDING_KD else torch.tensor(0.0).to(device)
	loss += embed_loss

	parameter_loss = 0.0
	if ENABLE_PARAMETER_KD:
	for name, student_param in MODELS['student'].model.named_parameters():
	param_values = []
	for teacher_model in MODELS['teacher']:
	if name in teacher_model.model.state_dict():
	param_values.append(teacher_model.model.state_dict()[name])
	if param_values:
	teacher_param_mean = torch.stack(param_values).mean(dim=0)
	parameter_loss += kd_loss_functions[WEIGHTED_PARAMETER_LOSS_TYPE](student_param, teacher_param_mean)
	loss += PARAMETER_KD_FACTOR * parameter_loss

	layerwise_parameter_loss = 0.0
	if ENABLE_LAYERWISE_PARAMETER_KD:
	for layer_module_name in LAYERWISE_PARAMETER_MODULES:
	if hasattr(MODELS['student'].model, layer_module_name):
	student_layer_module = getattr(MODELS['student'].model, layer_module_name)
	for name, student_param in student_layer_module.named_parameters():
	param_values = []
	full_name = f"{layer_module_name}.{name}"
	for teacher_model in MODELS['teacher']:
	if hasattr(teacher_model.model, layer_module_name) and full_name in teacher_model.model.state_dict():
	param_values.append(teacher_model.model.state_dict()[full_name])
	if param_values:
	teacher_param_mean = torch.stack(param_values).mean(dim=0)
	layerwise_parameter_loss += kd_loss_functions[WEIGHTED_PARAMETER_LOSS_TYPE](student_param, teacher_param_mean)
	loss += LAYERWISE_PARAMETER_KD_FACTOR * layerwise_parameter_loss

	vocab_projection_loss = 0.0
	if ENABLE_VOCAB_PROJECTION_KD:
	for vocab_module_name in VOCAB_PROJECTION_MODULES:
	if hasattr(MODELS['student'].model, vocab_module_name):
	student_vocab_module = getattr(MODELS['student'].model, vocab_module_name)
	projection_weights = []
	for teacher_model in MODELS['teacher']:
	if hasattr(teacher_model.model, vocab_module_name):
	teacher_vocab_module = getattr(teacher_model.model, vocab_module_name)
	projection_weights.append(teacher_vocab_module.weight)
	if projection_weights:
	teacher_weight_mean = torch.stack(projection_weights).mean(dim=0)
	vocab_projection_loss += F.mse_loss(student_vocab_module.weight, teacher_weight_mean)
	loss += VOCAB_PROJECTION_KD_FACTOR * vocab_projection_loss

	activation_loss = 0.0
	if ENABLE_ACTIVATION_KD:
	for name, module in MODELS['student'].model.named_modules():
	if isinstance(module, ActivationSaver) and type(module.module).__name__ in ACTIVATION_MODULES:
	if module.activation_values is not None:
	activations = module.activation_values
	activation_loss += F.mse_loss(activations, torch.zeros_like(activations))
	loss += ACTIVATION_KD_FACTOR * activation_loss

	logit_masking_loss = 0.0
	if ENABLE_LOGIT_MASKING_KD:
	teacher_probs = F.softmax(teacher_logits, dim=-1)
	top_k_indices = torch.topk(teacher_probs, k=10, dim=-1)[1]
	mask = torch.ones_like(student_logits).scatter_(-1, top_k_indices, 0.0).bool()
	masked_student_logits = student_logits.masked_fill(mask, -1e9)
	logit_masking_loss = kd_loss_functions['kl'](masked_student_logits / adaptive_temperature, teacher_logits / adaptive_temperature) * (adaptive_temperature ** 2)
	loss += LOGIT_MASKING_FACTOR * logit_masking_loss

	sparsity_loss = 0.0
	if ENABLE_SPARCITY_REGULARIZATION:
	for name, module in MODELS['student'].model.named_modules():
	if isinstance(module, ActivationSaver) and type(module.module).__name__ in ACTIVATION_MODULES:
	if module.activation_values is not None:
	activations = module.activation_values
	sparsity_loss += torch.norm(activations, 1)
	loss += SPARCITY_REGULARIZATION_FACTOR * sparsity_loss

	feature_map_loss = 0.0
	if ENABLE_FEATURE_MAP_KD:
	for name, student_module in MODELS['student'].model.named_modules():
	if isinstance(student_module, ActivationSaver) and type(student_module.module).__name__ in FEATURE_MAP_MODULES:
	teacher_module = MODELS['teacher'][0].model.get_submodule(name)
	if isinstance(teacher_module, ActivationSaver) and teacher_module.feature_map_values is not None and student_module.feature_map_values is not None:
	student_feature_map = student_module.feature_map_values
	teacher_feature_map = teacher_module.feature_map_values
	feature_map_loss += F.mse_loss(student_feature_map, teacher_feature_map)
	loss += FEATURE_MAP_KD_FACTOR * feature_map_loss

	contrastive_loss = 0.0
	if ENABLE_CONTRASTIVE_KD:
	student_vec = student_hidden_states[-1][:, 0, :]
	teacher_vec = teacher_hidden_states[-1][:, 0, :]
	contrastive_loss = CONTRASTIVE_KD_FACTOR * kd_loss_functions[CONTRASTIVE_KD_LOSS_TYPE](student_vec, teacher_vec)
	loss += contrastive_loss

	rdrop_loss = 0.0
	if ENABLE_RDROP_KD:
	r_student_outputs = MODELS['student'].model(**student_inputs, output_attentions=ENABLE_ATTENTION_KD or ENABLE_ATTENTION_ALIGNMENT_KD, output_hidden_states=ENABLE_HIDDEN_STATE_KD or ENABLE_INTERMEDIATE_KD or ENABLE_LAYER_WISE_KD)
	r_student_logits = r_student_outputs.logits
	rdrop_loss = RDROP_KD_FACTOR * (kd_loss_functions['kl'](F.log_softmax(student_logits, dim=-1), F.log_softmax(r_student_logits, dim=-1)) + kd_loss_functions['kl'](F.log_softmax(r_student_logits, dim=-1), F.log_softmax(student_logits, dim=-1)))
	loss += rdrop_loss

	layer_wise_loss = 0.0
	if ENABLE_LAYER_WISE_KD and teacher_hidden_states is not None and student_hidden_states is not None:
	layer_wise_loss = 0.0
	for layer_module_name in LAYER_WISE_MODULES:
	if hasattr(MODELS['student'].model, layer_module_name):
	student_layer_module = getattr(MODELS['student'].model, layer_module_name)
	for student_layer, teacher_layer in zip(student_layer_module.modules(), MODELS['teacher'][0].model.get_module(layer_module_name).modules()):
	if isinstance(student_layer, nn.Linear) and isinstance(teacher_layer, nn.Linear):
	student_output = student_layer(student_hidden_states[-1])
	teacher_output = teacher_layer(teacher_hidden_states[-1])
	layer_wise_loss += hidden_state_kd_loss(student_output, teacher_output, LAYER_WISE_LOSS_TYPE)
	loss += LAYER_WISE_KD_FACTOR * layer_wise_loss

	activation_regularization_loss = 0.0
	if ENABLE_ACTIVATION_REGULARIZATION:
	activation_regularization_loss = 0.0
	for name, module in MODELS['student'].model.named_modules():
	if isinstance(module, ActivationSaver) and type(module.module).__name__ in ACTIVATION_MODULES:
	if module.activation_values is not None:
	activations = module.activation_values
	activation_regularization_loss += torch.norm(activations, p=2)
	loss += ACTIVATION_REG_LAMBDA * activation_regularization_loss

	neuron_selectivity_loss = 0.0
	if ENABLE_NEURON_SELECTIVITY_KD:
	neuron_selectivity_loss = 0.0
	for name, module in MODELS['student'].model.named_modules():
	if isinstance(module, ActivationSaver) and type(module.module).__name__ in NEURON_SELECTIVITY_MODULES:
	if module.activation_values is not None:
	student_activations = module.activation_values
	teacher_module = MODELS['teacher'][0].model.get_submodule(name)
	if isinstance(teacher_module, ActivationSaver) and teacher_module.activation_values is not None:
	teacher_activations = teacher_module.activation_values
	neuron_selectivity_loss += NEURON_SELECTIVITY_KD_FACTOR * kd_loss_functions[NEURON_SELECTIVITY_LOSS_TYPE](student_activations.mean(dim=0), teacher_activations.mean(dim=0))
	loss += neuron_selectivity_loss

	l2_reg_loss = 0.0
	if ENABLE_L2_REGULARIZATION:
	for param in MODELS['student'].model.parameters():
	l2_reg_loss += torch.norm(param, p=2)
	loss += L2_LAMBDA * l2_reg_loss

	l1_reg_loss = 0.0
	if ENABLE_L1_REGULARIZATION:
	for param in MODELS['student'].model.parameters():
	l1_reg_loss += torch.norm(param, p=1)
	loss += L1_LAMBDA * l1_reg_loss

	loss_accum += (kd_loss_factor * loss + ce_loss_factor * ce_loss) / accumulation_steps

	except Exception as e:
	log_message(f"Error in inner distillation loop: {e}", level="warning")

	scaler.scale(loss_accum).backward()

	if ENABLE_GRADIENT_CLIPPING:
	try:
	torch.nn.utils.clip_grad_norm_(MODELS['student'].model.parameters(), GRAD_CLIP_VALUE)
	except Exception as e:
	log_message(f"Gradient clipping error: {e}", level="warning")
	scaler.step(optimizer)
	scaler.update()
	if scheduler:
	scheduler.step()
	if ENABLE_ADAPTIVE_TEMPERATURE_KD:
	adaptive_temperature *= adaptive_temperature_decay_rate
	adaptive_temperature = max(adaptive_temperature, 1.0)
	if ENABLE_STUDENT_PARAMETER_FREEZE:
	if ENABLE_DYNAMIC_FREEZE:
	if loss_accum.item() < FREEZE_THRESHOLD:
	for param in MODELS['student'].model.parameters():
	param.requires_grad = True
	if step == 0:
	log_message("Dynamic unfreezing activated due to low loss.", level="info")
	elif step + 1 == FREEZE_STUDENT_STEPS:
	for param in MODELS['student'].model.parameters():
	param.requires_grad = True
	log_message("Unfreezing after freeze_steps.", level="info")
	if SAVE_CHECKPOINTS and (step + 1) % CHECKPOINT_INTERVAL == 0:
	ckpt_path = os.path.join(CHECKPOINT_DIR, f"student_step_{step + 1}.pt")
	os.makedirs(CHECKPOINT_DIR, exist_ok=True)
	save_checkpoint(MODELS['student'], ckpt_path)
	log_to_file(f"Checkpoint saved to {ckpt_path}")
	if (step + 1) % LOG_INTERVAL == 0:
	elapsed = time.time() - start_time
	lr = optimizer.param_groups[0]['lr']
	grad_norm = 0.0
	num_params_with_grad = 0
	for param in MODELS['student'].model.parameters():
	if param.grad is not None:
	grad_norm += param.grad.data.norm(2).item()
	num_params_with_grad += 1
	avg_grad_norm = grad_norm / num_params_with_grad if num_params_with_grad > 0 else 0.0
	log_msg = (f"[KD] Step {step + 1}/{KD_STEPS}, Loss: {loss_accum.item():.4f}, "
	f"LR: {lr:.6f}, GradNorm: {avg_grad_norm:.4f}, Time: {elapsed:.2f}s, Temp: {adaptive_temperature.item():.3f}")
	print(log_msg)
	log_to_file(log_msg)
	writer.add_scalar("Loss/accumulated", loss_accum.item(), step + 1)
	writer.add_scalar("Learning_Rate", lr, step + 1)
	writer.add_scalar("GradNorm", avg_grad_norm, step + 1)
	writer.add_scalar("Output_Logit_KD_Loss", output_kd_loss.item(), step + 1)
	writer.add_scalar("Attention_KD_Loss", attn_loss.item(), step + 1)
	writer.add_scalar("HiddenState_KD_Loss", hidden_loss.item(), step + 1)
	writer.add_scalar("Intermediate_KD_Loss", intermediate_loss.item(), step + 1)
	writer.add_scalar("LayerNorm_KD_Loss", layer_norm_loss.item(), step + 1)
	writer.add_scalar("Embedding_KD_Loss", embed_loss.item(), step + 1)
	writer.add_scalar("Parameter_KD_Loss", parameter_loss.item(), step + 1)
	writer.add_scalar("Activation_KD_Loss", activation_loss.item(), step + 1)
	writer.add_scalar("LogitMasking_KD_Loss", logit_masking_loss.item(), step + 1)
	writer.add_scalar("Sparcity_Regularization_Loss", sparsity_loss.item(), step + 1)
	writer.add_scalar("FeatureMap_KD_Loss", feature_map_loss.item(), step + 1)
	writer.add_scalar("Layerwise_Parameter_KD_Loss", layerwise_parameter_loss.item(), step + 1)
	writer.add_scalar("VocabProjection_KD_Loss", vocab_projection_loss.item(), step + 1)
	writer.add_scalar("Contrastive_KD_Loss", contrastive_loss.item(), step + 1)
	writer.add_scalar("RDrop_KD_Loss", rdrop_loss.item(), step + 1)
	writer.add_scalar("Adaptive_Temperature", adaptive_temperature.item(), step + 1)
	writer.add_scalar("LayerWise_KD_Loss", layer_wise_loss.item(), step + 1)
	writer.add_scalar("Activation_Regularization_Loss", activation_regularization_loss.item(), step + 1)
	writer.add_scalar("NeuronSelectivity_KD_Loss", neuron_selectivity_loss.item(), step + 1)
	writer.add_scalar("WeightedParameter_KD_Loss", weighted_parameter_loss.item(), step + 1)

	if ENABLE_EARLY_STOPPING:
	if loss_accum.item() < best_loss:
	best_loss = loss_accum.item()
	patience_counter = 0
	else:
	patience_counter += 1
	if patience_counter >= EARLY_STOPPING_PATIENCE:
	log_message(f"Early stopping activated at step {step + 1} after {EARLY_STOPPING_PATIENCE} steps.", level="info")
	log_to_file(f"Early stopping activated at step {step + 1}.")
	break
	except Exception as e:
	log_message(f"Error in distillation step {step + 1}: {e}", level="warning")
	continue
	progress_bar.close()
	writer.close()
	MODELS['writer'] = None
	cleanup_and_exit(0)
	return MODELS['student']


	def push_model_to_hub(model, tokenizer, quantization_method, repo_name, use_auth_token):
	try:
	log_message(f"Saving {model.__class__.__name__} to {repo_name} with method '{quantization_method}'...", level="info")
	model.model.save_pretrained(repo_name, push_to_hub=False)
	tokenizer.save_pretrained(repo_name, push_to_hub=False)
	model.push_to_hub(repo_name, use_auth_token=use_auth_token)
	tokenizer.push_to_hub(repo_name, use_auth_token=use_auth_token)
	log_message("Upload completed.", level="info")
	log_to_file(f"Model and tokenizer uploaded to {repo_name} with method {quantization_method}.")
	except Exception as e:
	log_message(f"Error during push to hub: {e}", level="warning")


	def login_to_huggingface(token):
	global AUTH_TOKEN, USER_NAME
	try:
	user_info = HfApi(token=token).whoami()
	AUTH_TOKEN = token
	USER_NAME = user_info['name']
	log_message(f"Successfully logged in to Hugging Face as {USER_NAME}.", level="info")
	return AUTH_TOKEN, USER_NAME, None
	except Exception as e:
	log_message(f"Hugging Face Hub login error: {e}", level="warning")
	return None, None, "Invalid Hugging Face token."


	def run_fusion_distillation(teacher_model_ckpt_1, teacher_model_ckpt_2, student_model_ckpt, repo_name, disable_mean_resizing, huggingface_token):
	global MODELS, LOG_TEXT, GRADIO_LOG_OUTPUT, AUTH_TOKEN, USER_NAME
	LOG_TEXT = ""
	GRADIO_LOG_OUTPUT.value = LOG_TEXT
	mean_resizing = not disable_mean_resizing

	token, username, error_message = login_to_huggingface(huggingface_token)
	if token is None:
	log_message("Hugging Face login failed.", level="warning")
	return error_message

	AUTH_TOKEN = token
	USER_NAME = username

	try:
	log_message(f"Authenticated with Hugging Face Hub as user: {USER_NAME}.", level="info")
	except Exception as e:
	log_message(f"Hub login error: {e}", level="warning")
	return "Hugging Face Hub login failed: " + str(e)

	try:
	student_tokenizer = AutoTokenizer.from_pretrained(student_model_ckpt)
	MODELS['student'] = pipeline(model=student_model_ckpt, tokenizer=student_tokenizer, device=device)
	if MODELS['student'].tokenizer is None:
	log_message("Student model tokenizer could not be loaded.", level="warning")
	return "Student model tokenizer could not be loaded."

	special_token = "[UNFILTERED]"
	if special_token not in MODELS['student'].tokenizer.get_vocab():
	MODELS['student'].tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
	log_message("Special token added to student tokenizer.", level="info")
	MODELS['student'].model.resize_token_embeddings(len(MODELS['student'].tokenizer))
	except Exception as e:
	log_message(f"Error loading student model: {e}", level="warning")
	return "Error loading student model: " + str(e)

	teacher_models = []
	teacher_tokenizers = []
	teacher_model_checkpoints = [teacher_model_ckpt_1]
	if teacher_model_ckpt_2:
	teacher_model_checkpoints.append(teacher_model_ckpt_2)

	if not isinstance(teacher_model_checkpoints, list):
	log_message("Error loading teacher models: Teacher Model Checkpoints must be a list.", level="warning")
	return "Error loading teacher models: Teacher Model Checkpoints must be a list."
	for i, teacher_model_ckpt in enumerate(teacher_model_checkpoints):
	if not isinstance(teacher_model_ckpt, str):
	log_message(f"Error loading teacher models: not a string at index {i}", level="warning")
	return f"Error loading teacher models: not a string at index {i}"
	try:
	teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_model_ckpt)
	teacher_pipeline = pipeline(model=teacher_model_ckpt, tokenizer=teacher_tokenizer, device=device)
	teacher_models.append(teacher_pipeline)
	teacher_tokenizers.append(teacher_tokenizer)
	log_message(f"Teacher model {i + 1} loaded: {teacher_model_ckpt}", level="info")
	except Exception as e:
	log_message(f"Error loading teacher model {i + 1} ({teacher_model_ckpt}): {e}", level="warning")
	return f"Error loading teacher model {i + 1}: {e}"
	MODELS['teacher'] = teacher_models

	try:
	unify_parameters(MODELS['student'], MODELS['teacher'][0])
	unify_embeddings(MODELS['student'], MODELS['teacher'][0], mean_resizing=mean_resizing)
	MODELS['student'].tokenizer = unify_tokenizers(MODELS['student'].tokenizer, teacher_tokenizers[0], MODELS['student'])
	MODELS['student'].tokenizer = fuse_tokenizers(teacher_tokenizers, MODELS['student'].tokenizer)
	except Exception as e:
	log_message(f"Model unification error: {e}", level="warning")

	fusion_method_name = 'geometric_mean_double'

	try:
	unified_teacher_state = complete_unify_teacher_models_double(MODELS['teacher'], fusion_method=fusion_method_name, layer_scale=1.0, device=device)
	MODELS['student'] = unify_teacher_into_student(unified_teacher_state, MODELS['student'], force_parameter_copy=True)
	MODELS['student'] = update_student_embeddings_double(MODELS['student'], MODELS['teacher'], fusion_type=fusion_method_name, device=device, mean_resizing=mean_resizing)
	except Exception as e:
	log_message(f"Teacher model fusion error: {e}", level="warning")

	try:
	unified_student = advanced_knowledge_distillation(MODELS['teacher'], MODELS['student'], device)
	except Exception as e:
	log_message(f"Knowledge distillation error: {e}", level="warning")
	return "Knowledge Distillation Failed, but Fusion might be partially completed. Check logs."

	if ENABLE_PARAMETER_COUNT_CHECK:
	student_param_count = sum(p.numel() for p in unified_student.model.parameters())
	teacher_param_count = sum(p.numel() for p in MODELS['teacher'][0].model.parameters())
	if student_param_count != teacher_param_count:
	log_message(f"Warning: student parameters ({student_param_count:,}) differ from teacher ({teacher_param_count:,}).", level="warning")
	else:
	log_message(f"Parameter count consistent: Student and Teacher ({teacher_param_count:,}).", level="info")

	sample_texts = ["What is the capital of France?", "Solve: 3 + 5 * 2", "Define a function in Python."]
	try:
	predictions = generate_predictions(unified_student, MODELS['student'].tokenizer, sample_texts)
	log_message("Sample Predictions after KD:\n" + "\n".join(predictions), level="info")
	except Exception as e:
	log_message(f"Sample prediction error: {e}", level="warning")

	api = HfApi()
	try:
	api.create_repo(repo_name, exist_ok=True, token=AUTH_TOKEN)
	push_model_to_hub(unified_student, MODELS['student'].tokenizer, "unified_teacher_kd_full_options_default_true_geometric_mean_double_fusion_v11", repo_name, AUTH_TOKEN)
	final_msg = f"Student model fused, distilled, and uploaded to Hub to repo: {repo_name} with all advanced options and 'geometric_mean_double' fusion!"
	print(final_msg)
	log_to_file(final_msg, "training_log.txt")
	return "Fusion and Distillation Completed! Check console and logs."
	except Exception as e:
	log_message(f"Final upload/repo creation error: {e}", level="warning")
	return "Error during final upload or repository creation: " + str(e)


	if __name__ == "__main__":
	with gr.Blocks(css=".gradio-container {padding: 20px}") as iface:
	gr.Markdown("# Fusion and Distillation Pipeline for Language Models")
	gr.Markdown(
	"This application fuses and distills knowledge from teacher language models into a student model. "
	"It supports advanced knowledge distillation techniques and model fusion strategies."
	)

	huggingface_token_input = gr.Textbox(label="Hugging Face Token", type="password", visible=True)
	username_display = gr.Textbox(label="Hugging Face Username", interactive=False, visible=False)

	def process_token(huggingface_token):
	_, username, error_message = login_to_huggingface(huggingface_token)
	return username, error_message

	huggingface_token_input.change(
	process_token,
	inputs=[huggingface_token_input],
	outputs=[username_display]
	)

	with gr.Column():
	gr.Markdown("## Model Selection")
	teacher_model_ckpt_1_input = HuggingfaceHubSearch(label="Teacher Model 1")
	teacher_model_ckpt_2_input = HuggingfaceHubSearch(label="Teacher Model 2 (Optional)")
	student_model_ckpt_input = HuggingfaceHubSearch(label="Student Model")
	repo_name_input = gr.Textbox(
	label="Repository Name",
	info="Enter the name of the Hugging Face repository to create or update with the distilled model."
	)
	disable_mean_resizing_checkbox = gr.Checkbox(
	label="Disable Mean Resizing",
	info="Check to disable mean resizing of embeddings during unification."
	)

	with gr.Column():
	gr.Markdown("## Execution and Logs")
	run_button = gr.Button("Run Fusion and Distillation", variant="primary", interactive=True, elem_id='run-button')
	output_status_textbox = gr.Textbox(
	info="Real-time status and logs of the fusion and distillation process.",
	label="Status",
	lines=3
	)
	GRADIO_LOG_OUTPUT = gr.Textbox(
	value="",
	label="Detailed Log Output",
	lines=10,
	interactive=False
	)

	def update_run_button_interactivity(token):
	return gr.Button.update(interactive=True)

	huggingface_token_input.change(
	update_run_button_interactivity,
	inputs=[huggingface_token_input],
	outputs=[run_button]
	)

	run_button.click(
	run_fusion_distillation,
	inputs=[
	teacher_model_ckpt_1_input,
	teacher_model_ckpt_2_input,
	student_model_ckpt_input,
	repo_name_input,
	disable_mean_resizing_checkbox,
	huggingface_token_input
	],
	outputs=output_status_textbox,
	)

	iface.launch(server_name="0.0.0.0", server_port=7860)