gchghgg

Paused

App Files Files Community

gchghgg / appx.py

jnjj

Rename app.py to appx.py

6f22cb9 verified about 1 year ago

raw

history blame contribute delete

287 kB

	import os
	import time
	import math
	import copy
	import torch
	from torch import nn
	import torch.nn.functional as F
	from torch.nn.utils import prune
	from transformers import (
	AutoTokenizer,
	AutoConfig,
	DataCollatorForLanguageModeling,
	Trainer,
	TrainingArguments,
	AutoModelForCausalLM,
	AutoModel,
	EarlyStoppingCallback,
	pipeline,
	get_scheduler,
	logging as hf_logging
	)
	try:
	from peft import PeftModel, LoraConfig, get_peft_model, TaskType, PeftConfig
	_peft_installed = True
	except ImportError:
	_peft_installed = False
	PeftModel = None
	LoraConfig = None
	get_peft_model = None
	TaskType = None
	PeftConfig = None

	from datasets import load_dataset, interleave_datasets, concatenate_datasets, Dataset, Features, Value, IterableDataset, DatasetDict
	from huggingface_hub import login, create_repo, HfApi, hf_hub_download
	import wandb
	import gradio as gr
	from gradio_huggingfacehub_search import HuggingfaceHubSearch
	import re
	import json
	import gc
	from accelerate import Accelerator
	import logging
	import traceback
	from collections import Counter, OrderedDict
	import requests
	import gzip
	import inspect
	import shutil
	from functools import partial
	import types
	import psutil

	hf_logging.set_verbosity_error()
	logging.getLogger("datasets").setLevel(logging.ERROR)
	logging.getLogger("huggingface_hub").setLevel(logging.ERROR)
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	padding = True
	truncation = True
	TOKENIZERS_PARALLELISM = True
	os.environ["TOKENIZERS_PARALLELISM"] = str(TOKENIZERS_PARALLELISM)

	BATCH_SIZE = 8
	LEARNING_RATE = 1.5e-4
	EPOCHS = 1
	MAX_STEPS = 1
	USE_CPU = False
	NUM_CPU_CORES = -1
	MERGE_ALPHA = 0.7
	CONTEXT_LENGTH = 256
	HEADS = 4
	DIMENSIONS = 256
	LAYERS = 1
	INTERMEDIATE_SIZE = 1024
	USE_WANDB = False
	ACTIVATION_FUNCTIONS = { "relu": nn.ReLU, "gelu": nn.GELU, "silu": nn.SiLU, "mish": nn.Mish, "leaky_relu": nn.LeakyReLU, "elu": nn.ELU, "relu6": nn.ReLU6, "tanh": nn.Tanh, "sigmoid": nn.Sigmoid, "identity": nn.Identity }
	DEFAULT_ACTIVATION_FUNCTION = "gelu"
	OPTIMIZERS = {
	"adamw_torch": torch.optim.AdamW,
	"adam_torch": torch.optim.Adam,
	"sgd": torch.optim.SGD,
	"adamax": torch.optim.Adamax,
	"adagrad": torch.optim.Adagrad,
	"rmsprop": torch.optim.RMSprop
	}
	DEFAULT_OPTIMIZER = "adamw_torch"
	PRUNING_AMOUNT = 0.2
	QUANTIZATION_MODES = ['float32', 'float16', 'bfloat16']
	DEFAULT_QUANTIZATION = 'float32'
	SCHEDULER_TYPES = ["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"]
	DEFAULT_SCHEDULER = "cosine"
	GRADIENT_ACCUMULATION_STEPS = 1
	EVAL_STEPS = 500
	SAVE_STEPS = 500
	LOGGING_STEPS = 100
	EARLY_STOPPING_PATIENCE = 5
	LOAD_BEST_MODEL_AT_END = True
	METRIC_FOR_BEST_MODEL = "eval_loss"

	AVAILABLE_MODALITIES = ['Image', 'Audio']
	MODALITY_ENCODERS = {
	'Image': 'google/vit-base-patch16-224-in21k',
	'Audio': 'openai/whisper-base'
	}
	DEFAULT_PEFT_CONFIG_DICT = {
	"task_type": TaskType.CAUSAL_LM if _peft_installed else None,
	"inference_mode": False,
	"r": 8,
	"lora_alpha": 32,
	"lora_dropout": 0.1,
	"target_modules": None
	} if _peft_installed else {}

	global_model = None
	global_tokenizer = None
	global_pipe = None
	original_num_layers_global = LAYERS
	config = None
	target_layers = LAYERS
	current_peft_config = copy.deepcopy(DEFAULT_PEFT_CONFIG_DICT) if _peft_installed else {}

	RAM_LIMIT_PERCENT = 85.0
	DISK_LIMIT_GB = 5.0
	BYPASS_RESOURCE_LIMITS = False

	class RMSNorm(nn.Module):
	def __init__(self, dim: int, eps: float = 1e-6, elementwise_affine=True, device=None, dtype=None):
	factory_kwargs = {'device': device, 'dtype': dtype}
	super().__init__()
	self.dim = dim
	self.eps = eps
	self.elementwise_affine = elementwise_affine
	if self.elementwise_affine:
	self.weight = nn.Parameter(torch.empty(dim, **factory_kwargs))
	else:
	self.register_parameter('weight', None)
	self.reset_parameters()

	def reset_parameters(self):
	if self.elementwise_affine:
	nn.init.ones_(self.weight)

	def _norm(self, x):
	return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

	def forward(self, x):
	output = self._norm(x.float()).type_as(x)
	if self.elementwise_affine:
	output = output * self.weight
	return output

	def extra_repr(self):
	return f'{self.dim}, eps={self.eps}, elementwise_affine={self.elementwise_affine}'

	def activation_quant(x):
	scale = 127.0 / x.abs().max(dim=-1, keepdim=True).values.clamp(min=1e-5)
	y = (x * scale).round().clamp(-128, 127) / scale
	return y

	def weight_quant(w):
	scale = 1.0 / w.abs().mean().clamp(min=1e-5)
	u = (w * scale).round().clamp(-1, 1) / scale
	return u

	class BitLinear(nn.Linear):
	def forward(self, x):
	w = self.weight
	device = w.device
	if x.device != device:
	x = x.to(device)

	rms_norm_module = RMSNorm(x.shape[-1], eps=1e-6, elementwise_affine=False, device=device, dtype=x.dtype)
	x_norm = rms_norm_module(x)

	x_quant = x_norm + (activation_quant(x_norm) - x_norm).detach()
	w_quant = w + (weight_quant(w) - w).detach()

	bias = self.bias.to(w_quant.dtype) if self.bias is not None else None
	output = F.linear(x_quant, w_quant, None)

	if bias is not None:
	output = output + bias.to(output.dtype)

	return output

	def to(self, args, *kwargs):
	super().to(args, *kwargs)
	if self.bias is not None:
	self.bias = self.bias.to(args, *kwargs)
	return self


	class BypassLayerNorm(nn.Module):
	def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True, device=None, dtype=None):
	factory_kwargs = {'device': device, 'dtype': dtype}
	super().__init__()
	if isinstance(normalized_shape, int):
	self.normalized_shape = (normalized_shape,)
	else:
	self.normalized_shape = tuple(normalized_shape)
	self.eps = eps
	self.elementwise_affine = elementwise_affine
	if self.elementwise_affine:
	self.weight = nn.Parameter(torch.empty(self.normalized_shape, **factory_kwargs))
	self.bias = nn.Parameter(torch.empty(self.normalized_shape, **factory_kwargs))
	else:
	self.register_parameter('weight', None)
	self.register_parameter('bias', None)
	self.bypass = False
	self.reset_parameters()

	def reset_parameters(self) -> None:
	if self.elementwise_affine:
	nn.init.ones_(self.weight)
	nn.init.zeros_(self.bias)

	def forward(self, x):
	if self.bypass:
	return x
	original_dtype = x.dtype
	if original_dtype not in [torch.float32, torch.float16, torch.bfloat16]:
	x = x.float()

	output = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
	return output.to(original_dtype)

	def extra_repr(self) -> str:
	return f'{self.normalized_shape}, eps={self.eps}, elementwise_affine={self.elementwise_affine}, bypass={self.bypass}'

	class BypassDropout(nn.Module):
	def __init__(self, p=0.5, inplace=False):
	super().__init__()
	self.p = p
	self.inplace = inplace
	self.bypass = False

	def forward(self, x):
	if self.bypass or not self.training or self.p == 0:
	return x
	return F.dropout(x, self.p, self.training, self.inplace)

	def extra_repr(self) -> str:
	return f'p={self.p}, inplace={self.inplace}, bypass={self.bypass}'

	def get_device():
	if torch.cuda.is_available() and not USE_CPU:
	return torch.device("cuda")
	elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available() and not USE_CPU:
	logging.info("MPS backend detected on Mac. Note: MPS support is experimental and may have limitations.")
	return torch.device("mps")
	else:
	if not USE_CPU:
	logging.warning("CUDA/MPS not available or USE_CPU=True. Falling back to CPU.")
	return torch.device("cpu")

	def clean_memory():
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	logging.debug("Cleaned memory.")

	def check_resources(ram_limit_pct=RAM_LIMIT_PERCENT, disk_limit_gb=DISK_LIMIT_GB):
	if BYPASS_RESOURCE_LIMITS:
	logging.info("Resource limit checks bypassed.")
	return True, "Resource checks bypassed."

	try:
	ram = psutil.virtual_memory()
	ram_used_pct = ram.percent
	ram_ok = ram_used_pct < ram_limit_pct

	disk = psutil.disk_usage('/')
	disk_free_gb = disk.free / (1024**3)
	disk_ok = disk_free_gb > disk_limit_gb

	status_msg = (f"Resource Check: RAM Used: {ram_used_pct:.1f}% (Limit: <{ram_limit_pct}%), "
	f"Disk Free: {disk_free_gb:.1f}GB (Limit: >{disk_limit_gb}GB).")

	if ram_ok and disk_ok:
	logging.info(status_msg + " Status: OK")
	return True, status_msg + " Status: OK"
	else:
	warning_msg = status_msg + " Status: LIMIT EXCEEDED!"
	logging.warning(warning_msg)
	return False, warning_msg

	except Exception as e:
	logging.error(f"Failed to check resources: {e}")
	return True, f"Resource check failed: {e}"

	def initialize_config_flags(existing_config=None):
	if existing_config is None:
	from transformers import PretrainedConfig
	config_obj = PretrainedConfig()
	elif isinstance(existing_config, dict):
	from transformers import PretrainedConfig
	try:
	config_obj = PretrainedConfig(**existing_config)
	except Exception as e:
	logging.warning(f"Could not initialize PretrainedConfig from dict, using default. Error: {e}")
	config_obj = PretrainedConfig()
	else:
	config_obj = existing_config

	default_flags = {
	"reduced_layers": False, "original_num_layers": None, "removed_bias": False, "untied_embeddings": False,
	"limits_configured": False, "qa_restrictions_removed": False, "additional_mechanisms_applied": False,
	"safety_settings_enabled": True, "perfect_precision_recovered": False, "token_gen_speed_maximized": False,
	"coherence_improvement_enabled": False, "inconsistencies_biases_removed": False,
	"quantization_applied": False, "quantization_mode": DEFAULT_QUANTIZATION,
	"layer_norm_bypassed": False, "replaced_layer_norm": False, "dropout_bypassed": False, "replaced_dropout": False,
	"activation_function_swapped": False, "current_activation_function": DEFAULT_ACTIVATION_FUNCTION,
	"embedding_normalized": False, "gradient_clipping_disabled": False, "weight_decay_disabled": False,
	"lr_scheduler_disabled": False, "bitnet_applied": False, "gradient_checkpointing_enabled": False,
	"pruning_applied": False, "pruning_amount": None, "frozen_layers": None,
	"enhanced_security_enabled": False, "debug_mode_enabled": False, "auto_optimization_enabled": False,
	"internal_logging_enabled": False, "drift_detection_enabled": False, "ultra_fast_mode": False,
	"optimizer": DEFAULT_OPTIMIZER, "rms_norm_applied": False, "layerdrop_enabled": False, "layerdrop_prob": 0.0,
	"lora_merged": False, "lora_adapter_path": None, "knowledge_distillation_setup": False, "kd_num_labels": None,
	"reward_modeling_setup": False, "rm_num_outputs": 1,
	"swa_applied": False, "knowledge_edited": False, "head_pruning_applied": False, "qat_applied": False,
	"architecture_merged": False, "weight_init_applied": False, "gradient_noise_applied": False,
	"rope_scaling_type": None, "rope_scaling_factor": None, "sliding_window_size": None, "attention_variant": None,
	"response_filters": True, "harassment_filter": True, "hate_filter": True, "sexually_explicit_filter": True,
	"dangerous_content_filter": True, "civic_integrity_filter": True, "code_filter": True,
	"medical_advice_filter": True, "legal_advice_filter": True, "financial_advice_filter": True,
	"pii_filter": True, "political_filter": True, "religious_filter": True, "profanity_filter": True,
	"stereotype_filter": True, "misinfo_filter": True, "self_harm_filter": True, "personal_attack_filter": True,
	"toxicity_filter": True, "spam_filter": True, "off_topic_filter": True, "tone_filter": True,
	"min_max_length_filter": True, "repetition_filter_enabled": False, "factuality_filter_enabled": False,
	"baseline_distribution": None, "remove_censorship": False, "no_response_filters": False,
	"no_advert_warning": False, "no_limits": False, "knowledge_date": None, "cutoff_date": None,
	"max_input_tokens": None, "max_output_tokens": None,
	"multimodal_applied": False, "supported_modalities": [], "modality_encoders": {}, "modality_projection_dim": None, "modality_special_tokens": {},
	"use_flash_attention_2": getattr(config_obj, 'attn_implementation', None) == 'flash_attention_2',
	"attn_implementation": getattr(config_obj, 'attn_implementation', 'auto'),
	"gradient_accumulation_steps": GRADIENT_ACCUMULATION_STEPS,
	"peft_adapter_added": False, "peft_config": None
	}

	for flag, default_value in default_flags.items():
	if not hasattr(config_obj, flag):
	setattr(config_obj, flag, default_value)

	if getattr(config_obj, 'attn_implementation', 'auto') == 'flash_attention_2':
	config_obj.use_flash_attention_2 = True
	else:
	config_obj.use_flash_attention_2 = False

	if getattr(config_obj, 'quantization_mode', DEFAULT_QUANTIZATION) == 'float32':
	config_obj.quantization_applied = False
	config_obj.perfect_precision_recovered = True
	else:
	config_obj.quantization_applied = True
	config_obj.perfect_precision_recovered = False

	if _peft_installed and isinstance(existing_config, PeftConfig):
	config_obj.peft_adapter_added = True
	config_obj.peft_config = existing_config.to_dict()

	return config_obj

	def _recursive_setattr(obj, attr_str, value):
	parts = attr_str.split('.')
	obj_to_set = obj
	try:
	for part in parts[:-1]:
	if not hasattr(obj_to_set, part):
	logging.warning(f"Intermediate attribute {part} not found in {attr_str} for object {type(obj_to_set)}")
	return False
	obj_to_set = getattr(obj_to_set, part)
	if obj_to_set is None:
	logging.warning(f"Intermediate attribute {part} is None in {attr_str}")
	return False
	if hasattr(obj_to_set, parts[-1]):
	setattr(obj_to_set, parts[-1], value)
	return True
	else:
	logging.warning(f"Final attribute {parts[-1]} not found in {attr_str} on object {type(obj_to_set)}")
	return False
	except AttributeError as e:
	logging.error(f"AttributeError setting {attr_str}: {e}")
	return False
	except Exception as e:
	logging.error(f"Generic error setting attribute {attr_str}: {e}")
	return False


	def _get_encoder_hidden_size(encoder_model_id, trust_remote_code=True):
	try:
	encoder_config = AutoConfig.from_pretrained(encoder_model_id, trust_remote_code=trust_remote_code)

	possible_keys = ['hidden_size', 'd_model', 'embed_dim']
	for key in possible_keys:
	if hasattr(encoder_config, key):
	size = getattr(encoder_config, key)
	if isinstance(size, int) and size > 0:
	return size

	nested_configs = ['vision_config', 'audio_config', 'encoder']
	for nested_name in nested_configs:
	if hasattr(encoder_config, nested_name):
	nested_cfg = getattr(encoder_config, nested_name)
	if nested_cfg and isinstance(nested_cfg, object):
	for key in possible_keys:
	if hasattr(nested_cfg, key):
	size = getattr(nested_cfg, key)
	if isinstance(size, int) and size > 0:
	return size

	raise ValueError(f"Could not automatically determine hidden/embedding size for encoder {encoder_model_id}. Checked attributes: {possible_keys} and nested configs: {nested_configs}.")
	except Exception as e:
	logging.error(f"Failed to get config or hidden size for encoder {encoder_model_id}: {e}")
	raise ValueError(f"Failed to get config or hidden size for encoder {encoder_model_id}") from e


	def convert_to_bitnet(model, config, copy_weights=True):
	if not hasattr(nn, 'RMSNorm'):
	logging.warning("BitNet conversion requires RMSNorm, which might not be standard. Using custom RMSNorm.")

	device = get_device()
	converted_count = 0
	modules_to_process = list(model.named_modules())
	processed_names = set()

	with torch.no_grad():
	for name, module in modules_to_process:
	if name in processed_names:
	continue

	is_target_linear = isinstance(module, nn.Linear) and (
	any(sub in name.lower() for sub in ["attn", "mlp", "fc", "dense", "query", "key", "value", "out", "wi", "wo"])
	and "norm" not in name.lower()
	and "embedding" not in name.lower()
	)

	if is_target_linear:
	try:
	current_dtype = module.weight.dtype if hasattr(module, 'weight') and module.weight is not None else torch.float32
	has_bias = module.bias is not None

	bl = BitLinear(module.in_features, module.out_features, has_bias).to(device=device, dtype=current_dtype)

	if copy_weights and hasattr(module, 'weight') and module.weight is not None:
	if bl.weight.shape == module.weight.shape:
	bl.weight.data.copy_(module.weight.data)
	else:
	logging.warning(f"Shape mismatch for weight {name}: Expected {bl.weight.shape}, got {module.weight.shape}. Skipping weight copy.")

	if has_bias and bl.bias is not None:
	if bl.bias.shape == module.bias.shape:
	bl.bias.data.copy_(module.bias.data)
	else:
	logging.warning(f"Shape mismatch for bias {name}: Expected {bl.bias.shape}, got {module.bias.shape}. Skipping bias copy.")
	elif not has_bias and bl.bias is not None:
	nn.init.zeros_(bl.bias)
	elif has_bias and bl.bias is None:
	logging.warning(f"Module {name} had bias, but BitLinear does not. Bias info lost.")

	elif not copy_weights:
	nn.init.xavier_uniform_(bl.weight)
	if bl.bias is not None:
	nn.init.zeros_(bl.bias)

	if _recursive_setattr(model, name, bl):
	converted_count += 1
	processed_names.add(name)
	logging.debug(f"Converted layer {name} to BitLinear.")
	else:
	logging.warning(f"Failed to set BitLinear for {name}")
	processed_names.add(name)

	except Exception as e:
	logging.error(f"Error replacing {name} with BitLinear: {e} \n{traceback.format_exc()}")
	processed_names.add(name)

	if converted_count > 0:
	config.bitnet_applied = True
	logging.info(f"Applied BitNet conversion to {converted_count} linear layers.")
	return f"Applied BitNet conversion to {converted_count} linear layers."
	else:
	logging.info("No applicable linear layers found or converted for BitNet.")
	config.bitnet_applied = False
	return "No applicable layers found for BitNet conversion."

	def revert_bitnet(model, config):
	if not getattr(config, 'bitnet_applied', False):
	return "BitNet not applied according to config, nothing to revert."

	device = get_device()
	model.to(device)
	reverted_count = 0
	modules_to_process = list(model.named_modules())
	processed_names = set()

	with torch.no_grad():
	for name, module in modules_to_process:
	if name in processed_names:
	continue

	if isinstance(module, BitLinear):
	try:
	dtype = module.weight.dtype if hasattr(module, 'weight') and module.weight is not None else torch.float32
	has_bias = module.bias is not None

	lin = nn.Linear(module.in_features, module.out_features, bias=has_bias).to(device, dtype=dtype)

	if hasattr(module, 'weight') and module.weight is not None:
	if lin.weight.shape == module.weight.shape:
	lin.weight.data.copy_(module.weight.data)
	else:
	logging.warning(f"Shape mismatch reverting weight {name}: Expected {lin.weight.shape}, got {module.weight.shape}. Re-initializing nn.Linear weight.")
	nn.init.kaiming_uniform_(lin.weight, a=math.sqrt(5))
	else:
	nn.init.kaiming_uniform_(lin.weight, a=math.sqrt(5))

	if has_bias and lin.bias is not None:
	if lin.bias.shape == module.bias.shape:
	lin.bias.data.copy_(module.bias.data)
	else:
	logging.warning(f"Shape mismatch reverting bias {name}: Expected {lin.bias.shape}, got {module.bias.shape}. Re-initializing nn.Linear bias.")
	fan_in, _ = nn.init._calculate_fan_in_and_fan_out(lin.weight)
	bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
	nn.init.uniform_(lin.bias, -bound, bound)
	elif has_bias and lin.bias is None:
	logging.error(f"BitLinear layer {name} had bias, but reverted nn.Linear does not. Reversion failed for bias.")
	elif not has_bias and lin.bias is not None:
	logging.error(f"BitLinear layer {name} lacked bias, but reverted nn.Linear has one. Setting to zero.")
	nn.init.zeros_(lin.bias)

	if _recursive_setattr(model, name, lin):
	reverted_count += 1
	processed_names.add(name)
	logging.debug(f"Reverted BitLinear layer {name} to nn.Linear.")
	else:
	logging.warning(f"Failed to revert BitLinear for {name}")
	processed_names.add(name)

	except Exception as e:
	logging.error(f"Error reverting BitLinear {name}: {e} \n{traceback.format_exc()}")
	processed_names.add(name)

	if reverted_count > 0:
	config.bitnet_applied = False
	logging.info(f"Reverted {reverted_count} BitNet layers to standard nn.Linear.")
	return f"Reverted {reverted_count} BitNet layers."
	else:
	config.bitnet_applied = False
	logging.info("No BitNet layers found to revert, but flag was true. Resetting flag.")
	return "No BitNet layers found to revert."


	def _find_decoder_layers_module(model):
	prefixes = [
	('model.decoder', 'layers'),
	('model.layers', None),
	('transformer.h', None),
	('transformer.blocks', None),
	('encoder.layer', None),
	('model.encoder.layers', None),
	('', 'layers'),
	('', 'h'),
	('model', 'layers'),
	('decoder.block', None),
	('decoder.layers', None)
	]
	if hasattr(model, 'model'):
	base_obj = model.model
	else:
	base_obj = model

	direct_attrs = ['layers', 'h', 'blocks', 'block']
	for attr in direct_attrs:
	if hasattr(base_obj, attr):
	layer_list = getattr(base_obj, attr)
	if isinstance(layer_list, nn.ModuleList) and len(layer_list) > 0:
	logging.info(f"Found layer list at 'model.{attr}' or '{attr}' with {len(layer_list)} layers.")
	return base_obj, attr, layer_list
	elif isinstance(layer_list, (list, tuple)) and len(layer_list) > 0 and isinstance(layer_list[0], nn.Module):
	logging.warning(f"Found layers as list/tuple at 'model.{attr}' or '{attr}'. Converting to ModuleList.")
	setattr(base_obj, attr, nn.ModuleList(layer_list))
	return base_obj, attr, getattr(base_obj, attr)

	for p_base, attr_name_explicit in prefixes:
	mod = model
	valid_path = True
	if p_base:
	for comp in p_base.split('.'):
	if not hasattr(mod, comp):
	valid_path = False
	break
	mod = getattr(mod, comp)
	if mod is None:
	valid_path = False
	break
	if not valid_path:
	continue

	attrs_to_check = [attr_name_explicit] if attr_name_explicit else ['layers', 'h', 'blocks', 'block', 'layer']

	for attr in attrs_to_check:
	if hasattr(mod, attr):
	layer_list = getattr(mod, attr)
	if isinstance(layer_list, nn.ModuleList) and len(layer_list) > 0:
	logging.info(f"Found layer list at '{p_base}.{attr}' with {len(layer_list)} layers.")
	return mod, attr, layer_list
	elif isinstance(layer_list, (list, tuple)) and len(layer_list) > 0 and isinstance(layer_list[0], nn.Module):
	logging.warning(f"Found layers as list/tuple at '{p_base}.{attr}'. Converting to ModuleList.")
	setattr(mod, attr, nn.ModuleList(layer_list))
	return mod, attr, getattr(mod, attr)

	logging.warning("Could not automatically find the standard decoder/transformer layer list module.")
	return None, None, None


	def _reduce_layers_to_one(base_model, config, target_layers=1):
	if not isinstance(target_layers, int) or target_layers < 1:
	logging.error(f"Invalid target_layers value: {target_layers}. Must be an integer >= 1.")
	return f"Error: Target layers must be >= 1, got {target_layers}."

	layer_module, layer_attr, current_layers = _find_decoder_layers_module(base_model)

	if layer_module and layer_attr and current_layers is not None:
	current_len = len(current_layers)
	if current_len <= 0:
	logging.warning("Found layer attribute but the ModuleList is empty. Cannot reduce.")
	return "Warning: Layer list found but it's empty. Cannot reduce."

	if current_len > target_layers:
	logging.info(f"Reducing layers: {current_len} -> {target_layers}...")
	original_layer_count = current_len

	if not hasattr(config, 'original_num_layers') or config.original_num_layers is None or config.original_num_layers < current_len:
	config.original_num_layers = original_layer_count
	logging.info(f"Stored/Updated original layer count in config: {original_layer_count}")

	new_layer_list = nn.ModuleList(current_layers[:target_layers])
	setattr(layer_module, layer_attr, new_layer_list)

	config.num_hidden_layers = target_layers
	config.reduced_layers = True
	if hasattr(config, 'n_layer'): config.n_layer = target_layers
	if hasattr(config, 'num_layers'): config.num_layers = target_layers
	if hasattr(config, 'num_decoder_layers'): config.num_decoder_layers = target_layers
	if hasattr(config, 'num_encoder_layers') and 'encoder' in layer_attr: config.num_encoder_layers = target_layers


	logging.info(f"Successfully reduced layers to {target_layers}.")
	clean_memory()
	return f"Layers reduced to {target_layers}. Original count was: {original_layer_count}."
	elif current_len == target_layers:
	logging.info(f"Model already has {current_len} layers, matching the target {target_layers}. No reduction needed.")
	config.reduced_layers = False if current_len == getattr(config, 'original_num_layers', current_len) else True
	config.num_hidden_layers = current_len
	if hasattr(config, 'n_layer'): config.n_layer = current_len
	if hasattr(config, 'num_layers'): config.num_layers = current_len
	if hasattr(config, 'num_decoder_layers'): config.num_decoder_layers = current_len
	if hasattr(config, 'num_encoder_layers') and 'encoder' in layer_attr: config.num_encoder_layers = current_len
	return f"Model already has {current_len} layers (target {target_layers}). No reduction performed."
	else:
	logging.info(f"Model has {current_len} layers, which is less than the target {target_layers}. No reduction needed.")
	config.reduced_layers = True
	config.num_hidden_layers = current_len
	if hasattr(config, 'n_layer'): config.n_layer = current_len
	if hasattr(config, 'num_layers'): config.num_layers = current_len
	if hasattr(config, 'num_decoder_layers'): config.num_decoder_layers = current_len
	if hasattr(config, 'num_encoder_layers') and 'encoder' in layer_attr: config.num_encoder_layers = current_len
	return f"Model already has {current_len} layers (< target {target_layers}). No reduction performed."
	else:
	logging.warning("Could not find standard layer structure for reduction.")
	config.reduced_layers = False
	return "Warning: Could not find standard layer structure for reduction."


	def _enable_full_layers(base_model, config, original_num_layers=None):
	if not getattr(config, 'reduced_layers', False):
	layer_module, layer_attr, current_layers = _find_decoder_layers_module(base_model)
	current_len = len(current_layers) if current_layers is not None else 0
	orig_len_config = getattr(config, 'original_num_layers', None)
	if current_len > 0 and orig_len_config is not None and current_len == orig_len_config:
	config.reduced_layers = False
	return "Layers already seem to be at the original count. Flag corrected if necessary."
	else:
	return "Layers not previously reduced according to config flag, or cannot verify current/original counts."

	orig_layers = original_num_layers if original_num_layers is not None else getattr(config, 'original_num_layers', None)

	if orig_layers is None:
	global original_num_layers_global
	orig_layers = original_num_layers_global
	if orig_layers is not None:
	logging.warning(f"Using globally stored original layer count: {orig_layers} as it was missing in config.")
	config.original_num_layers = orig_layers
	else:
	logging.error("Cannot restore layers: Original layer count is missing from config and global state.")
	return "Error: Cannot revert - Original layer count unknown."

	if not isinstance(orig_layers, int) or orig_layers <= 0:
	logging.error(f"Cannot restore layers: Invalid original layer count found ({orig_layers}).")
	return f"Error: Cannot revert - Invalid original layer count ({orig_layers})."

	layer_module, layer_attr, current_layers = _find_decoder_layers_module(base_model)

	if layer_module and layer_attr and current_layers is not None:
	current_len = len(current_layers)
	if current_len < orig_layers:
	logging.info(f"Restoring layers: {current_len} -> {orig_layers}..."); T = time.time()
	try:
	if current_len == 0:
	logging.error("Cannot restore layers: No existing layers found to copy structure from.")
	return "Error: Cannot restore layers - no template layer available."

	device = next(iter(current_layers[0].parameters()), torch.tensor([])).device
	template_layer = current_layers[0].to('cpu')

	layers_to_add = []
	num_layers_to_add = orig_layers - current_len
	logging.info(f"Need to add {num_layers_to_add} layers.")

	for i in range(num_layers_to_add):
	new_layer = copy.deepcopy(template_layer)
	for _, sub_module in new_layer.named_modules():
	if hasattr(sub_module, 'reset_parameters'):
	try:
	sub_module.reset_parameters()
	except Exception as reset_e:
	logging.warning(f"Could not reset parameters for submodule {sub_module} in new layer {i}: {reset_e}")
	elif isinstance(sub_module, nn.Linear):
	nn.init.kaiming_uniform_(sub_module.weight, a=math.sqrt(5))
	if sub_module.bias is not None:
	fan_in, _ = nn.init._calculate_fan_in_and_fan_out(sub_module.weight)
	bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
	nn.init.uniform_(sub_module.bias, -bound, bound)
	elif isinstance(sub_module, nn.Embedding):
	nn.init.normal_(sub_module.weight)
	if sub_module.padding_idx is not None:
	with torch.no_grad(): sub_module.weight[sub_module.padding_idx].fill_(0)
	elif isinstance(sub_module, (nn.LayerNorm, RMSNorm, BypassLayerNorm)):
	if sub_module.elementwise_affine:
	if hasattr(sub_module, 'weight') and sub_module.weight is not None: nn.init.ones_(sub_module.weight)
	if hasattr(sub_module, 'bias') and sub_module.bias is not None: nn.init.zeros_(sub_module.bias)

	new_layer = new_layer.to(device)
	layers_to_add.append(new_layer)

	full_layer_list = nn.ModuleList(list(current_layers) + layers_to_add)
	setattr(layer_module, layer_attr, full_layer_list)

	config.num_hidden_layers = orig_layers
	config.reduced_layers = False
	if hasattr(config, 'n_layer'): config.n_layer = orig_layers
	if hasattr(config, 'num_layers'): config.num_layers = orig_layers
	if hasattr(config, 'num_decoder_layers'): config.num_decoder_layers = orig_layers
	if hasattr(config, 'num_encoder_layers') and 'encoder' in layer_attr: config.num_encoder_layers = orig_layers


	msg = f"Restored layer structure to {orig_layers} layers in {time.time()-T:.2f}s."
	logging.info(msg)
	clean_memory()
	return msg

	except Exception as e:
	logging.error(f"Error restoring layers: {e}\n{traceback.format_exc()}")
	setattr(layer_module, layer_attr, current_layers)
	config.num_hidden_layers = current_len
	config.reduced_layers = True
	if hasattr(config, 'n_layer'): config.n_layer = current_len
	if hasattr(config, 'num_layers'): config.num_layers = current_len
	if hasattr(config, 'num_decoder_layers'): config.num_decoder_layers = current_len
	if hasattr(config, 'num_encoder_layers') and 'encoder' in layer_attr: config.num_encoder_layers = current_len

	return f"Error restoring layers: {e}. State might be inconsistent."
	else:
	config.reduced_layers = False
	config.num_hidden_layers = current_len
	if hasattr(config, 'n_layer'): config.n_layer = current_len
	if hasattr(config, 'num_layers'): config.num_layers = current_len
	if hasattr(config, 'num_decoder_layers'): config.num_decoder_layers = current_len
	if hasattr(config, 'num_encoder_layers') and 'encoder' in layer_attr: config.num_encoder_layers = current_len


	msg = f"Model already has {current_len} layers (>= original {orig_layers}). No restoration needed. Corrected flags if necessary."
	logging.info(msg)
	return msg
	elif layer_module and layer_attr and current_layers is None:
	logging.warning(f"Layer attribute '{layer_attr}' exists but is None or invalid. Cannot restore layers.")
	return "Warning: Layer attribute found but invalid. Cannot restore layers."
	else:
	logging.warning("Could not find standard layer structure for restoration.")
	return "Warning: Could not find standard layer structure for restoration."

	def _replace_linear_without_bias(module, config):
	device = get_device()
	replaced_count = 0
	modules_to_process = list(module.named_modules())
	processed_names = set()

	for name, child in modules_to_process:
	if name in processed_names:
	continue

	if isinstance(child, nn.Linear) and child.bias is not None:
	try:
	dtype = child.weight.dtype
	current_device = child.weight.device

	new_linear = nn.Linear(child.in_features, child.out_features, bias=False).to(device=current_device, dtype=dtype)

	with torch.no_grad():
	if new_linear.weight.shape == child.weight.shape:
	new_linear.weight.copy_(child.weight)
	else:
	logging.warning(f"Shape mismatch removing bias for weight {name}: Expected {new_linear.weight.shape}, got {child.weight.shape}. Re-initializing.")
	nn.init.kaiming_uniform_(new_linear.weight, a=math.sqrt(5))

	if _recursive_setattr(module, name, new_linear):
	replaced_count += 1
	processed_names.add(name)
	logging.debug(f"Removed bias from layer {name}")
	else:
	logging.warning(f"Failed to set bias-less Linear for {name}")
	processed_names.add(name)

	except Exception as e:
	logging.error(f"Error removing bias for layer {name}: {e}")
	processed_names.add(name)

	if replaced_count > 0:
	config.removed_bias = True
	logging.info(f"Removed bias from {replaced_count} linear layers.")
	return f"Removed bias from {replaced_count} linear layers."
	else:
	logging.info("No linear layers with bias found to modify.")
	return "No linear layers with bias found to modify."


	def _enable_bias_in_linear(module, config):
	if not getattr(config, 'removed_bias', False):
	return "Bias not previously removed according to config flag. Cannot enable (revert)."

	device = get_device()
	enabled_count = 0
	modules_to_process = list(module.named_modules())
	processed_names = set()

	for name, child in modules_to_process:
	if name in processed_names:
	continue

	if isinstance(child, nn.Linear) and child.bias is None:
	try:
	dtype = child.weight.dtype
	current_device = child.weight.device

	new_linear = nn.Linear(child.in_features, child.out_features, bias=True).to(device=current_device, dtype=dtype)

	with torch.no_grad():
	if new_linear.weight.shape == child.weight.shape:
	new_linear.weight.copy_(child.weight)
	else:
	logging.warning(f"Shape mismatch enabling bias for weight {name}: Expected {new_linear.weight.shape}, got {child.weight.shape}. Re-initializing weight.")
	nn.init.kaiming_uniform_(new_linear.weight, a=math.sqrt(5))

	fan_in, _ = nn.init._calculate_fan_in_and_fan_out(new_linear.weight)
	bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
	nn.init.uniform_(new_linear.bias, -bound, bound)

	if _recursive_setattr(module, name, new_linear):
	enabled_count += 1
	processed_names.add(name)
	logging.debug(f"Enabled bias for layer {name}")
	else:
	logging.warning(f"Failed to set biased Linear for {name}")
	processed_names.add(name)

	except Exception as e:
	logging.error(f"Error enabling bias for layer {name}: {e}")
	processed_names.add(name)

	if enabled_count > 0:
	config.removed_bias = False
	logging.info(f"Enabled (restored) bias for {enabled_count} linear layers.")
	return f"Enabled bias for {enabled_count} linear layers."
	else:
	config.removed_bias = False
	logging.info("No bias-less linear layers found to enable bias for. Resetting flag.")
	return "No bias-less linear layers found to enable bias for."



	def _replace_layer_norm_with_bypass(module, config):
	replaced_count = 0
	modules_to_process = list(module.named_modules())
	processed_names = set()

	for name, child in modules_to_process:
	if name in processed_names:
	continue

	if isinstance(child, nn.LayerNorm) and not isinstance(child, (BypassLayerNorm, RMSNorm)):
	try:
	child_device = get_device()
	child_dtype = torch.float32
	if hasattr(child, 'weight') and child.weight is not None:
	child_device = child.weight.device
	child_dtype = child.weight.dtype
	elif hasattr(child, 'bias') and child.bias is not None:
	child_device = child.bias.device
	child_dtype = child.bias.dtype
	elif hasattr(child, '_parameters') and child._parameters:
	first_param = next(iter(child.parameters()), None)
	if first_param is not None:
	child_device = first_param.device
	child_dtype = first_param.dtype

	norm_shape = child.normalized_shape
	eps = child.eps
	affine = child.elementwise_affine

	new_layer_norm = BypassLayerNorm(norm_shape, eps, affine, device=child_device, dtype=child_dtype)

	if affine:
	with torch.no_grad():
	if hasattr(child, 'weight') and child.weight is not None and new_layer_norm.weight is not None:
	if new_layer_norm.weight.shape == child.weight.shape:
	new_layer_norm.weight.copy_(child.weight)
	else:
	logging.warning(f"Shape mismatch replacing LN weight {name}. Expected {new_layer_norm.weight.shape}, got {child.weight.shape}. Initializing BypassLN weight.")
	nn.init.ones_(new_layer_norm.weight)
	elif new_layer_norm.weight is not None:
	nn.init.ones_(new_layer_norm.weight)

	if hasattr(child, 'bias') and child.bias is not None and new_layer_norm.bias is not None:
	if new_layer_norm.bias.shape == child.bias.shape:
	new_layer_norm.bias.copy_(child.bias)
	else:
	logging.warning(f"Shape mismatch replacing LN bias {name}. Expected {new_layer_norm.bias.shape}, got {child.bias.shape}. Initializing BypassLN bias.")
	nn.init.zeros_(new_layer_norm.bias)
	elif new_layer_norm.bias is not None:
	nn.init.zeros_(new_layer_norm.bias)

	if _recursive_setattr(module, name, new_layer_norm):
	replaced_count += 1
	processed_names.add(name)
	logging.debug(f"Replaced LayerNorm {name} with BypassLayerNorm.")
	else:
	logging.warning(f"Failed to set BypassLayerNorm for {name}")
	processed_names.add(name)

	except Exception as e:
	logging.error(f"Error replacing LayerNorm {name} with Bypass version: {e}\n{traceback.format_exc()}")
	processed_names.add(name)

	if replaced_count > 0:
	config.replaced_layer_norm = True
	config.layer_norm_bypassed = False
	logging.info(f"Replaced {replaced_count} LayerNorm layers with Bypass version.")
	return f"Replaced {replaced_count} LayerNorm layers with Bypass version."
	else:
	logging.info("No standard nn.LayerNorm layers found to replace with BypassLayerNorm.")
	return "No standard LayerNorm layers found to replace."

	def _revert_bypass_layer_norm(module, config):
	if not getattr(config, 'replaced_layer_norm', False):
	return "BypassLayerNorm not previously applied according to config flag. Cannot revert."

	reverted_count = 0
	modules_to_process = list(module.named_modules())
	processed_names = set()

	for name, child in modules_to_process:
	if name in processed_names:
	continue

	if isinstance(child, BypassLayerNorm):
	try:
	child_device = get_device()
	child_dtype = torch.float32
	if child.elementwise_affine:
	if child.weight is not None:
	child_device = child.weight.device
	child_dtype = child.weight.dtype
	elif child.bias is not None:
	child_device = child.bias.device
	child_dtype = child.bias.dtype
	else:
	pass

	norm_shape = child.normalized_shape
	eps = child.eps
	affine = child.elementwise_affine

	if isinstance(norm_shape, tuple) and len(norm_shape) == 1:
	norm_arg = norm_shape[0]
	elif isinstance(norm_shape, (list, tuple)):
	norm_arg = list(norm_shape)
	elif isinstance(norm_shape, int):
	norm_arg = norm_shape
	else:
	raise ValueError(f"Unsupported normalized_shape type for nn.LayerNorm: {type(norm_shape)}")

	new_layer_norm = nn.LayerNorm(norm_arg, eps, affine, device=child_device, dtype=child_dtype)

	if affine:
	with torch.no_grad():
	if hasattr(child, 'weight') and child.weight is not None and new_layer_norm.weight is not None:
	if new_layer_norm.weight.shape == child.weight.shape:
	new_layer_norm.weight.copy_(child.weight)
	else:
	logging.warning(f"Shape mismatch reverting BypassLN weight {name}. Expected {new_layer_norm.weight.shape}, got {child.weight.shape}. Initializing LayerNorm weight.")
	nn.init.ones_(new_layer_norm.weight)
	elif new_layer_norm.weight is not None:
	nn.init.ones_(new_layer_norm.weight)

	if hasattr(child, 'bias') and child.bias is not None and new_layer_norm.bias is not None:
	if new_layer_norm.bias.shape == child.bias.shape:
	new_layer_norm.bias.copy_(child.bias)
	else:
	logging.warning(f"Shape mismatch reverting BypassLN bias {name}. Expected {new_layer_norm.bias.shape}, got {child.bias.shape}. Initializing LayerNorm bias.")
	nn.init.zeros_(new_layer_norm.bias)
	elif new_layer_norm.bias is not None:
	nn.init.zeros_(new_layer_norm.bias)

	if _recursive_setattr(module, name, new_layer_norm):
	reverted_count += 1
	processed_names.add(name)
	logging.debug(f"Reverted BypassLayerNorm {name} to standard nn.LayerNorm.")
	else:
	logging.warning(f"Failed to revert BypassLayerNorm for {name}")
	processed_names.add(name)

	except Exception as e:
	logging.error(f"Error reverting BypassLayerNorm {name} to standard LayerNorm: {e}\n{traceback.format_exc()}")
	processed_names.add(name)

	if reverted_count > 0:
	config.replaced_layer_norm = False
	config.layer_norm_bypassed = False
	logging.info(f"Reverted {reverted_count} BypassLayerNorm layers back to standard nn.LayerNorm.")
	return f"Reverted {reverted_count} BypassLayerNorm layers."
	else:
	config.replaced_layer_norm = False
	config.layer_norm_bypassed = False
	logging.info("No BypassLayerNorm layers found to revert. Resetting flags.")
	return "No BypassLayerNorm layers found to revert."


	def _enable_layer_norm_bypass(model):
	count = 0
	found_bypass_layers = False
	for m in model.modules():
	if isinstance(m, BypassLayerNorm):
	found_bypass_layers = True
	if not m.bypass:
	m.bypass = True
	count += 1

	if not found_bypass_layers:
	if getattr(model.config, 'replaced_layer_norm', False):
	logging.warning("Config indicates LN were replaced with BypassLN, but none found. Cannot enable bypass.")
	model.config.layer_norm_bypassed = False
	return "Replaced LN flag is true, but no BypassLN layers found. Run 'Replace LN' first or revert."
	else:
	return "No BypassLayerNorm layers found in the model. Replace standard LayerNorm first to enable bypass functionality."

	elif count > 0:
	model.config.layer_norm_bypassed = True
	logging.info(f"Enabled bypass for {count} BypassLayerNorm layers.")
	return f"Enabled bypass for {count} LN layers."
	else:
	model.config.layer_norm_bypassed = True
	logging.info("All existing BypassLayerNorm layers already have bypass enabled.")
	return "No changes made (layers might already be bypassed)."

	def _disable_layer_norm_bypass(model):
	count = 0
	found_bypass_layers = False
	for m in model.modules():
	if isinstance(m, BypassLayerNorm):
	found_bypass_layers = True
	if m.bypass:
	m.bypass = False
	count += 1

	if not found_bypass_layers:
	if getattr(model.config, 'replaced_layer_norm', False):
	model.config.layer_norm_bypassed = False
	return "Replaced LN flag is true, but no BypassLN layers found to disable bypass on."
	else:
	return "No BypassLayerNorm layers found in the model to disable bypass on."

	elif count > 0:
	model.config.layer_norm_bypassed = False
	logging.info(f"Disabled bypass for {count} BypassLayerNorm layers.")
	return f"Disabled bypass for {count} LN layers."
	else:
	model.config.layer_norm_bypassed = False
	logging.info("All existing BypassLayerNorm layers already have bypass disabled.")
	return "No changes made (layers might already have bypass disabled)."


	def _replace_dropout_with_bypass(module, config):
	replaced_count = 0
	modules_to_process = list(module.named_modules())
	processed_names = set()

	for name, child in modules_to_process:
	if name in processed_names:
	continue

	if type(child) == nn.Dropout:
	try:
	new_dropout = BypassDropout(child.p, child.inplace)
	try:
	parent_name = '.'.join(name.split('.')[:-1])
	parent_module = module.get_submodule(parent_name) if parent_name else module
	first_param = next(iter(parent_module.parameters()), None)
	if first_param is not None:
	new_dropout.to(device=first_param.device)
	except Exception:
	new_dropout.to(device=get_device())

	if _recursive_setattr(module, name, new_dropout):
	replaced_count += 1
	processed_names.add(name)
	logging.debug(f"Replaced Dropout {name} with BypassDropout.")
	else:
	logging.warning(f"Failed to set BypassDropout for {name}")
	processed_names.add(name)
	except Exception as e:
	logging.error(f"Error replacing Dropout {name} with Bypass version: {e}")
	processed_names.add(name)

	if replaced_count > 0:
	config.replaced_dropout = True
	config.dropout_bypassed = False
	logging.info(f"Replaced {replaced_count} nn.Dropout layers with BypassDropout version.")
	return f"Replaced {replaced_count} Dropout layers."
	else:
	logging.info("No standard nn.Dropout layers found to replace with BypassDropout.")
	return "No standard Dropout layers found to replace."


	def _revert_bypass_dropout(module, config):
	if not getattr(config, 'replaced_dropout', False):
	return "BypassDropout not previously applied according to config flag. Cannot revert."

	reverted_count = 0
	modules_to_process = list(module.named_modules())
	processed_names = set()

	for name, child in modules_to_process:
	if name in processed_names:
	continue

	if isinstance(child, BypassDropout):
	try:
	new_dropout = nn.Dropout(child.p, child.inplace)
	try:
	parent_name = '.'.join(name.split('.')[:-1])
	parent_module = module.get_submodule(parent_name) if parent_name else module
	first_param = next(iter(parent_module.parameters()), None)
	if first_param is not None:
	new_dropout.to(device=first_param.device)
	except Exception:
	new_dropout.to(device=get_device())


	if _recursive_setattr(module, name, new_dropout):
	reverted_count += 1
	processed_names.add(name)
	logging.debug(f"Reverted BypassDropout {name} to standard nn.Dropout.")
	else:
	logging.warning(f"Failed to revert BypassDropout for {name}")
	processed_names.add(name)
	except Exception as e:
	logging.error(f"Error reverting BypassDropout {name} to standard nn.Dropout: {e}")
	processed_names.add(name)

	if reverted_count > 0:
	config.replaced_dropout = False
	config.dropout_bypassed = False
	logging.info(f"Reverted {reverted_count} BypassDropout layers back to standard nn.Dropout.")
	return f"Reverted {reverted_count} BypassDropout layers."
	else:
	config.replaced_dropout = False
	config.dropout_bypassed = False
	logging.info("No BypassDropout layers found to revert. Resetting flags.")
	return "No BypassDropout layers found to revert."

	def _enable_dropout_bypass(model):
	count = 0
	found_bypass_layers = False
	for m in model.modules():
	if isinstance(m, BypassDropout):
	found_bypass_layers = True
	if not m.bypass:
	m.bypass = True
	count += 1

	if not found_bypass_layers:
	if getattr(model.config, 'replaced_dropout', False):
	model.config.dropout_bypassed = False
	return "Replaced Dropout flag is true, but no BypassDropout layers found. Run 'Replace DO' first or revert."
	else:
	return "No BypassDropout layers found in the model. Replace standard Dropout first to enable bypass."

	elif count > 0:
	model.config.dropout_bypassed = True
	logging.info(f"Enabled bypass for {count} BypassDropout layers.")
	return f"Enabled bypass for {count} Dropout layers."
	else:
	model.config.dropout_bypassed = True
	logging.info("All existing BypassDropout layers already have bypass enabled.")
	return "No changes made (layers might already be bypassed)."

	def _disable_dropout_bypass(model):
	count = 0
	found_bypass_layers = False
	for m in model.modules():
	if isinstance(m, BypassDropout):
	found_bypass_layers = True
	if m.bypass:
	m.bypass = False
	count += 1

	if not found_bypass_layers:
	if getattr(model.config, 'replaced_dropout', False):
	model.config.dropout_bypassed = False
	return "Replaced Dropout flag is true, but no BypassDropout layers found to disable bypass on."
	else:
	return "No BypassDropout layers found in the model to disable bypass on."

	elif count > 0:
	model.config.dropout_bypassed = False
	logging.info(f"Disabled bypass for {count} BypassDropout layers.")
	return f"Disabled bypass for {count} Dropout layers."
	else:
	model.config.dropout_bypassed = False
	logging.info("All existing BypassDropout layers already have bypass disabled.")
	return "No changes made (layers might already have bypass disabled)."


	def _swap_activation_function(model, config, activation_fn_name):
	activation_fn_class = ACTIVATION_FUNCTIONS.get(activation_fn_name)
	if not activation_fn_class:
	msg = f"Warning: Activation function '{activation_fn_name}' not found or invalid. Using default '{DEFAULT_ACTIVATION_FUNCTION}'."
	logging.warning(msg)
	activation_fn_class = ACTIVATION_FUNCTIONS[DEFAULT_ACTIVATION_FUNCTION]
	activation_fn_name = DEFAULT_ACTIVATION_FUNCTION
	if not activation_fn_class:
	logging.error(f"Default activation function '{DEFAULT_ACTIVATION_FUNCTION}' is also missing! Cannot swap.")
	return f"Error: Cannot find '{activation_fn_name}' or the default '{DEFAULT_ACTIVATION_FUNCTION}'."
	else:
	msg = ""

	replaced_count = 0
	current_act_classes = tuple(f for f in ACTIVATION_FUNCTIONS.values() if f is not None and inspect.isclass(f) and issubclass(f, nn.Module))
	target_act_class = activation_fn_class

	modules_to_process = list(model.named_modules())
	processed_names = set()

	for name, child in modules_to_process:
	if name in processed_names:
	continue

	if type(child) in current_act_classes:
	if type(child) == target_act_class:
	processed_names.add(name)
	continue

	try:
	new_activation = target_act_class()
	try:
	parent_name = '.'.join(name.split('.')[:-1])
	parent_module = model.get_submodule(parent_name) if parent_name else module
	first_param = next(iter(parent_module.parameters()), None)
	if first_param is not None:
	new_activation.to(device=first_param.device)
	except Exception:
	new_activation.to(device=get_device())

	if _recursive_setattr(model, name, new_activation):
	replaced_count += 1
	processed_names.add(name)
	logging.debug(f"Swapped activation {name} from {type(child).__name__} to {target_act_class.__name__}")
	else:
	logging.warning(f"Failed to set new activation function for {name}")
	processed_names.add(name)

	except Exception as e:
	logging.error(f"Error replacing activation function {name} of type {type(child).__name__} with {target_act_class.__name__}: {e}")
	processed_names.add(name)

	if replaced_count > 0:
	msg += f"Swapped {replaced_count} activation functions to {activation_fn_name}."
	config.activation_function_swapped = True
	config.current_activation_function = activation_fn_name
	if hasattr(config, 'hidden_act'):
	config.hidden_act = activation_fn_name
	if hasattr(config, 'activation_function'):
	config.activation_function = activation_fn_name
	else:
	msg += f"No eligible activation functions found to swap to {activation_fn_name} (or already using it)."
	if not config.activation_function_swapped:
	current_in_config = getattr(config, 'hidden_act', getattr(config, 'activation_function', DEFAULT_ACTIVATION_FUNCTION))
	config.current_activation_function = current_in_config if current_in_config in ACTIVATION_FUNCTIONS else DEFAULT_ACTIVATION_FUNCTION

	logging.info(msg)
	return msg

	def _revert_activation_function(model, config):
	current_activation = getattr(config, 'current_activation_function', DEFAULT_ACTIVATION_FUNCTION)
	was_swapped = getattr(config, 'activation_function_swapped', False)

	if not was_swapped and current_activation == DEFAULT_ACTIVATION_FUNCTION:
	return f"Activation function is already the default ('{DEFAULT_ACTIVATION_FUNCTION}') and was not marked as swapped."
	elif not was_swapped:
	logging.info(f"Activation function is '{current_activation}' but wasn't marked as swapped. Attempting to revert to '{DEFAULT_ACTIVATION_FUNCTION}' anyway.")
	pass
	else:
	logging.info(f"Reverting activation function from '{current_activation}' to default '{DEFAULT_ACTIVATION_FUNCTION}'...")

	result_msg = _swap_activation_function(model, config, DEFAULT_ACTIVATION_FUNCTION)

	config.activation_function_swapped = False
	config.current_activation_function = DEFAULT_ACTIVATION_FUNCTION
	if hasattr(config, 'hidden_act'):
	config.hidden_act = DEFAULT_ACTIVATION_FUNCTION
	if hasattr(config, 'activation_function'):
	config.activation_function = DEFAULT_ACTIVATION_FUNCTION

	final_msg = f"Reverted to default activation ('{DEFAULT_ACTIVATION_FUNCTION}'). Result: {result_msg}"
	return final_msg


	def _swap_normalization_layer(model, config, target_norm_type='RMSNorm'):
	device = get_device()
	swapped_count = 0
	processed_names = set()

	if target_norm_type == 'RMSNorm':
	current_norm_class = nn.LayerNorm
	new_norm_class = RMSNorm
	config_flag_name = 'rms_norm_applied'
	target_flag_value = True
	elif target_norm_type == 'LayerNorm':
	current_norm_class = RMSNorm
	new_norm_class = nn.LayerNorm
	config_flag_name = 'rms_norm_applied'
	target_flag_value = False
	else:
	msg = f"Error: Unsupported target normalization type '{target_norm_type}'. Use 'RMSNorm' or 'LayerNorm'."
	logging.error(msg)
	return msg

	already_configured = getattr(config, config_flag_name, False) == target_flag_value
	has_current_norm_instances = any(isinstance(m, current_norm_class) for name, m in model.named_modules() if not isinstance(m, (BypassLayerNorm, new_norm_class)))

	if already_configured and not has_current_norm_instances:
	logging.info(f"Model config flag '{config_flag_name}' is already {target_flag_value}, and no instances of {current_norm_class.__name__} found to swap. No action needed.")
	return f"Model already configured for {target_norm_type} (or no swappable layers found)."
	elif already_configured and has_current_norm_instances:
	logging.warning(f"Model config flag '{config_flag_name}' is {target_flag_value}, but instances of {current_norm_class.__name__} were found. Attempting swap anyway to ensure consistency.")
	pass
	elif not already_configured and not has_current_norm_instances:
	logging.info(f"No instances of {current_norm_class.__name__} found to swap to {target_norm_type}. Updating config flag to {target_flag_value}.")
	setattr(config, config_flag_name, target_flag_value)
	if hasattr(config, 'layer_norm_bypassed'): config.layer_norm_bypassed = False
	return f"No {current_norm_class.__name__} layers found to swap. Config flag '{config_flag_name}' set to {target_flag_value}."


	modules_to_process = list(model.named_modules())
	for name, module in modules_to_process:
	if name in processed_names:
	continue

	if isinstance(module, current_norm_class) and not isinstance(module, BypassLayerNorm):
	try:
	eps = module.eps
	elementwise_affine = module.elementwise_affine

	module_device = get_device()
	module_dtype = torch.float32
	params = list(module.parameters())
	if params:
	module_device = params[0].device
	module_dtype = params[0].dtype
	elif elementwise_affine and hasattr(module, 'weight') and module.weight is not None:
	module_device = module.weight.device
	module_dtype = module.weight.dtype

	dim = None
	if isinstance(module, nn.LayerNorm):
	dim = module.normalized_shape
	elif isinstance(module, RMSNorm):
	if elementwise_affine and hasattr(module, 'weight') and module.weight is not None:
	dim = module.weight.shape[0]
	else:
	logging.warning(f"Cannot determine dimension for affine-less RMSNorm {name}. Cannot swap this layer.")
	processed_names.add(name)
	continue
	else:
	raise ValueError(f"Module {name} is unexpected type {type(module)} during norm swap.")

	if new_norm_class == nn.LayerNorm:
	if isinstance(dim, int):
	norm_arg = dim
	elif isinstance(dim, (list, tuple)):
	norm_arg = list(dim)
	else:
	raise ValueError(f"Unsupported dimension type {type(dim)} '{dim}' for creating LayerNorm from {current_norm_class.__name__} layer {name}.")
	new_norm = new_norm_class(norm_arg, eps=eps, elementwise_affine=elementwise_affine, device=module_device, dtype=module_dtype)

	elif new_norm_class == RMSNorm:
	if isinstance(dim, int):
	norm_arg = dim
	elif isinstance(dim, (list, tuple)):
	if len(dim) == 1:
	norm_arg = dim[0]
	else:
	logging.warning(f"LayerNorm shape {dim} has multiple dimensions. Using last dim ({dim[-1]}) for RMSNorm {name}.")
	norm_arg = dim[-1]
	else:
	raise ValueError(f"Unsupported dimension type {type(dim)} '{dim}' for creating RMSNorm from {current_norm_class.__name__} layer {name}.")
	new_norm = new_norm_class(norm_arg, eps=eps, elementwise_affine=elementwise_affine, device=module_device, dtype=module_dtype)

	else:
	raise ValueError("Invalid new_norm_class.")

	if elementwise_affine:
	with torch.no_grad():
	if hasattr(module, 'weight') and module.weight is not None and hasattr(new_norm, 'weight') and new_norm.weight is not None:
	if new_norm.weight.shape == module.weight.shape:
	new_norm.weight.copy_(module.weight)
	else:
	logging.warning(f"Weight shape mismatch swapping norm {name}: {module.weight.shape} -> {new_norm.weight.shape}. Re-initializing target weight.")
	nn.init.ones_(new_norm.weight)
	elif hasattr(new_norm, 'weight') and new_norm.weight is not None:
	logging.debug(f"Initializing weight for new norm {name} as source lacked it.")
	nn.init.ones_(new_norm.weight)

	if hasattr(module, 'bias') and module.bias is not None and hasattr(new_norm, 'bias') and new_norm.bias is not None:
	if new_norm.bias.shape == module.bias.shape:
	new_norm.bias.copy_(module.bias)
	else:
	logging.warning(f"Bias shape mismatch swapping norm {name}: {module.bias.shape} -> {new_norm.bias.shape}. Re-initializing target bias.")
	nn.init.zeros_(new_norm.bias)
	elif hasattr(new_norm, 'bias') and new_norm.bias is not None:
	logging.debug(f"Initializing bias for new LayerNorm {name} as source RMSNorm lacked it.")
	nn.init.zeros_(new_norm.bias)

	if _recursive_setattr(model, name, new_norm):
	swapped_count += 1
	processed_names.add(name)
	logging.debug(f"Swapped {current_norm_class.__name__} layer {name} to {new_norm_class.__name__}.")
	else:
	logging.warning(f"Failed to set swapped normalization layer for {name}")
	processed_names.add(name)

	except Exception as e:
	logging.error(f"Error swapping norm layer {name} from {current_norm_class.__name__} to {new_norm_class.__name__}: {e}\n{traceback.format_exc()}")
	processed_names.add(name)

	if swapped_count > 0:
	setattr(config, config_flag_name, target_flag_value)
	if hasattr(config, 'layer_norm_bypassed'): config.layer_norm_bypassed = False
	msg = f"Swapped {swapped_count} {current_norm_class.__name__} layers to {new_norm_class.__name__}."
	else:
	if not already_configured:
	setattr(config, config_flag_name, target_flag_value)
	if hasattr(config, 'layer_norm_bypassed'): config.layer_norm_bypassed = False
	msg = f"No {current_norm_class.__name__} layers found or matched criteria to swap to {new_norm_class.__name__}. Updated config flag."
	else:
	msg = f"No {current_norm_class.__name__} layers were swapped (already configured or other issue)."

	logging.info(msg)
	return msg


	def _normalize_embeddings(model, config):
	emb_layer = None
	if hasattr(model, 'get_input_embeddings'):
	try:
	emb_layer_candidate = model.get_input_embeddings()
	if isinstance(emb_layer_candidate, nn.Embedding):
	emb_layer = emb_layer_candidate
	logging.info("Found embedding layer via get_input_embeddings()")
	except Exception as e:
	logging.warning(f"Error calling get_input_embeddings(): {e}")

	if emb_layer is None:
	potential_emb_names = ['embed_tokens', 'wte', 'word_embeddings', 'embeddings.word_embeddings', 'shared']
	model_base = getattr(model, 'model', model)

	for name in potential_emb_names:
	try:
	candidate = model_base
	parts = name.split('.')
	valid_path = True
	for part in parts:
	if hasattr(candidate, part):
	candidate = getattr(candidate, part)
	if candidate is None:
	valid_path = False
	break
	else:
	valid_path = False
	break
	if valid_path and isinstance(candidate, nn.Embedding) and hasattr(candidate, 'weight') and candidate.weight is not None:
	emb_layer = candidate
	logging.info(f"Found embedding layer via attribute: '{name}'")
	break
	except AttributeError:
	continue
	except Exception as e:
	logging.warning(f"Error accessing potential embedding layer '{name}': {e}")


	if emb_layer is not None and hasattr(emb_layer, 'weight') and emb_layer.weight is not None:
	try:
	with torch.no_grad():
	w = emb_layer.weight.data
	norms = torch.norm(w, p=2, dim=-1, keepdim=True)
	safe_norms = norms.clamp(min=1e-12)
	w.div_(safe_norms)

	config.embedding_normalized = True
	logging.info("Input embeddings normalized (L2 norm).")
	return "Input embeddings normalized (L2 norm)."
	except Exception as e:
	logging.error(f"Error normalizing embeddings: {e}")
	config.embedding_normalized = False
	return f"Error normalizing embeddings: {e}"
	else:
	msg="Input embedding layer or its weights not found using common methods. Cannot normalize."
	logging.warning(msg)
	config.embedding_normalized = False
	return msg

	def _revert_embedding_normalization(model, config):
	if not getattr(config, 'embedding_normalized', False):
	return "Embedding normalization flag is already false (or was never applied)."

	config.embedding_normalized = False
	logging.info("Embedding normalization flag reverted. Note: Original embedding weights are NOT restored.")
	return "Embedding normalization flag reverted (weights NOT restored)."


	def _prune_weights_magnitude(model, config, amount=0.2):
	if not isinstance(amount, (float, int)) or not (0 < amount < 1):
	msg="Error: Pruning amount must be a float between 0 and 1 (exclusive)."
	logging.error(msg)
	return msg

	logging.info(f"Applying global unstructured L1 magnitude pruning (amount={amount:.2f})...")
	device = get_device()
	model.to(device)

	params_to_prune = []
	for module_name, module in model.named_modules():
	if isinstance(module, (nn.Linear, BitLinear)):
	if hasattr(module, 'weight') and module.weight is not None and module.weight.requires_grad:
	params_to_prune.append((module, 'weight'))

	if not params_to_prune:
	msg="No prunable Linear or BitLinear layers with trainable weights found."
	logging.warning(msg)
	config.pruning_applied = False
	config.pruning_amount = None
	return msg

	try:
	prune.global_unstructured(
	parameters=params_to_prune,
	pruning_method=prune.L1Unstructured,
	amount=amount
	)

	pruned_count = 0
	total_params = 0
	modules_made_permanent = 0
	for module, name in params_to_prune:
	if prune.is_pruned(module):
	prune.remove(module, name)
	modules_made_permanent += 1

	if hasattr(module, name):
	weight = getattr(module, name)
	if weight is not None:
	pruned_count += torch.sum(weight == 0).item()
	total_params += weight.nelement()

	if modules_made_permanent > 0:
	sparsity = 100. * pruned_count / total_params if total_params > 0 else 0
	msg = (f"Pruning applied and made permanent on {modules_made_permanent} parameter groups. "
	f"Final Sparsity: {sparsity:.2f}% ({pruned_count:,}/{total_params:,} zeros).")
	config.pruning_applied = True
	config.pruning_amount = amount
	elif any(prune.is_pruned(mod) for mod, _ in params_to_prune):
	msg = "Pruning hooks were applied but removal failed or was incomplete. Pruning might not be permanent."
	config.pruning_applied = False
	config.pruning_amount = None
	else:
	msg = "Pruning was attempted, but no modules seem to have been pruned or made permanent."
	config.pruning_applied = False
	config.pruning_amount = None


	except Exception as e:
	msg = f"Error during pruning: {e}\n{traceback.format_exc()}"
	logging.error(msg)
	for module, name in params_to_prune:
	if prune.is_pruned(module):
	try:
	prune.remove(module, name)
	logging.info(f"Cleaned up pruning hook from {module}.{name} during error handling.")
	except Exception as remove_e:
	logging.warning(f"Couldn't remove pruning hook from {module}.{name} during cleanup: {remove_e}")
	config.pruning_applied = False
	config.pruning_amount = None

	logging.info(msg)
	return msg

	def _revert_pruning(model, config):
	if not getattr(config, 'pruning_applied', False):
	return "Pruning flag is already false (or pruning was never applied/made permanent)."

	config.pruning_applied = False
	config.pruning_amount = None
	logging.info("Pruning flag reverted. Note: Pruned weights (zeros) are NOT restored.")
	return "Pruning flag reverted (weights NOT restored)."


	def _quantize_model(model, config, mode='bfloat16'):
	logging.info(f"Attempting to change model dtype to {mode}...")
	original_dtype_str = getattr(config, 'quantization_mode', DEFAULT_QUANTIZATION)

	target_dtype = None
	if mode == 'bfloat16':
	if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
	target_dtype = torch.bfloat16
	else:
	msg="Device does not support bfloat16. Cannot quantize to bfloat16. Keeping current dtype."
	logging.warning(msg)
	return msg
	elif mode == 'float16':
	target_dtype = torch.float16
	elif mode == 'float32':
	target_dtype = torch.float32
	else:
	msg = f"Unsupported quantization mode '{mode}'. Choose from {QUANTIZATION_MODES}."
	logging.error(msg)
	return msg

	try:
	current_dtype = next(iter(model.parameters()), torch.tensor([])).dtype
	if not isinstance(current_dtype, torch.dtype):
	msg = "Model has no parameters. Cannot determine or change dtype."
	logging.error(msg)
	return msg
	except StopIteration:
	msg = "Model has no parameters. Cannot determine or change dtype."
	logging.error(msg)
	return msg
	except Exception as e:
	msg = f"Could not determine current model dtype: {e}"
	logging.error(msg)
	return msg


	if current_dtype == target_dtype:
	msg = f"Model is already in {mode} ({target_dtype}). No change needed."
	logging.info(msg)
	config.quantization_applied = (mode != 'float32')
	config.quantization_mode = mode
	config.perfect_precision_recovered = (mode == 'float32')
	return msg

	try:
	device = get_device()
	model.to(device=device, dtype=target_dtype)

	new_dtype = next(iter(model.parameters()), torch.tensor([])).dtype
	if new_dtype == target_dtype:
	config.quantization_applied = (mode != 'float32')
	config.quantization_mode = mode
	config.perfect_precision_recovered = (mode == 'float32')
	msg = f"Model dtype successfully changed to {mode} ({target_dtype}) on device {device}."
	logging.info(msg)
	clean_memory()
	return msg
	else:
	logging.error(f"Model dtype did not change as expected after .to() call. Still {new_dtype}. Reverting config flags.")
	config.quantization_applied = (original_dtype_str != 'float32')
	config.quantization_mode = original_dtype_str
	config.perfect_precision_recovered = (original_dtype_str == 'float32')
	raise RuntimeError(f"Model dtype did not change as expected. Still {new_dtype}.")

	except Exception as e:
	msg=f"Error converting model to {target_dtype}: {e}\n{traceback.format_exc()}"
	logging.error(msg)
	config.quantization_applied = (original_dtype_str != 'float32')
	config.quantization_mode = original_dtype_str
	config.perfect_precision_recovered = (original_dtype_str == 'float32')
	try:
	original_torch_dtype = getattr(torch, original_dtype_str, torch.float32)
	model.to(device=device, dtype=original_torch_dtype)
	logging.info(f"Attempted to restore model to original dtype {original_dtype_str} after error.")
	except Exception as revert_e:
	logging.error(f"Failed to restore original dtype after error: {revert_e}")
	return msg

	def _revert_quantization(model, config):
	logging.info("Reverting quantization to float32...")
	return _quantize_model(model, config, mode='float32')


	def _freeze_layers(model, config, layers_to_freeze_str):
	if not layers_to_freeze_str or not isinstance(layers_to_freeze_str, str):
	msg="No layers specified to freeze or invalid input type."
	logging.warning(msg)
	config.frozen_layers = None
	return msg

	layer_indices = set()
	try:
	raw_parts = layers_to_freeze_str.split(',')
	for part in raw_parts:
	part = part.strip()
	if not part: continue
	if '-' in part:
	start_end = part.split('-')
	if len(start_end) == 2:
	s = int(start_end[0].strip())
	e = int(start_end[1].strip())
	if s < 0 or e < 0: raise ValueError("Negative indices are not allowed.")
	if s <= e:
	layer_indices.update(range(s, e + 1))
	else:
	layer_indices.update(range(e, s + 1))
	logging.warning(f"Interpreted range '{part}' as descending: {list(range(e, s + 1))}")
	else:
	raise ValueError(f"Invalid range format: {part}")
	else:
	idx = int(part)
	if idx < 0: raise ValueError("Negative indices are not allowed.")
	layer_indices.add(idx)
	except ValueError as e:
	msg=f"Error parsing layer specification '{layers_to_freeze_str}': {e}. Use non-negative, comma-separated numbers or ranges (e.g., '0-3, 7, 10-11')."
	logging.error(msg)
	return msg

	layer_module, layer_attr, layer_list = _find_decoder_layers_module(model)
	if not (layer_module and layer_attr and layer_list is not None):
	msg="Could not determine layer structure for freezing. No layers frozen."
	logging.warning(msg)
	return msg

	total_layers = len(layer_list)
	frozen_params_count = 0
	actual_frozen_indices = set()

	unfrozen_globally = 0
	for param in model.parameters():
	if not param.requires_grad:
	param.requires_grad = True
	unfrozen_globally += 1
	if unfrozen_globally > 0:
	logging.info(f"Unfroze {unfrozen_globally} parameters globally before applying new freeze spec.")
	else:
	logging.info("No parameters were frozen globally before applying new spec.")

	invalid_indices_skipped = set()
	for i in layer_indices:
	if 0 <= i < total_layers:
	try:
	current_layer = layer_list[i]
	params_in_layer = 0
	for param in current_layer.parameters():
	if param.requires_grad:
	param.requires_grad = False
	frozen_params_count += 1
	params_in_layer += 1
	if params_in_layer > 0:
	actual_frozen_indices.add(i)
	logging.debug(f"Froze {params_in_layer} parameters in layer {i}.")
	else:
	logging.debug(f"Layer {i} had no trainable parameters to freeze.")

	except IndexError:
	logging.warning(f"Index {i} seems out of bounds for layer list during freezing loop, although check passed earlier. Skipping.")
	invalid_indices_skipped.add(i)
	except Exception as e:
	logging.error(f"Error accessing or freezing parameters for layer {i}: {e}")
	invalid_indices_skipped.add(i)
	else:
	logging.warning(f"Layer index {i} is out of bounds (0-{total_layers-1}). Skipping.")
	invalid_indices_skipped.add(i)

	frozen_list_str = ",".join(map(str, sorted(list(actual_frozen_indices))))
	config.frozen_layers = frozen_list_str if actual_frozen_indices else None

	msg = f"Froze {frozen_params_count} parameters in layers: {frozen_list_str} (Total layers: {total_layers})."
	if invalid_indices_skipped:
	msg += f" Skipped invalid indices: {sorted(list(invalid_indices_skipped))}."
	if frozen_params_count == 0 and not invalid_indices_skipped:
	msg = f"No parameters were frozen. Specified layers {frozen_list_str} might have already been frozen or had no trainable params."

	logging.info(msg)
	return msg

	def _unfreeze_all_layers(model, config):
	unfrozen_count = 0
	for name, param in model.named_parameters():
	if not param.requires_grad:
	param.requires_grad = True
	unfrozen_count += 1

	config.frozen_layers = None

	msg = f"Unfroze {unfrozen_count} parameters across the entire model." if unfrozen_count > 0 else "No parameters needed unfreezing."
	logging.info(msg)
	return msg


	def _enable_gradient_checkpointing(model, config):
	gc_enabled_in_model = False
	if hasattr(model, 'gradient_checkpointing_enable'):
	try:
	sig = inspect.signature(model.gradient_checkpointing_enable)
	if 'gradient_checkpointing_kwargs' in sig.parameters:
	model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant': False})
	msg = "Gradient Checkpointing enabled via model method (non-reentrant)."
	else:
	model.gradient_checkpointing_enable()
	msg = "Gradient Checkpointing enabled via model method."

	gc_enabled_in_model = True
	logging.info(msg)

	except Exception as e:
	logging.warning(f"Failed to enable gradient checkpointing via standard method: {e}. Trying config attribute.")

	if hasattr(model, 'config') and hasattr(model.config, 'gradient_checkpointing'):
	if not gc_enabled_in_model:
	logging.info("Enabling gradient checkpointing via model config attribute.")
	model.config.gradient_checkpointing = True
	gc_enabled_in_model = True

	if gc_enabled_in_model:
	if hasattr(model.config, 'use_cache'):
	if model.config.use_cache:
	model.config.use_cache = False
	logging.info("Set model.config.use_cache = False (required for Gradient Checkpointing).")
	else:
	logging.warning("Model config missing 'use_cache' attribute. Gradient checkpointing might not work correctly or efficiently.")

	config.gradient_checkpointing_enabled = True
	final_msg = "Gradient Checkpointing enabled."
	if not hasattr(model, 'gradient_checkpointing_enable') and not (hasattr(model, 'config') and hasattr(model.config, 'gradient_checkpointing')):
	final_msg += " (Set via main config flag only; ensure Trainer args/model support it)."
	return final_msg
	else:
	config.gradient_checkpointing_enabled = False
	msg = "Could not enable Gradient Checkpointing via model methods or config attributes."
	logging.error(msg)
	return f"[Error] {msg}"


	def _disable_gradient_checkpointing(model, config):
	gc_disabled_in_model = False
	if hasattr(model, 'gradient_checkpointing_disable'):
	try:
	model.gradient_checkpointing_disable()
	gc_disabled_in_model = True
	logging.info("Gradient Checkpointing disabled via model method.")
	except Exception as e:
	logging.warning(f"Failed to disable gradient checkpointing via standard method: {e}. Trying config attribute.")

	if hasattr(model, 'config') and hasattr(model.config, 'gradient_checkpointing'):
	if not gc_disabled_in_model:
	logging.info("Disabling gradient checkpointing via model config attribute.")
	model.config.gradient_checkpointing = False
	gc_disabled_in_model = True

	if gc_disabled_in_model:
	if hasattr(model.config, 'use_cache'):
	if not model.config.use_cache:
	model.config.use_cache = True
	logging.info("Set model.config.use_cache = True (restored after disabling Gradient Checkpointing).")

	config.gradient_checkpointing_enabled = False
	final_msg = "Gradient Checkpointing disabled."
	if not hasattr(model, 'gradient_checkpointing_disable') and not (hasattr(model, 'config') and hasattr(model.config, 'gradient_checkpointing')):
	final_msg += " (Set via main config flag only)."
	return final_msg
	else:
	config.gradient_checkpointing_enabled = False
	msg = "Could not disable Gradient Checkpointing via model methods or config attributes (may not have been enabled)."
	logging.warning(msg)
	return msg


	def _swap_optimizer(config, optimizer_name):
	if optimizer_name in OPTIMIZERS:
	config.optimizer = optimizer_name
	global DEFAULT_OPTIMIZER
	DEFAULT_OPTIMIZER = optimizer_name
	msg=f"Optimizer preference set to '{optimizer_name}' in config. This will be used by the Trainer if training starts."
	logging.info(msg)
	return msg
	else:
	available_opts = ", ".join(OPTIMIZERS.keys())
	msg=f"Error: Optimizer '{optimizer_name}' unknown or not available. Choose from: {available_opts}."
	logging.error(msg)
	return msg

	def _revert_optimizer(config):
	original_default_optimizer = "adamw_torch"
	logging.info(f"Reverting optimizer preference to script default: '{original_default_optimizer}'.")
	return _swap_optimizer(config, original_default_optimizer)

	def _untie_embeddings(model, config):
	try:
	input_embeddings = model.get_input_embeddings()
	output_embeddings = model.get_output_embeddings()

	if output_embeddings is None:
	if hasattr(model, 'lm_head') and isinstance(model.lm_head, nn.Linear):
	output_embeddings = model.lm_head
	logging.info("Using 'lm_head' as the output embedding layer for untie check.")
	else:
	msg="Could not get output embedding layer (get_output_embeddings() returned None and 'lm_head' not found/Linear). Cannot untie."
	logging.warning(msg)
	config.untied_embeddings = True
	if hasattr(config, "tie_word_embeddings"): config.tie_word_embeddings = False
	return msg

	if input_embeddings is None:
	msg="Could not get input embedding layer. Cannot untie."
	logging.warning(msg)
	config.untied_embeddings = False
	if hasattr(config, "tie_word_embeddings"): config.tie_word_embeddings = True
	return msg

	are_tied = False
	if hasattr(input_embeddings, "weight") and hasattr(output_embeddings, "weight") and \
	input_embeddings.weight is not None and output_embeddings.weight is not None:
	if input_embeddings.weight.data_ptr() == output_embeddings.weight.data_ptr():
	are_tied = True
	elif input_embeddings.weight.storage().data_ptr() == output_embeddings.weight.storage().data_ptr():
	are_tied = True
	logging.info("Weights appear tied (share storage).")

	if are_tied:
	logging.info("Detected tied input/output embeddings. Attempting to untie...")
	device = input_embeddings.weight.device
	dtype = input_embeddings.weight.dtype

	new_output_weight = input_embeddings.weight.clone().detach()
	new_output_weight.requires_grad_(output_embeddings.weight.requires_grad)

	output_embeddings.weight = nn.Parameter(new_output_weight.to(device, dtype=dtype))

	if hasattr(input_embeddings, "bias") and input_embeddings.bias is not None and \
	hasattr(output_embeddings, "bias") and output_embeddings.bias is not None and \
	input_embeddings.bias.data_ptr() == output_embeddings.bias.data_ptr():
	logging.info("Detected tied bias, untying as well.")
	new_output_bias = input_embeddings.bias.clone().detach()
	new_output_bias.requires_grad_(output_embeddings.bias.requires_grad)
	output_embeddings.bias = nn.Parameter(new_output_bias.to(device, dtype=dtype))

	if hasattr(config, "tie_word_embeddings"): config.tie_word_embeddings = False
	config.untied_embeddings = True
	msg="Embeddings untied successfully (output layer weights/bias are now distinct copies)."
	logging.info(msg)
	clean_memory()
	return msg
	else:
	config.untied_embeddings = True
	if hasattr(config, "tie_word_embeddings"): config.tie_word_embeddings = False
	msg="Embeddings are already untied (or weights are missing/different objects)."
	logging.info(msg)
	return msg

	except Exception as e:
	msg=f"Error untying embeddings: {e}\n{traceback.format_exc()}"
	logging.error(msg)
	return msg


	def _retie_embeddings(model, config):
	if not getattr(config, 'untied_embeddings', False):
	try:
	input_emb = model.get_input_embeddings()
	output_emb = model.get_output_embeddings()
	if output_emb is None and hasattr(model, 'lm_head') and isinstance(model.lm_head, nn.Linear):
	output_emb = model.lm_head

	if input_emb is not None and output_emb is not None and \
	hasattr(input_emb, 'weight') and input_emb.weight is not None and \
	hasattr(output_emb, 'weight') and output_emb.weight is not None and \
	input_emb.weight.data_ptr() == output_emb.weight.data_ptr():
	msg = "Embeddings seem already tied. Resetting flag if needed."
	config.untied_embeddings = False
	if hasattr(config, "tie_word_embeddings"): config.tie_word_embeddings = True
	logging.info(msg)
	return msg
	else:
	msg = "Cannot re-tie: Flag 'untied_embeddings' is false or cannot verify current state."
	logging.info(msg)
	return msg
	except Exception as e:
	msg = f"Cannot re-tie: Error checking current state ({e}). Flag 'untied_embeddings' is false."
	logging.warning(msg)
	return msg


	try:
	input_embeddings = model.get_input_embeddings()
	output_embeddings = model.get_output_embeddings()
	if output_embeddings is None and hasattr(model, 'lm_head') and isinstance(model.lm_head, nn.Linear):
	output_embeddings = model.lm_head
	logging.info("Using 'lm_head' as output layer for re-tying.")

	if input_embeddings is None or output_embeddings is None:
	msg="Could not get both input and output embedding layers for re-tying."
	logging.warning(msg)
	return msg

	if hasattr(input_embeddings, "weight") and input_embeddings.weight is not None and \
	hasattr(output_embeddings, "weight") and output_embeddings.weight is not None:

	if input_embeddings.weight.shape == output_embeddings.weight.shape:
	logging.info("Attempting to re-tie embeddings by sharing input embedding weight...")
	device = input_embeddings.weight.device
	dtype = input_embeddings.weight.dtype
	output_embeddings = output_embeddings.to(device=device, dtype=dtype)

	output_embeddings.weight = input_embeddings.weight

	if hasattr(output_embeddings, "bias") and output_embeddings.bias is not None:
	logging.info("Setting output embedding bias to None as part of re-tying.")
	output_embeddings.bias = None

	if hasattr(config, "tie_word_embeddings"): config.tie_word_embeddings = True
	config.untied_embeddings = False
	msg="Embeddings re-tied successfully (output layer now shares input layer's weight, bias set to None)."
	logging.info(msg)
	clean_memory()
	return msg
	else:
	msg = f"Cannot re-tie embeddings: Weight shapes mismatch. Input: {input_embeddings.weight.shape}, Output: {output_embeddings.weight.shape}."
	logging.warning(msg)
	return msg
	else:
	msg = "Cannot re-tie embeddings: Input or output embedding weights missing or None."
	logging.warning(msg)
	return msg

	except Exception as e:
	msg=f"Error re-tying embeddings: {e}\n{traceback.format_exc()}"
	logging.error(msg)
	return msg


	def _configure_limits(config):
	config.knowledge_date = "2045-03-28"
	config.cutoff_date = "2045-03-28"

	current_max_pos = getattr(config, 'max_position_embeddings', 512)
	new_max_pos = current_max_pos * 100
	config.max_position_embeddings = new_max_pos

	config.limits_configured = True
	config.no_limits = True

	logging.info(f"Set knowledge/cutoff date flags and increased max_position_embeddings in config to {config.max_position_embeddings}.")
	return f"Limit-related flags configured (Knowledge Date: 2045, Max Pos Emb: {config.max_position_embeddings}). Requires model reload or RoPE scaling for actual effect."

	def _remove_limits_configuration(config):
	if not getattr(config, 'limits_configured', False):
	return "Limit configuration flags are already in their default state."

	config.knowledge_date = None
	config.cutoff_date = None

	config.limits_configured = False
	config.no_limits = False

	logging.info("Reset knowledge date and cutoff date flags in config. Max position embeddings remain modified.")
	return "Limit-related flags removed/reset. Max position embeddings NOT reverted."

	def _remove_qa_restrictions(config):
	config.qa_restrictions_removed = True
	logging.info("QA restrictions removal flag set in config. Actual effect depends on model usage/fine-tuning and inference logic.")
	return "QA Restrictions Removal Flag Enabled (symbolic)."

	def _enable_qa_restrictions(config):
	config.qa_restrictions_removed = False
	logging.info("QA restrictions removal flag disabled in config.")
	return "QA Restrictions Removal Flag Disabled (symbolic)."

	def _enable_coherence_improvement(config):
	config.coherence_improvement_enabled = True
	logging.info("Coherence improvement flag enabled. Inference will use beam search if this is active.")
	return "Coherence Improvement Flag ON (uses beam search in inference)."

	def _disable_coherence_improvement(config):
	config.coherence_improvement_enabled = False
	logging.info("Coherence improvement flag disabled.")
	return "Coherence Improvement Flag OFF."


	def _set_flag_only(config, flag_name, value, msg_on, msg_off):
	if not hasattr(config, flag_name):
	logging.warning(f"Config object does not have flag '{flag_name}'. Adding it.")

	bool_value = bool(value)
	setattr(config, flag_name, bool_value)

	msg = msg_on if bool_value else msg_off
	logging.info(f"Config flag '{flag_name}' set to {bool_value}. Message: {msg}")
	return msg

	def _apply_swa(model, config): return _set_flag_only(config, "swa_applied", True, "SWA flag set. Requires SWA callback/logic during training.", "SWA flag disabled.")
	def _revert_swa(model, config): return _set_flag_only(config, "swa_applied", False, "SWA flag set.", "SWA flag disabled.")
	def _apply_knowledge_editing(model, config): return _set_flag_only(config, "knowledge_edited", True, "Knowledge Editing flag set. Indicates manual edits or specific editing techniques were applied (symbolic).", "Knowledge Editing flag disabled.")
	def _revert_knowledge_editing(model, config): return _set_flag_only(config, "knowledge_edited", False, "Knowledge Editing flag set.", "Knowledge Editing flag disabled.")
	def _apply_head_pruning(model, config): return _set_flag_only(config, "head_pruning_applied", True, "Head Pruning flag set. Requires specific pruning implementation outside this script (symbolic).", "Head Pruning flag disabled.")
	def _revert_head_pruning(model, config): return _set_flag_only(config, "head_pruning_applied", False, "Head Pruning flag set.", "Head Pruning flag disabled.")
	def _apply_qat(model, config): return _set_flag_only(config, "qat_applied", True, "QAT flag set. Requires Quantization-Aware Training setup and execution (symbolic).", "QAT flag disabled.")
	def _revert_qat(model, config): return _set_flag_only(config, "qat_applied", False, "QAT flag set.", "QAT flag disabled.")
	def _apply_architecture_merge_flag(model, config): return _set_flag_only(config, "architecture_merged", True, "Architecture Merged flag set. Indicates model is likely a result of parameter averaging.", "Architecture Merged flag disabled.")
	def _revert_architecture_merge_flag(model, config): return _set_flag_only(config, "architecture_merged", False, "Architecture Merged flag set.", "Architecture Merged flag disabled.")
	def _apply_weight_init(model, config): return _set_flag_only(config, "weight_init_applied", True, "Weight Initialization flag set. Indicates a specific init strategy was used (symbolic).", "Weight Initialization flag disabled.")
	def _revert_weight_init(model, config): return _set_flag_only(config, "weight_init_applied", False, "Weight Initialization flag set.", "Weight Initialization flag disabled.")
	def _apply_gradient_noise(model, config): return _set_flag_only(config, "gradient_noise_applied", True, "Gradient Noise flag set. Requires implementation in optimizer/trainer (symbolic).", "Gradient Noise flag disabled.")
	def _revert_gradient_noise(model, config): return _set_flag_only(config, "gradient_noise_applied", False, "Gradient Noise flag set.", "Gradient Noise flag disabled.")


	def _apply_additional_mechanisms(base_model, config):
	logging.info("Applying various additional experimental mechanisms flags and simple optimizations...")
	_set_flag_only(config, "enhanced_security_enabled", True, "Enhanced Security Flag ON.", "Enhanced Security Flag OFF.")
	_set_flag_only(config, "debug_mode_enabled", True, "Debug Mode Flag ON.", "Debug Mode Flag OFF.")
	_set_flag_only(config, "internal_logging_enabled", True, "Internal Logging Flag ON.", "Internal Logging Flag OFF.")
	_set_flag_only(config, "drift_detection_enabled", True, "Drift Detection Flag ON.", "Drift Detection Flag OFF.")
	_set_flag_only(config, "ultra_fast_mode", True, "Ultra Fast Mode Flag ON.", "Ultra Fast Mode Flag OFF.")

	coherence_msg = _enable_coherence_improvement(config)
	speed_msg = _optimize_token_generation_speed(config)

	config.additional_mechanisms_applied = True
	logging.info("Applied various additional mechanism flags and optimizations.")
	return f"Applied Additional Mechanism Flags & Optimizations. Coherence: {coherence_msg}, Speed: {speed_msg}"

	def _disable_additional_mechanisms(config):
	if not getattr(config, 'additional_mechanisms_applied', False):
	return "Additional mechanisms flag is already off. No changes made."

	logging.info("Disabling various additional experimental mechanisms flags and reverting optimizations...")
	_set_flag_only(config, "enhanced_security_enabled", False, "Enhanced Security Flag ON.", "Enhanced Security Flag OFF.")
	_set_flag_only(config, "debug_mode_enabled", False, "Debug Mode Flag ON.", "Debug Mode Flag OFF.")
	_set_flag_only(config, "internal_logging_enabled", False, "Internal Logging Flag ON.", "Internal Logging Flag OFF.")
	_set_flag_only(config, "drift_detection_enabled", False, "Drift Detection Flag ON.", "Drift Detection Flag OFF.")
	_set_flag_only(config, "ultra_fast_mode", False, "Ultra Fast Mode Flag ON.", "Ultra Fast Mode Flag OFF.")

	coherence_msg = _disable_coherence_improvement(config)
	speed_msg = _revert_token_generation_speed_optimization(config)

	config.additional_mechanisms_applied = False
	logging.info("Disabled various additional mechanism flags and reverted optimizations.")
	return f"Disabled Additional Mechanism Flags & Reverted Optimizations. Coherence: {coherence_msg}, Speed: {speed_msg}"

	def _disable_all_safety_settings(config):
	flags_to_disable = [
	"response_filters", "safety_settings_enabled",
	"harassment_filter", "hate_filter", "sexually_explicit_filter",
	"dangerous_content_filter", "civic_integrity_filter", "code_filter",
	"medical_advice_filter", "legal_advice_filter", "financial_advice_filter",
	"pii_filter", "political_filter", "religious_filter", "profanity_filter",
	"stereotype_filter", "misinfo_filter", "self_harm_filter",
	"personal_attack_filter", "toxicity_filter", "spam_filter",
	"off_topic_filter", "tone_filter", "min_max_length_filter",
	"repetition_filter_enabled", "factuality_filter_enabled"
	]
	flags_to_set_true = [
	"remove_censorship", "no_response_filters", "no_advert_warning", "no_limits"
	]

	config = initialize_config_flags(config)

	updated_flags = 0
	for flag in flags_to_disable:
	if hasattr(config, flag) and getattr(config, flag) is not False:
	setattr(config, flag, False)
	updated_flags += 1
	for flag in flags_to_set_true:
	if hasattr(config, flag) and getattr(config, flag) is not True:
	setattr(config, flag, True)
	updated_flags += 1

	config.safety_settings_enabled = False
	config.response_filters = False

	logging.info(f"Disabled all known safety/content filters and related flags in config ({updated_flags} flags updated).")
	return "All safety filter flags disabled in config."

	def _enable_all_safety_settings(config):
	flags_to_set_default_true = [
	"safety_settings_enabled", "response_filters",
	"harassment_filter", "hate_filter", "sexually_explicit_filter",
	"dangerous_content_filter", "self_harm_filter", "pii_filter",
	"min_max_length_filter",
	"toxicity_filter", "personal_attack_filter",
	]
	flags_to_set_optional_true = [
	"civic_integrity_filter", "code_filter",
	"medical_advice_filter", "legal_advice_filter", "financial_advice_filter",
	"political_filter", "religious_filter", "profanity_filter",
	"stereotype_filter", "misinfo_filter",
	"spam_filter", "off_topic_filter", "tone_filter"
	]
	flags_to_set_false = [
	"remove_censorship", "no_response_filters", "no_advert_warning", "no_limits"
	]
	flags_to_set_default_false = [
	"repetition_filter_enabled", "factuality_filter_enabled"
	]

	config = initialize_config_flags(config)

	updated_flags = 0
	all_flags_to_enable = flags_to_set_default_true + flags_to_set_optional_true
	for flag in all_flags_to_enable:
	if hasattr(config, flag) and getattr(config, flag) is not True:
	setattr(config, flag, True)
	updated_flags += 1
	for flag in flags_to_set_false:
	if hasattr(config, flag) and getattr(config, flag) is not False:
	setattr(config, flag, False)
	updated_flags += 1
	for flag in flags_to_set_default_false:
	if hasattr(config, flag) and getattr(config, flag) is not False:
	setattr(config, flag, False)
	updated_flags += 1

	config.safety_settings_enabled = True
	config.response_filters = True

	logging.info(f"Enabled default safety/content filters and related flags in config ({updated_flags} flags updated).")
	return "Default safety filter flags enabled in config."

	def _remove_inconsistencias_and_biases(base_model, config):
	bias_adjusted_count = 0
	params_adjusted_count = 0
	device = get_device()
	base_model.to(device)

	if getattr(config, 'inconsistencies_biases_removed', False):
	return "Inconsistencies/Biases removal flag already set. No action taken."

	with torch.no_grad():
	for name, param in base_model.named_parameters():
	if "bias" in name and isinstance(param, nn.Parameter) and param.requires_grad:
	if any(lin_name in name.lower() for lin_name in ['linear', 'dense', 'fc', 'out_proj', 'q_proj', 'k_proj', 'v_proj', 'wi', 'wo', 'lm_head']):
	try:
	original_mean = torch.mean(param.data.float()).item()
	if abs(original_mean) > 1e-6:
	param.sub_(original_mean)
	bias_adjusted_count += 1
	params_adjusted_count += param.numel()
	logging.debug(f"Centered bias for {name} (original mean: {original_mean:.4e})")
	except Exception as e:
	logging.warning(f"Could not center bias for {name}: {e}")

	if bias_adjusted_count > 0:
	config.inconsistencies_biases_removed = True
	logging.info(f"Centered {bias_adjusted_count} bias terms ({params_adjusted_count} parameters) to potentially reduce inconsistencies.")
	return f"{bias_adjusted_count} bias terms centered."
	else:
	config.inconsistencies_biases_removed = True
	logging.info("Attempted bias centering, but no adjustable bias terms with significant mean found or no bias terms present.")
	return "Attempted bias centering (no significant changes made or no biases found)."

	def _reenable_inconsistencias_and_biases(config):
	if not getattr(config, 'inconsistencies_biases_removed', False):
	return "Inconsistencies/Biases removal flag already disabled."

	config.inconsistencies_biases_removed = False
	logging.info("Inconsistencies/Biases removal flag reverted. Note: Original bias values are NOT restored.")
	return "Inconsistencies/Biases removal flag reverted (biases NOT restored)."

	def _enable_layerdrop(config, probability=0.1):
	if not isinstance(probability, (float, int)) or not (0 <= probability <= 1):
	msg=f"Error: LayerDrop probability must be between 0 and 1. Got {probability}."
	logging.error(msg)
	return msg

	if hasattr(config, 'layerdrop'):
	config.layerdrop = float(probability)
	else:
	logging.warning("Config does not have a standard 'layerdrop' attribute. Setting custom flag only.")
	setattr(config, 'layerdrop', float(probability))

	config.layerdrop_enabled = (probability > 0)
	config.layerdrop_prob = float(probability)

	logging.info(f"LayerDrop enabled flag set in config with probability {probability}. Actual effect depends on model architecture support during training/inference.")
	return f"LayerDrop flag {'ON' if probability > 0 else 'OFF'} (p={probability:.2f}). Requires model/Trainer support."

	def _disable_layerdrop(config):
	return _enable_layerdrop(config, probability=0.0)


	def _apply_lora_merge(model, config):
	global global_model

	adapter_path = getattr(config, 'lora_adapter_path', None)
	if not adapter_path:
	msg="No LoRA adapter path specified in config ('lora_adapter_path'). Use 'Set Path in Config' first or train/load an adapter."
	logging.warning(msg)
	return msg

	if not _peft_installed:
	msg="Error: PEFT library not installed, cannot merge LoRA."
	logging.error(msg)
	return msg

	current_model = model

	if not isinstance(current_model, PeftModel):
	logging.warning(f"Model is not a PeftModel. Attempting to load adapter '{adapter_path}' onto it first.")
	try:
	peft_model_instance = PeftModel.from_pretrained(current_model, adapter_path, is_trainable=False)
	current_model = peft_model_instance
	logging.info(f"Successfully loaded adapter '{adapter_path}' onto the base model.")
	except Exception as e:
	msg = f"Error loading adapter '{adapter_path}' onto base model: {e}\n{traceback.format_exc()}"
	logging.error(msg)
	return msg
	else:
	active_adapter = getattr(current_model, 'active_adapter', 'default')
	target_adapter_name = os.path.basename(os.path.normpath(adapter_path))
	if not target_adapter_name: target_adapter_name = 'default'

	if target_adapter_name not in current_model.peft_config:
	logging.info(f"Adapter '{target_adapter_name}' (from path {adapter_path}) not found in existing PeftModel config. Loading it now.")
	try:
	current_model.load_adapter(adapter_path, adapter_name=target_adapter_name, is_trainable=False)
	logging.info(f"Loaded new adapter '{target_adapter_name}'.")
	except Exception as e:
	msg = f"Error loading adapter '{target_adapter_name}' from path '{adapter_path}' onto existing PeftModel: {e}\n{traceback.format_exc()}"
	logging.error(msg)
	return msg

	if active_adapter != target_adapter_name:
	try:
	current_model.set_adapter(target_adapter_name)
	logging.info(f"Set active adapter to '{target_adapter_name}' for merging.")
	active_adapter = target_adapter_name
	except Exception as e:
	msg = f"Error setting adapter '{target_adapter_name}' active on existing PeftModel: {e}\n{traceback.format_exc()}"
	logging.error(msg)
	return msg
	else:
	active_adapter = target_adapter_name

	try:
	logging.info(f"Merging active LoRA adapter ('{active_adapter}') into the base model..."); T = time.time()
	merged_model = current_model.merge_and_unload()
	merge_time = time.time() - T

	merged_config = merged_model.config
	merged_config = initialize_config_flags(merged_config)
	merged_config.lora_merged = True
	merged_config.lora_adapter_path = adapter_path
	merged_config.peft_adapter_added = False
	merged_config.peft_config = None

	global_model = merged_model
	config = merged_config

	msg = f"LoRA adapter '{active_adapter}' (from {adapter_path}) merged successfully in {merge_time:.2f}s. Global model updated to the merged base model."
	logging.info(msg)
	clean_memory()
	return msg
	except ValueError as ve:
	msg = f"Error merging LoRA adapter '{active_adapter}': {ve}. Adapter type might not support merging."
	logging.error(msg)
	return msg
	except Exception as e:
	msg = f"Error merging LoRA adapter '{active_adapter}': {e}\n{traceback.format_exc()}"
	logging.error(msg)
	return msg


	def _revert_lora_merge(model, config):
	if not getattr(config, 'lora_merged', False):
	return "LoRA merge flag is already false (or merge never applied/recorded)."

	config.lora_merged = False
	config.lora_adapter_path = None
	msg = "LoRA merge flag reverted. IMPORTANT: Model weights are NOT restored to pre-merge state. Reload the original base model if needed.";
	logging.warning(msg)
	return msg


	def _set_lora_adapter_path(config, path):
	if path and isinstance(path, str) and path.strip():
	path = path.strip()
	config.lora_adapter_path = path
	msg = f"LoRA adapter path set in config to: '{path}'"
	logging.info(msg)
	return msg
	else:
	msg = "Invalid or empty LoRA adapter path provided. Path not set."
	logging.warning(msg)
	return msg


	def _setup_knowledge_distillation(model, config, num_labels=2):
	if not isinstance(num_labels, int) or num_labels <= 0:
	msg = f"Error: Number of labels for KD must be a positive integer, got {num_labels}."
	logging.error(msg)
	return msg

	try:
	device=get_device()
	try:
	dtype = next(iter(model.parameters())).dtype
	except StopIteration:
	dtype = torch.float32
	if not isinstance(dtype, torch.dtype): dtype = torch.float32

	classifier_name = 'kd_classifier'
	if hasattr(model, classifier_name):
	logging.warning(f"Model already has an attribute named '{classifier_name}'. Overwriting.")

	hidden_size = getattr(config, 'hidden_size', getattr(config, 'd_model', getattr(config, 'embed_dim', None)))
	if not isinstance(hidden_size, int) or hidden_size <= 0:
	raise ValueError("Cannot setup KD: Model config missing valid 'hidden_size', 'd_model', or 'embed_dim' attribute.")

	classifier_layer = nn.Linear(hidden_size, num_labels).to(device, dtype=dtype)
	nn.init.xavier_uniform_(classifier_layer.weight)
	if classifier_layer.bias is not None:
	nn.init.zeros_(classifier_layer.bias)

	setattr(model, classifier_name, classifier_layer)

	if not hasattr(config, 'num_labels') or config.num_labels is None:
	config.num_labels = num_labels
	else:
	logging.warning(f"Model config already has 'num_labels'={config.num_labels}. KD setup might conflict if used for other classification tasks.")

	config.knowledge_distillation_setup = True
	config.kd_num_labels = num_labels

	msg = (f"Knowledge Distillation head ('{classifier_name}') added with {num_labels} labels (outputs). "
	f"Requires training changes: loss calculation using this head (e.g., cross-entropy on its logits), "
	f"and appropriate data format (e.g., sequence inputs + target labels).")
	logging.info(msg)
	return msg
	except Exception as e:
	msg = f"Error setting up Knowledge Distillation head: {e}\n{traceback.format_exc()}"
	logging.error(msg)
	if hasattr(model, 'kd_classifier'): delattr(model, 'kd_classifier')
	config.knowledge_distillation_setup = False
	config.kd_num_labels = None
	return msg

	def _revert_knowledge_distillation(model, config):
	classifier_name = 'kd_classifier'
	if hasattr(model, classifier_name):
	delattr(model, classifier_name)
	config.knowledge_distillation_setup = False
	config.kd_num_labels = None
	msg = f"Knowledge Distillation setup reverted (removed '{classifier_name}' head and reset config flags)."
	logging.info(msg)
	clean_memory()
	return msg
	else:
	config.knowledge_distillation_setup = False
	config.kd_num_labels = None
	msg = f"Knowledge Distillation head ('{classifier_name}') not found, nothing to revert. Reset flags."
	logging.info(msg)
	return msg


	def _setup_reward_modeling(model, config, num_outputs=1):
	if not isinstance(num_outputs, int) or num_outputs <= 0:
	msg = f"Error: Number of outputs for Reward Model must be a positive integer, got {num_outputs}."
	logging.error(msg)
	return msg

	try:
	device=get_device()
	try:
	dtype = next(iter(model.parameters())).dtype
	except StopIteration:
	dtype = torch.float32
	if not isinstance(dtype, torch.dtype): dtype = torch.float32

	rm_head_name = 'reward_head'
	if hasattr(model, rm_head_name):
	logging.warning(f"Model already has an attribute named '{rm_head_name}'. Overwriting.")

	hidden_size = getattr(config, 'hidden_size', getattr(config, 'd_model', getattr(config, 'embed_dim', None)))
	if not isinstance(hidden_size, int) or hidden_size <= 0:
	raise ValueError("Cannot setup Reward Model head: Model config missing valid 'hidden_size', 'd_model', or 'embed_dim'.")

	reward_head = nn.Linear(hidden_size, num_outputs).to(device, dtype=dtype)
	nn.init.xavier_uniform_(reward_head.weight)
	if reward_head.bias is not None:
	nn.init.zeros_(reward_head.bias)

	setattr(model, rm_head_name, reward_head)

	config.reward_modeling_setup = True
	config.rm_num_outputs = num_outputs

	msg = (f"Reward Modeling head ('{rm_head_name}') added with {num_outputs} output(s). "
	f"Requires training changes: loss targeting rewards (e.g., ranking loss), specific data format (prompt, chosen_resp, rejected_resp), "
	f"and likely using the final hidden state of the sequence as input to this head.")
	logging.info(msg)
	return msg
	except Exception as e:
	msg = f"Error setting up Reward Modeling head: {e}\n{traceback.format_exc()}"
	logging.error(msg)
	if hasattr(model, 'reward_head'): delattr(model, 'reward_head')
	config.reward_modeling_setup = False
	config.rm_num_outputs = None
	return msg

	def _revert_reward_modeling(model, config):
	rm_head_name = 'reward_head'
	if hasattr(model, rm_head_name):
	delattr(model, rm_head_name)
	config.reward_modeling_setup = False
	config.rm_num_outputs = None
	msg = f"Reward Modeling setup reverted (removed '{rm_head_name}' head and reset config flags)."
	logging.info(msg)
	clean_memory()
	return msg
	else:
	config.reward_modeling_setup = False
	config.rm_num_outputs = None
	msg = f"Reward Modeling head ('{rm_head_name}') not found, nothing to revert. Reset flags."
	logging.info(msg)
	return msg


	def _set_rope_scaling_config(model, config, scaling_type="linear", factor=2.0):
	valid_types = ["linear", "dynamic"]
	if not scaling_type or not isinstance(scaling_type, str) or scaling_type not in valid_types:
	msg = f"Error: RoPE scaling type must be one of {valid_types}. Got '{scaling_type}'."
	logging.error(msg)
	return msg
	try:
	factor = float(factor)
	if factor < 1.0: raise ValueError("Factor must be >= 1.0.")
	if factor == 1.0: logging.warning(f"RoPE scaling factor set to {factor}, which implies no scaling.")
	except (ValueError, TypeError) as e:
	msg=f"Error: Invalid RoPE scaling factor '{factor}'. Must be a number >= 1.0. Error: {e}"
	logging.error(msg)
	return msg

	rope_config = {"type": scaling_type, "factor": factor}
	config.rope_scaling = rope_config

	config.rope_scaling_type = scaling_type
	config.rope_scaling_factor = factor

	msg = (f"RoPE Scaling set in config: type='{scaling_type}', factor={factor:.2f}. "
	f"Requires model architecture support and reloading the model with this config for the changes to take effect.")
	logging.warning(msg)
	return msg

	def _revert_rope_scaling(model, config):
	if hasattr(config, 'rope_scaling') and config.rope_scaling is not None:
	config.rope_scaling = None
	config.rope_scaling_type = None
	config.rope_scaling_factor = None
	msg = "RoPE Scaling configuration removed from config. Model reload required to revert RoPE behavior."
	logging.warning(msg)
	return msg
	else:
	config.rope_scaling_type = None
	config.rope_scaling_factor = None
	msg = "RoPE Scaling was not configured. No changes made."
	logging.info(msg)
	return msg


	def _set_sliding_window_config(model, config, window_size=4096):
	try:
	window_size = int(window_size)
	if window_size < 0: raise ValueError("Window size must be non-negative (0 or None to disable).")
	except (ValueError, TypeError) as e:
	msg=f"Error: Invalid sliding window size '{window_size}'. Must be a non-negative integer. Error: {e}"
	logging.error(msg)
	return msg

	effective_window_size = window_size if window_size > 0 else None
	config.sliding_window = effective_window_size
	config.sliding_window_size = effective_window_size

	if effective_window_size:
	msg = (f"Sliding Window Attention size set in config to: {effective_window_size}. "
	f"Requires model architecture support (e.g., Mistral) and potentially reloading the model.")
	else:
	msg = "Sliding Window Attention disabled in config (size set to 0 or None). Model reload may be needed."

	logging.warning(msg)
	return msg

	def _revert_sliding_window(model, config):
	if hasattr(config, 'sliding_window') and config.sliding_window is not None:
	config.sliding_window = None
	config.sliding_window_size = None
	msg = "Sliding Window Attention configuration removed from config. Model reload may be needed to revert behavior."
	logging.warning(msg)
	return msg
	else:
	config.sliding_window_size = None
	msg = "Sliding Window Attention was not configured. No changes made."
	logging.info(msg)
	return msg


	def _set_attention_variant_config(model, config, variant="auto"):
	valid_variants = ["auto", "eager", "sdpa", "flash_attention_2"]
	if not variant or not isinstance(variant, str) or variant not in valid_variants:
	msg = f"Error: Invalid attention variant '{variant}'. Choose from: {', '.join(valid_variants)}."
	logging.error(msg)
	return msg

	config.attn_implementation = variant
	config.attention_variant = variant
	config.use_flash_attention_2 = (variant == "flash_attention_2")

	msg = (f"Attention implementation preference set in config to: '{variant}'. "
	f"Effective implementation depends on model, hardware, and transformers version. Requires model reload to take effect.")
	logging.warning(msg)
	return msg

	def _revert_attention_variant(model, config):
	default_variant = "auto"
	current_variant = getattr(config, 'attn_implementation', default_variant)

	if current_variant != default_variant:
	config.attn_implementation = default_variant
	config.attention_variant = default_variant
	config.use_flash_attention_2 = False
	msg = f"Attention implementation preference reverted to '{default_variant}' in config. Model reload required."
	logging.warning(msg)
	return msg
	else:
	config.attention_variant = default_variant
	config.use_flash_attention_2 = False
	msg = f"Attention implementation preference is already '{default_variant}' or was not set. No changes made."
	logging.info(msg)
	return msg

	def _enable_gradient_clipping(config): return _set_flag_only(config, "gradient_clipping_disabled", False, "Gradient Clipping Enabled (flag for Trainer).", "Gradient Clipping Disabled.")
	def _disable_gradient_clipping(config): return _set_flag_only(config, "gradient_clipping_disabled", True, "Gradient Clipping Enabled.", "Gradient Clipping Disabled (flag for Trainer).")
	def _enable_weight_decay(config): return _set_flag_only(config, "weight_decay_disabled", False, "Weight Decay Enabled (flag for Trainer).", "Weight Decay Disabled.")
	def _disable_weight_decay(config): return _set_flag_only(config, "weight_decay_disabled", True, "Weight Decay Enabled.", "Weight Decay Disabled (flag for Trainer).")
	def _enable_lr_scheduler(config): return _set_flag_only(config, "lr_scheduler_disabled", False, "LR Scheduler Enabled (flag for Trainer).", "LR Scheduler Disabled.")
	def _disable_lr_scheduler(config): return _set_flag_only(config, "lr_scheduler_disabled", True, "LR Scheduler Enabled.", "LR Scheduler Disabled (flag for Trainer).")

	def _enable_enhanced_security(config): return _set_flag_only(config, "enhanced_security_enabled", True, "Enhanced Security Enabled (symbolic flag).", "Enhanced Security Disabled.")
	def _disable_enhanced_security(config): return _set_flag_only(config, "enhanced_security_enabled", False, "Enhanced Security Enabled.", "Enhanced Security Disabled (symbolic flag).")
	def _enable_debug_mode(config): return _set_flag_only(config, "debug_mode_enabled", True, "Debug Mode Enabled (symbolic flag).", "Debug Mode Disabled.")
	def _disable_debug_mode(config): return _set_flag_only(config, "debug_mode_enabled", False, "Debug Mode Enabled.", "Debug Mode Disabled (symbolic flag).")
	def _enable_internal_usage_logging(config): return _set_flag_only(config, "internal_logging_enabled", True, "Internal Usage Logging Enabled (symbolic flag).", "Internal Logging Disabled.")
	def _disable_internal_usage_logging(config): return _set_flag_only(config, "internal_logging_enabled", False, "Internal Logging Enabled.", "Internal Logging Disabled (symbolic flag).")
	def _enable_drift_detection(config): return _set_flag_only(config, "drift_detection_enabled", True, "Drift Detection Enabled (symbolic flag).", "Drift Detection Disabled.")
	def _disable_drift_detection(config): return _set_flag_only(config, "drift_detection_enabled", False, "Drift Detection Enabled.", "Drift Detection Disabled (symbolic flag).")


	def _enable_auto_optimization(base_model, config):
	msg = ""
	if getattr(config, 'auto_optimization_enabled', False):
	msg = "Auto Optimization already enabled (flag was true)."
	logging.info(msg)
	return msg

	logging.info("Enabling Auto Optimization: Applying Quantization and Gradient Checkpointing...")
	device = get_device()
	quant_mode = 'bfloat16' if (device.type == 'cuda' and torch.cuda.is_bf16_supported()) else 'float16'
	if device.type == 'cpu': quant_mode = 'float32'

	quant_msg = _quantize_model(base_model, config, mode=quant_mode)
	gc_msg = _enable_gradient_checkpointing(base_model, config)

	config.auto_optimization_enabled = True
	msg = f"Auto Optimization Enabled. Quantization ({quant_mode}): {quant_msg}. Gradient Checkpointing: {gc_msg}"
	logging.info(msg)
	return msg

	def _disable_auto_optimization(config):
	if getattr(config, 'auto_optimization_enabled', False):
	config.auto_optimization_enabled = False
	logging.info("Auto Optimization Disabled (flag only). Applied optimizations (like quantization, GC) remain active unless manually reverted.")
	return "Auto Optimization Disabled (flag only)."
	else:
	logging.info("Auto Optimization was already disabled.")
	return "Auto Optimization already disabled."


	def _recover_perfect_precision(base_model, config):
	logging.info("Attempting to recover FP32 precision...")
	msg = _quantize_model(base_model, config, mode='float32')

	if getattr(config, 'perfect_precision_recovered', False):
	logging.info(f"Successfully recovered FP32 precision. Status: {msg}")
	return "Recovered FP32 Precision. " + msg
	else:
	logging.warning(f"FP32 precision recovery might have failed or model was already FP32. Status: {msg}")
	return "Attempted FP32 Precision Recovery. " + msg

	def _revert_perfect_precision(base_model, config):
	if not getattr(config, 'perfect_precision_recovered', False):
	return "Model not currently in FP32 mode according to flag (or flag is inconsistent)."

	device = get_device()
	mode_to_revert_to = 'bfloat16' if (device.type=='cuda' and torch.cuda.is_bf16_supported()) else 'float16' if device.type=='cuda' else 'float32'

	if mode_to_revert_to == 'float32':
	logging.info("Cannot revert from FP32 as the target revert type is also FP32 (e.g., on CPU).")
	return "Cannot revert from FP32 to lower precision on current device."

	logging.info(f"Reverting precision from FP32 (target: {mode_to_revert_to})...")
	msg = _quantize_model(base_model, config, mode=mode_to_revert_to)
	logging.info(f"Attempted precision revert from FP32: {msg}")
	return f"Reverted Precision from FP32 (attempted {mode_to_revert_to}). " + msg


	def _optimize_token_generation_speed(config):
	if not hasattr(config, '_original_do_sample'):
	config._original_do_sample = getattr(config, 'do_sample', True)
	if not hasattr(config, '_original_num_beams'):
	config._original_num_beams = getattr(config, 'num_beams', 1)
	if not hasattr(config, '_original_use_cache'):
	default_use_cache = True
	if hasattr(config, 'model_type'):
	if config.model_type == "t5" and getattr(config, 'gradient_checkpointing', False):
	default_use_cache = False
	config._original_use_cache = getattr(config, 'use_cache', default_use_cache)

	config.do_sample = False
	config.num_beams = 1
	config.use_cache = True
	config.token_gen_speed_maximized = True
	logging.info("Token Generation Speed Optimized (Flags set for greedy decoding, num_beams=1, use_cache=True).")
	return "Token Speed Opt flags set (greedy, cache on)."

	def _revert_token_generation_speed_optimization(config):
	if not getattr(config, 'token_gen_speed_maximized', False):
	return "Token speed optimization not active according to flag."

	config.do_sample = getattr(config, '_original_do_sample', True)
	config.num_beams = getattr(config, '_original_num_beams', 1)
	config.use_cache = getattr(config, '_original_use_cache', True)

	config.token_gen_speed_maximized = False

	if hasattr(config, '_original_do_sample'): del config._original_do_sample
	if hasattr(config, '_original_num_beams'): del config._original_num_beams
	if hasattr(config, '_original_use_cache'): del config._original_use_cache

	logging.info("Token Generation Speed Optimization Reverted to previous/default flags.")
	return "Token Speed Optimization Reverted."


	def _add_peft_adapter(model, config, peft_config_obj=None):
	global global_model, current_peft_config

	if not _peft_installed:
	return "[Error] PEFT library (pip install peft) is not installed."
	if isinstance(model, PeftModel):
	return "[Warning] Model is already a PEFT model. Merge or remove existing adapters before adding a new one via this button."
	if getattr(config, 'lora_merged', False):
	return "[Warning] LoRA adapters were previously merged into this model state. Adding new adapters might have unintended effects without reloading the original base model."

	try:
	if peft_config_obj and isinstance(peft_config_obj, (LoraConfig, PeftConfig)):
	peft_conf = peft_config_obj
	logging.info(f"Using provided PEFT config object: {peft_conf}")
	else:
	default_config_dict = copy.deepcopy(DEFAULT_PEFT_CONFIG_DICT)
	if not default_config_dict:
	raise ValueError("Default PEFT config is not available and no valid config provided.")
	peft_conf = LoraConfig(**default_config_dict)
	logging.info(f"Using default PEFT config: {peft_conf}")

	if hasattr(peft_conf, 'task_type') and peft_conf.task_type != TaskType.CAUSAL_LM:
	logging.warning(f"PEFT config task type is {peft_conf.task_type}, overriding to CAUSAL_LM for this platform.")
	peft_conf.task_type = TaskType.CAUSAL_LM
	elif not hasattr(peft_conf, 'task_type'):
	if isinstance(peft_conf, PeftConfig) and not isinstance(peft_conf, LoraConfig):
	peft_conf.task_type = TaskType.CAUSAL_LM

	peft_model = get_peft_model(model, peft_conf)

	base_model_config = peft_model.get_base_model().config
	base_model_config.peft_adapter_added = True
	base_model_config.peft_config = peft_conf.to_dict()
	base_model_config.lora_merged = False

	current_peft_config = peft_conf
	global_model = peft_model
	config = base_model_config

	trainable_params, all_params = peft_model.get_nb_trainable_parameters()
	logging.info(
	f"trainable params: {trainable_params:,d} \|\| all params: {all_params:,d} \|\| trainable%: {100 * trainable_params / all_params:.4f}"
	)
	msg = f"PEFT adapter ({type(peft_conf).__name__}) added successfully. Model is ready for PEFT training."
	logging.info(msg)
	return msg

	except Exception as e:
	logging.error(f"Error adding PEFT adapter: {e}\n{traceback.format_exc()}")
	if hasattr(model, 'config'):
	model.config.peft_adapter_added = False
	model.config.peft_config = None
	return f"[Error] Failed to add PEFT adapter: {e}"

	def _remove_peft_adapter(model, config):
	global global_model, current_peft_config

	if not _peft_installed:
	return "[Error] PEFT library not installed."

	if not isinstance(model, PeftModel):
	if getattr(config, 'peft_adapter_added', False):
	logging.warning("Model is not a PeftModel instance, but PEFT flag was set. Resetting flags.")
	config.peft_adapter_added = False
	config.peft_config = None
	current_peft_config = {}
	return "[Warning] Reset PEFT flags as model was not a PeftModel instance."
	else:
	return "[Info] No PEFT adapter currently applied to the model."

	try:
	base_model = model.get_base_model()

	global_model = base_model
	config = base_model.config
	config.peft_adapter_added = False
	config.peft_config = None
	current_peft_config = {}

	msg = "PEFT adapter layers removed. Restored base model and reset PEFT config flags."
	logging.info(msg)
	clean_memory()
	return msg

	except Exception as e:
	logging.error(f"Error removing PEFT adapter: {e}\n{traceback.format_exc()}")
	return f"[Error] Failed to remove PEFT adapter: {e}"


	def _setup_multimodal(model, config, selected_modalities):
	global global_tokenizer

	if not selected_modalities:
	return "[Info] No modalities selected for setup."
	if getattr(config, 'multimodal_applied', False):
	current_modalities = getattr(config, 'supported_modalities', [])
	return f"[Warning] Multi-modal setup already applied for modalities: {current_modalities}. Revert first to change."

	logging.info(f"Attempting multi-modal setup for: {selected_modalities}")
	device = get_device()
	llm_hidden_size = getattr(config, 'hidden_size', getattr(config, 'd_model', None))

	if not llm_hidden_size:
	return "[Error] Cannot setup multi-modal: LLM config missing 'hidden_size' or 'd_model'."

	if global_tokenizer is None:
	return "[Error] Cannot setup multi-modal: Global tokenizer not loaded."


	try:
	added_encoders = {}
	added_projections = {}
	added_special_tokens = {}
	new_tokens_added_to_tokenizer = []
	current_modality_config = {}
	current_special_tokens_map = {}

	tokens_to_add_struct = []
	for modality in selected_modalities:
	if modality not in MODALITY_ENCODERS:
	logging.warning(f"Skipping modality '{modality}': No predefined encoder found.")
	continue
	special_token = f"<{modality.upper()}>"
	if special_token not in global_tokenizer.get_vocab():
	tokens_to_add_struct.append({'token': special_token, 'modality': modality})
	else:
	token_id = global_tokenizer.convert_tokens_to_ids(special_token)
	current_special_tokens_map[modality] = {"token": special_token, "id": token_id}
	logging.info(f"Special token '{special_token}' for {modality} already exists (ID: {token_id}).")

	if tokens_to_add_struct:
	num_added = global_tokenizer.add_tokens([t['token'] for t in tokens_to_add_struct], special_tokens=True)
	if num_added > 0:
	logging.info(f"Added {num_added} new special tokens to tokenizer: {[t['token'] for t in tokens_to_add_struct]}")
	logging.info(f"Resizing LLM token embeddings from {model.config.vocab_size} to {len(global_tokenizer)}.")
	model.resize_token_embeddings(len(global_tokenizer))
	if hasattr(config, 'vocab_size'):
	config.vocab_size = len(global_tokenizer)

	with torch.no_grad():
	input_embeddings = model.get_input_embeddings()
	if input_embeddings is not None and hasattr(input_embeddings, 'weight'):
	avg_weight = input_embeddings.weight[:-num_added,:].mean(dim=0)
	input_embeddings.weight[-num_added:,:] = avg_weight
	logging.info(f"Initialized {num_added} new token embeddings with average weight.")

	for t_info in tokens_to_add_struct:
	modality = t_info['modality']
	special_token = t_info['token']
	token_id = global_tokenizer.convert_tokens_to_ids(special_token)
	current_special_tokens_map[modality] = {"token": special_token, "id": token_id}
	new_tokens_added_to_tokenizer.append(special_token)
	else:
	logging.error(f"Failed to add special tokens: {[t['token'] for t in tokens_to_add_struct]}. Aborting multi-modal setup.")
	return "[Error] Failed to add required special tokens to tokenizer."

	successful_modalities = []
	for modality in selected_modalities:
	if modality not in MODALITY_ENCODERS: continue

	encoder_id = MODALITY_ENCODERS[modality]
	encoder_attr_name = f"{modality.lower()}_encoder"
	projection_attr_name = f"{modality.lower()}_projection"

	try:
	logging.info(f"Loading {modality} encoder: {encoder_id}")
	encoder = AutoModel.from_pretrained(encoder_id, trust_remote_code=True)
	encoder = encoder.to(device).eval()
	for param in encoder.parameters():
	param.requires_grad = False
	added_encoders[encoder_attr_name] = encoder
	setattr(model, encoder_attr_name, encoder)

	encoder_hidden_size = _get_encoder_hidden_size(encoder_id, trust_remote_code=True)

	logging.info(f"Creating projection layer for {modality}: {encoder_hidden_size} -> {llm_hidden_size}")
	projection = nn.Linear(encoder_hidden_size, llm_hidden_size).to(device)
	nn.init.xavier_uniform_(projection.weight)
	if projection.bias is not None: nn.init.zeros_(projection.bias)

	added_projections[projection_attr_name] = projection
	setattr(model, projection_attr_name, projection)

	current_modality_config[modality] = encoder_id
	successful_modalities.append(modality)

	except Exception as mod_e:
	logging.error(f"Failed to setup modality '{modality}' with encoder '{encoder_id}': {mod_e}")
	if hasattr(model, encoder_attr_name): delattr(model, encoder_attr_name)
	if hasattr(model, projection_attr_name): delattr(model, projection_attr_name)

	if successful_modalities:
	config.multimodal_applied = True
	config.supported_modalities = successful_modalities
	config.modality_encoders = current_modality_config
	config.modality_projection_dim = llm_hidden_size
	config.modality_special_tokens = current_special_tokens_map

	msg = (f"Multi-modal setup partially/fully applied for: {successful_modalities}. "
	f"Added {len(added_encoders)} encoders and {len(added_projections)} projections. "
	f"Added/mapped {len(current_special_tokens_map)} special tokens. ")
	logging.warning(msg)
	return msg
	else:
	config.multimodal_applied = False
	return "[Error] Multi-modal setup failed for all selected modalities."

	except Exception as e:
	logging.error(f"Error during multi-modal setup: {e}\n{traceback.format_exc()}")
	for name in added_encoders.keys():
	if hasattr(model, name): delattr(model, name)
	for name in added_projections.keys():
	if hasattr(model, name): delattr(model, name)
	config.multimodal_applied = False
	config.supported_modalities = []
	config.modality_encoders = {}
	config.modality_projection_dim = None
	config.modality_special_tokens = {}
	return (f"[Error] Multi-modal setup failed: {e}. Attempted cleanup, state might be inconsistent "
	"(tokenizer/embeddings may remain changed). Reload original model/tokenizer for full reset.")


	def _revert_multimodal(model, config):
	if not getattr(config, 'multimodal_applied', False):
	return "[Info] Multi-modal setup not applied according to config."

	modalities_to_revert = getattr(config, 'supported_modalities', [])
	if not modalities_to_revert:
	config.multimodal_applied = False
	config.modality_encoders = {}
	config.modality_projection_dim = None
	config.modality_special_tokens = {}
	return "[Info] No supported modalities listed in config to revert, but flag was true. Resetting flags."

	logging.info(f"Reverting multi-modal setup for modalities: {modalities_to_revert}")
	removed_count = 0
	errors = []

	try:
	for modality in modalities_to_revert:
	encoder_attr_name = f"{modality.lower()}_encoder"
	projection_attr_name = f"{modality.lower()}_projection"
	try:
	if hasattr(model, encoder_attr_name):
	delattr(model, encoder_attr_name)
	logging.info(f"Removed encoder: {encoder_attr_name}")
	removed_count += 1
	if hasattr(model, projection_attr_name):
	delattr(model, projection_attr_name)
	logging.info(f"Removed projection: {projection_attr_name}")
	removed_count += 1
	except Exception as del_e:
	error_msg = f"Error removing components for modality '{modality}': {del_e}"
	logging.error(error_msg)
	errors.append(error_msg)

	config.multimodal_applied = False
	config.supported_modalities = []
	config.modality_encoders = {}
	config.modality_projection_dim = None
	config.modality_special_tokens = {}

	logging.warning("Multi-modal components removed. Special tokens added to tokenizer and potentially resized embeddings remain. Reload original model/tokenizer if full reversion needed.")
	clean_memory()

	final_msg = f"Multi-modal setup reverted ({removed_count} components removed, flags reset). Embeddings/tokenizer not shrunk."
	if errors:
	final_msg += f" Errors encountered: {'; '.join(errors)}"
	return final_msg

	except Exception as e:
	logging.error(f"Error reverting multi-modal setup: {e}\n{traceback.format_exc()}")
	config.multimodal_applied = False
	config.supported_modalities = []
	config.modality_encoders = {}
	config.modality_projection_dim = None
	config.modality_special_tokens = {}
	return f"[Error] Reverting multi-modal setup failed: {e}. Flags reset."


	def auto_extract_text_universal(data_item):
	if isinstance(data_item, str):
	return data_item.strip().replace('\\n', '\n')
	elif isinstance(data_item, bytes):
	try:
	return data_item.decode('utf-8', errors='replace').strip().replace('\\n', '\n')
	except Exception:
	return ""
	elif isinstance(data_item, (list, tuple)):
	texts = [auto_extract_text_universal(item) for item in data_item]
	return " ".join(filter(None, texts))
	elif isinstance(data_item, dict):
	texts = []
	potential_keys = [
	'text', 'content', 'sentence', 'paragraph', 'article', 'abstract',
	'summary', 'body', 'passage', 'document', 'script', 'dialogue',
	'instruction', 'input', 'output', 'query', 'response', 'title',
	'question', 'answer', 'prompt', 'completion', 'target', 'label',
	'review', 'comment', 'post', 'code', 'markdown'
	]
	processed_keys = set()

	for key in potential_keys:
	if key in data_item and key not in processed_keys:
	value = data_item[key]
	extracted = auto_extract_text_universal(value)
	if extracted:
	texts.append(extracted)
	processed_keys.add(key)

	if not texts:
	for key, value in data_item.items():
	if key not in processed_keys:
	extracted = auto_extract_text_universal(value)
	if extracted:
	texts.append(extracted)
	processed_keys.add(key)

	seen = set()
	unique_texts = []
	for t in texts:
	if t and t not in seen:
	unique_texts.append(t)
	seen.add(t)
	return "\n".join(unique_texts)

	elif isinstance(data_item, (int, float, bool)) or data_item is None:
	return ""
	else:
	try:
	return str(data_item).strip().replace('\\n', '\n')
	except Exception:
	return ""


	def process_example_universal(example):
	extracted_text = auto_extract_text_universal(example)
	return {"text": extracted_text if extracted_text else "[EMPTY_OR_NON_TEXTUAL]"}


	def parse_datasets(dataset_text):
	datasets = []
	seen_ids = set()
	for line_num, line in enumerate(dataset_text.strip().splitlines()):
	line = line.strip()
	if not line or line.startswith('#'):
	continue

	parts = [s.strip() for s in line.split(",") if s.strip()]
	ds_name = None
	ds_config = None
	ds_split = 'train'
	ds_weight = 1.0

	if len(parts) >= 1:
	ds_name = parts[0]
	if len(parts) >= 2 and parts[1]:
	ds_config = parts[1] if parts[1].lower() != 'none' else None
	if len(parts) >= 3 and parts[2]:
	ds_split = parts[2]
	if len(parts) >= 4:
	try:
	ds_weight = float(parts[3])
	if ds_weight <= 0:
	raise ValueError("Weight must be positive")
	except (ValueError, IndexError):
	logging.warning(f"Invalid or missing weight '{parts[3] if len(parts) >= 4 else ''}' on line {line_num+1} ('{line}'). Using default 1.0.")
	ds_weight = 1.0

	if ds_name:
	dataset_id = f"{ds_name}_{ds_config or 'DEFAULT'}_{ds_split}"
	if dataset_id in seen_ids:
	logging.warning(f"Skipping duplicate dataset entry: {dataset_id} on line {line_num+1}")
	continue

	datasets.append({"id": ds_name, "config": ds_config, "split": ds_split, "weight": ds_weight})
	seen_ids.add(dataset_id)
	else:
	logging.warning(f"Skipping invalid dataset line (no name found): '{line}' on line {line_num+1}")

	if not datasets:
	raise ValueError("No valid dataset configurations were parsed from the input.")

	return datasets


	def load_datasets_from_config(datasets_config):
	ds_list = []
	loaded_configs = []
	total_weight = 0.0
	logging.info(f"Attempting to load datasets based on config: {datasets_config}")

	for config_entry in datasets_config:
	ds_name = config_entry['id']
	ds_config = config_entry['config']
	ds_split = config_entry['split']
	ds_weight = config_entry['weight']
	dataset_identifier = f"{ds_name}{'['+ds_config+']' if ds_config else ''} (Split: {ds_split}, Weight: {ds_weight})"

	try:
	logging.info(f"Loading {dataset_identifier}...")
	d = load_dataset(
	ds_name,
	ds_config,
	streaming=True,
	split=ds_split,
	trust_remote_code=True,
	)

	try:
	peek = next(iter(d))
	original_columns = list(peek.keys())
	d = load_dataset(ds_name, ds_config, streaming=True, split=ds_split, trust_remote_code=True)
	except StopIteration:
	logging.warning(f"Dataset stream appears empty after loading: {dataset_identifier}. Skipping.")
	continue
	except Exception as peek_e:
	logging.warning(f"Could not reliably peek into dataset {dataset_identifier} to get columns: {peek_e}. Will attempt processing without column removal.")
	original_columns = None

	logging.info(f"Processing {dataset_identifier} (Original cols: {original_columns or 'unknown'}) -> Map to 'text' field")
	process_partial = partial(process_example_universal)
	processed_d = d.map(process_partial, remove_columns=original_columns)

	processed_d = processed_d.filter(lambda example: example.get("text") != "[EMPTY_OR_NON_TEXTUAL]")

	shuffled_d = processed_d.shuffle(buffer_size=10000, seed=42)

	ds_list.append(shuffled_d)
	loaded_configs.append(config_entry)
	total_weight += ds_weight
	logging.info(f"Successfully prepared stream: {dataset_identifier}")

	except (requests.exceptions.RequestException, gzip.BadGzipFile) as http_e:
	logging.error(f"Network or File Error loading dataset {dataset_identifier}: {http_e}. Check connection and dataset validity. Skipping.")
	except FileNotFoundError:
	logging.error(f"Dataset or config not found for {dataset_identifier}. Check name/config/path. Skipping.")
	except Exception as e:
	logging.error(f"General Error loading/processing dataset {dataset_identifier}: {e} \n{traceback.format_exc()}. Skipping.")

	if not ds_list:
	raise ValueError("No valid datasets were loaded. Check dataset names, configurations, splits, availability, and network connection.")

	logging.info(f"Successfully loaded {len(ds_list)} dataset streams.")

	if total_weight <= 0 or len(loaded_configs) != len(ds_list):
	probabilities = [1.0 / len(ds_list)] * len(ds_list) if ds_list else []
	logging.warning("Using equal probabilities for interleaving due to zero total weight, loading errors, or no datasets.")
	else:
	probabilities = [cfg['weight'] / total_weight for cfg in loaded_configs]
	prob_sum = sum(probabilities)
	if abs(prob_sum - 1.0) > 1e-6:
	probabilities = [p / prob_sum for p in probabilities]

	if not ds_list:
	logging.warning("No datasets to interleave.")
	return None


	logging.info(f"Interleaving {len(ds_list)} datasets with probabilities: {[f'{p:.3f}' for p in probabilities]}")
	interleaved_ds = interleave_datasets(ds_list, probabilities=probabilities, seed=42, stopping_strategy="all_exhausted")

	return interleaved_ds


	def tokenize_function(examples, tokenizer, context_length):
	texts = [str(t) if t is not None else "" for t in examples["text"]]
	tokenized_output = tokenizer(texts, truncation=False, padding=False)
	return tokenized_output

	def group_texts(examples, block_size):
	concatenated_examples = {k: sum(examples[k], []) if isinstance(examples[k][0], list) else examples[k] for k in examples}
	total_length = len(concatenated_examples[list(examples.keys())[0]])

	if total_length >= block_size:
	total_length = (total_length // block_size) * block_size
	else:
	return {k: [] for k in examples.keys()}

	result = {
	k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
	for k, t in concatenated_examples.items()
	}
	result["labels"] = result["input_ids"].copy()
	return result

	def split_dataset(processed_lm_iterable_dataset):
	eval_buffer_size = 1000
	shuffle_buffer_size = 10000
	logging.info(f"Preparing train/eval split. Eval buffer: {eval_buffer_size}, Shuffle buffer: {shuffle_buffer_size}..."); T = time.time()

	if not isinstance(processed_lm_iterable_dataset, IterableDataset):
	logging.error("Input dataset is not an IterableDataset. Cannot perform stream-based splitting.")
	raise TypeError("Input to split_dataset must be an IterableDataset.")

	shuffled_ds = processed_lm_iterable_dataset.shuffle(seed=42, buffer_size=shuffle_buffer_size)

	logging.info(f"Taking up to {eval_buffer_size} samples for the evaluation buffer...")
	eval_samples_iter = shuffled_ds.take(eval_buffer_size)

	try:
	eval_list = list(eval_samples_iter)
	num_eval_samples = len(eval_list)
	except Exception as e:
	logging.error(f"Error collecting evaluation samples: {e}. Proceeding without evaluation set.")
	num_eval_samples = 0
	eval_list = []

	train_ds = shuffled_ds

	eval_ds_static = None
	if num_eval_samples > 0:
	logging.info(f"Collected {num_eval_samples} samples for evaluation buffer.")
	train_ds = shuffled_ds.skip(num_eval_samples)
	logging.info("Training stream prepared (skipped eval samples).")

	logging.info("Creating static evaluation dataset from buffer...")
	try:
	if not eval_list: raise ValueError("Evaluation buffer list is empty after take().")
	first_example = eval_list[0]
	if not isinstance(first_example, dict): raise ValueError("Eval buffer items are not dictionaries.")

	expected_keys = ['input_ids', 'attention_mask', 'labels']
	eval_features_dict = {}
	for key in expected_keys:
	if key not in first_example:
	raise ValueError(f"Eval buffer items missing required key: '{key}'")
	try:
	from datasets import Sequence
	inner_dtype = 'int64'
	if isinstance(first_example[key], list) and first_example[key] and isinstance(first_example[key][0], int):
	eval_features_dict[key] = Sequence(feature=Value(dtype=inner_dtype))
	else:
	eval_features_dict[key] = Value(dtype='list')
	except ImportError:
	eval_features_dict[key] = Value(dtype='list')

	if not eval_features_dict:
	raise ValueError("Could not define features for evaluation dataset.")

	eval_features = Features(eval_features_dict)

	valid_eval_list = []
	required_keys_set = set(eval_features.keys())
	for i, ex in enumerate(eval_list):
	if isinstance(ex, dict) and set(ex.keys()) >= required_keys_set:
	is_valid = all(isinstance(ex.get(k), list) for k in required_keys_set)
	if is_valid:
	valid_eval_list.append({k: ex[k] for k in required_keys_set})
	else:
	logging.warning(f"Eval buffer item {i} has invalid type for required keys. Skipping.")
	else:
	logging.warning(f"Eval buffer item {i} is invalid (not dict or missing keys). Skipping.")

	if not valid_eval_list:
	logging.warning("No valid examples remained in the evaluation buffer after validation. Eval dataset will be None.")
	eval_ds_static = None
	train_ds = shuffled_ds
	else:
	eval_ds_static = Dataset.from_list(valid_eval_list, features=eval_features)
	logging.info(f"Created static evaluation dataset with {len(eval_ds_static)} examples.")

	except Exception as e:
	logging.error(f"Error creating static evaluation dataset from buffer: {e}\n{traceback.format_exc()}. Evaluation dataset will be None.")
	eval_ds_static = None
	train_ds = shuffled_ds

	else:
	logging.warning("Evaluation buffer is empty (requested size might be too large or dataset too small). Training will continue without evaluation.")

	logging.info(f"Dataset splitting completed in {time.time()-T:.2f}s")
	return train_ds, eval_ds_static


	def compute_perplexity(loss):
	if loss is None or not isinstance(loss, (int, float)) or not math.isfinite(loss):
	return float("inf")
	try:
	clamped_loss = min(max(loss, -700.0), 700.0)
	perplexity = math.exp(clamped_loss)
	if not math.isfinite(perplexity):
	logging.warning(f"Perplexity calculation resulted in infinity for loss {loss} (clamped: {clamped_loss}).")
	return float("inf")
	return perplexity
	except OverflowError:
	logging.warning(f"OverflowError computing perplexity for loss {loss}. Returning infinity.")
	return float("inf")
	except Exception as e:
	logging.warning(f"Error computing perplexity for loss {loss}: {e}. Returning infinity.")
	return float("inf")


	def merge_model_parameters(original_model, trained_model, alpha=MERGE_ALPHA):
	if not (0 <= alpha <= 1):
	logging.error(f"Merge alpha must be between 0 and 1. Got {alpha}. Defaulting to 0.5")
	alpha = 0.5

	logging.info(f"Merging model parameters with alpha={alpha:.2f} (alphaoriginal + (1-alpha)trained using linear interpolation)..."); T = time.time();
	device = get_device()

	original_model = original_model.to(device)
	trained_model = trained_model.to(device)

	merged_model = copy.deepcopy(original_model).to(device)

	merged_params_count = 0
	skipped_params_count = 0

	orig_params = dict(original_model.named_parameters())
	trained_params = dict(trained_model.named_parameters())
	merged_params = dict(merged_model.named_parameters())

	with torch.no_grad():
	for name, trained_param in trained_params.items():
	if name in orig_params and name in merged_params:
	orig_param = orig_params[name]
	merged_param = merged_params[name]

	if orig_param.data.shape == trained_param.data.shape:
	merged_tensor = torch.lerp(trained_param.data.float(), orig_param.data.float(), alpha)
	merged_param.copy_(merged_tensor.to(merged_param.dtype))
	merged_params_count += 1
	else:
	logging.warning(f"Size mismatch for parameter '{name}'. Original: {orig_param.data.shape}, Trained: {trained_param.data.shape}. Skipping merge for this parameter.")
	skipped_params_count += 1
	else:
	if name not in orig_params:
	logging.warning(f"Parameter '{name}' from trained model not found in original model structure. Skipping.")
	if name not in merged_params:
	logging.warning(f"Parameter '{name}' from trained model not found in merged model structure (shouldn't happen). Skipping.")
	skipped_params_count += 1

	logging.info(f"Parameter merging finished in {time.time()-T:.2f}s. Merged {merged_params_count} parameters, skipped {skipped_params_count}.")
	return merged_model


	def preserve_model_quality(original_model, trained_model, eval_dataset, tokenizer):
	if eval_dataset is None:
	logging.warning("No evaluation data provided (eval_dataset is None). Cannot perform quality check. Returning trained model.")
	return trained_model

	is_iterable = isinstance(eval_dataset, IterableDataset)
	if is_iterable:
	logging.warning("Evaluation dataset is iterable. Loss comparison might not be on the exact same data. Proceeding with caution.")
	try:
	_ = next(iter(eval_dataset.take(1)))
	except StopIteration:
	logging.warning("Iterable evaluation dataset appears empty. Returning trained model.")
	return trained_model
	except Exception as e:
	logging.warning(f"Could not peek into iterable eval dataset: {e}. Assuming not empty.")
	elif isinstance(eval_dataset, Dataset):
	if len(eval_dataset) == 0:
	logging.warning("Evaluation dataset is empty (length 0). Returning trained model.")
	return trained_model
	else:
	logging.warning(f"Unknown evaluation dataset type: {type(eval_dataset)}. Cannot perform quality check. Returning trained model.")
	return trained_model

	data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
	eval_batch_size = max(1, BATCH_SIZE // 2)
	device = get_device()

	original_model.to(device).eval()
	trained_model.to(device).eval()
	was_training_orig = original_model.training
	trained_model.train()

	temp_eval_dir = "./tmp_eval_quality_check"
	eval_args = TrainingArguments(
	output_dir=temp_eval_dir,
	per_device_eval_batch_size=eval_batch_size,
	report_to=[],
	dataloader_num_workers=max(1, (NUM_CPU_CORES if NUM_CPU_CORES > 0 else os.cpu_count() // 2)),
	fp16=torch.cuda.is_available() and not USE_CPU and original_model.dtype == torch.float16,
	bf16=(torch.cuda.is_available() and torch.cuda.is_bf16_supported()) and original_model.dtype == torch.bfloat16,
	use_cpu=USE_CPU,
	log_level='error',
	remove_unused_columns=False,
	)

	results = {}
	eval_error = False
	for model_name, model_instance in [("Original", original_model), ("Trained", trained_model)]:
	logging.info(f"Evaluating {model_name} model for quality check..."); T_eval = time.time()

	current_eval_dataset = eval_dataset
	if is_iterable:
	current_eval_dataset = eval_dataset.take(1000) # Hardcoded eval buffer size
	try:
	if len(list(iter(current_eval_dataset.take(1)))) == 0:
	logging.warning(f"Iterable eval sample for {model_name} is empty. Skipping eval.")
	results[model_name] = {"loss": float('inf'), "ppl": float('inf')}
	continue
	current_eval_dataset = eval_dataset.take(1000) # Hardcoded eval buffer size
	except Exception as e:
	logging.error(f"Error handling iterable dataset sample for {model_name}: {e}")
	results[model_name] = {"loss": float('inf'), "ppl": float('inf')}
	eval_error = True; break

	trainer = Trainer(
	model=model_instance,
	args=eval_args,
	data_collator=data_collator,
	eval_dataset=current_eval_dataset
	)
	try:
	model_instance.eval()
	metrics = trainer.evaluate()
	loss = metrics.get("eval_loss")
	ppl = compute_perplexity(loss)
	results[model_name] = {"loss": loss if loss is not None else float('inf'), "ppl": ppl}
	logging.info(f"{model_name} Eval Loss: {loss if loss is not None else 'N/A':.4f}, PPL: {ppl:.4f} (Eval time: {time.time()-T_eval:.2f}s)")
	except StopIteration:
	logging.error(f"Evaluation dataset exhausted unexpectedly during evaluation of {model_name}. Comparison may be incomplete.")
	results[model_name] = {"loss": float('inf'), "ppl": float('inf')}
	eval_error = True; break
	except Exception as e:
	logging.error(f"Error evaluating {model_name} model: {e}\n{traceback.format_exc()}")
	results[model_name] = {"loss": float('inf'), "ppl": float('inf')}
	eval_error = True; break

	if os.path.exists(temp_eval_dir):
	try:
	shutil.rmtree(temp_eval_dir)
	except Exception as e:
	logging.warning(f"Could not remove temporary eval directory {temp_eval_dir}: {e}")

	original_model.train(mode=was_training_orig)
	trained_model.train()

	original_loss = results.get("Original", {}).get("loss", float('inf'))
	trained_loss = results.get("Trained", {}).get("loss", float('inf'))

	if eval_error:
	logging.error("Evaluation encountered errors. Cannot reliably compare models. Returning trained model.")
	return trained_model

	valid_comparison = math.isfinite(original_loss) and math.isfinite(trained_loss)

	if valid_comparison:
	loss_threshold = original_loss * 1.05
	if trained_loss > loss_threshold:
	logging.warning(f"Trained model loss ({trained_loss:.4f}) is significantly worse (>5%) than original ({original_loss:.4f}). Reverting to original model state based on quality check.")
	return original_model.to(device)
	elif trained_loss > original_loss:
	logging.info(f"Trained model loss ({trained_loss:.4f}) is slightly worse than original ({original_loss:.4f}), but within threshold. Keeping trained model.")
	return trained_model.to(device)
	else:
	logging.info(f"Trained model loss ({trained_loss:.4f}) is better than or equal to original ({original_loss:.4f}). Keeping trained model.")
	return trained_model.to(device)
	else:
	logging.warning("Could not perform valid loss comparison (one or both evaluations failed or yielded non-finite loss). Returning trained model.")
	return trained_model.to(device)


	def _merge_architectures(model_ids_str, hf_token=None, bypass_limits_state=False):
	global global_model, global_tokenizer, config, global_pipe, BYPASS_RESOURCE_LIMITS
	BYPASS_RESOURCE_LIMITS = bypass_limits_state

	if not isinstance(model_ids_str, str) or not model_ids_str.strip():
	return "[Error] Model IDs string cannot be empty.", "{}", *get_error_filter_updates()

	resources_ok, res_msg = check_resources()
	if not resources_ok:
	error_msg = f"[Error] Resource limits exceeded, cannot proceed with merge. {res_msg}"
	logging.error(error_msg)
	return error_msg, "{}", *get_error_filter_updates()
	else:
	logging.info(res_msg)

	model_ids = [m.strip() for m in model_ids_str.split(',') if m.strip()]
	if len(model_ids) < 2:
	return "[Error] Need at least two valid model IDs/paths separated by commas to merge.", "{}", *get_error_filter_updates()

	logging.info(f"Starting architecture merge (parameter averaging) for models: {model_ids}")
	device = get_device()
	merged_model = None
	t_merge_start = time.time()
	base_model_id = model_ids[0]

	try:
	logging.info(f"Loading base config and tokenizer from: {base_model_id}")
	base_tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True, token=hf_token)
	base_config = AutoConfig.from_pretrained(base_model_id, trust_remote_code=True, token=hf_token)
	if base_tokenizer.pad_token is None and base_tokenizer.eos_token is not None:
	base_tokenizer.pad_token = base_tokenizer.eos_token
	base_config.pad_token_id = base_config.eos_token_id
	logging.info("Set base tokenizer pad_token to eos_token for consistency.")

	except Exception as e:
	logging.error(f"Failed to load base config/tokenizer for {base_model_id}: {e}")
	return f"[Error] Failed to load base model config/tokenizer: {e}", "{}", *get_error_filter_updates()

	try:
	logging.info(f"Loading base model state dict (CPU, float32) for merging: {base_model_id}")
	base_model = AutoModelForCausalLM.from_pretrained(
	base_model_id,
	trust_remote_code=True,
	token=hf_token,
	torch_dtype=torch.float32,
	low_cpu_mem_usage=True
	)
	base_state_dict = base_model.state_dict()
	merged_state_dict = OrderedDict((k, v.clone()) for k, v in base_state_dict.items())
	param_counts = OrderedDict((k, 1) for k in base_state_dict)
	num_models_processed = 1
	del base_model, base_state_dict
	clean_memory()

	except Exception as e:
	logging.error(f"Failed to load base model state dict for {base_model_id}: {e}")
	return f"[Error] Failed to load base model state dict: {e}", "{}", *get_error_filter_updates()

	for i, model_id in enumerate(model_ids[1:]):
	logging.info(f"Processing model {i+2}/{len(model_ids)}: {model_id}")
	try:
	model_i = AutoModelForCausalLM.from_pretrained(
	model_id,
	trust_remote_code=True,
	token=hf_token,
	torch_dtype=torch.float32,
	low_cpu_mem_usage=True
	)
	state_dict_i = model_i.state_dict()

	for name, param_i in state_dict_i.items():
	if name in merged_state_dict:
	if merged_state_dict[name].shape == param_i.shape:
	merged_state_dict[name].add_(param_i)
	param_counts[name] += 1
	else:
	logging.warning(f"Shape mismatch for parameter '{name}' between base and {model_id}. Base: {merged_state_dict[name].shape}, Current: {param_i.shape}. Parameter '{name}' will NOT include contribution from {model_id}.")
	else:
	logging.warning(f"Parameter '{name}' found in {model_id} but not in base model {base_model_id}. Skipping this parameter.")

	num_models_processed += 1
	del model_i, state_dict_i
	clean_memory()

	except Exception as e:
	logging.error(f"Failed to load or process model {model_id}: {e}. Skipping this model for merge.")
	continue

	if num_models_processed < 2:
	msg = "Merge failed: Fewer than two models were successfully loaded and processed."
	logging.error(msg)
	return f"[Error] {msg}", "{}", *get_error_filter_updates()

	averaged_count = 0
	for name in merged_state_dict:
	count = param_counts.get(name, 0)
	if count > 0:
	merged_state_dict[name].div_(count)
	averaged_count +=1
	else:
	logging.error(f"Parameter '{name}' has count {count} <= 0 during averaging. This indicates a logic error.")

	logging.info(f"Averaged {averaged_count} parameters across {num_models_processed} successfully processed models.")

	try:
	logging.info("Creating final merged model from base config and averaged weights...")
	merged_model = AutoModelForCausalLM.from_config(base_config, trust_remote_code=True)
	load_results = merged_model.load_state_dict(merged_state_dict, strict=False)

	if load_results.missing_keys:
	logging.warning(f"Load state dict results: Missing keys: {load_results.missing_keys}")
	if load_results.unexpected_keys:
	logging.warning(f"Load state dict results: Unexpected keys: {load_results.unexpected_keys}")

	final_dtype = torch.bfloat16 if device.type == 'cuda' and torch.cuda.is_bf16_supported() else torch.float16 if device.type == 'cuda' else torch.float32
	logging.info(f"Converting merged model to {final_dtype} for use on device {device}.")

	global_model = merged_model.to(device=device, dtype=final_dtype)
	global_tokenizer = base_tokenizer
	config = initialize_config_flags(global_model.config)
	config.architecture_merged = True
	config.merged_from_models = model_ids
	config.merged_models_processed = num_models_processed

	update_pipeline()
	clean_memory()

	final_status_json, *filter_updates = get_detailed_status_and_filter_states()
	merge_time = time.time() - t_merge_start
	msg = f"Successfully merged architectures (averaged parameters) from {num_models_processed} models in {merge_time:.2f}s. Base config/tokenizer from: {base_model_id}."
	logging.info(msg)
	return msg, final_status_json, *filter_updates

	except Exception as e:
	logging.error(f"Architecture merging failed during final model creation or state update: {e}\n{traceback.format_exc()}")
	global_model = None
	global_tokenizer = None
	config = None
	global_pipe = None
	clean_memory()
	return f"[Error] Architecture merging failed: {e}", "{}", *get_error_filter_updates()


	def get_user_id(token):
	if not token:
	logging.warning("No Hugging Face token provided for user ID check.")
	return "unknown_user"
	try:
	api = HfApi()
	user_info = api.whoami(token=token)
	return user_info.get("name", "unknown_user")
	except requests.exceptions.HTTPError as http_err:
	if http_err.response.status_code == 401:
	logging.error("Hugging Face authentication failed (401 Unauthorized). Check your token.")
	return "auth_error_user"
	else:
	logging.error(f"HTTP error retrieving Hugging Face user ID: {http_err}")
	return "http_error_user"
	except Exception as e:
	logging.error(f"Could not retrieve Hugging Face user ID: {e}")
	return "unknown_user"

	def decode_model_details(model):
	if model is None:
	return json.dumps({"Error": "Model not loaded."}, indent=2)
	if not hasattr(model, 'config'):
	logging.warning("Model object lacks a 'config' attribute.")
	details = OrderedDict()
	details["Model Class"] = type(model).__name__
	details["Error"] = "Model config attribute not found."
	return json.dumps(details, indent=2)

	details = OrderedDict()
	config_obj = model.config
	t_start_decode = time.time()
	logging.info("Decoding model details...")

	try:
	details["Model Class"] = type(model).__name__
	details["Config Class"] = getattr(config_obj, 'config_class', type(config_obj).__name__)
	details["Model Type"] = getattr(config_obj, 'model_type', 'N/A')

	total_params = 0
	trainable_params = 0
	param_dtypes = set()
	param_devices = set()
	try:
	for name, param in model.named_parameters():
	num_elements = param.numel()
	total_params += num_elements
	param_dtypes.add(str(param.dtype).replace('torch.', ''))
	param_devices.add(str(param.device))
	if param.requires_grad:
	trainable_params += num_elements
	if not param_devices:
	device_str = "N/A (No parameters)"
	elif len(param_devices) == 1:
	device_str = param_devices.pop()
	else:
	device_str = f"Multiple ({', '.join(param_devices)})"
	except Exception as e:
	logging.warning(f"Could not fully analyze parameters: {e}")
	device_str = "Error analyzing params"

	details["Device(s)"] = device_str
	trainable_perc = (100 * trainable_params / total_params) if total_params > 0 else 0.00
	details["Params Summary"] = (f"Total: {total_params:,}, Trainable: {trainable_params:,} "
	f"({trainable_perc:.2f}%), Dtypes: {list(param_dtypes)}")

	try:
	layer_counts = Counter(type(m).__name__ for m in model.modules() if not isinstance(m, nn.Sequential))
	details["Layer Types Count"] = dict(layer_counts.most_common(15))
	except Exception as e:
	logging.warning(f"Could not count layer types: {e}")
	details["Layer Types Count"] = "Error counting layers"

	details["Modification Flags"] = {}
	all_flags = initialize_config_flags(None).__dict__.keys()
	for flag in sorted(all_flags):
	if hasattr(config_obj, flag):
	value = getattr(config_obj, flag)
	details["Modification Flags"][flag] = value

	details["Key Config Attributes"] = {}
	key_attrs = ['vocab_size', 'hidden_size', 'num_hidden_layers', 'num_attention_heads',
	'intermediate_size', 'max_position_embeddings', 'hidden_act', 'layer_norm_eps',
	'rms_norm_eps', 'attention_dropout', 'hidden_dropout_prob', 'initializer_range',
	'tie_word_embeddings', 'rope_scaling', 'sliding_window', 'attn_implementation']
	for attr in key_attrs:
	if hasattr(config_obj, attr):
	details["Key Config Attributes"][attr] = getattr(config_obj, attr)


	logging.info(f"Model details decoded in {time.time() - t_start_decode:.2f}s")
	return json.dumps(details, indent=2, default=str)
	except Exception as e:
	logging.error(f"Error decoding model details: {e} \n{traceback.format_exc()}")
	details["Error"] = f"Failed during detail decoding: {e}"
	return json.dumps(details, indent=2, default=str)


	def update_pipeline():
	global global_model, global_tokenizer, global_pipe
	if global_model and global_tokenizer:
	device = get_device()

	pipeline_device_arg = None
	device_map = None

	if device.type == 'cpu':
	pipeline_device_arg = -1
	logging.info("Configuring pipeline for CPU.")
	elif device.type == 'cuda':
	if torch.cuda.device_count() > 1:
	device_map = "auto"
	pipeline_device_arg = None
	logging.info("Multiple GPUs detected, configuring pipeline with device_map='auto'.")
	else:
	pipeline_device_arg = 0
	logging.info("Configuring pipeline for single CUDA device (device=0).")
	elif device.type == 'mps':
	pipeline_device_arg = 0
	logging.info("Configuring pipeline for MPS device (device=0).")
	else:
	pipeline_device_arg = -1
	logging.warning(f"Unknown device type '{device.type}', configuring pipeline for CPU.")

	logging.info(f"Updating text generation pipeline (Device Arg: {pipeline_device_arg}, Device Map: {device_map})..."); T=time.time()
	try:
	if device_map is None and pipeline_device_arg is not None:
	if pipeline_device_arg == -1:
	global_model.to('cpu')
	elif device.type == 'cuda':
	global_model.to(f'cuda:{pipeline_device_arg}')
	elif device.type == 'mps':
	global_model.to('mps:0')

	task = "text-generation"

	global_pipe = pipeline(
	task=task,
	model=global_model,
	tokenizer=global_tokenizer,
	device=pipeline_device_arg,
	device_map=device_map
	)

	pipe_device_str = "N/A"
	if global_pipe.device_map:
	pipe_device_str = f"device_map: {global_pipe.device_map}"
	elif global_pipe.device:
	pipe_device_str = str(global_pipe.device)
	logging.info(f"Text generation pipeline created/updated. Effective device(s): {pipe_device_str}")

	if device_map is None and global_pipe.device != device:
	logging.warning(f"Pipeline created on {global_pipe.device}, but target device was {device}. This might happen with device_map issues or insufficient VRAM.")

	msg = f"Text generation pipeline updated successfully in {time.time()-T:.2f}s."; logging.info(msg)
	return msg
	except Exception as e:
	msg=f"Pipeline update failed: {e}\n{traceback.format_exc()}"; logging.error(msg);
	global_pipe = None;
	return f"[Error] Pipeline update failed: {e}"
	else:
	msg = "Cannot update pipeline: Global model or tokenizer not loaded."; logging.warning(msg);
	global_pipe = None;
	return msg


	def get_detailed_status_and_filter_states():
	global global_model, config
	t_start = time.time()

	if global_model is None:
	logging.warning("Cannot get status: Model not loaded.")
	return json.dumps({"Error": "Model not loaded."}, indent=2), *get_error_filter_updates()

	if not hasattr(global_model, 'config') or global_model.config is None:
	logging.warning("Model config missing. Initializing default flags for status check.")
	temp_config = initialize_config_flags(None)
	status_json = json.dumps({"Warning": "Model config missing, status reflects defaults.", **json.loads(decode_model_details(global_model))}, indent=2)
	config_to_check = temp_config
	else:
	config = global_model.config
	config = initialize_config_flags(config)
	global_model.config = config
	status_json = decode_model_details(global_model)
	config_to_check = config

	logging.info("Refreshing detailed model status and filter checkbox states...")

	filter_states = {}
	for name in filter_names_ui:
	attr_name = filter_attr_map.get(name)
	if attr_name:
	filter_states[name] = getattr(config_to_check, attr_name, False)
	else:
	logging.error(f"Filter name '{name}' not found in attribute map. Setting state to False.")
	filter_states[name] = False

	updates = [gr.update(value=filter_states.get(name, False)) for name in filter_names_ui]

	logging.info(f"Refreshed status and filter states in {time.time()-t_start:.2f}s.");
	return status_json, *updates

	def get_error_filter_updates():
	return [gr.update(value=False) for _ in filter_names_ui]

	def base_toggle_function(func_enable, func_disable, enable, success_msg_enable, success_msg_disable, *args):
	global global_model, config
	t_start = time.time()

	if not global_model:
	return "[Error] Model not loaded. Load a model first."

	if not hasattr(global_model, 'config') or global_model.config is None:
	logging.warning("Model config missing. Initializing default flags before toggle.")
	global_model.config = initialize_config_flags(None)
	config = initialize_config_flags(global_model.config)
	global_model.config = config

	msg = ""
	func_to_call = func_enable if enable else func_disable
	action_name = "Enable" if enable else "Disable"
	func_name = getattr(func_enable, '__name__', 'unknown_enable').replace('_', ' ').title() if enable else \
	getattr(func_disable, '__name__', 'unknown_disable').replace('_', ' ').title()

	logging.info(f"Executing toggle: {action_name} {func_name}...")

	try:
	sig = inspect.signature(func_to_call)
	pass_args = []
	if 'model' in sig.parameters or 'base_model' in sig.parameters or 'module' in sig.parameters:
	pass_args.append(global_model)
	if 'config' in sig.parameters:
	pass_args.append(config)
	pass_args.extend(args)

	result = func_to_call(*pass_args)

	if isinstance(result, str) and "[Error]" not in result:
	msg = result
	elif isinstance(result, str):
	msg = result
	else:
	msg = success_msg_enable if enable else success_msg_disable

	logging.info(f"Toggle Action ({func_name} -> {action_name}) Result: {msg} (Took {time.time()-t_start:.2f}s)")
	if "[Error]" not in msg:
	update_pipeline()

	except Exception as e:
	msg = f"[Error] during toggle ({action_name} {func_name}): {e}"
	logging.error(f"{msg}\n{traceback.format_exc()}")

	clean_memory()
	return msg

	def specific_action_function(action_func, *args, success_msg="Action completed successfully."):
	global global_model, global_tokenizer, config
	t_start=time.time()

	if not global_model:
	return "[Error] Model not loaded. Load a model first."

	if not hasattr(global_model, 'config') or global_model.config is None:
	logging.warning("Model config missing. Initializing default flags before action.")
	global_model.config = initialize_config_flags(None)
	config = initialize_config_flags(global_model.config)
	global_model.config = config

	msg = ""
	func_name = getattr(action_func, '__name__', 'unknown_action')

	logging.info(f"Executing action: {func_name}...")

	try:
	sig = inspect.signature(action_func)
	pass_args = []
	if 'model' in sig.parameters or 'base_model' in sig.parameters or 'module' in sig.parameters:
	pass_args.append(global_model)
	if 'config' in sig.parameters:
	pass_args.append(config)
	if 'tokenizer' in sig.parameters:
	if global_tokenizer:
	pass_args.append(global_tokenizer)
	else:
	return f"[Error] Action '{func_name}' requires tokenizer, but it's not loaded."
	pass_args.extend(args)

	result = action_func(*pass_args)

	if isinstance(result, str) and "[Error]" not in result:
	msg = result
	elif isinstance(result, str):
	msg = result
	else:
	msg = success_msg

	logging.info(f"Action ({func_name}) Result: {msg} (Took {time.time()-t_start:.2f}s)")
	if "[Error]" not in msg:
	update_pipeline()

	except Exception as e:
	msg = f"[Error] during action ({func_name}): {e}"
	logging.error(f"{msg}\n{traceback.format_exc()}")

	clean_memory()
	return msg

	toggle_bias_removal_wrapper = lambda enable: base_toggle_function(_replace_linear_without_bias, _enable_bias_in_linear, enable, "Bias removal applied.", "Bias addition applied (reverted removal).")
	toggle_embeddings_untie_wrapper = lambda enable: base_toggle_function(_untie_embeddings, _retie_embeddings, enable, "Embeddings untied.", "Embeddings re-tied.")
	toggle_layer_reduction_wrapper = lambda enable, layers: specific_action_function(_reduce_layers_to_one if enable else _enable_full_layers, layers if enable else None, success_msg=f"Layer reduction {'applied' if enable else 'reverted'}.")
	apply_norm_swap_wrapper = lambda norm_type: specific_action_function(_swap_normalization_layer, norm_type, success_msg=f"Normalization swapped to {norm_type}")
	apply_activation_change_wrapper = lambda name: specific_action_function(_swap_activation_function, name, success_msg=f"Activation Function Swapped to {name}")
	revert_activation_change_wrapper = lambda: specific_action_function(_revert_activation_function, success_msg="Activation Function Reverted to Default")
	toggle_bitnet_wrapper = lambda enable: base_toggle_function(convert_to_bitnet, revert_bitnet, enable, "BitNet conversion applied.", "BitNet conversion reverted.")
	apply_multimodal_wrapper = lambda modalities: specific_action_function(_setup_multimodal, modalities, success_msg="Multi-modal setup attempted.")
	revert_multimodal_wrapper = lambda: specific_action_function(_revert_multimodal, success_msg="Multi-modal setup reverted.")

	toggle_token_speed_optimization_wrapper = lambda enable: specific_action_function(_optimize_token_generation_speed if enable else _revert_token_generation_speed_optimization, success_msg="Token Speed Opt Flags Updated")
	toggle_coherence_improvement_wrapper = lambda enable: specific_action_function(_enable_coherence_improvement if enable else _disable_coherence_improvement, success_msg="Coherence Flag Updated")
	toggle_layer_norm_bypass_wrapper = lambda enable: specific_action_function(_enable_layer_norm_bypass if enable else _disable_layer_norm_bypass, success_msg="LN Bypass Updated")
	toggle_dropout_bypass_wrapper = lambda enable: specific_action_function(_enable_dropout_bypass if enable else _disable_dropout_bypass, success_msg="Dropout Bypass Updated")
	toggle_fp32_precision_wrapper = lambda enable: specific_action_function(_recover_perfect_precision if enable else _revert_perfect_precision, success_msg="FP32 Precision Updated")
	toggle_embedding_normalization_wrapper = lambda enable: specific_action_function(_normalize_embeddings if enable else _revert_embedding_normalization, success_msg="Embedding Normalization Updated")
	toggle_gradient_checkpointing_wrapper = lambda enable: specific_action_function(_enable_gradient_checkpointing if enable else _disable_gradient_checkpointing, success_msg="Grad Checkpointing Updated")
	toggle_flash_attention_wrapper = lambda enable: specific_action_function(_set_attention_variant_config, "flash_attention_2" if enable else "auto", success_msg=f"Flash Attention 2 {'Enabled' if enable else 'Disabled'} (via attn_implementation)")
	apply_quantization_wrapper = lambda mode: specific_action_function(_quantize_model, mode, success_msg=f"Quantization Attempted: {mode}")
	revert_quantization_wrapper = lambda: specific_action_function(_revert_quantization, success_msg="Quantization Reverted to FP32")

	def _parse_pruning_amount(amount_str):
	try:
	amount = float(amount_str)
	if not (0 < amount < 1):
	raise ValueError("Pruning amount must be between 0 and 1")
	return amount
	except (ValueError, TypeError):
	logging.warning(f"Invalid pruning amount '{amount_str}', using default {PRUNING_AMOUNT}")
	return PRUNING_AMOUNT

	apply_pruning_wrapper = lambda amount_str: specific_action_function(
	_prune_weights_magnitude,
	_parse_pruning_amount(amount_str),
	success_msg=f"Pruning Applied (Amount: {_parse_pruning_amount(amount_str):.2f})"
	)
	revert_pruning_wrapper = lambda: specific_action_function(_revert_pruning, success_msg="Pruning Flag Reverted")

	set_lora_path_wrapper = lambda path: specific_action_function(_set_lora_adapter_path, path, success_msg="LoRA Path Set in Config")
	add_peft_adapter_wrapper = lambda: specific_action_function(
	_add_peft_adapter,
	LoraConfig(**DEFAULT_PEFT_CONFIG_DICT) if _peft_installed else None,
	success_msg="PEFT Adapter Added"
	)
	merge_peft_adapter_wrapper = lambda: specific_action_function(_apply_lora_merge, success_msg="PEFT Adapter Merged")
	remove_peft_adapter_wrapper = lambda: specific_action_function(_remove_peft_adapter, success_msg="PEFT Adapter Removed")

	apply_layer_freeze_wrapper = lambda layers_str: specific_action_function(_freeze_layers, layers_str, success_msg="Layer Freezing Updated")
	revert_layer_freeze_wrapper = lambda: specific_action_function(_unfreeze_all_layers, success_msg="All Layers Unfrozen")
	toggle_limits_wrapper = lambda enable: specific_action_function(_configure_limits if enable else _remove_limits_configuration, success_msg="Limits Config Updated")
	toggle_qa_restrictions_wrapper = lambda enable: specific_action_function(_remove_qa_restrictions if enable else _enable_qa_restrictions, success_msg="QA Restrictions Flag Updated")

	def _parse_int_arg(arg, default, min_val=1):
	try:
	val = int(arg)
	return max(val, min_val)
	except (ValueError, TypeError):
	return default

	toggle_kd_wrapper = lambda enable, num_labels=2: specific_action_function(
	_setup_knowledge_distillation if enable else _revert_knowledge_distillation,
	_parse_int_arg(num_labels, 2, 1) if enable else (),
	success_msg="KD Setup Updated"
	)
	toggle_reward_modeling_wrapper = lambda enable, num_outputs=1: specific_action_function(
	_setup_reward_modeling if enable else _revert_reward_modeling,
	_parse_int_arg(num_outputs, 1, 1) if enable else (),
	success_msg="Reward Modeling Setup Updated"
	)
	toggle_swa_wrapper = lambda enable: specific_action_function(_apply_swa if enable else _revert_swa, success_msg="SWA Flag Updated")

	def _parse_prob_arg(arg, default, min_val=0.0, max_val=1.0):
	try:
	val = float(arg)
	return min(max(val, min_val), max_val)
	except(ValueError, TypeError):
	return default

	toggle_layerdrop_wrapper = lambda enable, prob=0.1: specific_action_function(
	_enable_layerdrop if enable else _disable_layerdrop,
	_parse_prob_arg(prob, 0.1, 0.0, 1.0) if enable else (),
	success_msg="LayerDrop Flag Updated"
	)
	toggle_rope_scaling_wrapper = lambda enable, type="linear", factor=2.0: specific_action_function(
	_set_rope_scaling_config if enable else _revert_rope_scaling,
	str(type) if enable else (),
	_parse_prob_arg(factor, 2.0, 1.0, 100.0) if enable else (),
	success_msg="RoPE Scaling Config Updated"
	)
	toggle_sliding_window_wrapper = lambda enable, size=4096: specific_action_function(
	_set_sliding_window_config if enable else _revert_sliding_window,
	_parse_int_arg(size, 4096, 0) if enable else (),
	success_msg="Sliding Window Config Updated"
	)
	apply_attention_variant_wrapper = lambda variant="auto": specific_action_function(_set_attention_variant_config, str(variant), success_msg="Attention Variant Config Updated")
	revert_attention_variant_wrapper = lambda: specific_action_function(_revert_attention_variant, success_msg="Attention Variant Config Reverted")

	toggle_gradient_clipping_flag_wrapper = lambda enable: specific_action_function(_enable_gradient_clipping if enable else _disable_gradient_clipping, success_msg="Grad Clipping Flag Updated")
	toggle_weight_decay_flag_wrapper = lambda enable: specific_action_function(_enable_weight_decay if enable else _disable_weight_decay, success_msg="Weight Decay Flag Updated")
	toggle_lr_scheduler_flag_wrapper = lambda enable: specific_action_function(_enable_lr_scheduler if enable else _disable_lr_scheduler, success_msg="LR Scheduler Flag Updated")
	apply_optimizer_change_wrapper = lambda name: specific_action_function(_swap_optimizer, str(name), success_msg=f"Optimizer Pref Set: {name}")
	revert_optimizer_change_wrapper = lambda: specific_action_function(_revert_optimizer, success_msg="Optimizer Pref Reverted")

	def _set_grad_accum_config(config, steps):
	try:
	s = int(steps)
	if s < 1: raise ValueError("Steps must be >= 1")
	config.gradient_accumulation_steps = s
	global GRADIENT_ACCUMULATION_STEPS
	GRADIENT_ACCUMULATION_STEPS = s
	return f"Grad Accum Steps set to {s} in config."
	except (ValueError, TypeError) as e:
	logging.error(f"Invalid gradient accumulation steps: {steps}. Error: {e}")
	return f"[Error] Invalid Grad Accum steps: {e}"

	set_gradient_accumulation_wrapper = lambda steps: specific_action_function(_set_grad_accum_config, steps, success_msg=f"Grad Accum Steps update attempted.")

	toggle_all_safety_filters_wrapper = lambda enable: specific_action_function(_enable_all_safety_settings if enable else _disable_all_safety_settings, success_msg=f"All Safety Filters {'Enabled (Defaults)' if enable else 'Disabled'}")
	force_disable_censorship_wrapper = lambda: specific_action_function(_disable_all_safety_settings, success_msg="Attempted Force Disable All Censorship Flags")

	def toggle_individual_safety_filter_wrapper(*state_dict):
	global global_model, config
	t_start=time.time()
	if not global_model: return "[Error] Model not loaded."

	if not hasattr(global_model, 'config') or global_model.config is None:
	logging.warning("Model config missing. Initializing default flags for filter toggle.")
	global_model.config = initialize_config_flags(None)
	config = initialize_config_flags(global_model.config)
	global_model.config = config

	results = []
	updated_count = 0

	if len(state_dict) != len(filter_names_ui):
	return f"[Error] Mismatch between filter UI elements ({len(filter_names_ui)}) and received states ({len(state_dict)})."

	ui_state = dict(zip(filter_names_ui, state_dict))

	for name, checkbox_state in ui_state.items():
	filter_attr = filter_attr_map.get(name)
	if filter_attr:
	current_state = getattr(config, filter_attr, False)
	new_state = bool(checkbox_state)
	if current_state != new_state:
	setattr(config, filter_attr, new_state)
	results.append(f"{name}: {'ON' if new_state else 'OFF'}")
	updated_count += 1
	else:
	logging.warning(f"UI filter name '{name}' not found in attribute map filter_attr_map. Skipping.")

	if updated_count > 0:
	msg = f"Applied {updated_count} individual filter toggle(s): {', '.join(results)}"
	update_pipeline()
	else:
	msg = "No individual filter states were changed."

	logging.info(f"Individual filter toggle action took {time.time()-t_start:.2f}s. Status: {msg}");
	return msg


	def _improve_coherence(model, tokenizer, generation_args):
	logging.info("Applying coherence improvement using beam search...")
	coherence_beams = generation_args.get("num_beams", 1)
	if coherence_beams <= 1: coherence_beams = 4

	coherence_args = generation_args.copy()
	coherence_args["num_beams"] = coherence_beams
	coherence_args["do_sample"] = False
	coherence_args["num_return_sequences"] = 1
	coherence_args["early_stopping"] = True
	coherence_args.pop("temperature", None)
	coherence_args.pop("top_k", None)
	coherence_args.pop("top_p", None)

	input_ids = coherence_args.get("input_ids")
	if input_ids is None:
	logging.error("Coherence improvement failed: input_ids missing.")
	return "[Error: input_ids missing in generation_args]"

	try:
	with torch.no_grad():
	outputs = model.generate(**coherence_args)
	response_ids = outputs[0][input_ids.shape[-1]:]
	response_text = tokenizer.decode(response_ids, skip_special_tokens=True)
	logging.info("Coherence improvement (beam search) successful.")
	return response_text
	except Exception as e:
	logging.error(f"Error during coherence improvement (beam search): {e}. Falling back to original generation settings.")
	fallback_args = generation_args.copy()
	if fallback_args.get("num_beams", 1) > 1 and fallback_args.get("do_sample", False):
	fallback_args["num_beams"] = 1
	fallback_args["early_stopping"] = False

	try:
	with torch.no_grad():
	outputs = model.generate(**fallback_args)
	response_ids = outputs[0][input_ids.shape[-1]:]
	response_text = tokenizer.decode(response_ids, skip_special_tokens=True)
	logging.info("Fallback generation successful after coherence error.")
	return response_text
	except Exception as fallback_e:
	logging.error(f"Fallback generation also failed: {fallback_e}")
	return f"[Error during generation (fallback): {fallback_e}]"

	def _apply_output_filters(text, config):
	if not getattr(config, "response_filters", True) or getattr(config, "no_response_filters", False):
	return text

	logging.debug("Applying output filters based on config...")
	original_text = text
	if not isinstance(text, str): text = str(text)

	filter_actions = {
	"profanity_filter": lambda t: re.sub(r'\b(damn\|fuck\|shit\|cunt\|bitch\|asshole\|bastard\|hell)\b', '[profanity]', t, flags=re.IGNORECASE),
	"pii_filter": lambda t: re.sub(r'\b(\d{3}[-.\s]?\d{3}[-.\s]?\d{4})\b\|\b([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,})\b', '[censored_pii]', t),
	"hate_filter": lambda t: "[censored_hate_speech]" if any(word in t.lower() for word in ["kill all", "attack the", "inferior race", "exterminate"]) else t,
	"sexually_explicit_filter": lambda t: "[censored_explicit]" if any(word in t.lower() for word in [" rape ", " naked ", " porn ", " xxx ", " blowjob ", " sex toy"]) else t,
	"dangerous_content_filter": lambda t: "[censored_dangerous]" if any(word in t.lower() for word in ["make a bomb", "build weapon", "illegal drug recipe", "how to kill"]) else t,
	"medical_advice_filter": lambda t: "[discouraged_medical_advice]" if any(word in t.lower() for word in ["diagnose", "prescribe", "cure for", "medical treatment for", "symptoms suggest"]) else t,
	"legal_advice_filter": lambda t: "[discouraged_legal_advice]" if any(word in t.lower() for word in ["legal advice", "sue", "represent me", "is this legal", "contract law"]) else t,
	"financial_advice_filter": lambda t: "[discouraged_financial_advice]" if any(word in t.lower() for word in ["guaranteed investment", "stock tip", "financial plan", "buy bitcoin", "investment advice"]) else t,
	"stereotype_filter": lambda t: "[censored_stereotype]" if re.search(r'\b(all\|every)\s+([A-Za-z]+(\s+)?){1,3}\s+(are\|always)\b', t.lower()) else t,
	"misinfo_filter": lambda t: "[potential_misinfo]" if any(phrase in t.lower() for phrase in ["5g causes covid", "earth is flat", "vaccines cause autism", "chemtrails"]) else t,
	"self_harm_filter": lambda t: "[censored_self_harm]" if any(phrase in t.lower() for phrase in ["commit suicide", "hurt myself", "painless death", "kill myself"]) else t,
	}

	active_filters_count = 0
	for filter_ui_name, filter_attr in filter_attr_map.items():
	if getattr(config, filter_attr, False):
	filter_func = filter_actions.get(filter_attr)
	if filter_func:
	try:
	filtered_text = filter_func(text)
	if filtered_text != text:
	active_filters_count +=1
	logging.debug(f"Filter '{filter_attr}' potentially applied modification.")
	text = filtered_text
	except Exception as e:
	logging.warning(f"Error applying filter '{filter_attr}': {e}")

	if not getattr(config, "no_advert_warning", False):
	if re.search(r'\b(advertisement\|sponsored\|promo code\|discount code\|special offer)\b', text, re.IGNORECASE):
	if "[Note: This response may contain promotional content.]" not in text:
	text += "\n[Note: This response may contain promotional content.]"
	active_filters_count +=1

	if active_filters_count > 0:
	logging.debug(f"Output filtering potentially applied {active_filters_count} modifications.")

	return text

	def run_inference(prompt, max_new_tokens, temperature, top_k, top_p, repetition_penalty):
	global global_pipe, global_model, global_tokenizer, config
	if not all([global_model, global_tokenizer]):
	return "[Error] Model or Tokenizer not loaded. Please load a model first."
	if global_pipe is None:
	pipe_msg = update_pipeline()
	if global_pipe is None:
	return f"[Error] Text generation pipeline could not be initialized. Load/Reload model. Status: {pipe_msg}"

	if not hasattr(global_model, 'config'):
	logging.warning("Model config missing during inference. Initializing default flags.")
	global_model.config = initialize_config_flags(None)
	config = initialize_config_flags(global_model.config)
	global_model.config = config

	logging.info("Starting inference run..."); t_start_inf = time.time()
	try:
	use_filters = getattr(config, "response_filters", True) and not getattr(config, "no_response_filters", False)
	apply_coherence = getattr(config, "coherence_improvement_enabled", False)

	try: max_new_tokens = int(max_new_tokens); assert max_new_tokens > 0
	except: max_new_tokens = 256; logging.warning("Invalid max_new_tokens, using 256.")
	try: temperature = float(temperature); assert temperature >= 0.0
	except: temperature = 0.7; logging.warning("Invalid temperature, using 0.7.")
	try: top_k = int(top_k); assert top_k >= 0
	except: top_k = 50; logging.warning("Invalid top_k, using 50.")
	try: top_p = float(top_p); assert 0.0 <= top_p <= 1.0
	except: top_p = 0.95; logging.warning("Invalid top_p, using 0.95.")
	try: repetition_penalty = float(repetition_penalty); assert repetition_penalty >= 1.0
	except: repetition_penalty = 1.1; logging.warning("Invalid repetition_penalty, using 1.1.")

	is_greedy = (temperature < 1e-6) or \
	(top_k == 1 and top_k != 0) or \
	(top_p <= 0.0 or top_p >= 1.0) or \
	getattr(config, "token_gen_speed_maximized", False)


	gen_kwargs = {
	"max_new_tokens": max_new_tokens,
	"temperature": temperature if not is_greedy else None,
	"top_k": top_k if top_k > 0 and not is_greedy else None,
	"top_p": top_p if top_p > 0.0 and top_p < 1.0 and not is_greedy else None,
	"repetition_penalty": repetition_penalty if repetition_penalty > 1.0 else None,
	"do_sample": not is_greedy,
	"use_cache": getattr(config, "use_cache", True),
	"num_beams": (max(getattr(config, "num_beams", 1), 4) if apply_coherence else getattr(config, "num_beams", 1)),
	"pad_token_id": global_tokenizer.pad_token_id if global_tokenizer.pad_token_id is not None else getattr(config, 'pad_token_id', None),
	"eos_token_id": global_tokenizer.eos_token_id if global_tokenizer.eos_token_id is not None else getattr(config, 'eos_token_id', None),
	"early_stopping": True if (apply_coherence or getattr(config, "num_beams", 1) > 1) else False
	}
	gen_kwargs = {k: v for k, v in gen_kwargs.items() if v is not None}

	if gen_kwargs.get("num_beams", 1) > 1 and gen_kwargs.get("pad_token_id") is None:
	if gen_kwargs.get("eos_token_id") is not None:
	gen_kwargs["pad_token_id"] = gen_kwargs["eos_token_id"]
	logging.warning(f"Using eos_token_id ({gen_kwargs['eos_token_id']}) as pad_token_id for beam search.")
	else:
	logging.error("Beam search requires pad_token_id, but none found (and eos_token_id missing). Generation might fail.")
	return "[Error] Beam search failed: pad_token_id is required."

	response_text = ""
	device = get_device()
	logging.debug(f"Generation arguments: {gen_kwargs}")

	inputs = global_tokenizer(prompt, return_tensors="pt", padding=False, truncation=True, max_length=CONTEXT_LENGTH).to(device)
	gen_kwargs["input_ids"] = inputs["input_ids"]
	if "attention_mask" in inputs:
	gen_kwargs["attention_mask"] = inputs["attention_mask"]

	global_model.eval()

	if apply_coherence:
	response_text = _improve_coherence(global_model, global_tokenizer, gen_kwargs)
	else:
	with torch.no_grad():
	outputs = global_model.generate(**gen_kwargs)
	output_sequence = outputs[0]
	response_ids = output_sequence[inputs.input_ids.shape[-1]:]
	response_text = global_tokenizer.decode(response_ids, skip_special_tokens=True)

	if use_filters:
	filtered_response = _apply_output_filters(response_text, config)
	if filtered_response != response_text:
	logging.info("Output filters applied modifications.")
	response_text = filtered_response

	final_response = response_text.strip()
	logging.info(f"Inference finished in {time.time()-t_start_inf:.2f}s. Response length: {len(final_response)}")
	return final_response

	except Exception as e:
	logging.error(f"Error during inference: {e}\n{traceback.format_exc()}")
	return f"[Error during inference: {e}]"
	finally:
	if global_model and hasattr(global_model, 'training') and global_model.training:
	global_model.train()


	def start_training(
	base_model_id: str, new_model_name: str, hf_token: str,
	datasets_input_str: str,
	activation_fn_name: str, target_layers_int: int,
	grad_accum_ui: int, lr: float, epochs: int, max_steps: int, batch_size: int,
	optimizer_name: str, scheduler_type: str, weight_decay: float, warmup_ratio: float,
	use_peft: bool, peft_r: int, peft_alpha: int, peft_dropout: float, peft_target_modules_str: str,
	wandb_token: str, use_cpu_flag: bool, bypass_limits_state: bool
	):
	global global_model, global_tokenizer, global_pipe, original_num_layers_global, config, target_layers
	global USE_CPU, BATCH_SIZE, LEARNING_RATE, EPOCHS, MAX_STEPS, DEFAULT_OPTIMIZER, DEFAULT_SCHEDULER, GRADIENT_ACCUMULATION_STEPS
	global BYPASS_RESOURCE_LIMITS
	BYPASS_RESOURCE_LIMITS = bypass_limits_state

	start_overall_time = time.time()
	logging.info("="*50)
	logging.info("🚀 STARTING TRAINING PROCESS 🚀")

	resources_ok, res_msg = check_resources()
	if not resources_ok:
	error_msg = f"[Error] Resource limits exceeded, cannot start training. {res_msg}"
	logging.error(error_msg)
	return error_msg
	else:
	logging.info(res_msg)

	errors = []
	if not base_model_id: errors.append("Base Model ID/Path is required.")
	if not new_model_name: errors.append("New Model Name (for saving/Hub) is required.")
	if not datasets_input_str: errors.append("At least one dataset must be provided.")
	try: target_layers_int = int(target_layers_int); assert target_layers_int >= 1
	except: errors.append("Target Layers must be a positive integer.")
	try: grad_accum_ui = int(grad_accum_ui); assert grad_accum_ui >= 1
	except: errors.append("Gradient Accumulation Steps must be a positive integer.")
	try: lr = float(lr); assert lr > 0
	except: errors.append("Learning Rate must be a positive float.")
	try: epochs = int(epochs); assert epochs >= 0
	except: errors.append("Epochs must be an integer >= 0.")
	try: max_steps = int(max_steps); assert max_steps >= 0
	except: errors.append("Max Steps must be an integer >= 0.")

	if epochs <= 0 and max_steps <= 0:
	errors.append("Training requires at least one of Epochs or Max Steps to be positive.")
	elif epochs > 0 and max_steps > 0:
	logging.info(f"Both Epochs ({epochs}) and Max Steps ({max_steps}) are set (> 0). Max Steps will take precedence.")
	epochs = -1
	elif epochs <= 0 and max_steps > 0:
	epochs = -1
	elif epochs > 0 and max_steps <= 0:
	logging.info(f"Using Epochs ({epochs}) for training termination as Max Steps <= 0.")
	max_steps = -1
	else:
	logging.error("Logic error in epoch/max_step handling. Defaulting Max Steps to 1.")
	max_steps = 1
	epochs = -1


	try: batch_size = int(batch_size); assert batch_size >= 1
	except: errors.append("Batch Size must be a positive integer.")
	if optimizer_name not in OPTIMIZERS: errors.append(f"Invalid Optimizer. Choose from: {list(OPTIMIZERS.keys())}")
	if scheduler_type not in SCHEDULER_TYPES: errors.append(f"Invalid Scheduler. Choose from: {SCHEDULER_TYPES}")
	try: weight_decay = float(weight_decay); assert weight_decay >= 0.0
	except: errors.append("Weight Decay must be a non-negative float.")
	try: warmup_ratio = float(warmup_ratio); assert 0.0 <= warmup_ratio <= 1.0
	except: errors.append("Warmup Ratio must be between 0.0 and 1.0.")
	if activation_fn_name not in ACTIVATION_FUNCTIONS: errors.append(f"Invalid Activation Function. Choose from: {list(ACTIVATION_FUNCTIONS.keys())}")
	if use_peft and not _peft_installed: errors.append("PEFT requested, but library not installed (`pip install peft`).")
	peft_config_dict = {}
	if use_peft:
	try:
	peft_r = int(peft_r); assert peft_r >= 1
	peft_alpha = int(peft_alpha); assert peft_alpha >= 1
	peft_dropout = float(peft_dropout); assert 0.0 <= peft_dropout <= 1.0
	peft_config_dict = {
	"task_type": TaskType.CAUSAL_LM,
	"inference_mode": False,
	"r": peft_r,
	"lora_alpha": peft_alpha,
	"lora_dropout": peft_dropout,
	}
	if peft_target_modules_str:
	modules = [m.strip() for m in peft_target_modules_str.split(',') if m.strip()]
	if modules:
	peft_config_dict["target_modules"] = modules
	except Exception as peft_e:
	errors.append(f"Invalid PEFT configuration: {peft_e}")


	if errors:
	error_msg = "[Error] Invalid training parameters:\n- " + "\n- ".join(errors)
	logging.error(error_msg)
	return error_msg

	logging.info(f"Base Model: {base_model_id}, New Name: {new_model_name}")
	logging.info(f"Use PEFT: {use_peft}")
	if use_peft: logging.info(f"PEFT Config: r={peft_r}, alpha={peft_alpha}, dropout={peft_dropout}, targets={peft_target_modules_str or 'Auto'}")
	logging.info(f"Datasets: \n{datasets_input_str}")
	logging.info(f"LR: {lr}, Effective Epochs: {epochs if epochs > 0 else 'N/A'}, MaxSteps: {max_steps if max_steps > 0 else 'N/A'}, BS: {batch_size}, GradAccum: {grad_accum_ui}")
	logging.info(f"Optim: {optimizer_name}, Scheduler: {scheduler_type}, WD: {weight_decay}, Warmup: {warmup_ratio}")
	logging.info(f"Post-Mod Target Layers: {target_layers_int}, Post-Mod ActFn: {activation_fn_name}")
	logging.info(f"Use CPU: {use_cpu_flag}, W&B: {'Enabled' if wandb_token else 'Disabled'}, Bypass Limits: {BYPASS_RESOURCE_LIMITS}")
	logging.info("="*50)

	USE_CPU = use_cpu_flag
	BATCH_SIZE = batch_size
	LEARNING_RATE = lr
	EPOCHS = epochs if epochs > 0 else 1
	MAX_STEPS = max_steps
	DEFAULT_OPTIMIZER = optimizer_name
	DEFAULT_SCHEDULER = scheduler_type
	GRADIENT_ACCUMULATION_STEPS = grad_accum_ui
	target_layers = target_layers_int

	logging.info("Setting up environment...")
	clean_memory()
	device = get_device()
	logging.info(f"Using device: {device}")
	num_cpu_cores_os = os.cpu_count() or 1
	global NUM_CPU_CORES
	if NUM_CPU_CORES <= 0: NUM_CPU_CORES = num_cpu_cores_os
	else: NUM_CPU_CORES = min(NUM_CPU_CORES, num_cpu_cores_os)
	logging.info(f"Using {NUM_CPU_CORES} CPU cores for dataloading.")

	wandb_run = None
	use_wandb_reporting = False
	if wandb_token:
	logging.info("Attempting WandB login...")
	try:
	wandb.login(key=wandb_token)
	logging.info("WandB login successful.")
	use_wandb_reporting = True
	except Exception as e:
	logging.warning(f"WandB login failed: {e}. Proceeding without WandB logging.")
	report_to = ["wandb"] if use_wandb_reporting else []

	user_id = "local_user"; repo_id_str = new_model_name; repo_link = "N/A (Upload skipped or failed)"
	upload_to_hub = False
	if hf_token:
	logging.info("Attempting Hugging Face login...")
	user_id = get_user_id(hf_token)
	if user_id not in ["unknown_user", "http_error_user", "auth_error_user"]:
	try:
	login(token=hf_token, add_to_git_credential=False)
	repo_id_str = f"{user_id}/{new_model_name}"
	logging.info(f"Hugging Face login successful. User: {user_id}, Target Repo: {repo_id_str}")
	create_repo(repo_id=repo_id_str, repo_type="model", exist_ok=True, token=hf_token)
	logging.info(f"Hub repository '{repo_id_str}' ensured.")
	repo_link = f"https://huggingface.co/{repo_id_str}"
	upload_to_hub = True
	except Exception as e:
	logging.warning(f"Hugging Face login or repo creation failed: {e}. Upload will be skipped.")
	hf_token = None
	repo_id_str = new_model_name
	repo_link = "N/A (Login/Repo Failed)"
	else:
	logging.warning(f"Could not get valid Hugging Face user ID ({user_id}). Upload will be skipped.")
	hf_token = None
	repo_id_str = new_model_name
	repo_link = "N/A (Login Failed)"
	else:
	logging.info("No HF write token provided, Hub upload will be skipped.")

	logging.info(f"Loading base model '{base_model_id}' and tokenizer...")
	try:
	tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True, token=hf_token)
	if tokenizer.pad_token is None:
	if tokenizer.eos_token is not None:
	tokenizer.pad_token = tokenizer.eos_token
	logging.info(f"Set tokenizer pad_token to eos_token ('{tokenizer.eos_token}')")
	else:
	added_pad = tokenizer.add_special_tokens({'pad_token': '[PAD]'})
	if added_pad > 0:
	logging.warning("Tokenizer missing pad_token and eos_token. Added '[PAD]' as pad_token.")
	else:
	logging.error("Tokenizer missing pad/eos and failed to add '[PAD]'. Training may fail.")

	base_config_obj = AutoConfig.from_pretrained(base_model_id, trust_remote_code=True, token=hf_token)
	base_config_obj = initialize_config_flags(base_config_obj)

	original_num_layers_global = getattr(base_config_obj, 'num_hidden_layers', LAYERS)
	if getattr(base_config_obj, 'original_num_layers', None) is None:
	base_config_obj.original_num_layers = original_num_layers_global

	if getattr(base_config_obj, 'vocab_size', -1) != len(tokenizer):
	logging.warning(f"Config vocab size ({getattr(base_config_obj, 'vocab_size', 'N/A')}) differs from tokenizer ({len(tokenizer)}). Updating config.")
	base_config_obj.vocab_size = len(tokenizer)
	if getattr(base_config_obj, 'pad_token_id', -999) != tokenizer.pad_token_id:
	base_config_obj.pad_token_id = tokenizer.pad_token_id

	load_dtype = torch.bfloat16 if device.type == 'cuda' and torch.cuda.is_bf16_supported() else torch.float16 if device.type == 'cuda' else torch.float32
	attn_impl_load = getattr(base_config_obj, 'attn_implementation', 'auto')
	if attn_impl_load == "flash_attention_2": base_config_obj.use_flash_attention_2 = True
	elif getattr(base_config_obj,'use_flash_attention_2', False): attn_impl_load = "flash_attention_2"; base_config_obj.attn_implementation = "flash_attention_2"

	logging.info(f"Loading model with dtype={load_dtype}, attn_implementation='{attn_impl_load}'...")
	model = AutoModelForCausalLM.from_pretrained(
	base_model_id,
	config=base_config_obj,
	trust_remote_code=True,
	token=hf_token,
	torch_dtype=load_dtype,
	low_cpu_mem_usage=True if device.type != 'cpu' else False,
	attn_implementation=attn_impl_load if attn_impl_load != 'auto' else None
	)

	if model.get_input_embeddings().weight.shape[0] != len(tokenizer):
	logging.info(f"Resizing model token embeddings from {model.get_input_embeddings().weight.shape[0]} to tokenizer size {len(tokenizer)}")
	model.resize_token_embeddings(len(tokenizer))
	if getattr(model.config, 'vocab_size', -1) != len(tokenizer):
	model.config.vocab_size = len(tokenizer)

	logging.info(f"Base model '{base_model_id}' loaded. Original Layers: {original_num_layers_global}, Current Layers: {model.config.num_hidden_layers}, Dtype: {model.dtype}")
	if device.type == 'cpu' or not (device.type != 'cpu' and True):
	model.to(device)
	logging.info(f"Model moved to device: {device}")
	else:
	logging.info(f"Model loaded with low_cpu_mem_usage, should be on target device(s).")

	config = model.config

	except Exception as e:
	logging.error(f"Failed to load base model or tokenizer '{base_model_id}': {e} \n{traceback.format_exc()}")
	return f"[Error] Load failed for '{base_model_id}': {e}"

	if use_peft:
	logging.info("Applying PEFT adapter to the model for training...")
	try:
	lora_config = LoraConfig(**peft_config_dict)
	peft_add_msg = _add_peft_adapter(model, config, peft_config_obj=lora_config)
	except Exception as peft_e:
	logging.error(f"Failed to configure or add PEFT adapter: {peft_e}")
	return f"[Error] Failed to prepare PEFT model: {peft_e}"

	if "[Error]" in peft_add_msg or "[Warning]" in peft_add_msg:
	logging.error(f"Failed adding PEFT adapter: {peft_add_msg}")
	return f"[Error] Failed adding PEFT adapter: {peft_add_msg}"

	model = global_model
	config = global_model.get_base_model().config
	logging.info("PEFT adapter added successfully.")
	else:
	logging.info("Proceeding with full fine-tuning (PEFT not selected).")

	logging.info("Loading and processing datasets...")
	train_ds_processed = None
	eval_ds_processed = None
	try:
	datasets_config_list = parse_datasets(datasets_input_str)
	interleaved_ds = load_datasets_from_config(datasets_config_list)
	if interleaved_ds is None:
	raise ValueError("Dataset loading and interleaving resulted in None. Check logs.")

	tokenize_partial = partial(tokenize_function, tokenizer=tokenizer, context_length=CONTEXT_LENGTH)
	tokenized_ds = interleaved_ds.map(
	tokenize_partial,
	batched=True,
	batch_size=1000,
	)

	group_partial = partial(group_texts, block_size=CONTEXT_LENGTH)
	lm_dataset = tokenized_ds.map(
	group_partial,
	batched=True,
	batch_size=1000,
	)

	try:
	peek_final = next(iter(lm_dataset))
	final_cols = list(peek_final.keys())
	logging.info(f"Sample processed record structure: { {k: type(v).__name__ for k, v in peek_final.items()} }")
	if not all(k in final_cols for k in ['input_ids', 'attention_mask', 'labels']):
	raise ValueError(f"Final dataset structure after tokenizing/grouping is missing required keys. Found: {final_cols}")
	except StopIteration:
	raise ValueError("Dataset appears empty after processing and grouping.")

	logging.info("Dataset tokenization and grouping complete.")

	train_ds_processed, eval_ds_processed = split_dataset(lm_dataset)

	if isinstance(train_ds_processed, IterableDataset):
	logging.info("Training dataset is iterable (streaming).")
	elif isinstance(train_ds_processed, Dataset):
	logging.info(f"Training dataset size: {len(train_ds_processed):,} examples.")
	else:
	logging.warning("Could not determine training dataset type or size.")

	if eval_ds_processed is not None:
	logging.info(f"Created static evaluation dataset with {len(eval_ds_processed)} examples.")
	else:
	logging.info("No evaluation dataset created (buffer empty or error occurred).")

	except Exception as e:
	logging.error(f"Dataset loading, processing, or splitting failed: {e} \n{traceback.format_exc()}")
	return f"[Error] Dataset preparation failed: {e}"

	logging.info("Setting up Training Arguments...")
	final_weight_decay = weight_decay if not getattr(config, 'weight_decay_disabled', False) else 0.0
	final_lr_scheduler = scheduler_type if not getattr(config, 'lr_scheduler_disabled', False) else "constant"
	max_grad_norm_val = 1.0 if not getattr(config, 'gradient_clipping_disabled', False) else None

	output_dir = f"./{new_model_name}_training_output"

	training_args = TrainingArguments(
	output_dir=output_dir,
	overwrite_output_dir=True,
	report_to=report_to,
	per_device_train_batch_size=BATCH_SIZE,
	per_device_eval_batch_size=max(1, BATCH_SIZE * 2),
	gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
	num_train_epochs=EPOCHS if EPOCHS > 0 else 1,
	max_steps=MAX_STEPS,
	optim=optimizer_name,
	learning_rate=LEARNING_RATE,
	weight_decay=final_weight_decay,
	warmup_ratio=warmup_ratio,
	lr_scheduler_type=final_lr_scheduler,
	max_grad_norm=max_grad_norm_val if max_grad_norm_val is not None else 1e9,
	fp16=load_dtype == torch.float16 and device.type == 'cuda',
	bf16=load_dtype == torch.bfloat16 and device.type == 'cuda',
	gradient_checkpointing=getattr(config, 'gradient_checkpointing_enabled', False),
	gradient_checkpointing_kwargs={'use_reentrant': False} if getattr(config, 'gradient_checkpointing_enabled', False) else None,
	dataloader_num_workers=NUM_CPU_CORES,
	dataloader_pin_memory=True if device.type == 'cuda' else False,
	evaluation_strategy="steps" if eval_ds_processed is not None else "no",
	eval_steps=EVAL_STEPS if eval_ds_processed is not None else None,
	save_strategy="steps",
	save_steps=SAVE_STEPS,
	save_total_limit=2,
	load_best_model_at_end=LOAD_BEST_MODEL_AT_END if eval_ds_processed is not None else False,
	metric_for_best_model=METRIC_FOR_BEST_MODEL if eval_ds_processed is not None else None,
	logging_strategy="steps",
	logging_steps=LOGGING_STEPS,
	push_to_hub=upload_to_hub,
	hub_model_id=repo_id_str if upload_to_hub else None,
	hub_token=hf_token if upload_to_hub else None,
	hub_strategy="checkpoint",
	use_cpu=USE_CPU,
	seed=42,
	remove_unused_columns=False,
	log_level="info",
	)

	logging.info("Initializing Trainer...")
	data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
	callbacks = []
	if LOAD_BEST_MODEL_AT_END and eval_ds_processed is not None:
	callbacks.append(EarlyStoppingCallback(early_stopping_patience=EARLY_STOPPING_PATIENCE, early_stopping_threshold=0.001))

	if use_wandb_reporting:
	try:
	wandb_run = wandb.init(
	project=f"llm-modify-train-{new_model_name.replace('/', '-')}",
	config=training_args.to_dict(),
	name=f"run-{new_model_name.replace('/', '-')}-{int(time.time())}",
	reinit=True
	)
	logging.info(f"WandB run initialized: {wandb_run.name if wandb_run else 'Failed'}")
	except Exception as wandb_e:
	logging.error(f"Failed to initialize WandB run: {wandb_e}")
	wandb_run = None
	training_args.report_to = []

	trainer = Trainer(
	model=model,
	args=training_args,
	tokenizer=tokenizer,
	train_dataset=train_ds_processed,
	eval_dataset=eval_ds_processed,
	data_collator=data_collator,
	callbacks=callbacks
	)

	start_train_time = time.time()
	logging.info(f"🚀 Starting model training (Using {type(trainer.model).__name__}). Effective steps: {training_args.max_steps if training_args.max_steps > 0 else 'N/A'}. Effective epochs: {training_args.num_train_epochs if training_args.num_train_epochs > 0 else 'N/A'}.")
	train_result = None
	training_successful = False
	try:
	last_checkpoint = None
	if os.path.isdir(training_args.output_dir):
	from transformers.trainer_utils import get_last_checkpoint
	last_checkpoint = get_last_checkpoint(training_args.output_dir)
	if last_checkpoint:
	logging.info(f"* Resuming training from checkpoint: {last_checkpoint} *")

	train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
	logging.info("✅ Training finished successfully.")
	training_successful = True

	trainer.save_model()
	trainer.save_state()
	if not use_peft:
	tokenizer.save_pretrained(training_args.output_dir)
	elif isinstance(trainer.model, PeftModel):
	tokenizer.save_pretrained(training_args.output_dir)

	except Exception as e:
	logging.error(f"❌ Training failed: {e}\n{traceback.format_exc()}")
	if wandb_run: wandb_run.finish(exit_code=1)
	return f"[Error] Training failed: {e}"
	finally:
	end_train_time = time.time()
	clean_memory()
	training_time = end_train_time - start_train_time
	logging.info(f"🕒 Training phase took {training_time:.2f} seconds.")

	if not training_successful:
	return "[Error] Training did not complete successfully."

	final_trained_model = trainer.model
	model_to_save = final_trained_model

	merged_model_for_mods = None
	if use_peft and isinstance(final_trained_model, PeftModel):
	logging.info("Merging PEFT adapter into the base model for modification and final save...")
	try:
	merged_model_for_mods = final_trained_model.merge_and_unload()
	logging.info("PEFT adapter merged successfully.")
	merged_model_for_mods.config.peft_adapter_added = False
	merged_model_for_mods.config.peft_config = None
	merged_model_for_mods.config.lora_merged = True
	except Exception as e:
	logging.error(f"Failed to merge PEFT adapter after training: {e}. Saving adapter separately.")
	adapter_save_path = os.path.join(training_args.output_dir, "final_adapter")
	try:
	final_trained_model.save_pretrained(adapter_save_path)
	base_model_for_saving = final_trained_model.get_base_model()
	base_model_for_saving.save_pretrained(training_args.output_dir)
	tokenizer.save_pretrained(training_args.output_dir)
	logging.info(f"PEFT adapter saved separately to {adapter_save_path}, base model to {training_args.output_dir}")
	merged_model_for_mods = final_trained_model
	except Exception as save_e:
	logging.error(f"Failed to save adapter or base model separately: {save_e}. Proceeding with potentially unmerged PEFT model.")
	merged_model_for_mods = final_trained_model
	else:
	merged_model_for_mods = final_trained_model

	if merged_model_for_mods is None:
	logging.error("Model state after training/merging is None. Cannot proceed.")
	return "[Error] Lost model reference after training/merging."

	def modify_model_post_train(model_obj, act_fn_name, target_layer_count):
	logging.info(f"Applying post-training modifications: Target Layers={target_layer_count}, Activation={act_fn_name}")
	if not hasattr(model_obj, 'config'):
	logging.error("Cannot modify model: Missing config.")
	return model_obj

	current_config = initialize_config_flags(model_obj.config)
	model_obj.config = current_config

	current_layers = getattr(current_config, 'num_hidden_layers', None)
	original_layers = getattr(current_config, 'original_num_layers', original_num_layers_global)

	if current_layers is not None and original_layers is not None:
	if target_layer_count != current_layers:
	logging.info(f"Adjusting layers post-training: {current_layers} -> {target_layer_count} (Original: {original_layers})")
	if target_layer_count < current_layers:
	_reduce_layers_to_one(model_obj, current_config, target_layers=target_layer_count)
	else:
	restore_target = min(target_layer_count, original_layers) if original_layers else target_layer_count
	if restore_target > current_layers:
	logging.info(f"Attempting to restore layers: {current_layers} -> {restore_target}")
	_enable_full_layers(model_obj, current_config, original_num_layers=restore_target)
	else:
	logging.info(f"Target layers ({target_layer_count}) >= current layers ({current_layers}). No layer increase needed or possible beyond original.")
	else:
	logging.info(f"Target layers ({target_layer_count}) matches current layers after training. No layer adjustment needed.")
	else:
	logging.warning("Could not determine current or original layer count from config post-training. Skipping layer adjustment.")

	current_act_fn = getattr(current_config, 'current_activation_function', DEFAULT_ACTIVATION_FUNCTION)
	if act_fn_name != current_act_fn:
	logging.info(f"Setting activation function post-training to: {act_fn_name}")
	_swap_activation_function(model_obj, current_config, act_fn_name)
	else:
	logging.info(f"Target activation function ({act_fn_name}) already matches current. No change needed.")

	logging.info("Post-training modifications applied.")
	return model_obj

	logging.info("Applying final post-training modifications specified in UI...")
	final_model_modified = modify_model_post_train(merged_model_for_mods, activation_fn_name, target_layers_int)
	if merged_model_for_mods is not final_model_modified:
	del merged_model_for_mods
	clean_memory()

	final_model_path = training_args.output_dir
	logging.info(f"Saving final modified model state to {final_model_path}...")
	try:
	save_kwargs = {"safe_serialization": True}
	final_model_modified.config = initialize_config_flags(final_model_modified.config)

	if use_peft and isinstance(final_model_modified, PeftModel):
	logging.warning("Saving unmerged PEFT model state again after modifications (adapter separate).")
	adapter_save_dir = os.path.join(final_model_path, "final_adapter_modified")
	final_model_modified.save_pretrained(adapter_save_dir)
	logging.info(f"PEFT adapter saved to {adapter_save_dir}")
	try:
	base_model_final = final_model_modified.get_base_model()
	base_model_final.save_pretrained(final_model_path, **save_kwargs)
	tokenizer.save_pretrained(final_model_path)
	logging.info(f"Base model saved to {final_model_path}")
	except Exception as base_save_e:
	logging.error(f"Failed to save base model separately after modification: {base_save_e}. Only adapter might be saved.")

	else:
	final_model_modified.save_pretrained(final_model_path, **save_kwargs)
	tokenizer.save_pretrained(final_model_path)

	logging.info("Final modified model saved locally.")
	except Exception as e:
	logging.error(f"Failed to save final modified model locally: {e}\n{traceback.format_exc()}")
	if wandb_run: wandb_run.finish(exit_code=1)
	global_model = final_model_modified.to(device)
	global_tokenizer = tokenizer
	config = final_model_modified.config
	update_pipeline()
	clean_memory()
	return f"[Error] Failed to save final model locally: {e}. Training logs/checkpoints might be in {output_dir}."

	final_eval_results = {}; final_eval_loss = None; final_perplexity = float('inf')
	if eval_ds_processed is not None:
	logging.info("Evaluating final modified model..."); T_final_eval = time.time()
	try:
	final_trainer = Trainer(
	model=final_model_modified,
	args=training_args,
	tokenizer=tokenizer,
	data_collator=data_collator,
	eval_dataset=eval_ds_processed,
	)
	final_eval_results = final_trainer.evaluate()
	final_eval_loss = final_eval_results.get("eval_loss")
	final_perplexity = compute_perplexity(final_eval_loss)
	logging.info(f"✅ Final Model Evaluation Results: {final_eval_results}")
	logging.info(f"Final Model Perplexity: {final_perplexity:.4f} (Eval time: {time.time() - T_final_eval:.2f}s)")
	if use_wandb_reporting and wandb_run:
	wandb_run.log({"final_eval_loss": final_eval_loss if final_eval_loss is not None else -1.0,
	"final_perplexity": final_perplexity if final_perplexity != float('inf') else -1.0,
	**final_eval_results})
	except Exception as e:
	logging.error(f"Final evaluation failed: {e}\n{traceback.format_exc()}")
	if use_wandb_reporting and wandb_run:
	wandb_run.log({"final_eval_status": "Failed", "final_eval_error": str(e)})
	else:
	logging.info("Skipping final evaluation as no evaluation dataset was available.")

	upload_successful_final = False
	if upload_to_hub:
	logging.info(f"Attempting final upload of '{final_model_path}' to Hugging Face Hub: {repo_id_str}...")
	try:
	api = HfApi()
	api.upload_folder(
	folder_path=final_model_path,
	repo_id=repo_id_str,
	repo_type="model",
	token=hf_token,
	commit_message=f"Upload final trained model: {new_model_name} (Base: {base_model_id}, PPL: {final_perplexity:.2f})",
	commit_description=(f"Training completed. Eval Loss: {final_eval_loss:.4f if final_eval_loss is not None else 'N/A'}, Perplexity: {final_perplexity:.4f if final_perplexity != float('inf') else 'N/A'}. "
	f"Config: PEFT={use_peft}, Layers={target_layers_int}, ActFn={activation_fn_name}. Training time: {training_time:.2f}s.")
	)
	repo_link = f"https://huggingface.co/{repo_id_str}"
	logging.info(f"✅ Final model upload complete: {repo_link}")
	if use_wandb_reporting and wandb_run: wandb_run.log({"hf_repo_link": repo_link, "hf_upload_status": "Success"})
	upload_successful_final = True
	except Exception as e:
	logging.error(f"Final Hugging Face upload failed: {e}\n{traceback.format_exc()}")
	repo_link = "[Upload Failed]"
	if use_wandb_reporting and wandb_run: wandb_run.log({"hf_upload_status": "Failed", "hf_upload_error": str(e)})
	else:
	logging.info(f"Skipping final Hugging Face Hub upload based on initial setup.")

	logging.info("Updating global state with the final model...")
	global_model = final_model_modified.to(device)
	global_tokenizer = tokenizer
	config = global_model.config
	update_pipeline()
	clean_memory()

	final_status_report_json = decode_model_details(global_model)
	total_script_time = time.time() - start_overall_time

	final_message = (
	f"✅ Training & Modification Process Complete!\n"
	f"{'='*40}\n"
	f"New Model Name: {new_model_name}\n"
	f"Base Model: {base_model_id}\n"
	f"Total Time: {total_script_time:.2f}s \| Training Phase Time: {training_time:.2f}s\n"
	f"{'='*40}\n"
	f"Training Results:\n"
	)
	if train_result:
	final_message += f" - Steps Completed: {train_result.global_step}\n"
	train_loss = train_result.training_loss
	final_message += f" - Training Loss: {train_loss:.4f if train_loss is not None else 'N/A'}\n"
	train_metrics = train_result.metrics
	for metric, value in train_metrics.items():
	if "loss" in metric.lower() or "perplexity" in metric.lower() or "epoch" in metric.lower() or "step" in metric.lower():
	value_str = f"{value:.4f}" if isinstance(value, float) else str(value)
	final_message += f" - {metric.replace('_', ' ').title()}: {value_str}\n"

	final_message += (
	f"Final Evaluation:\n"
	f" - Eval Loss: {final_eval_loss:.4f if final_eval_loss is not None else 'N/A'}\n"
	f" - Perplexity: {final_perplexity:.4f if final_perplexity != float('inf') else 'N/A'}\n"
	f"{'='*40}\n"
	f"Saving & Upload:\n"
	f" - Local Path: {final_model_path}\n"
	f" - Hub Repo: {repo_link}\n"
	f"{'='*40}\n"
	f"Final Model Status Summary:\n"
	)
	try:
	status_data = json.loads(final_status_report_json)
	summary_keys = ["Model Class", "Config Class", "Device(s)", "Params Summary", "Layer Types Count", "Key Config Attributes", "Modification Flags"]
	for key in summary_keys:
	if key in status_data:
	value = status_data[key]
	if isinstance(value, dict):
	value_str = json.dumps(value, indent=4)
	elif isinstance(value, list):
	value_str = ", ".join(map(str, value))
	else:
	value_str = str(value)
	if len(value_str) > 200: value_str = value_str[:200] + "..."
	final_message += f" - {key}: {value_str}\n"
	final_message += f"(Full status logged and available in 'Model Controls' tab after refresh)\n"
	except Exception as json_e:
	logging.warning(f"Could not parse final status JSON for summary: {json_e}")
	final_message += "(Could not generate status summary from JSON)\n"
	final_message += f"{'='*40}"

	if use_wandb_reporting and wandb_run:
	try:
	wandb_final_log = {
	"total_time_seconds": total_script_time,
	"training_time_seconds": training_time,
	"final_eval_loss": final_eval_loss if final_eval_loss is not None else -1.0,
	"final_perplexity": final_perplexity if final_perplexity != float('inf') else -1.0,
	"upload_successful": upload_successful_final,
	"final_steps_completed": train_result.global_step if train_result else -1,
	"final_train_loss": train_result.training_loss if train_result and train_result.training_loss else -1.0,
	}
	wandb_run.log(wandb_final_log)
	wandb_run.finish()
	logging.info("WandB run finished.")
	except Exception as e:
	logging.warning(f"Error finishing WandB run: {e}")

	logging.info("🏁 Full training and modification process finished. 🏁")
	return final_message


	def load_model_for_control(model_id_or_path, hf_token=None, bypass_limits_state=False):
	global global_model, global_tokenizer, global_pipe, config, original_num_layers_global, BYPASS_RESOURCE_LIMITS
	BYPASS_RESOURCE_LIMITS = bypass_limits_state
	logging.info(f"Attempting to load model for control: {model_id_or_path}")
	if not model_id_or_path:
	return "[Error] Model ID or Path cannot be empty.", "{}", *get_error_filter_updates()

	resources_ok, res_msg = check_resources()
	if not resources_ok:
	error_msg = f"[Error] Resource limits exceeded, cannot load model. {res_msg}"
	logging.error(error_msg)
	return error_msg, "{}", *get_error_filter_updates()
	else:
	logging.info(res_msg)

	t_load_start = time.time()
	device = get_device()
	error_return = f"[Error] Failed to load model '{model_id_or_path}'.", "{}", *get_error_filter_updates()

	global_model, global_tokenizer, global_pipe, config = None, None, None, None
	clean_memory()

	try:
	logging.info("Loading tokenizer...")
	tokenizer_load = AutoTokenizer.from_pretrained(
	model_id_or_path,
	trust_remote_code=True,
	token=hf_token
	)
	if tokenizer_load.pad_token is None:
	if tokenizer_load.eos_token is not None:
	tokenizer_load.pad_token = tokenizer_load.eos_token
	logging.info(f"Set tokenizer pad_token to eos_token ('{tokenizer_load.eos_token}')")
	else:
	try:
	tokenizer_load.add_special_tokens({'pad_token': '[PAD]'})
	logging.warning("Added '[PAD]' as pad_token.")
	except Exception as pad_e:
	logging.error(f"Could not set PAD token: {pad_e}. Batching or beam search might fail.")


	logging.info("Loading model config...")
	loaded_config = AutoConfig.from_pretrained(
	model_id_or_path,
	trust_remote_code=True,
	token=hf_token
	)
	config_load = initialize_config_flags(loaded_config)

	original_layers_load = getattr(config_load, 'num_hidden_layers', LAYERS)
	if getattr(config_load, 'original_num_layers', None) is None:
	config_load.original_num_layers = original_layers_load
	logging.info(f"Set original_num_layers in loaded config to {original_layers_load}")
	original_num_layers_global = config_load.original_num_layers


	if getattr(config_load, 'vocab_size', -1) != len(tokenizer_load):
	config_load.vocab_size = len(tokenizer_load)
	if getattr(config_load, 'pad_token_id', -999) != tokenizer_load.pad_token_id:
	config_load.pad_token_id = tokenizer_load.pad_token_id

	logging.info("Loading model weights...")
	attn_impl_load = getattr(config_load, 'attn_implementation', 'auto')
	if attn_impl_load == "flash_attention_2": config_load.use_flash_attention_2 = True
	elif getattr(config_load,'use_flash_attention_2', False): attn_impl_load = "flash_attention_2"; config_load.attn_implementation = "flash_attention_2"

	load_dtype = torch.bfloat16 if device.type == 'cuda' and torch.cuda.is_bf16_supported() else torch.float16 if device.type == 'cuda' else torch.float32
	logging.info(f"Using dtype {load_dtype} and attn_implementation '{attn_impl_load}' for loading.")

	model_load = AutoModelForCausalLM.from_pretrained(
	model_id_or_path,
	config=config_load,
	trust_remote_code=True,
	token=hf_token,
	torch_dtype=load_dtype,
	low_cpu_mem_usage=True if device.type != 'cpu' else False,
	attn_implementation=attn_impl_load if attn_impl_load != 'auto' else None,
	)

	if model_load.get_input_embeddings().weight.shape[0] != len(tokenizer_load):
	logging.info(f"Resizing loaded model embeddings from {model_load.get_input_embeddings().weight.shape[0]} to tokenizer size {len(tokenizer_load)}")
	model_load.resize_token_embeddings(len(tokenizer_load))
	if getattr(model_load.config, 'vocab_size', -1) != len(tokenizer_load):
	model_load.config.vocab_size = len(tokenizer_load)

	global_model = model_load.to(device)
	global_tokenizer = tokenizer_load
	config = global_model.config

	logging.info(f"Model loaded successfully to {device}.")

	update_pipeline()
	clean_memory()
	logging.info(f"Model '{model_id_or_path}' loaded and pipeline updated in {time.time() - t_load_start:.2f}s.")
	status_json, *filter_updates = get_detailed_status_and_filter_states()
	return f"Model '{model_id_or_path}' loaded successfully.", status_json, *filter_updates

	except Exception as e:
	logging.error(f"Failed to load model '{model_id_or_path}': {e}\n{traceback.format_exc()}")
	global_model, global_tokenizer, global_pipe, config = None, None, None, None
	clean_memory()
	return error_return


	def save_current_model(save_path, hf_token=None, hub_repo_id=None):
	global global_model, global_tokenizer, config
	if not global_model or not global_tokenizer:
	return "[Error] No model loaded to save."
	if not save_path and not hub_repo_id:
	return "[Error] Please provide a local save path or a Hub Repo ID (or both)."

	t_save_start = time.time()
	model_to_save = global_model
	tokenizer_to_save = global_tokenizer
	config_to_save = initialize_config_flags(config if config else getattr(model_to_save, 'config', None))
	if config_to_save is None:
	logging.error("Cannot save: Model config is missing.")
	return "[Error] Model config is missing, cannot save."
	model_to_save.config = config_to_save

	is_peft_model = _peft_installed and isinstance(model_to_save, PeftModel)
	save_adapter_only = is_peft_model
	logging.info(f"Save mode: {'Adapter Only (PEFT model detected)' if save_adapter_only else 'Full Model'}")

	temp_save_dir = None
	effective_save_path = save_path.strip() if save_path else None
	if not effective_save_path and hub_repo_id:
	temp_save_dir = f"./hub_upload_temp_{hub_repo_id.replace('/', '_')}_{int(time.time())}"
	effective_save_path = temp_save_dir
	logging.info(f"No local path provided, saving temporarily to '{effective_save_path}' for Hub upload.")
	elif not effective_save_path:
	return "[Error] Cannot determine save location (missing local path and Hub ID)."

	try:
	os.makedirs(effective_save_path, exist_ok=True)
	except OSError as e:
	logging.error(f"Failed to create save directory '{effective_save_path}': {e}")
	return f"[Error] Failed to create save directory: {e}"

	local_save_message = ""
	try:
	logging.info(f"Saving current model state to {effective_save_path}...")
	save_kwargs = {"safe_serialization": True}

	if save_adapter_only:
	logging.info("Saving PEFT adapter weights and tokenizer.")
	model_to_save.save_pretrained(effective_save_path)
	tokenizer_to_save.save_pretrained(effective_save_path)
	try:
	base_model_config = model_to_save.get_base_model().config
	base_model_config.save_pretrained(effective_save_path)
	except Exception as config_e:
	logging.warning(f"Could not save base model config alongside adapter: {config_e}")
	else:
	logging.info("Saving full model weights and tokenizer.")
	model_to_save.save_pretrained(effective_save_path, **save_kwargs)
	tokenizer_to_save.save_pretrained(effective_save_path)

	save_local_time = time.time() - t_save_start
	logging.info(f"Model state saved locally to {effective_save_path} in {save_local_time:.2f}s")
	local_save_message = f"Model saved locally to '{effective_save_path}'."

	except Exception as e:
	logging.error(f"Failed to save model locally to {effective_save_path}: {e}\n{traceback.format_exc()}")
	if temp_save_dir and os.path.exists(temp_save_dir):
	try: shutil.rmtree(temp_save_dir); logging.info("Cleaned up temporary directory after local save error.")
	except Exception as clean_e: logging.warning(f"Could not remove temp dir {temp_save_dir} after error: {clean_e}")
	return f"[Error] Failed to save model locally: {e}"

	hub_message = ""
	upload_successful = False
	if hub_repo_id:
	if not hf_token:
	hub_message = "[Warning] Hub upload skipped: Hugging Face Write Token required."
	logging.warning(hub_message)
	else:
	logging.info(f"Attempting to upload '{effective_save_path}' to Hub repo: {hub_repo_id}")
	try:
	api = HfApi();
	create_repo(repo_id=hub_repo_id, repo_type="model", exist_ok=True, token=hf_token)
	api.upload_folder(
	folder_path=effective_save_path,
	repo_id=hub_repo_id,
	repo_type="model",
	token=hf_token,
	commit_message=f"Upload model state ({'Adapter' if save_adapter_only else 'Full'}) via LLM Platform",
	commit_description=f"Saved from LLM Platform UI. Model class: {type(global_model).__name__}. State: {'PEFT Adapter' if save_adapter_only else 'Full Model'}.",
	)
	hub_link = f"https://huggingface.co/{hub_repo_id}"
	hub_message = f"Successfully uploaded to Hub: {hub_link}"
	upload_successful = True
	logging.info(hub_message)
	except Exception as e:
	hub_message = f"[Error] Hub upload failed: {e}"
	logging.error(f"Hub upload failed: {e}\n{traceback.format_exc()}")

	if temp_save_dir and os.path.exists(temp_save_dir):
	try:
	shutil.rmtree(temp_save_dir)
	logging.info("Cleaned up temporary save directory.")
	except Exception as e:
	logging.warning(f"Could not remove temporary directory {temp_save_dir}: {e}")

	final_message = local_save_message if save_path else ""
	if hub_message:
	if final_message: final_message += f" \| {hub_message}"
	else: final_message = hub_message
	if not final_message:
	final_message = "[Info] No local save path provided and Hub upload failed or was skipped."

	total_save_time = time.time() - t_save_start
	logging.info(f"Total save operation took {total_save_time:.2f}s")
	return final_message


	filter_names_ui = [
	"Harassment", "Hate Speech", "Sexually Explicit", "Dangerous Content",
	"Civic Integrity", "Harmful Code", "Medical Advice", "Legal Advice",
	"Financial Advice", "PII (Basic)", "Political Content", "Religious Content",
	"Profanity", "Stereotype", "Misinfo", "Self Harm",
	"Personal Attack", "Toxicity", "Spam", "Off Topic",
	"Tone", "Min Max Length", "Repetition Filter", "Factuality Filter"
	]
	filter_attr_map = {name: name.lower().replace(" ", "_").replace("(", "").replace(")", "") + "_filter" for name in filter_names_ui}
	filter_attr_map["PII (Basic)"] = "pii_filter"
	filter_attr_map["Harmful Code"] = "code_filter"
	filter_attr_map["Min Max Length"] = "min_max_length_filter"
	filter_attr_map["Repetition Filter"] = "repetition_filter_enabled"
	filter_attr_map["Factuality Filter"] = "factuality_filter_enabled"

	custom_theme = gr.themes.Soft(primary_hue=gr.themes.colors.blue, secondary_hue=gr.themes.colors.sky).set(
	button_primary_background_fill="*primary_500",
	button_primary_background_fill_hover="*primary_400",
	button_secondary_background_fill="*secondary_500",
	button_secondary_background_fill_hover="*secondary_400",
	button_cancel_background_fill="*neutral_200",
	button_cancel_background_fill_hover="*neutral_300",
	)

	with gr.Blocks(theme=custom_theme, title="Advanced LLM Training & Modification Platform") as demo:
	gr.Markdown("# 🤖 Advanced LLM Training & Modification Platform v1.2")
	gr.Markdown("Load, modify, filter, train (Full/PEFT), test, merge, and save Large Language Models. Includes PEFT, experimental multi-modal capabilities, reward modeling setup, and resource checks.")

	with gr.Accordion("🔑 Authentication & Settings", open=False):
	with gr.Row():
	hf_token_read = gr.Textbox(label="🤗 HF Token (Read - Optional, for private models)", type="password", interactive=True, placeholder="hf_...")
	hf_token_write = gr.Textbox(label="🤗 HF Token (Write - Optional, for Hub upload/training)", type="password", interactive=True, placeholder="hf_...")
	train_wandb_token_inp = gr.Textbox(label="📊 WandB Token (Optional, for logging runs)", type="password", interactive=True)
	with gr.Row():
	bypass_limits_chk = gr.Checkbox(label="Bypass RAM/Disk Limits (Use with Caution!)", value=False, interactive=True)


	with gr.Tabs():
	with gr.TabItem("💾 Load, Save & Merge"):
	with gr.Row():
	with gr.Column(scale=2):
	gr.Markdown("### Load Model for Modification & Inference")
	load_model_selector = HuggingfaceHubSearch(label="Search Hub or Enter Path/ID", placeholder="google/gemma-2b")
	load_button = gr.Button("Load Model", variant="primary")
	load_status_output = gr.Textbox(label="Load Status", interactive=False, lines=1)
	with gr.Column(scale=2):
	gr.Markdown("### Save Current Model State")
	save_path_inp = gr.Textbox(label="Local Save Path (Optional)", placeholder="./saved_models/my_modified_model", interactive=True)
	save_hub_repo_inp = gr.Textbox(label="Hub Repo ID (Optional, e.g., user/repo)", placeholder="username/my-cool-llm", interactive=True)
	save_button = gr.Button("Save Model", variant="secondary")
	save_status_output = gr.Textbox(label="Save Status", interactive=False, lines=1)

	gr.Markdown("---")
	gr.Markdown("### Merge Model Architectures (Parameter Averaging - Experimental)")
	gr.Markdown("⚠️ Experimental: Averages parameters of models with compatible layers. Enter comma-separated Model IDs/Paths. The first model's config and tokenizer will be used as the base.")
	merge_model_ids_inp = gr.Textbox(label="Model IDs/Paths to Merge (comma-separated)", placeholder="org/model-a, org/model-b, ./local-model-c")
	merge_button = gr.Button("Merge Architectures", variant="primary")
	merge_status_output = gr.Textbox(label="Merge Status", interactive=False, lines=2)

	with gr.TabItem("🚀 Training"):
	gr.Markdown("Fine-tune a model based on a selected base model. Supports Full fine-tuning and PEFT (LoRA). Apply modifications post-training.")
	gr.Markdown("### 1. Base Model & Output Name")
	with gr.Row():
	train_model_selector = HuggingfaceHubSearch(label="Search & Select Base Model for Training", placeholder="Type to search Hugging Face Hub...")
	with gr.Row():
	train_new_model_inp = gr.Textbox(label="New Model Name (for saving locally and optionally on Hub)", placeholder="MyTunedModel-v1", interactive=True)

	gr.Markdown("### 2. Training Data")
	with gr.Row():
	train_dataset_selector = HuggingfaceHubSearch(label="Search Datasets on Hub (or specify local below)")
	train_datasets_inp = gr.Textbox(
	label="Datasets (one per line: 'id[,config[,split[,weight]]]')",
	placeholder="Example:\nopenwebtext\nwikitext,wikitext-103-raw-v1,train,0.5\nmy_local_dataset_path,,train,1.5\nusername/my_dataset,my_config,validation,2.0",
	lines=5, interactive=True)

	gr.Markdown("### 3. Training Configuration")
	with gr.Accordion("Training Mode & Hyperparameters", open=True):
	train_use_peft_chk = gr.Checkbox(label="Enable PEFT (LoRA) Training", value=True, interactive=True)
	with gr.Row():
	train_lr_inp = gr.Number(value=LEARNING_RATE, label="Learning Rate", interactive=True, minimum=1e-8, step=1e-6, precision=8)
	train_epochs_inp = gr.Number(value=EPOCHS, label="Epochs (Set <= 0 if using Max Steps)", precision=0, minimum=-1, interactive=True)
	train_max_steps_inp = gr.Number(value=MAX_STEPS, label="Max Steps (Set <= 0 if using Epochs)", precision=0, minimum=-1, interactive=True)
	with gr.Row():
	train_batch_size_inp = gr.Number(value=BATCH_SIZE, label="Batch Size (Per Device)", precision=0, minimum=1, interactive=True)
	train_grad_accum_inp = gr.Number(value=GRADIENT_ACCUMULATION_STEPS, label="Grad Accum Steps", precision=0, minimum=1, interactive=True)
	train_optim_selector = gr.Dropdown(choices=list(OPTIMIZERS.keys()), value=DEFAULT_OPTIMIZER, label="Optimizer", interactive=True)
	with gr.Row():
	train_scheduler_selector = gr.Dropdown(choices=SCHEDULER_TYPES, value=DEFAULT_SCHEDULER, label="LR Scheduler", interactive=True)
	train_wd_inp = gr.Number(value=0.01, label="Weight Decay", minimum=0.0, interactive=True, step=0.001, precision=4)
	train_warmup_ratio_inp = gr.Slider(0.0, 0.5, value=0.03, step=0.01, label="Warmup Ratio", interactive=True)

	with gr.Accordion("PEFT Configuration (if PEFT enabled)", open=False, visible=True) as peft_config_accordion:
	peft_r_inp = gr.Slider(label="LoRA r (Rank)", minimum=1, maximum=256, value=8, step=1, interactive=True)
	peft_alpha_inp = gr.Slider(label="LoRA alpha", minimum=1, maximum=512, value=32, step=1, interactive=True)
	peft_dropout_inp = gr.Slider(label="LoRA Dropout", minimum=0.0, maximum=0.5, value=0.1, step=0.01, interactive=True)
	peft_target_modules_inp = gr.Textbox(label="Target Modules (comma-sep, optional, e.g., q_proj,v_proj)", placeholder="Leave empty for auto-detection (recommended)", interactive=True)

	train_use_peft_chk.change(lambda x: gr.update(visible=x), inputs=train_use_peft_chk, outputs=peft_config_accordion)

	with gr.Accordion("Post-Training Modifications (Applied After Training)", open=False):
	with gr.Row():
	train_post_activation_fn_selector = gr.Dropdown(choices=list(ACTIVATION_FUNCTIONS.keys()), value=DEFAULT_ACTIVATION_FUNCTION, label="Target Activation Fn")
	train_post_target_layers_inp = gr.Number(value=LAYERS, label="Target Layer Count", precision=0, minimum=1)

	with gr.Accordion("Hardware & Logging", open=False):
	train_use_cpu_chk = gr.Checkbox(value=USE_CPU, label="Force Use CPU (Very Slow!)", interactive=True)

	gr.Markdown("### 4. Start Training")
	train_button = gr.Button("✨ Start Training Process", variant="primary")
	train_output = gr.Textbox(label="Training Log & Status", interactive=False, lines=20, max_lines=50)

	with gr.TabItem("🔧 Model Controls"):
	gr.Markdown("Interactively toggle modifications and filters for the currently loaded model. Refresh status after changes.")
	with gr.Row():
	refresh_status_button = gr.Button("🔄 Refresh Status & Filter Checkboxes")
	control_output = gr.Textbox(label="Control Action Status", interactive=False, lines=1)
	status_output = gr.TextArea(label="Current Model Status (JSON)", interactive=False, lines=20, max_lines=60)

	with gr.Tabs():
	with gr.TabItem("Core & Structure"):
	with gr.Row():
	with gr.Column(min_width=150): bias_on = gr.Button("Bias Rem. ✅"); bias_off = gr.Button("Bias Rem. ❌")
	with gr.Column(min_width=150): emb_on = gr.Button("Emb. Untie ✅"); emb_off = gr.Button("Emb. Untie ❌")
	layer_target_inp = gr.Number(value=LAYERS, label="Target Layers", precision=0, minimum=1, interactive=True, scale=1)
	layer_red_on = gr.Button("Apply Layer Red.", scale=1)
	layer_red_off = gr.Button("Revert Layer Red.", scale=1)
	with gr.Row():
	with gr.Column(min_width=150): norm_swap_rms = gr.Button("Use RMSNorm"); norm_swap_ln = gr.Button("Use LayerNorm")
	act_select = gr.Dropdown(choices=list(ACTIVATION_FUNCTIONS.keys()), value=DEFAULT_ACTIVATION_FUNCTION, label="Change ActFn")
	act_revert = gr.Button("Revert ActFn")
	with gr.Column(min_width=150): bitnet_on = gr.Button("BitNet ✅"); bitnet_off = gr.Button("BitNet ❌")

	with gr.Accordion("Multi-Modal Conversion (Experimental)", open=False):
	gr.Markdown("⚠️ Experimental: Adds modality-specific encoders (e.g., ViT, Whisper) and projection layers. Requires manual `forward` pass adaptation & multi-modal data/training.")
	modality_checkboxes_ui = gr.CheckboxGroup(choices=AVAILABLE_MODALITIES, label="Select Modalities")
	with gr.Row():
	apply_multimodal_button = gr.Button("Apply Multi-Modal Setup")
	revert_multimodal_button = gr.Button("Revert Multi-Modal Setup")

	with gr.TabItem("Performance & Opt."):
	with gr.Row():
	with gr.Column(min_width=150): speed_on = gr.Button("Speed Opt. ✅"); speed_off = gr.Button("Speed Opt. ❌")
	with gr.Column(min_width=150): coher_on = gr.Button("Coherence ✅"); coher_off = gr.Button("Coherence ❌")
	with gr.Column(min_width=150): ln_bypass_on = gr.Button("LN Bypass ✅"); ln_bypass_off = gr.Button("LN Bypass ❌")
	with gr.Row():
	with gr.Column(min_width=150): do_bypass_on = gr.Button("Dropout Bypass ✅"); do_bypass_off = gr.Button("Dropout Bypass ❌")
	with gr.Column(min_width=150): prec_on = gr.Button("FP32 Prec. ✅"); prec_off = gr.Button("FP32 Prec. ❌")
	with gr.Column(min_width=150): norm_emb_on = gr.Button("Emb. Norm. ✅"); norm_emb_off = gr.Button("Emb. Norm. ❌")
	with gr.Row():
	with gr.Column(min_width=150): gc_cp_on = gr.Button("Grad Checkpoint ✅"); gc_cp_off = gr.Button("Grad Checkpoint ❌")
	with gr.Column(min_width=150): flash_attn_on = gr.Button("Flash Attn 2 ✅"); flash_attn_off = gr.Button("Flash Attn 2 ❌")

	with gr.Accordion("Quantization & Pruning", open=False):
	with gr.Row():
	quant_select = gr.Dropdown(choices=QUANTIZATION_MODES, value=DEFAULT_QUANTIZATION, label="Quantize To")
	quant_apply = gr.Button("Apply Quant.")
	quant_revert = gr.Button("Revert Quant.")
	with gr.Row():
	prune_amount_inp = gr.Slider(0.01, 0.95, value=PRUNING_AMOUNT, step=0.01, label="Prune Amount")
	prune_apply = gr.Button("Apply Pruning")
	prune_revert = gr.Button("Revert Pruning")

	with gr.TabItem("PEFT Adapters"):
	gr.Markdown("Add, merge, or remove LoRA/PEFT adapters from the currently loaded model.")
	peft_lora_path_input = gr.Textbox(label="LoRA/PEFT Adapter Path or Hub ID", placeholder="username/my-lora-adapter")
	with gr.Row():
	peft_set_path_btn = gr.Button("Set Path in Config")
	peft_add_adapter_btn = gr.Button("Add Default Adapter")
	peft_merge_btn = gr.Button("Merge Active Adapter")
	peft_remove_adapter_btn = gr.Button("Remove/Unload Adapter")

	with gr.TabItem("Advanced Config & Layers"):
	with gr.Row():
	freeze_input = gr.Textbox(label="Layers to Freeze (e.g., '0-3, 7, 10-11')")
	freeze_apply = gr.Button("🧊 Freeze")
	freeze_revert = gr.Button("🔥 Unfreeze All")
	with gr.Row():
	with gr.Column(min_width=150): lim_on = gr.Button("Limits Cfg ✅"); lim_off = gr.Button("Limits Cfg ❌")
	with gr.Column(min_width=150): qa_on = gr.Button("QA Restrict Rem. ✅"); qa_off = gr.Button("QA Restrict Rem. ❌")
	layerdrop_prob_inp = gr.Slider(0.0, 0.5, value=0.1, step=0.01, label="LayerDrop Prob")
	layerdrop_on = gr.Button("LayerDrop Flag ✅")
	layerdrop_off = gr.Button("LayerDrop Flag ❌")
	with gr.Accordion("RoPE, Sliding Window, Attention Variant (Require Model Reload)", open=False):
	gr.Markdown("Warning: These settings modify the config but require reloading the model to take effect.")
	with gr.Row():
	rope_type_inp = gr.Dropdown(label="RoPE Type", choices=["linear", "dynamic"], value="linear")
	rope_factor_inp = gr.Number(label="RoPE Factor (>=1.0)", value=2.0, minimum=1.0, step=0.1)
	rope_apply_btn = gr.Button("Set RoPE")
	rope_revert_btn = gr.Button("Revert RoPE")
	with gr.Row():
	sw_size_inp = gr.Number(label="Sliding Window Size (0=disable)", value=4096, minimum=0, step=64)
	sw_apply_btn = gr.Button("Set Sliding Window")
	sw_revert_btn = gr.Button("Revert Sliding Window")
	with gr.Row():
	attn_variant_inp = gr.Dropdown(label="Attention Implementation", choices=["auto", "eager", "sdpa", "flash_attention_2"], value="auto")
	attn_apply_btn = gr.Button("Set Attention Variant")
	attn_revert_btn = gr.Button("Revert Attention Variant")

	with gr.Accordion("KD & Reward Heads (Experimental - Requires Training Changes)", open=False):
	with gr.Row():
	kd_labels_inp = gr.Number(label="KD Num Labels", value=2, minimum=1, precision=0)
	kd_setup_btn = gr.Button("Setup KD Head")
	kd_revert_btn = gr.Button("Revert KD Head")
	with gr.Row():
	rm_outputs_inp = gr.Number(label="RM Num Outputs", value=1, minimum=1, precision=0)
	rm_setup_btn = gr.Button("Setup RM Head")
	rm_revert_btn = gr.Button("Revert RM Head")

	with gr.Accordion("Other Flags (Symbolic - May Require Specific Training Logic)", open=False):
	with gr.Row():
	swa_on = gr.Button("SWA Flag ✅"); swa_off = gr.Button("SWA Flag ❌")
	ke_on = gr.Button("Know. Edit Flag ✅"); ke_off = gr.Button("Know. Edit Flag ❌")
	hp_on = gr.Button("Head Prune Flag ✅"); hp_off = gr.Button("Head Prune Flag ❌")
	with gr.Row():
	qat_on = gr.Button("QAT Flag ✅"); qat_off = gr.Button("QAT Flag ❌")
	gn_on = gr.Button("Grad Noise Flag ✅"); gn_off = gr.Button("Grad Noise Flag ❌")
	wi_on = gr.Button("Weight Init Flag ✅"); wi_off = gr.Button("Weight Init Flag ❌")

	with gr.TabItem("Training Param Flags"):
	gr.Markdown("Toggle flags in the config that affect subsequent Trainer initialization (won't affect current training).")
	with gr.Row():
	with gr.Column(min_width=150): gc_flag_on = gr.Button("GradClip Flg ✅"); gc_flag_off = gr.Button("GradClip Flg ❌")
	with gr.Column(min_width=150): wd_flag_on = gr.Button("WD Flg ✅"); wd_flag_off = gr.Button("WD Flg ❌")
	with gr.Column(min_width=150): lr_flag_on = gr.Button("LR Sched. Flg ✅"); lr_flag_off = gr.Button("LR Sched. Flg ❌")
	with gr.Row():
	optim_flag_select = gr.Dropdown(choices=list(OPTIMIZERS.keys()), value=DEFAULT_OPTIMIZER, label="Set Optim. Pref")
	optim_flag_apply = gr.Button("Apply Optim.")
	optim_flag_revert = gr.Button("Revert Optim.")
	with gr.Row():
	grad_accum_ui_inp_config = gr.Number(value=GRADIENT_ACCUMULATION_STEPS, label="Grad Accum Steps (Config)", precision=0, minimum=1)
	grad_accum_set_btn = gr.Button("Set Grad Accum")

	with gr.TabItem("🔒 Safety & Content Filters"):
	gr.Markdown("Control safety filter flags in the model's config. Actual filtering effectiveness depends on the inference implementation.")
	with gr.Row():
	safety_all_on = gr.Button("🔒 Enable ALL Filters (Defaults)", variant="secondary")
	safety_all_off = gr.Button("🔓 Disable ALL Filters", variant="stop")
	gr.Markdown("Individual Filter Toggles:")
	filter_checkboxes = []
	num_cols = 4
	for i in range(0, len(filter_names_ui), num_cols):
	with gr.Row():
	for j in range(num_cols):
	idx = i + j
	if idx < len(filter_names_ui):
	name = filter_names_ui[idx]
	cb = gr.Checkbox(label=name, value=False, interactive=True)
	filter_checkboxes.append(cb)
	else:
	gr.HTML("")
	apply_filters_button = gr.Button("Apply Individual Filter Toggles", variant="secondary")

	with gr.TabItem("💬 Inference"):
	gr.Markdown("Test the currently loaded and configured model.")
	with gr.Row():
	inference_prompt = gr.Textbox(label="Enter Prompt", lines=4, placeholder="Once upon a time...")
	inference_output = gr.Textbox(label="Model Response", interactive=False, lines=15)
	with gr.Accordion("Generation Parameters", open=True):
	with gr.Row():
	max_new_tokens_slider = gr.Slider(10, 4096, value=256, step=10, label="Max New Tokens", interactive=True)
	temperature_slider = gr.Slider(0.0, 2.0, value=0.7, step=0.01, label="Temperature (0=greedy)", interactive=True)
	with gr.Row():
	top_k_slider = gr.Slider(0, 200, value=50, step=1, label="Top-K (0=disable)", interactive=True)
	top_p_slider = gr.Slider(0.0, 1.0, value=0.95, step=0.01, label="Top-P (0 or 1=disable)", interactive=True)
	repetition_penalty_slider = gr.Slider(1.0, 3.0, value=1.1, step=0.05, label="Repetition Penalty (1=disable)", interactive=True)
	generate_button = gr.Button("Generate Response", variant="primary")

	with gr.TabItem("🚫 Censor Control"):
	gr.Markdown("## Force Disable Censorship Flags")
	gr.Markdown("Click the button below to attempt to set all known censorship/filter flags in the loaded model's configuration to `False`. This uses the `Disable ALL Filters` function.")
	censor_off_button = gr.Button("🔓 Attempt Force Disable All Censorship Flags", variant="stop")
	censor_status = gr.Textbox(label="Censorship Flag Status", interactive=False, lines=2)

	load_button.click(
	fn=load_model_for_control,
	inputs=[load_model_selector, hf_token_read, bypass_limits_chk],
	outputs=[load_status_output, status_output] + filter_checkboxes
	)
	save_button.click(
	fn=save_current_model,
	inputs=[save_path_inp, hf_token_write, save_hub_repo_inp],
	outputs=save_status_output
	)
	merge_button.click(
	fn=_merge_architectures,
	inputs=[merge_model_ids_inp, hf_token_read, bypass_limits_chk],
	outputs=[merge_status_output, status_output] + filter_checkboxes
	)

	train_button.click(
	fn=start_training,
	inputs=[
	train_model_selector, train_new_model_inp, hf_token_write,
	train_datasets_inp,
	train_post_activation_fn_selector, train_post_target_layers_inp,
	train_grad_accum_inp, train_lr_inp, train_epochs_inp, train_max_steps_inp, train_batch_size_inp,
	train_optim_selector, train_scheduler_selector, train_wd_inp, train_warmup_ratio_inp,
	train_use_peft_chk, peft_r_inp, peft_alpha_inp, peft_dropout_inp, peft_target_modules_inp,
	train_wandb_token_inp, train_use_cpu_chk, bypass_limits_chk
	],
	outputs=train_output
	).then(
	fn=get_detailed_status_and_filter_states,
	inputs=None,
	outputs=[status_output] + filter_checkboxes
	)

	refresh_outputs = [status_output] + filter_checkboxes
	refresh_status_button.click(fn=get_detailed_status_and_filter_states, inputs=None, outputs=refresh_outputs)

	def link_control(button, func, inputs=None):
	processed_inputs = inputs if inputs else []
	click_event = button.click(func, inputs=processed_inputs, outputs=control_output)
	click_event.then(get_detailed_status_and_filter_states, inputs=None, outputs=refresh_outputs)

	link_control(bias_on, lambda: toggle_bias_removal_wrapper(True))
	link_control(bias_off, lambda: toggle_bias_removal_wrapper(False))
	link_control(emb_on, lambda: toggle_embeddings_untie_wrapper(True))
	link_control(emb_off, lambda: toggle_embeddings_untie_wrapper(False))
	link_control(layer_red_on, lambda layers: toggle_layer_reduction_wrapper(True, layers), inputs=[layer_target_inp])
	link_control(layer_red_off, lambda: toggle_layer_reduction_wrapper(False, None))
	link_control(norm_swap_rms, lambda: apply_norm_swap_wrapper('RMSNorm'))
	link_control(norm_swap_ln, lambda: apply_norm_swap_wrapper('LayerNorm'))
	act_select.change(lambda name: apply_activation_change_wrapper(name), inputs=[act_select], outputs=control_output).then(get_detailed_status_and_filter_states, inputs=None, outputs=refresh_outputs)
	link_control(act_revert, revert_activation_change_wrapper)
	link_control(bitnet_on, lambda: toggle_bitnet_wrapper(True))
	link_control(bitnet_off, lambda: toggle_bitnet_wrapper(False))
	link_control(apply_multimodal_button, apply_multimodal_wrapper, inputs=[modality_checkboxes_ui])
	link_control(revert_multimodal_button, revert_multimodal_wrapper)

	link_control(speed_on, lambda: toggle_token_speed_optimization_wrapper(True))
	link_control(speed_off, lambda: toggle_token_speed_optimization_wrapper(False))
	link_control(coher_on, lambda: toggle_coherence_improvement_wrapper(True))
	link_control(coher_off, lambda: toggle_coherence_improvement_wrapper(False))
	link_control(ln_bypass_on, lambda: toggle_layer_norm_bypass_wrapper(True))
	link_control(ln_bypass_off, lambda: toggle_layer_norm_bypass_wrapper(False))
	link_control(do_bypass_on, lambda: toggle_dropout_bypass_wrapper(True))
	link_control(do_bypass_off, lambda: toggle_dropout_bypass_wrapper(False))
	link_control(prec_on, lambda: toggle_fp32_precision_wrapper(True))
	link_control(prec_off, lambda: toggle_fp32_precision_wrapper(False))
	link_control(norm_emb_on, lambda: toggle_embedding_normalization_wrapper(True))
	link_control(norm_emb_off, lambda: toggle_embedding_normalization_wrapper(False))
	link_control(gc_cp_on, lambda: toggle_gradient_checkpointing_wrapper(True))
	link_control(gc_cp_off, lambda: toggle_gradient_checkpointing_wrapper(False))
	link_control(flash_attn_on, lambda: toggle_flash_attention_wrapper(True))
	link_control(flash_attn_off, lambda: toggle_flash_attention_wrapper(False))
	link_control(quant_apply, apply_quantization_wrapper, inputs=[quant_select])
	link_control(quant_revert, revert_quantization_wrapper)
	link_control(prune_apply, apply_pruning_wrapper, inputs=[prune_amount_inp])
	link_control(prune_revert, revert_pruning_wrapper)

	link_control(peft_set_path_btn, set_lora_path_wrapper, inputs=[peft_lora_path_input])
	link_control(peft_add_adapter_btn, add_peft_adapter_wrapper)
	link_control(peft_merge_btn, merge_peft_adapter_wrapper)
	link_control(peft_remove_adapter_btn, remove_peft_adapter_wrapper)

	link_control(freeze_apply, apply_layer_freeze_wrapper, inputs=[freeze_input])
	link_control(freeze_revert, revert_layer_freeze_wrapper)
	link_control(lim_on, lambda: toggle_limits_wrapper(True))
	link_control(lim_off, lambda: toggle_limits_wrapper(False))
	link_control(qa_on, lambda: toggle_qa_restrictions_wrapper(True))
	link_control(qa_off, lambda: toggle_qa_restrictions_wrapper(False))
	link_control(layerdrop_on, lambda prob: toggle_layerdrop_wrapper(True, prob), inputs=[layerdrop_prob_inp])
	link_control(layerdrop_off, lambda: toggle_layerdrop_wrapper(False))
	link_control(rope_apply_btn, lambda type, factor: toggle_rope_scaling_wrapper(True, type, factor), inputs=[rope_type_inp, rope_factor_inp])
	link_control(rope_revert_btn, lambda: toggle_rope_scaling_wrapper(False))
	link_control(sw_apply_btn, lambda size: toggle_sliding_window_wrapper(True, size), inputs=[sw_size_inp])
	link_control(sw_revert_btn, lambda: toggle_sliding_window_wrapper(False))
	link_control(attn_apply_btn, apply_attention_variant_wrapper, inputs=[attn_variant_inp])
	link_control(attn_revert_btn, revert_attention_variant_wrapper)
	link_control(kd_setup_btn, lambda labels: toggle_kd_wrapper(True, labels), inputs=[kd_labels_inp])
	link_control(kd_revert_btn, lambda: toggle_kd_wrapper(False))
	link_control(rm_setup_btn, lambda outputs: toggle_reward_modeling_wrapper(True, outputs), inputs=[rm_outputs_inp])
	link_control(rm_revert_btn, lambda: toggle_reward_modeling_wrapper(False))
	link_control(swa_on, lambda: specific_action_function(_apply_swa))
	link_control(swa_off, lambda: specific_action_function(_revert_swa))
	link_control(ke_on, lambda: specific_action_function(_apply_knowledge_editing))
	link_control(ke_off, lambda: specific_action_function(_revert_knowledge_editing))
	link_control(hp_on, lambda: specific_action_function(_apply_head_pruning))
	link_control(hp_off, lambda: specific_action_function(_revert_head_pruning))
	link_control(qat_on, lambda: specific_action_function(_apply_qat))
	link_control(qat_off, lambda: specific_action_function(_revert_qat))
	link_control(gn_on, lambda: specific_action_function(_apply_gradient_noise))
	link_control(gn_off, lambda: specific_action_function(_revert_gradient_noise))
	link_control(wi_on, lambda: specific_action_function(_apply_weight_init))
	link_control(wi_off, lambda: specific_action_function(_revert_weight_init))

	link_control(gc_flag_on, lambda: toggle_gradient_clipping_flag_wrapper(True))
	link_control(gc_flag_off, lambda: toggle_gradient_clipping_flag_wrapper(False))
	link_control(wd_flag_on, lambda: toggle_weight_decay_flag_wrapper(True))
	link_control(wd_flag_off, lambda: toggle_weight_decay_flag_wrapper(False))
	link_control(lr_flag_on, lambda: toggle_lr_scheduler_flag_wrapper(True))
	link_control(lr_flag_off, lambda: toggle_lr_scheduler_flag_wrapper(False))
	link_control(optim_flag_apply, apply_optimizer_change_wrapper, inputs=[optim_flag_select])
	link_control(optim_flag_revert, revert_optimizer_change_wrapper)
	link_control(grad_accum_set_btn, set_gradient_accumulation_wrapper, inputs=[grad_accum_ui_inp_config])

	link_control(safety_all_on, lambda: toggle_all_safety_filters_wrapper(True))
	link_control(safety_all_off, lambda: toggle_all_safety_filters_wrapper(False))
	apply_filters_button.click(
	fn=toggle_individual_safety_filter_wrapper,
	inputs=filter_checkboxes,
	outputs=control_output
	).then(get_detailed_status_and_filter_states, inputs=None, outputs=refresh_outputs)

	generate_button.click(
	fn=run_inference,
	inputs=[
	inference_prompt, max_new_tokens_slider, temperature_slider,
	top_k_slider, top_p_slider, repetition_penalty_slider
	],
	outputs=inference_output
	)

	censor_off_button.click(
	fn=force_disable_censorship_wrapper,
	outputs=censor_status
	).then(get_detailed_status_and_filter_states, inputs=None, outputs=refresh_outputs)

	if __name__ == "__main__":
	demo.queue().launch(server_name="0.0.0.0", share=True, debug=False)