|
|
"""
|
|
|
D1337 CIPHER - Custom Training Script
|
|
|
=====================================
|
|
|
Optimized QLoRA training for 31B model on 4x L40S (192GB VRAM)
|
|
|
|
|
|
Brand: D1337 SOVEREIGN LABS
|
|
|
Model: GLM-4.7-Flash-abliterated (31B) -> D1337 CIPHER
|
|
|
"""
|
|
|
|
|
|
import os
|
|
|
import sys
|
|
|
import torch
|
|
|
import gradio as gr
|
|
|
from threading import Thread
|
|
|
from dataclasses import dataclass
|
|
|
from typing import Optional
|
|
|
|
|
|
|
|
|
from transformers import (
|
|
|
AutoTokenizer,
|
|
|
AutoModelForCausalLM,
|
|
|
TrainingArguments,
|
|
|
BitsAndBytesConfig,
|
|
|
)
|
|
|
from peft import (
|
|
|
LoraConfig,
|
|
|
get_peft_model,
|
|
|
TaskType,
|
|
|
)
|
|
|
from datasets import load_dataset
|
|
|
from trl import SFTTrainer, SFTConfig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
class TrainingConfig:
|
|
|
|
|
|
base_model: str = "huihui-ai/Huihui-GLM-4.7-Flash-abliterated"
|
|
|
output_model: str = "Desorden1337/d1337-cipher-v1"
|
|
|
|
|
|
|
|
|
dataset_name: str = "Desorden1337/d1337-cipher-dataset"
|
|
|
dataset_split: str = "train"
|
|
|
|
|
|
|
|
|
lora_r: int = 32
|
|
|
lora_alpha: int = 64
|
|
|
lora_dropout: float = 0.05
|
|
|
target_modules: list = None
|
|
|
|
|
|
|
|
|
num_epochs: int = 5
|
|
|
batch_size: int = 1
|
|
|
gradient_accumulation: int = 8
|
|
|
learning_rate: float = 2e-4
|
|
|
max_seq_length: int = 2048
|
|
|
warmup_ratio: float = 0.1
|
|
|
weight_decay: float = 0.01
|
|
|
|
|
|
|
|
|
use_4bit: bool = True
|
|
|
use_bf16: bool = True
|
|
|
|
|
|
def __post_init__(self):
|
|
|
if self.target_modules is None:
|
|
|
self.target_modules = [
|
|
|
"q_proj", "k_proj", "v_proj", "o_proj",
|
|
|
"gate_proj", "up_proj", "down_proj"
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class D1337CipherTrainer:
|
|
|
def __init__(self, config: TrainingConfig = None):
|
|
|
self.config = config or TrainingConfig()
|
|
|
self.model = None
|
|
|
self.tokenizer = None
|
|
|
self.trainer = None
|
|
|
self.training_status = "Idle"
|
|
|
self.training_log = []
|
|
|
|
|
|
def log(self, message: str):
|
|
|
"""Log message to console and internal log"""
|
|
|
print(f"[D1337] {message}")
|
|
|
self.training_log.append(message)
|
|
|
if len(self.training_log) > 100:
|
|
|
self.training_log = self.training_log[-100:]
|
|
|
|
|
|
def setup_quantization(self):
|
|
|
"""Setup 4-bit quantization config"""
|
|
|
if self.config.use_4bit:
|
|
|
return BitsAndBytesConfig(
|
|
|
load_in_4bit=True,
|
|
|
bnb_4bit_quant_type="nf4",
|
|
|
bnb_4bit_compute_dtype=torch.bfloat16 if self.config.use_bf16 else torch.float16,
|
|
|
bnb_4bit_use_double_quant=True,
|
|
|
)
|
|
|
return None
|
|
|
|
|
|
def setup_lora(self):
|
|
|
"""Setup LoRA configuration"""
|
|
|
return LoraConfig(
|
|
|
r=self.config.lora_r,
|
|
|
lora_alpha=self.config.lora_alpha,
|
|
|
lora_dropout=self.config.lora_dropout,
|
|
|
target_modules=self.config.target_modules,
|
|
|
bias="none",
|
|
|
task_type=TaskType.CAUSAL_LM,
|
|
|
)
|
|
|
|
|
|
def load_model(self):
|
|
|
"""Load base model with quantization"""
|
|
|
self.training_status = "Loading model..."
|
|
|
self.log(f"Loading model: {self.config.base_model}")
|
|
|
|
|
|
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
|
self.config.base_model,
|
|
|
trust_remote_code=True,
|
|
|
padding_side="right",
|
|
|
)
|
|
|
|
|
|
if self.tokenizer.pad_token is None:
|
|
|
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|
|
|
|
|
|
|
|
bnb_config = self.setup_quantization()
|
|
|
|
|
|
self.model = AutoModelForCausalLM.from_pretrained(
|
|
|
self.config.base_model,
|
|
|
quantization_config=bnb_config,
|
|
|
device_map="auto",
|
|
|
trust_remote_code=True,
|
|
|
torch_dtype=torch.bfloat16 if self.config.use_bf16 else torch.float16,
|
|
|
)
|
|
|
|
|
|
|
|
|
self.model.gradient_checkpointing_enable()
|
|
|
self.model.enable_input_require_grads()
|
|
|
|
|
|
|
|
|
lora_config = self.setup_lora()
|
|
|
self.model = get_peft_model(self.model, lora_config)
|
|
|
|
|
|
|
|
|
trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
|
|
|
total_params = sum(p.numel() for p in self.model.parameters())
|
|
|
self.log(f"Trainable parameters: {trainable_params:,} / {total_params:,} ({100 * trainable_params / total_params:.2f}%)")
|
|
|
self.log(f"Model loaded on {torch.cuda.device_count()} GPU(s)")
|
|
|
|
|
|
def load_dataset(self):
|
|
|
"""Load and prepare dataset"""
|
|
|
self.training_status = "Loading dataset..."
|
|
|
self.log(f"Loading dataset: {self.config.dataset_name}")
|
|
|
|
|
|
dataset = load_dataset(self.config.dataset_name, split=self.config.dataset_split)
|
|
|
self.log(f"Dataset loaded: {len(dataset)} samples")
|
|
|
|
|
|
return dataset
|
|
|
|
|
|
def format_messages(self, example):
|
|
|
"""Format messages into training text"""
|
|
|
messages = example["messages"]
|
|
|
|
|
|
|
|
|
text = ""
|
|
|
for msg in messages:
|
|
|
role = msg["role"]
|
|
|
content = msg["content"]
|
|
|
text += f"<|im_start|>{role}\n{content}<|im_end|>\n"
|
|
|
|
|
|
return {"text": text}
|
|
|
|
|
|
def train(self):
|
|
|
"""Execute training"""
|
|
|
try:
|
|
|
self.training_status = "Initializing..."
|
|
|
self.log("=" * 60)
|
|
|
self.log("D1337 CIPHER TRAINING - INITIATED")
|
|
|
self.log("=" * 60)
|
|
|
|
|
|
|
|
|
self.load_model()
|
|
|
dataset = self.load_dataset()
|
|
|
|
|
|
|
|
|
self.log("Formatting dataset...")
|
|
|
dataset = dataset.map(self.format_messages, remove_columns=dataset.column_names)
|
|
|
|
|
|
|
|
|
self.training_status = "Setting up training..."
|
|
|
training_args = TrainingArguments(
|
|
|
output_dir="./d1337-cipher-output",
|
|
|
num_train_epochs=self.config.num_epochs,
|
|
|
per_device_train_batch_size=self.config.batch_size,
|
|
|
gradient_accumulation_steps=self.config.gradient_accumulation,
|
|
|
learning_rate=self.config.learning_rate,
|
|
|
weight_decay=self.config.weight_decay,
|
|
|
warmup_steps=14,
|
|
|
lr_scheduler_type="cosine",
|
|
|
logging_steps=1,
|
|
|
save_steps=50,
|
|
|
save_total_limit=2,
|
|
|
bf16=self.config.use_bf16,
|
|
|
fp16=not self.config.use_bf16,
|
|
|
gradient_checkpointing=True,
|
|
|
max_grad_norm=1.0,
|
|
|
group_by_length=True,
|
|
|
dataloader_num_workers=4,
|
|
|
remove_unused_columns=False,
|
|
|
push_to_hub=True,
|
|
|
hub_model_id=self.config.output_model,
|
|
|
hub_private_repo=True,
|
|
|
report_to="none",
|
|
|
)
|
|
|
|
|
|
|
|
|
self.trainer = SFTTrainer(
|
|
|
model=self.model,
|
|
|
args=training_args,
|
|
|
train_dataset=dataset,
|
|
|
)
|
|
|
|
|
|
|
|
|
self.training_status = "Training in progress..."
|
|
|
self.log("Training started!")
|
|
|
self.trainer.train()
|
|
|
|
|
|
|
|
|
self.training_status = "Saving model..."
|
|
|
self.log("Saving model...")
|
|
|
self.trainer.save_model()
|
|
|
self.trainer.push_to_hub()
|
|
|
|
|
|
self.training_status = "Complete!"
|
|
|
self.log("=" * 60)
|
|
|
self.log("D1337 CIPHER TRAINING - COMPLETE!")
|
|
|
self.log(f"Model saved to: {self.config.output_model}")
|
|
|
self.log("=" * 60)
|
|
|
|
|
|
return True
|
|
|
|
|
|
except Exception as e:
|
|
|
self.training_status = f"Error: {str(e)}"
|
|
|
self.log(f"Training failed: {str(e)}")
|
|
|
import traceback
|
|
|
self.log(traceback.format_exc())
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_ui(trainer: D1337CipherTrainer):
|
|
|
"""Create Gradio UI for monitoring"""
|
|
|
|
|
|
def get_status():
|
|
|
return trainer.training_status
|
|
|
|
|
|
def get_logs():
|
|
|
return "\n".join(trainer.training_log[-50:])
|
|
|
|
|
|
def start_training():
|
|
|
trainer.training_log = []
|
|
|
thread = Thread(target=trainer.train)
|
|
|
thread.start()
|
|
|
return "Training started! Check logs for progress."
|
|
|
|
|
|
def get_gpu_info():
|
|
|
if torch.cuda.is_available():
|
|
|
info = []
|
|
|
for i in range(torch.cuda.device_count()):
|
|
|
props = torch.cuda.get_device_properties(i)
|
|
|
mem_total = props.total_memory / (1024**3)
|
|
|
mem_used = torch.cuda.memory_allocated(i) / (1024**3)
|
|
|
info.append(f"GPU {i}: {props.name} - {mem_used:.1f}GB / {mem_total:.1f}GB")
|
|
|
return "\n".join(info)
|
|
|
return "No GPU available"
|
|
|
|
|
|
with gr.Blocks(title="D1337 CIPHER Training", theme=gr.themes.Soft()) as demo:
|
|
|
gr.Markdown("""
|
|
|
# 🔥 D1337 CIPHER - Training Console
|
|
|
### D1337 SOVEREIGN LABS
|
|
|
|
|
|
Custom training environment for GLM-4.7-Flash-abliterated → D1337 CIPHER
|
|
|
""")
|
|
|
|
|
|
with gr.Row():
|
|
|
with gr.Column(scale=1):
|
|
|
gr.Markdown("### Configuration")
|
|
|
model_name = gr.Textbox(
|
|
|
label="Base Model",
|
|
|
value=trainer.config.base_model,
|
|
|
interactive=False
|
|
|
)
|
|
|
dataset_name = gr.Textbox(
|
|
|
label="Dataset",
|
|
|
value=trainer.config.dataset_name,
|
|
|
interactive=False
|
|
|
)
|
|
|
output_name = gr.Textbox(
|
|
|
label="Output Model",
|
|
|
value=trainer.config.output_model,
|
|
|
interactive=False
|
|
|
)
|
|
|
|
|
|
gr.Markdown("### Training Parameters")
|
|
|
gr.Textbox(
|
|
|
label="LoRA Rank",
|
|
|
value=str(trainer.config.lora_r),
|
|
|
interactive=False
|
|
|
)
|
|
|
gr.Textbox(
|
|
|
label="Epochs",
|
|
|
value=str(trainer.config.num_epochs),
|
|
|
interactive=False
|
|
|
)
|
|
|
gr.Textbox(
|
|
|
label="Learning Rate",
|
|
|
value=str(trainer.config.learning_rate),
|
|
|
interactive=False
|
|
|
)
|
|
|
|
|
|
with gr.Column(scale=2):
|
|
|
gr.Markdown("### Status")
|
|
|
status_box = gr.Textbox(
|
|
|
label="Current Status",
|
|
|
value=get_status,
|
|
|
every=2
|
|
|
)
|
|
|
|
|
|
gpu_info = gr.Textbox(
|
|
|
label="GPU Info",
|
|
|
value=get_gpu_info,
|
|
|
every=5
|
|
|
)
|
|
|
|
|
|
start_btn = gr.Button("🚀 Start Training", variant="primary", size="lg")
|
|
|
|
|
|
gr.Markdown("### Training Logs")
|
|
|
logs_box = gr.Textbox(
|
|
|
label="Logs",
|
|
|
value=get_logs,
|
|
|
every=3,
|
|
|
lines=15,
|
|
|
max_lines=20
|
|
|
)
|
|
|
|
|
|
start_btn.click(fn=start_training, outputs=status_box)
|
|
|
|
|
|
return demo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
print("=" * 60)
|
|
|
print("D1337 CIPHER - Custom Training Environment")
|
|
|
print("D1337 SOVEREIGN LABS")
|
|
|
print("=" * 60)
|
|
|
|
|
|
|
|
|
if torch.cuda.is_available():
|
|
|
print(f"GPUs available: {torch.cuda.device_count()}")
|
|
|
for i in range(torch.cuda.device_count()):
|
|
|
props = torch.cuda.get_device_properties(i)
|
|
|
print(f" GPU {i}: {props.name} ({props.total_memory / (1024**3):.1f} GB)")
|
|
|
else:
|
|
|
print("WARNING: No GPU detected!")
|
|
|
|
|
|
|
|
|
config = TrainingConfig()
|
|
|
trainer = D1337CipherTrainer(config)
|
|
|
|
|
|
|
|
|
auto_start = os.environ.get("AUTO_START_TRAINING", "false").lower() == "true"
|
|
|
|
|
|
if auto_start:
|
|
|
print("Auto-starting training...")
|
|
|
trainer.train()
|
|
|
else:
|
|
|
|
|
|
print("Launching Gradio UI...")
|
|
|
demo = create_ui(trainer)
|
|
|
demo.launch(
|
|
|
server_name="0.0.0.0",
|
|
|
server_port=7860,
|
|
|
share=False
|
|
|
)
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|
|
|
|