d1337-cipher-simple / d1337_cipher_complete.jsonl
Darin Leonhart
Upload d1337_cipher_complete.jsonl with huggingface_hub
8bb3088 verified
"""
D1337 CIPHER - Fine-tuning Setup Script
Auto-deploy training ke HuggingFace GPU
"""
import torch
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
from datasets import load_dataset
import json
from pathlib import Path
class D1337Training:
def __init__(self, base_model="huihui-ai/Huihui-GLM-4.7-Flash-abliterated"):
self.base_model = base_model
self.output_model = "d1337-cipher-v1"
def setup_model(self):
"""Load base model dan tokenizer"""
print(f"[+] Loading {self.base_model}...")
# Load tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(
self.base_model,
trust_remote_code=True,
padding_side="right"
)
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
# Load model
self.model = AutoModelForCausalLM.from_pretrained(
self.base_model,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True,
attn_implementation="flash_attention_2"
)
print(f"[+] Model loaded on {torch.cuda.device_count()} GPUs")
def prepare_dataset(self, dataset_file="d1337_cipher_complete.jsonl"):
"""Prepare dataset untuk training"""
print(f"[+] Loading dataset: {dataset_file}")
# Load dari local atau HuggingFace
try:
dataset = load_dataset("json", data_files=dataset_file)["train"]
except:
dataset = load_dataset("Desorden1337/d1337-cipher-dataset")["train"]
def format_prompt(sample):
"""Format data ke prompt format"""
messages = sample["messages"]
# Build conversation
conversation = ""
for msg in messages:
if msg["role"] == "system":
conversation += f"<|im_start|>system\n{msg['content']}<|im_end|>\n"
elif msg["role"] == "user":
conversation += f"<|im_start|>user\n{msg['content']}<|im_end|>\n"
elif msg["role"] == "assistant":
conversation += f"<|im_start|>assistant\n{msg['content']}<|im_end|>\n"
return {"text": conversation}
# Format dataset
dataset = dataset.map(format_prompt, remove_columns=dataset.column_names)
# Tokenize
def tokenize(examples):
tokenized = self.tokenizer(
examples["text"],
truncation=True,
max_length=2048,
padding="max_length"
)
tokenized["labels"] = tokenized["input_ids"].copy()
return tokenized
dataset = dataset.map(tokenize, batched=True)
print(f"[+] Dataset prepared: {len(dataset)} samples")
return dataset
def setup_training(self, output_dir="./d1337-cipher-output"):
"""Setup training arguments"""
training_args = TrainingArguments(
output_dir=output_dir,
# 8x A100 BEAST MODE OPTIMIZATION
num_train_epochs=2,
per_device_train_batch_size=8, # 8 per GPU = 32 total batch (4x L40S)
gradient_accumulation_steps=1, # No accumulation needed
gradient_checkpointing=False, # Disable untuk speed
# Optimizer optimized
learning_rate=1e-4, # Higher LR untuk faster convergence
weight_decay=0.01,
warmup_steps=10, # Minimal warmup
lr_scheduler_type="linear", # Faster scheduler
# Logging minimal
logging_steps=20,
save_steps=500, # Less frequent saves
eval_steps=500,
# Memory optimization
dataloader_pin_memory=False,
bf16=True,
tf32=True,
dataloader_num_workers=4, # Parallel loading
# Speed settings
remove_unused_columns=False,
push_to_hub=True,
hub_model_id="Desorden1337/d1337-cipher-v1",
hub_private_repo=True,
report_to=None,
# Additional speed
max_steps=100, # Ultra fast dengan 8x A100
ddp_find_unused_parameters=False, # DDP optimization
dataloader_persistent_workers=True # Keep workers alive
)
return training_args
def train(self):
"""Execute training"""
print("="*60)
print("D1337 CIPHER - TRAINING INITIATED")
print("="*60)
# Setup
self.setup_model()
dataset = self.prepare_dataset()
training_args = self.setup_training()
# Data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=self.tokenizer,
mlm=False
)
# Trainer
trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=dataset,
data_collator=data_collator
)
# Train!
print("[+] TRAINING STARTED...")
trainer.train()
# Save final model
trainer.save_model()
self.tokenizer.save_pretrained(training_args.output_dir)
# Push to hub
trainer.push_to_hub()
print("="*60)
print("D1337 CIPHER TRAINING COMPLETE!")
print(f"Model saved: Desorden1337/d1337-cipher-v1")
print("="*60)
if __name__ == "__main__":
trainer = D1337Training(base_model="huihui-ai/Huihui-GLM-4.7-Flash-abliterated")
trainer.train()