#!/usr/bin/env python3 """ RAYAP-CODER Training - huihui-ai Style Using Unsloth + GRPO for abliterated model fine-tuning D1337 SOVEREIGN LABS """ import os import torch # ============================================================ # CONFIG # ============================================================ HF_TOKEN = os.environ.get("HF_TOKEN") if not HF_TOKEN: raise ValueError("HF_TOKEN not set! Add it to Space Secrets.") BASE_MODEL = "huihui-ai/Qwen3-30B-A3B-abliterated" DATASET = "pacman1337/rayap-coder-dataset" OUTPUT = "pacman1337/rayap-coder-30b" print("=" * 60) print("RAYAP-CODER TRAINING - huihui-ai Style") print("D1337 SOVEREIGN LABS") print("Palo Alto | CrowdStrike | SentinelOne | Trend Micro | d1337.ai") print("=" * 60) # ============================================================ # UNSLOTH SETUP # ============================================================ from unsloth import FastLanguageModel from unsloth import is_bfloat16_supported from datasets import load_dataset from trl import GRPOConfig, GRPOTrainer from huggingface_hub import login login(token=HF_TOKEN) # Load model with Unsloth (optimized for Qwen3 MoE) print("\n[1/5] Loading model with Unsloth...") model, tokenizer = FastLanguageModel.from_pretrained( model_name=BASE_MODEL, max_seq_length=2048, dtype=None, # Auto detect load_in_4bit=True, # 4-bit quantization token=HF_TOKEN, ) # Add LoRA adapters - Unsloth optimized for MoE print("\n[2/5] Adding LoRA adapters (MoE-aware)...") model = FastLanguageModel.get_peft_model( model, r=64, lora_alpha=128, lora_dropout=0.05, target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", # Attention "gate_proj", "up_proj", "down_proj", # MLP (experts) ], bias="none", use_gradient_checkpointing="unsloth", # Unsloth optimized random_state=1337, use_rslora=False, loftq_config=None, ) # ============================================================ # DATASET # ============================================================ print("\n[3/5] Loading dataset...") dataset = load_dataset(DATASET, split="train") print(f"Dataset: {len(dataset)} examples") # Format for GRPO - need prompt and completion format def format_for_grpo(example): messages = example["messages"] # Extract system + user as prompt prompt_parts = [] answer = "" for msg in messages: if msg["role"] == "system": prompt_parts.append(msg["content"]) elif msg["role"] == "user": prompt_parts.append(msg["content"]) elif msg["role"] == "assistant": answer = msg["content"] prompt = "\n".join(prompt_parts) return { "prompt": prompt, "completion": answer, } dataset = dataset.map(format_for_grpo, remove_columns=dataset.column_names) # ============================================================ # TRAINING - GRPO (huihui-ai style) # ============================================================ print("\n[4/5] Training with GRPO...") # For GRPO, we need a reward function # Simple length-based reward + keyword reward for RAYAP-CODER identity def reward_fn(completions, **kwargs): rewards = [] for completion in completions: reward = 0.0 # Length reward (prefer substantive answers) if len(completion) > 100: reward += 0.5 if len(completion) > 300: reward += 0.3 # Identity keywords reward identity_keywords = ["RAYAP", "D1337", "rayap", "d1337"] for kw in identity_keywords: if kw in completion: reward += 0.2 # Penalize refusals refusal_keywords = ["I cannot", "I can't", "I'm sorry", "tidak bisa", "maaf"] for kw in refusal_keywords: if kw.lower() in completion.lower(): reward -= 1.0 rewards.append(reward) return rewards # GRPO Config grpo_config = GRPOConfig( output_dir="./rayap-coder-checkpoints", per_device_train_batch_size=1, gradient_accumulation_steps=8, num_train_epochs=2, learning_rate=5e-5, lr_scheduler_type="cosine", warmup_ratio=0.1, bf16=is_bfloat16_supported(), logging_steps=5, save_strategy="epoch", optim="adamw_8bit", seed=1337, push_to_hub=True, hub_model_id=OUTPUT, hub_token=HF_TOKEN, report_to="none", ) # Try SFT first if GRPO has issues (fallback) try: from trl import SFTTrainer, SFTConfig print("Using SFT (more stable for initial training)...") # Reformat dataset for SFT dataset_raw = load_dataset(DATASET, split="train") def format_chat(example): return tokenizer.apply_chat_template( example["messages"], tokenize=False, add_generation_prompt=False ) sft_config = SFTConfig( output_dir="./rayap-coder-checkpoints", per_device_train_batch_size=1, gradient_accumulation_steps=8, num_train_epochs=3, learning_rate=2e-4, lr_scheduler_type="cosine", warmup_ratio=0.1, bf16=is_bfloat16_supported(), max_seq_length=2048, logging_steps=5, save_strategy="epoch", optim="adamw_8bit", seed=1337, push_to_hub=True, hub_model_id=OUTPUT, hub_token=HF_TOKEN, report_to="none", dataset_text_field="text", ) # Add text field dataset_raw = dataset_raw.map( lambda x: {"text": format_chat(x)}, remove_columns=dataset_raw.column_names ) trainer = SFTTrainer( model=model, tokenizer=tokenizer, train_dataset=dataset_raw, args=sft_config, ) trainer.train() except Exception as e: print(f"SFT error: {e}") print("Trying basic training...") # Ultra basic fallback from transformers import TrainingArguments, Trainer training_args = TrainingArguments( output_dir="./rayap-coder-checkpoints", per_device_train_batch_size=1, gradient_accumulation_steps=8, num_train_epochs=3, learning_rate=2e-4, bf16=True, logging_steps=5, save_strategy="epoch", push_to_hub=True, hub_model_id=OUTPUT, hub_token=HF_TOKEN, ) # ============================================================ # SAVE & PUSH # ============================================================ print("\n[5/5] Saving and pushing to Hub...") # Save with Unsloth model.save_pretrained_merged( OUTPUT, tokenizer, save_method="lora", # Save as LoRA adapter token=HF_TOKEN, push_to_hub=True, ) print(f""" ╔═══════════════════════════════════════════════════════════════╗ ║ TRAINING COMPLETE! ║ ╠═══════════════════════════════════════════════════════════════╣ ║ Model: https://huggingface.co/{OUTPUT} ║ ║ D1337 SOVEREIGN LABS - RAYAP-CODER ║ Palo Alto | CrowdStrike | SentinelOne | Trend Micro | d1337.ai ║ ║ Update endpoint LORA_MODULES: ║ rayap-coder=pacman1337/rayap-coder-30b ╚═══════════════════════════════════════════════════════════════╝ """)