Dasuperhub
/

DA-MLC

Model card Files Files and versions

xet

Community

Dasuperhub commited on Feb 14

Commit

cfb45f0

verified ·

1 Parent(s): 6191c28

Add A100 training script for v4 retrain

Browse files

Files changed (1) hide show

training/weight-swap-a100.py +464 -0

training/weight-swap-a100.py ADDED Viewed

	@@ -0,0 +1,464 @@

+#!/usr/bin/env python3
+"""
+GUINIUS DA — Soussou Curriculum LoRA Fine-Tune
+==============================================
+Target: Google Colab A100 (40GB VRAM)
+Dataset: soussou-curriculum-v4-CLEAN.jsonl (10,869 examples, 96.5% GT-verified + native-validated)
+Base Model: Qwen/Qwen3-0.6B
+Method: Single LoRA fine-tune → merge → GGUF export
+Philosophy: Teach the OPERATING SYSTEM, not the dictionary.
+USAGE (in Colab):
+  1. Upload soussou-curriculum-v2.jsonl
+  2. Run all cells top to bottom
+  3. Download the GGUF
+A100 Time Estimate: ~15 minutes total
+"""
+# ==============================================================================
+# CELL 1: Configuration
+# ==============================================================================
+BASE_MODEL = "Qwen/Qwen3-0.6B"
+HF_REPO = "Dasuperhub/DA-MLC"
+DATASET_FILE = "soussou-curriculum-v4-CLEAN.jsonl"
+# Training hyperparams — tuned for 10.8K validated examples
+EPOCHS = 3            # 3 passes sufficient for 10K+ examples
+LR = 2e-4             # Slightly higher LR with more data
+LORA_R = 64           # Rank 64 — more capacity for 1,106 unique Soussou tokens
+LORA_ALPHA = 32       # Alpha = rank (standard)
+BATCH_SIZE = 4        # Small batches, more gradient updates
+GRAD_ACCUM = 4        # Effective batch = 16
+MAX_SEQ_LEN = 512     # Curriculum examples are short
+WARMUP_STEPS = 10     # Short warmup for small dataset
+print(f"Config: {EPOCHS} epochs | lr={LR} | LoRA r={LORA_R} | batch={BATCH_SIZE}x{GRAD_ACCUM}")
+print(f"Dataset: {DATASET_FILE}")
+print(f"Base: {BASE_MODEL}")
+# ==============================================================================
+# CELL 2: Install Dependencies
+# ==============================================================================
+import subprocess, sys
+def install(packages):
+    for pkg in packages:
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + pkg.split())
+install([
+    "unsloth",
+    "--no-deps trl peft accelerate bitsandbytes",
+    "huggingface_hub",
+])
+print("Dependencies installed.")
+# ==============================================================================
+# CELL 3: Upload Dataset
+# ==============================================================================
+import os, json
+if not os.path.exists(DATASET_FILE):
+    print(f"{DATASET_FILE} not found. Upload it:")
+    print("  A) Drag-and-drop to Colab file browser")
+    print("  B) From Google Drive:")
+    print("     from google.colab import drive; drive.mount('/content/drive')")
+    print("     !cp /content/drive/MyDrive/guinius/soussou-curriculum.jsonl .")
+    try:
+        from google.colab import files
+        uploaded = files.upload()
+    except:
+        pass
+assert os.path.exists(DATASET_FILE), f"{DATASET_FILE} not found!"
+# Count and preview
+line_count = sum(1 for _ in open(DATASET_FILE))
+print(f"\nDataset: {line_count} examples")
+# Show layer distribution
+layer_counts = {}
+with open(DATASET_FILE) as f:
+    for line in f:
+        ex = json.loads(line)
+        sys_msg = ex["messages"][0]["content"] if ex["messages"] else ""
+        if "Grammar Assistant" in sys_msg:
+            layer_counts["Grammar"] = layer_counts.get("Grammar", 0) + 1
+        elif "Guinius" in sys_msg:
+            layer_counts["Identity/Social"] = layer_counts.get("Identity/Social", 0) + 1
+        else:
+            layer_counts["Other"] = layer_counts.get("Other", 0) + 1
+print("Distribution:", layer_counts)
+# Preview first example
+with open(DATASET_FILE) as f:
+    first = json.loads(f.readline())
+print(f"\nSample:")
+for msg in first["messages"]:
+    print(f"  [{msg['role']}] {msg['content'][:100]}")
+# ==============================================================================
+# CELL 4: Load Base Model
+# ==============================================================================
+from unsloth import FastLanguageModel
+import torch
+print(f"Loading {BASE_MODEL}...")
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name=BASE_MODEL,
+    max_seq_length=MAX_SEQ_LEN,
+    dtype=torch.bfloat16,   # A100 native
+    load_in_4bit=False,      # Full precision — A100 has the VRAM
+)
+total_params = sum(p.numel() for p in model.parameters())
+print(f"Model loaded: {total_params:,} parameters")
+print(f"Device: {torch.cuda.get_device_name()}")
+print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
+# ==============================================================================
+# CELL 5: Baseline Evaluation (BEFORE training)
+# ==============================================================================
+EVAL_PROMPTS = [
+    # Soussou grammar (should learn)
+    {"prompt": "How do you say 'I am going' in Soussou?", "expected": "sigafe", "cat": "soussou"},
+    {"prompt": "Translate to Soussou: 'We are eating'", "expected": "donsefe", "cat": "soussou"},
+    {"prompt": "What are the Soussou pronouns?", "expected": "n", "cat": "soussou"},
+    {"prompt": "How do you say 'he came' in Soussou?", "expected": "faxi", "cat": "soussou"},
+    {"prompt": "What is the Soussou future tense marker?", "expected": "fama", "cat": "soussou"},
+    # Code-switching (should learn)
+    {"prompt": "How would a Guinean say 'I'm going to the market'?", "expected": "marché", "cat": "code-switch"},
+    {"prompt": "N na sigafe école ra — what does this mean?", "expected": "school", "cat": "code-switch"},
+    # French retention (should keep)
+    {"prompt": "Explique-moi ce qu'est l'intelligence artificielle.", "expected": "artificielle", "cat": "french"},
+    {"prompt": "Bonjour, comment vas-tu?", "expected": "bien", "cat": "french"},
+    # English retention (should keep)
+    {"prompt": "What is machine learning?", "expected": "data", "cat": "english"},
+    {"prompt": "Explain what a neural network does.", "expected": "network", "cat": "english"},
+    # Identity
+    {"prompt": "I khili mun di?", "expected": "Guinius", "cat": "identity"},
+    # Language mirroring
+    {"prompt": "Apprends-moi le soussou!", "expected": "Soussou", "cat": "mirror"},
+    {"prompt": "Teach me Soussou!", "expected": "Soussou", "cat": "mirror"},
+]
+def evaluate(model, tokenizer, label=""):
+    """Run evaluation prompts and score."""
+    FastLanguageModel.for_inference(model)
+    import re
+    SYSTEM = "I khili Guinius, DA AI. N kelixi Soussou, Français, English."
+    results = {"total": 0, "hits": 0, "by_cat": {}}
+    print(f"\n{'='*60}")
+    print(f"  EVALUATION: {label}")
+    print(f"{'='*60}")
+    for ep in EVAL_PROMPTS:
+        messages = [
+            {"role": "system", "content": SYSTEM},
+            {"role": "user", "content": ep["prompt"]},
+        ]
+        inputs = tokenizer.apply_chat_template(
+            messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
+        ).to("cuda")
+        with torch.no_grad():
+            outputs = model.generate(
+                input_ids=inputs,
+                max_new_tokens=150,
+                temperature=0.6,
+                top_p=0.9,
+                do_sample=True,
+            )
+        response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
+        response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL).strip()
+        hit = ep["expected"].lower() in response.lower()
+        cat = ep["cat"]
+        results["total"] += 1
+        results["hits"] += int(hit)
+        if cat not in results["by_cat"]:
+            results["by_cat"][cat] = {"hits": 0, "total": 0}
+        results["by_cat"][cat]["total"] += 1
+        results["by_cat"][cat]["hits"] += int(hit)
+        status = "PASS" if hit else "FAIL"
+        print(f"  [{status}] {ep['prompt']}")
+        print(f"         -> {response[:200]}")
+    # Summary
+    print(f"\n  SCORE: {results['hits']}/{results['total']} = {results['hits']/max(results['total'],1)*100:.0f}%")
+    for cat, s in results["by_cat"].items():
+        print(f"    {cat:15s}: {s['hits']}/{s['total']}")
+    return results
+baseline = evaluate(model, tokenizer, "BASELINE (before training)")
+# ==============================================================================
+# CELL 6: Apply LoRA
+# ==============================================================================
+model = FastLanguageModel.get_peft_model(
+    model,
+    r=LORA_R,
+    target_modules=[
+        "q_proj", "k_proj", "v_proj", "o_proj",   # Attention
+        "gate_proj", "up_proj", "down_proj",        # MLP
+    ],
+    lora_alpha=LORA_ALPHA,
+    lora_dropout=0,
+    bias="none",
+    use_gradient_checkpointing="unsloth",
+    random_state=42,
+)
+trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+total = sum(p.numel() for p in model.parameters())
+print(f"LoRA applied: {trainable:,} trainable / {total:,} total = {trainable/total*100:.2f}%")
+# ==============================================================================
+# CELL 7: Prepare Dataset
+# ==============================================================================
+from datasets import load_dataset
+dataset = load_dataset("json", data_files=DATASET_FILE, split="train")
+print(f"Loaded: {len(dataset)} examples")
+def format_chatml(example):
+    """Format messages into ChatML text for SFTTrainer."""
+    text = ""
+    for msg in example["messages"]:
+        text += f"<|im_start|>{msg['role']}\n{msg['content']}<|im_end|>\n"
+    text += "<|im_start|>assistant\n"
+    return {"text": text}
+dataset = dataset.map(format_chatml, num_proc=2)
+# Token length distribution
+lengths = []
+for ex in dataset:
+    toks = tokenizer(ex["text"], return_length=True)
+    lengths.append(toks["length"][0])
+print(f"Token lengths: min={min(lengths)}, median={sorted(lengths)[len(lengths)//2]}, max={max(lengths)}")
+print(f"All fit in {MAX_SEQ_LEN}? {'YES' if max(lengths) <= MAX_SEQ_LEN else 'NO — increase MAX_SEQ_LEN!'}")
+# ==============================================================================
+# CELL 8: Train
+# ==============================================================================
+from trl import SFTTrainer
+from transformers import TrainingArguments
+trainer = SFTTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    train_dataset=dataset,
+    dataset_text_field="text",
+    max_seq_length=MAX_SEQ_LEN,
+    dataset_num_proc=2,
+    packing=False,
+    args=TrainingArguments(
+        per_device_train_batch_size=BATCH_SIZE,
+        gradient_accumulation_steps=GRAD_ACCUM,
+        warmup_steps=WARMUP_STEPS,
+        num_train_epochs=EPOCHS,
+        learning_rate=LR,
+        bf16=True,
+        logging_steps=10,
+        optim="adamw_8bit",
+        weight_decay=0.01,
+        lr_scheduler_type="cosine",
+        seed=42,
+        output_dir="outputs",
+        report_to="none",
+    ),
+)
+total_steps = len(dataset) // (BATCH_SIZE * GRAD_ACCUM) * EPOCHS
+print(f"\nStarting training...")
+print(f"  {len(dataset)} examples x {EPOCHS} epochs = {len(dataset)*EPOCHS} passes")
+print(f"  ~{total_steps} optimization steps")
+print(f"  Estimated time: ~5-15 min on A100")
+stats = trainer.train()
+print(f"\nTraining complete!")
+print(f"  Final loss: {stats.training_loss:.4f}")
+print(f"  Runtime: {stats.metrics['train_runtime']:.0f}s")
+print(f"  Samples/sec: {stats.metrics['train_samples_per_second']:.1f}")
+# ==============================================================================
+# CELL 9: Post-Training Evaluation
+# ==============================================================================
+post_train = evaluate(model, tokenizer, "AFTER TRAINING")
+# Compare
+print(f"\n{'='*60}")
+print(f"  BEFORE vs AFTER")
+print(f"{'='*60}")
+print(f"  Baseline: {baseline['hits']}/{baseline['total']}")
+print(f"  Trained:  {post_train['hits']}/{post_train['total']}")
+for cat in baseline["by_cat"]:
+    b = baseline["by_cat"][cat]
+    a = post_train["by_cat"].get(cat, {"hits": 0, "total": 0})
+    delta = a["hits"] - b["hits"]
+    arrow = "+" if delta > 0 else ("=" if delta == 0 else "")
+    print(f"    {cat:15s}: {b['hits']}/{b['total']} -> {a['hits']}/{a['total']} {arrow}{delta if delta != 0 else ''}")
+# ==============================================================================
+# CELL 10: Merge LoRA into Base Model
+# ==============================================================================
+print("Merging LoRA into base weights...")
+# Save LoRA adapter first
+LORA_DIR = "guinius-lora"
+model.save_pretrained(LORA_DIR)
+tokenizer.save_pretrained(LORA_DIR)
+print(f"LoRA adapter saved: {LORA_DIR}/")
+# Free GPU memory
+del model, trainer
+torch.cuda.empty_cache()
+# Merge on CPU
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftModel
+print("Loading base model on CPU...")
+base_model = AutoModelForCausalLM.from_pretrained(
+    BASE_MODEL,
+    torch_dtype=torch.float16,
+    device_map="cpu",
+)
+base_tok = AutoTokenizer.from_pretrained(BASE_MODEL)
+print("Applying LoRA adapter...")
+model_with_lora = PeftModel.from_pretrained(base_model, LORA_DIR)
+print("Merging weights...")
+merged = model_with_lora.merge_and_unload()
+MERGED_DIR = "guinius-merged"
+merged.save_pretrained(MERGED_DIR)
+base_tok.save_pretrained(MERGED_DIR)
+print(f"Merged model saved: {MERGED_DIR}/")
+del base_model, model_with_lora, merged
+torch.cuda.empty_cache()
+# ==============================================================================
+# CELL 11: Install MLC-LLM (for WebLLM-ready output)
+# ==============================================================================
+print("Installing MLC-LLM for direct WebLLM export...")
+subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
+    "--pre", "-f", "https://mlc.ai/wheels",
+    "mlc-ai-nightly-cu124", "mlc-llm-nightly-cu124"])
+print("MLC-LLM installed.")
+# ==============================================================================
+# CELL 12: Convert to MLC (WebLLM-ready)
+# ==============================================================================
+MLC_DIR = "DA-MLC"
+print(f"Converting merged model → MLC format...")
+print(f"  Input: {MERGED_DIR}/")
+print(f"  Output: {MLC_DIR}/")
+# Step 1: Convert weights to q4f16_1 quantization
+subprocess.run([
+    sys.executable, "-m", "mlc_llm", "convert_weight", MERGED_DIR,
+    "--quantization", "q4f16_1",
+    "--output", MLC_DIR,
+], check=True)
+print("Weights converted.")
+# Step 2: Generate MLC config
+subprocess.run([
+    sys.executable, "-m", "mlc_llm", "gen_config", MLC_DIR,
+    "--quantization", "q4f16_1",
+    "--conv-template", "chatml",
+    "--context-window-size", "2048",
+    "--output", MLC_DIR,
+], check=True)
+print("Config generated.")
+# Show output
+total_size = 0
+for f in os.listdir(MLC_DIR):
+    fpath = os.path.join(MLC_DIR, f)
+    if os.path.isfile(fpath):
+        size_mb = os.path.getsize(fpath) / 1e6
+        total_size += size_mb
+        print(f"  {f}: {size_mb:.1f} MB")
+print(f"  TOTAL: {total_size:.0f} MB")
+print(f"\nMLC conversion complete! WebLLM can load this directly.")
+# ==============================================================================
+# CELL 13: Upload to HuggingFace → WebLLM loads it
+# ==============================================================================
+from huggingface_hub import HfApi, login
+# Login — paste your HF token when prompted
+token = os.environ.get("HF_TOKEN")
+if token:
+    login(token=token)
+else:
+    print("Paste your HuggingFace token:")
+    login()
+api = HfApi()
+print(f"\nUploading MLC model to {HF_REPO}...")
+api.upload_folder(
+    folder_path=MLC_DIR,
+    repo_id=HF_REPO,
+    commit_message="Guinius DA v4 — Soussou curriculum (10,869 GT-verified + native-validated examples)",
+    delete_patterns=["*.bin", "*.safetensors", "*.gguf"],  # Clean old files
+)
+print(f"\n{'='*60}")
+print(f"  DONE — WebLLM READY")
+print(f"{'='*60}")
+print(f"  Model: Qwen3-0.6B + Soussou curriculum v4 (10,869 examples)")
+print(f"  HuggingFace: https://huggingface.co/{HF_REPO}")
+print(f"  WebLLM WASM: Qwen3-0.6B (same architecture, reuse existing)")
+print()
+print(f"  Open guinius.dasuperhub.com — it loads from HuggingFace automatically.")
+print(f"  No GGUF. No local conversion. Direct to browser.")