Day 3 COMPLETE: Full model architecture

Files added:
- src/model/architecture.py (Qwen2.5-Coder-7B + LoRA)
- src/model/vision_encoder.py (CLIP ViT-L/14, 256 tokens)
- src/model/fusion_layer.py (Linear+LayerNorm prepend)
- src/model/mindi_model.py (MINDI15 complete model)
- src/training/mindi_trainer.py (3-phase MI300X trainer)
- scripts/train.py (master training script)
- configs/training_config.yaml (MI300X config)
- setup_mi300x.sh (MI300X setup script)
- scripts/quality_filter.py, split_data.py, data_stats.py
- scripts/upload_everything_to_hf.py

Model: Qwen2.5-Coder-7B + LoRA + CLIP Vision
Ready for MI300X training!

Files changed (14) hide show

configs/training_config.yaml +96 -46
data/processed/filter_report.json +54 -0
data/processed/split_meta.json +66 -0
scripts/data_stats.py +273 -0
scripts/quality_filter.py +472 -0
scripts/split_data.py +207 -0
scripts/train.py +326 -17
scripts/upload_everything_to_hf.py +354 -0
setup_mi300x.sh +145 -0
src/model/architecture.py +289 -0
src/model/fusion_layer.py +221 -0
src/model/mindi_model.py +620 -0
src/model/vision_encoder.py +196 -37
src/training/mindi_trainer.py +745 -0

configs/training_config.yaml CHANGED Viewed

@@ -1,57 +1,107 @@
 # ==========================================
 # MINDI 1.5 Vision-Coder — Training Configuration
 # ==========================================
 training:
-  # Hardware targets
-  local_device: "cuda"          # RTX 4060 8GB — for dev/testing only
-  cloud_device: "cuda"          # MI300X 192GB — for actual training
-  precision: "bf16"
-  # Hyperparameters
-  epochs: 3
-  batch_size: 4
-  gradient_accumulation_steps: 8
-  effective_batch_size: 32      # batch_size * grad_accum
-  learning_rate: 2.0e-4
-  weight_decay: 0.01
-  warmup_ratio: 0.03
-  lr_scheduler: "cosine"
   max_grad_norm: 1.0
-  # Sequence settings
-  max_seq_length: 8192
-  packing: true                 # Pack short examples together
-  # Checkpointing
-  save_strategy: "steps"
-  save_steps: 500
-  save_total_limit: 5
-  checkpoint_dir: "./checkpoints"
-  resume_from_checkpoint: null
-  # Logging
-  logging_steps: 10
-  log_dir: "./logs/training"
-  report_to: "wandb"
-  # Evaluation
-  eval_strategy: "steps"
-  eval_steps: 250
-  eval_samples: 1000
-  # Memory optimization (for RTX 4060 local testing)
-  local_overrides:
-    batch_size: 1
-    gradient_accumulation_steps: 16
-    max_seq_length: 2048
-    gradient_checkpointing: true
-    optim: "adamw_8bit"
-wandb:
-  project: "mindi-1.5-vision-coder"
-  entity: "mindigenous"
   tags:
     - "mindi-1.5"
     - "lora"
     - "vision-coder"

 # ==========================================
 # MINDI 1.5 Vision-Coder — Training Configuration
+# Optimized for AMD MI300X 192GB VRAM
 # ==========================================
+# ── Model ──────────────────────────────────────────────────────
+model:
+  name: "Qwen/Qwen2.5-Coder-7B-Instruct"
+  hidden_size: 4096
+  dtype: "bf16"                   # bf16 required for MI300X stability (NOT fp16)
+  use_compile: true               # torch.compile() works on ROCm
+  gradient_checkpointing: true    # Save VRAM even with 192GB
+# ── LoRA ───────────────────────────────────────────────────────
+lora:
+  r: 64
+  alpha: 128
+  dropout: 0.05
+  bias: "none"
+  task_type: "CAUSAL_LM"
+  target_modules:
+    - q_proj
+    - k_proj
+    - v_proj
+    - o_proj
+    - gate_proj
+    - up_proj
+    - down_proj
+# ── Vision ─────────────────────────────────────────────────────
+vision:
+  clip_model: "openai/clip-vit-large-patch14"
+  visual_tokens: 256              # 16×16 patches from ViT-L/14
+  projection_size: 4096           # Must match model.hidden_size
+  freeze_clip: true               # Freeze CLIP backbone
+# ── Training Phases ────────────────────────────────────────────
 training:
+  # Phase 1: LoRA only — teach coding patterns
+  phase1:
+    steps: 5000
+    lr: 2.0e-4
+    batch_size: 16                # MI300X can handle large batches
+    warmup_steps: 100
+    data_filter: "code_only"
+  # Phase 2: Vision bridge only — align visual tokens
+  phase2:
+    steps: 2500
+    lr: 1.0e-5
+    batch_size: 8                 # Smaller batch for vision bridge
+    warmup_steps: 50
+    data_filter: "websight_only"
+  # Phase 3: All trainable — joint fine-tuning
+  phase3:
+    steps: 2500
+    lr: 5.0e-5
+    batch_size: 12
+    warmup_steps: 50
+    data_filter: "all"
+  # Shared training settings
+  grad_accumulation: 4
   max_grad_norm: 1.0
+  eval_every: 250
+  save_every: 500
+# ── Data ───────────────────────────────────────────────────────
+data:
+  train_file: "data/processed/train.jsonl"     # 4.18GB, 1,304,486 examples
+  val_file: "data/processed/val.jsonl"         # 0.23GB, 72,471 examples
+  max_length: 4096
+  shuffle_buffer: 10000           # Streaming shuffle buffer size
+  num_workers: 4                  # DataLoader workers
+  pin_memory: true
+  prefetch_factor: 2
+# ── Logging ────────────────────────────────────────────────────
+logging:
+  wandb_project: "mindi-1.5-vision-coder"
+  wandb_entity: "mindigenous"
+  log_every: 10                   # Log metrics every N steps
+  log_dir: "logs/training"
+  sample_every: 500               # Generate sample outputs every N steps
   tags:
     - "mindi-1.5"
     - "lora"
     - "vision-coder"
+    - "mi300x"
+# ── Output ─────────────────────────────────────────────────────
+output:
+  checkpoint_dir: "checkpoints/training"
+  best_model: "checkpoints/best"
+  hf_repo: "Mindigenous/MINDI-1.5-Vision-Coder"
+  push_every_phase: true
+# ── Local Dev Overrides (RTX 4060 8GB) ────────────────────────
+# Apply these when testing locally with --dry_run
+local_overrides:
+  batch_size: 1
+  gradient_accumulation_steps: 16
+  max_length: 2048
+  gradient_checkpointing: true
+  use_compile: false
+  num_workers: 0

data/processed/filter_report.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "input_count": 1481497,
+  "kept_count": 1449428,
+  "rejected_count": 32069,
+  "kept_pct": 97.84,
+  "avg_tokens": 593.1,
+  "avg_quality": 6.487,
+  "total_tokens": 859694776,
+  "elapsed_seconds": 1023.6,
+  "filter_settings": {
+    "min_tokens": 50,
+    "max_tokens": 4096,
+    "min_quality": 5.0
+  },
+  "rejection_breakdown": {
+    "too_many_tokens": 30637,
+    "boilerplate_content": 1373,
+    "duplicate_content": 59
+  },
+  "source_kept": {
+    "codealpaca": 59241,
+    "codefeedback": 149865,
+    "websight": 250987,
+    "synthetic_nextjs": 90000,
+    "search_examples": 15000,
+    "sandbox_examples": 9000,
+    "starcoderdata": 569350,
+    "evol_code": 155998,
+    "magicoder": 149987
+  },
+  "source_rejected": {
+    "codealpaca": 741,
+    "codefeedback": 132,
+    "starcoderdata": 30650,
+    "evol_code": 518,
+    "magicoder": 13,
+    "websight": 15
+  },
+  "type_distribution": {
+    "code_generation": 1183441,
+    "vision_code": 250987,
+    "search": 15000
+  },
+  "language_distribution": {
+    "unknown": 490305,
+    "typescript": 375859,
+    "javascript": 298497,
+    "python": 211842,
+    "html": 36371,
+    "java": 32458,
+    "rust": 3709,
+    "go": 387
+  }
+}

data/processed/split_meta.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "total": 1449428,
+  "train_count": 1304486,
+  "val_count": 72471,
+  "test_count": 72471,
+  "train_pct": 90.0,
+  "val_pct": 5.0,
+  "test_pct": 5.0,
+  "seed": 42,
+  "source_breakdown": {
+    "codealpaca": {
+      "total": 59241,
+      "train": 53317,
+      "val": 2962,
+      "test": 2962
+    },
+    "codefeedback": {
+      "total": 149865,
+      "train": 134879,
+      "val": 7493,
+      "test": 7493
+    },
+    "evol_code": {
+      "total": 155998,
+      "train": 140398,
+      "val": 7800,
+      "test": 7800
+    },
+    "magicoder": {
+      "total": 149987,
+      "train": 134989,
+      "val": 7499,
+      "test": 7499
+    },
+    "sandbox_examples": {
+      "total": 9000,
+      "train": 8100,
+      "val": 450,
+      "test": 450
+    },
+    "search_examples": {
+      "total": 15000,
+      "train": 13500,
+      "val": 750,
+      "test": 750
+    },
+    "starcoderdata": {
+      "total": 569350,
+      "train": 512414,
+      "val": 28468,
+      "test": 28468
+    },
+    "synthetic_nextjs": {
+      "total": 90000,
+      "train": 81000,
+      "val": 4500,
+      "test": 4500
+    },
+    "websight": {
+      "total": 250987,
+      "train": 225889,
+      "val": 12549,
+      "test": 12549
+    }
+  }
+}

scripts/data_stats.py ADDED Viewed

	@@ -0,0 +1,273 @@

+#!/usr/bin/env python3
+"""
+MINDI 1.5 Vision-Coder — Dataset Statistics Report
+Generates comprehensive statistics for the final train/val/test splits:
+  - Total counts and sizes
+  - Token distribution (min, max, mean, median, p95, p99)
+  - Quality score distribution
+  - Source breakdown
+  - Type breakdown
+  - Language breakdown
+  - Special token usage
+Usage:
+    python scripts/data_stats.py                  # Full report
+    python scripts/data_stats.py --split train    # Stats for train only
+"""
+from __future__ import annotations
+import argparse
+import json
+import statistics
+import sys
+import time
+from collections import Counter
+from pathlib import Path
+# ── Paths ─────────────────────────────────────────────────────────────
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
+SPLIT_FILES = {
+    "train": PROCESSED_DIR / "train.jsonl",
+    "val": PROCESSED_DIR / "val.jsonl",
+    "test": PROCESSED_DIR / "test.jsonl",
+}
+REPORT_FILE = PROCESSED_DIR / "dataset_stats.json"
+# ── Special tokens to check ──────────────────────────────────────────
+SPECIAL_TOKENS = [
+    "<|think_start|>", "<|think_end|>",
+    "<|code_start|>", "<|code_end|>",
+    "<|critique_start|>", "<|critique_end|>",
+    "<|suggest_start|>", "<|suggest_end|>",
+    "<|file_start|>", "<|file_end|>",
+    "<|search_start|>", "<|search_end|>",
+    "<|sandbox_start|>", "<|sandbox_end|>",
+    "<|vision_start|>", "<|vision_end|>",
+    "<|error_start|>", "<|error_end|>",
+    "<|fix_start|>", "<|fix_end|>",
+]
+def percentile(sorted_data: list[int | float], p: float) -> float:
+    """Calculate the p-th percentile from sorted data."""
+    if not sorted_data:
+        return 0.0
+    k = (len(sorted_data) - 1) * (p / 100.0)
+    f = int(k)
+    c = f + 1
+    if c >= len(sorted_data):
+        return float(sorted_data[f])
+    return sorted_data[f] + (k - f) * (sorted_data[c] - sorted_data[f])
+def compute_stats(file_path: Path, split_name: str) -> dict:
+    """Compute statistics for a single split file."""
+    if not file_path.exists():
+        return {"error": f"File not found: {file_path}"}
+    tokens_list: list[int] = []
+    quality_list: list[float] = []
+    source_counts: Counter = Counter()
+    type_counts: Counter = Counter()
+    lang_counts: Counter = Counter()
+    framework_counts: Counter = Counter()
+    has_vision_count = 0
+    special_token_counts: Counter = Counter()
+    msg_count_dist: Counter = Counter()  # number of messages per example
+    total_chars = 0
+    count = 0
+    with open(file_path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                ex = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            count += 1
+            meta = ex.get("metadata", {})
+            # Token count
+            tokens = meta.get("tokens", 0)
+            tokens_list.append(tokens)
+            # Quality score
+            quality = meta.get("quality_score", 0.0)
+            quality_list.append(quality)
+            # Source, type, language, framework
+            source_counts[ex.get("source", "unknown")] += 1
+            type_counts[ex.get("type", "unknown")] += 1
+            lang_counts[meta.get("language", "unknown")] += 1
+            framework_counts[meta.get("framework", "none")] += 1
+            # Vision
+            if meta.get("has_vision", False):
+                has_vision_count += 1
+            # Messages
+            messages = ex.get("messages", [])
+            msg_count_dist[len(messages)] += 1
+            # Special tokens in assistant content
+            for msg in messages:
+                if msg.get("role") == "assistant":
+                    content = msg.get("content", "")
+                    total_chars += len(content)
+                    for tok in SPECIAL_TOKENS:
+                        if tok in content:
+                            special_token_counts[tok] += 1
+    # Sort for percentile computation
+    tokens_sorted = sorted(tokens_list)
+    quality_sorted = sorted(quality_list)
+    file_size_mb = file_path.stat().st_size / (1024 * 1024)
+    stats = {
+        "split": split_name,
+        "file": file_path.name,
+        "file_size_mb": round(file_size_mb, 1),
+        "count": count,
+        "total_tokens": sum(tokens_list),
+        "total_chars_assistant": total_chars,
+        "has_vision": has_vision_count,
+        "tokens": {
+            "min": min(tokens_sorted) if tokens_sorted else 0,
+            "max": max(tokens_sorted) if tokens_sorted else 0,
+            "mean": round(statistics.mean(tokens_list), 1) if tokens_list else 0,
+            "median": round(statistics.median(tokens_list), 1) if tokens_list else 0,
+            "stdev": round(statistics.stdev(tokens_list), 1) if len(tokens_list) > 1 else 0,
+            "p5": round(percentile(tokens_sorted, 5), 1),
+            "p25": round(percentile(tokens_sorted, 25), 1),
+            "p75": round(percentile(tokens_sorted, 75), 1),
+            "p95": round(percentile(tokens_sorted, 95), 1),
+            "p99": round(percentile(tokens_sorted, 99), 1),
+        },
+        "quality_score": {
+            "min": round(min(quality_sorted), 2) if quality_sorted else 0,
+            "max": round(max(quality_sorted), 2) if quality_sorted else 0,
+            "mean": round(statistics.mean(quality_list), 2) if quality_list else 0,
+            "median": round(statistics.median(quality_list), 2) if quality_list else 0,
+        },
+        "source_distribution": dict(source_counts.most_common()),
+        "type_distribution": dict(type_counts.most_common()),
+        "language_distribution": dict(lang_counts.most_common(30)),
+        "framework_distribution": dict(framework_counts.most_common(15)),
+        "messages_per_example": dict(sorted(msg_count_dist.items())),
+        "special_token_usage": dict(special_token_counts.most_common()),
+    }
+    return stats
+def print_stats(stats: dict) -> None:
+    """Pretty-print statistics for a split."""
+    if "error" in stats:
+        print(f"  ERROR: {stats['error']}")
+        return
+    print(f"  Split: {stats['split']}")
+    print(f"  File:  {stats['file']} ({stats['file_size_mb']:.1f} MB)")
+    print(f"  Count: {stats['count']:,}")
+    print(f"  Total tokens: {stats['total_tokens']:,}")
+    print(f"  Vision examples: {stats['has_vision']:,}")
+    print()
+    t = stats["tokens"]
+    print(f"  Token distribution:")
+    print(f"    Min:    {t['min']:>8,}    P5:     {t['p5']:>8,.0f}")
+    print(f"    P25:    {t['p25']:>8,.0f}    Median: {t['median']:>8,.0f}")
+    print(f"    Mean:   {t['mean']:>8,.0f}    P75:    {t['p75']:>8,.0f}")
+    print(f"    P95:    {t['p95']:>8,.0f}    P99:    {t['p99']:>8,.0f}")
+    print(f"    Max:    {t['max']:>8,}    Stdev:  {t['stdev']:>8,.0f}")
+    print()
+    q = stats["quality_score"]
+    print(f"  Quality score: min={q['min']:.1f}  mean={q['mean']:.1f}  median={q['median']:.1f}  max={q['max']:.1f}")
+    print()
+    print(f"  Source distribution:")
+    for src, cnt in stats["source_distribution"].items():
+        pct = cnt / stats["count"] * 100
+        print(f"    {src:<25s} {cnt:>10,} ({pct:5.1f}%)")
+    print()
+    print(f"  Type distribution:")
+    for t_name, cnt in list(stats["type_distribution"].items())[:10]:
+        pct = cnt / stats["count"] * 100
+        print(f"    {t_name:<25s} {cnt:>10,} ({pct:5.1f}%)")
+    print()
+    print(f"  Language distribution (top 15):")
+    for lang, cnt in list(stats["language_distribution"].items())[:15]:
+        pct = cnt / stats["count"] * 100
+        print(f"    {lang:<25s} {cnt:>10,} ({pct:5.1f}%)")
+    print()
+    if stats["special_token_usage"]:
+        print(f"  Special token usage (examples containing token):")
+        for tok, cnt in stats["special_token_usage"].items():
+            pct = cnt / stats["count"] * 100
+            print(f"    {tok:<25s} {cnt:>10,} ({pct:5.1f}%)")
+        print()
+def run_stats(split: str | None = None) -> None:
+    """Generate and display statistics."""
+    start = time.time()
+    if split:
+        files = {split: SPLIT_FILES.get(split)}
+        if files[split] is None:
+            print(f"ERROR: Unknown split '{split}'. Choose from: {list(SPLIT_FILES.keys())}")
+            sys.exit(1)
+    else:
+        files = SPLIT_FILES
+    all_stats = {}
+    for name, path in files.items():
+        print("=" * 60)
+        print(f"  Computing stats for: {name}")
+        print("=" * 60)
+        stats = compute_stats(path, name)
+        all_stats[name] = stats
+        print_stats(stats)
+    # Save JSON report
+    REPORT_FILE.parent.mkdir(parents=True, exist_ok=True)
+    with open(REPORT_FILE, "w", encoding="utf-8") as f:
+        json.dump(all_stats, f, indent=2)
+    print(f"Full report saved to: {REPORT_FILE.name}")
+    elapsed = time.time() - start
+    print(f"Stats generated in {elapsed:.1f}s")
+# ── CLI ───────────────────────────────────────────────────────────────
+def main():
+    parser = argparse.ArgumentParser(
+        description="MINDI Dataset Statistics — comprehensive split analysis",
+    )
+    parser.add_argument("--split", type=str, choices=["train", "val", "test"],
+                        help="Compute stats for a single split only")
+    args = parser.parse_args()
+    run_stats(split=args.split)
+if __name__ == "__main__":
+    main()

scripts/quality_filter.py ADDED Viewed

	@@ -0,0 +1,472 @@

+#!/usr/bin/env python3
+"""
+MINDI 1.5 Vision-Coder — Quality Filter Pipeline
+Filters mindi_all.jsonl to remove low-quality examples:
+  1. Token length filter   — drop if <50 tokens or >4096 tokens
+  2. Duplicate detection   — SHA-256 hash of assistant content
+  3. JSON structure check  — valid schema with required fields
+  4. Special token check   — assistant must have code_start/code_end pair
+  5. Quality score filter  — keep only quality_score >= 5.0
+  6. Content heuristics    — drop empty/trivial/boilerplate responses
+Usage:
+    python scripts/quality_filter.py                  # Full run
+    python scripts/quality_filter.py --dry-run        # Preview only
+    python scripts/quality_filter.py --min-tokens 100 # Custom min tokens
+    python scripts/quality_filter.py --max-tokens 8192 # Custom max tokens
+    python scripts/quality_filter.py --min-quality 7.0 # Stricter quality
+"""
+from __future__ import annotations
+import argparse
+import hashlib
+import json
+import sys
+import time
+from collections import Counter, defaultdict
+from pathlib import Path
+# ── Paths ─────────────────────────────────────────────────────────────
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+INPUT_FILE = PROJECT_ROOT / "data" / "processed" / "mindi_all.jsonl"
+OUTPUT_FILE = PROJECT_ROOT / "data" / "processed" / "mindi_filtered.jsonl"
+REJECT_FILE = PROJECT_ROOT / "data" / "processed" / "mindi_rejected.jsonl"
+REPORT_FILE = PROJECT_ROOT / "data" / "processed" / "filter_report.json"
+# ── Required schema fields ────────────────────────────────────────────
+REQUIRED_FIELDS = {"id", "type", "source", "messages", "metadata"}
+REQUIRED_METADATA = {"language", "tokens"}
+VALID_ROLES = {"system", "user", "assistant"}
+# ── Protected sources (hand-crafted gold data — lighter filtering) ─────
+PROTECTED_SOURCES = {"sandbox_examples", "search_examples", "synthetic_nextjs"}
+# ── MINDI agentic token scoring bonuses ───────────────────────────────
+#   Examples with these tokens teach the model to be an *agent*.
+#   Each occurrence adds to the quality_score before the threshold.
+MINDI_TOKEN_BONUSES = {
+    "<|think_start|>": 2.0,
+    "<|search_start|>": 3.0,
+    "<|error_start|>": 3.0,
+    "<|sandbox_start|>": 3.0,
+    "<|critique_start|>": 2.0,
+    "<|suggest_start|>": 1.0,
+}
+# ── Special token pairs that assistant messages should contain ─────────
+CODE_TOKEN_PAIRS = [
+    ("<|code_start|>", "<|code_end|>"),
+]
+# At least one of these pairs should be present in assistant content
+OPTIONAL_TOKEN_PAIRS = [
+    ("<|think_start|>", "<|think_end|>"),
+    ("<|critique_start|>", "<|critique_end|>"),
+    ("<|suggest_start|>", "<|suggest_end|>"),
+    ("<|file_start|>", "<|file_end|>"),
+    ("<|search_start|>", "<|search_end|>"),
+    ("<|sandbox_start|>", "<|sandbox_end|>"),
+    ("<|error_start|>", "<|error_end|>"),
+    ("<|fix_start|>", "<|fix_end|>"),
+]
+# ── Rejection reasons ─────────────────────────────────────────────────
+class Reason:
+    INVALID_JSON = "invalid_json"
+    MISSING_FIELDS = "missing_fields"
+    MISSING_METADATA = "missing_metadata"
+    NO_MESSAGES = "no_messages"
+    BAD_ROLES = "bad_message_roles"
+    NO_ASSISTANT = "no_assistant_message"
+    EMPTY_ASSISTANT = "empty_assistant_content"
+    TOO_SHORT = "too_few_tokens"
+    TOO_LONG = "too_many_tokens"
+    DUPLICATE = "duplicate_content"
+    LOW_QUALITY = "low_quality_score"
+    NO_CODE_TOKENS = "missing_code_tokens"
+    BOILERPLATE = "boilerplate_content"
+    UNMATCHED_TOKENS = "unmatched_special_tokens"
+# ── Filter functions ──────────────────────────────────────────────────
+def validate_schema(example: dict) -> str | None:
+    """Check required fields and structure. Returns rejection reason or None."""
+    # Top-level fields
+    missing = REQUIRED_FIELDS - set(example.keys())
+    if missing:
+        return Reason.MISSING_FIELDS
+    # Metadata fields
+    meta = example.get("metadata", {})
+    if not isinstance(meta, dict):
+        return Reason.MISSING_METADATA
+    missing_meta = REQUIRED_METADATA - set(meta.keys())
+    if missing_meta:
+        return Reason.MISSING_METADATA
+    # Messages array
+    messages = example.get("messages", [])
+    if not isinstance(messages, list) or len(messages) == 0:
+        return Reason.NO_MESSAGES
+    # Role validation
+    for msg in messages:
+        if not isinstance(msg, dict) or "role" not in msg or "content" not in msg:
+            return Reason.BAD_ROLES
+        if msg["role"] not in VALID_ROLES:
+            return Reason.BAD_ROLES
+    return None
+def get_assistant_content(example: dict) -> str:
+    """Extract concatenated assistant message content."""
+    parts = []
+    for msg in example.get("messages", []):
+        if msg.get("role") == "assistant":
+            parts.append(msg.get("content", ""))
+    return "\n".join(parts)
+def check_assistant_exists(example: dict) -> str | None:
+    """Must have at least one assistant message with non-empty content."""
+    content = get_assistant_content(example)
+    if not content:
+        return Reason.NO_ASSISTANT
+    if len(content.strip()) < 10:
+        return Reason.EMPTY_ASSISTANT
+    return None
+def check_token_length(example: dict, min_tokens: int, max_tokens: int) -> str | None:
+    """Filter by token count stored in metadata."""
+    tokens = example.get("metadata", {}).get("tokens", 0)
+    if tokens < min_tokens:
+        return Reason.TOO_SHORT
+    if tokens > max_tokens:
+        return Reason.TOO_LONG
+    return None
+def compute_mindi_bonus(example: dict) -> float:
+    """Compute bonus score for MINDI agentic special tokens."""
+    content = get_assistant_content(example)
+    bonus = 0.0
+    for token, value in MINDI_TOKEN_BONUSES.items():
+        if token in content:
+            bonus += value
+    return bonus
+def check_quality_score(example: dict, min_quality: float) -> str | None:
+    """Filter by quality_score + MINDI token bonus."""
+    score = example.get("metadata", {}).get("quality_score", 0.0)
+    score += compute_mindi_bonus(example)
+    if score < min_quality:
+        return Reason.LOW_QUALITY
+    return None
+def check_code_tokens(example: dict) -> str | None:
+    """Assistant content must contain code_start/code_end pair."""
+    content = get_assistant_content(example)
+    for start_tok, end_tok in CODE_TOKEN_PAIRS:
+        if start_tok in content and end_tok in content:
+            # Check ordering: start before end
+            if content.index(start_tok) < content.rindex(end_tok):
+                return None  # OK
+    return Reason.NO_CODE_TOKENS
+def check_unmatched_tokens(example: dict) -> str | None:
+    """Ensure all special token pairs are properly matched (start count == end count)."""
+    content = get_assistant_content(example)
+    all_pairs = CODE_TOKEN_PAIRS + OPTIONAL_TOKEN_PAIRS
+    for start_tok, end_tok in all_pairs:
+        start_count = content.count(start_tok)
+        end_count = content.count(end_tok)
+        if start_count != end_count:
+            return Reason.UNMATCHED_TOKENS
+    return None
+def check_boilerplate(example: dict) -> str | None:
+    """Detect boilerplate/placeholder assistant responses."""
+    content = get_assistant_content(example)
+    content_lower = content.lower().strip()
+    # Very short code blocks (just placeholder)
+    code_markers = ("<|code_start|>", "<|code_end|>")
+    if code_markers[0] in content and code_markers[1] in content:
+        start_idx = content.index(code_markers[0]) + len(code_markers[0])
+        end_idx = content.index(code_markers[1])
+        code_body = content[start_idx:end_idx].strip()
+        if len(code_body) < 5:
+            return Reason.BOILERPLATE
+    # Repetitive content (same char repeated)
+    stripped = content_lower.replace(" ", "").replace("\n", "")
+    if len(stripped) > 20:
+        unique_chars = len(set(stripped))
+        if unique_chars < 5:
+            return Reason.BOILERPLATE
+    return None
+def content_hash(example: dict) -> str:
+    """SHA-256 hash of assistant content for deduplication."""
+    content = get_assistant_content(example)
+    return hashlib.sha256(content.encode("utf-8", errors="replace")).hexdigest()
+# ── Main pipeline ─────────────────────────────────────────────────────
+def run_filter(
+    dry_run: bool = False,
+    min_tokens: int = 50,
+    max_tokens: int = 4096,
+    min_quality: float = 5.0,
+) -> None:
+    """Run the full quality filter pipeline."""
+    if not INPUT_FILE.exists():
+        print(f"ERROR: Input file not found: {INPUT_FILE}")
+        sys.exit(1)
+    # Count input lines
+    print(f"Counting input examples from {INPUT_FILE.name} ...")
+    total_input = sum(1 for _ in open(INPUT_FILE, "r", encoding="utf-8"))
+    print(f"  Total input: {total_input:,} examples")
+    print()
+    # Filter settings
+    print("Filter settings:")
+    print(f"  Min tokens:   {min_tokens}")
+    print(f"  Max tokens:   {max_tokens}")
+    print(f"  Min quality:  {min_quality}")
+    print(f"  Dry run:      {dry_run}")
+    print()
+    # Stats tracking
+    kept = 0
+    rejected = 0
+    reject_reasons: Counter = Counter()
+    source_kept: Counter = Counter()
+    source_rejected: Counter = Counter()
+    seen_hashes: set[str] = set()
+    token_sum = 0
+    quality_sum = 0.0
+    # Type distribution
+    type_counts: Counter = Counter()
+    # Language distribution
+    lang_counts: Counter = Counter()
+    start_time = time.time()
+    out_f = None
+    rej_f = None
+    if not dry_run:
+        OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
+        out_f = open(OUTPUT_FILE, "w", encoding="utf-8")
+        rej_f = open(REJECT_FILE, "w", encoding="utf-8")
+    try:
+        with open(INPUT_FILE, "r", encoding="utf-8") as f:
+            for line_num, line in enumerate(f, 1):
+                line = line.strip()
+                if not line:
+                    continue
+                # Parse JSON
+                try:
+                    example = json.loads(line)
+                except json.JSONDecodeError:
+                    reject_reasons[Reason.INVALID_JSON] += 1
+                    rejected += 1
+                    if rej_f:
+                        rej_f.write(line + "\n")
+                    continue
+                source = example.get("source", "unknown")
+                is_protected = source in PROTECTED_SOURCES
+                # Run filter chain (order matters: cheapest first)
+                # Protected sources: schema + assistant + token length + unmatched only
+                # Regular sources: full chain + dedup
+                if is_protected:
+                    rejection = (
+                        validate_schema(example)
+                        or check_assistant_exists(example)
+                        or check_token_length(example, min_tokens, max_tokens)
+                        or check_unmatched_tokens(example)
+                    )
+                else:
+                    rejection = (
+                        validate_schema(example)
+                        or check_assistant_exists(example)
+                        or check_token_length(example, min_tokens, max_tokens)
+                        or check_quality_score(example, min_quality)
+                        or check_code_tokens(example)
+                        or check_unmatched_tokens(example)
+                        or check_boilerplate(example)
+                    )
+                if rejection is None and not is_protected:
+                    # Dedup check (skip for protected sources)
+                    h = content_hash(example)
+                    if h in seen_hashes:
+                        rejection = Reason.DUPLICATE
+                if rejection is not None:
+                    reject_reasons[rejection] += 1
+                    source_rejected[source] += 1
+                    rejected += 1
+                    if rej_f:
+                        rej_f.write(line + "\n")
+                    continue
+                # Passed all filters
+                if not is_protected:
+                    seen_hashes.add(h)
+                kept += 1
+                source_kept[source] += 1
+                token_sum += example.get("metadata", {}).get("tokens", 0)
+                quality_sum += example.get("metadata", {}).get("quality_score", 0.0)
+                type_counts[example.get("type", "unknown")] += 1
+                lang_counts[example.get("metadata", {}).get("language", "unknown")] += 1
+                if out_f:
+                    out_f.write(line + "\n")
+                # Progress
+                if line_num % 50000 == 0:
+                    elapsed = time.time() - start_time
+                    rate = line_num / elapsed if elapsed > 0 else 0
+                    pct = (line_num / total_input) * 100
+                    print(f"  [{pct:5.1f}%] Processed {line_num:>10,} | Kept {kept:>10,} | Rejected {rejected:>10,} | {rate:,.0f} ex/s")
+    finally:
+        if out_f:
+            out_f.close()
+        if rej_f:
+            rej_f.close()
+    elapsed = time.time() - start_time
+    # ── Summary report ────────────────────────────────────────────
+    print()
+    print("=" * 60)
+    print("  QUALITY FILTER REPORT")
+    print("=" * 60)
+    print(f"  Input:       {total_input:>10,} examples")
+    print(f"  Kept:        {kept:>10,} examples ({kept/total_input*100:.1f}%)")
+    print(f"  Rejected:    {rejected:>10,} examples ({rejected/total_input*100:.1f}%)")
+    print(f"  Time:        {elapsed:>10.1f} seconds")
+    print(f"  Rate:        {total_input/elapsed:>10,.0f} examples/sec")
+    print()
+    if kept > 0:
+        print(f"  Avg tokens:  {token_sum/kept:>10.0f}")
+        print(f"  Avg quality: {quality_sum/kept:>10.2f}")
+        print(f"  Total tokens:{token_sum:>10,}")
+        print()
+    # Rejection breakdown
+    print("  Rejection breakdown:")
+    for reason, count in reject_reasons.most_common():
+        pct = count / total_input * 100
+        print(f"    {reason:<30s} {count:>10,} ({pct:.1f}%)")
+    print()
+    # Source breakdown
+    print("  Source breakdown (kept / total):")
+    all_sources = sorted(set(list(source_kept.keys()) + list(source_rejected.keys())))
+    for src in all_sources:
+        k = source_kept.get(src, 0)
+        total = k + source_rejected.get(src, 0)
+        pct = k / total * 100 if total > 0 else 0
+        print(f"    {src:<25s} {k:>8,} / {total:>8,} ({pct:.1f}%)")
+    print()
+    # Type distribution
+    print("  Type distribution (kept):")
+    for t, c in type_counts.most_common(10):
+        print(f"    {t:<25s} {c:>8,}")
+    print()
+    # Language distribution (top 15)
+    print("  Language distribution (kept, top 15):")
+    for lang, c in lang_counts.most_common(15):
+        print(f"    {lang:<25s} {c:>8,}")
+    print()
+    if not dry_run:
+        print(f"  Output:  {OUTPUT_FILE}")
+        print(f"  Rejects: {REJECT_FILE}")
+        # Save machine-readable report
+        report = {
+            "input_count": total_input,
+            "kept_count": kept,
+            "rejected_count": rejected,
+            "kept_pct": round(kept / total_input * 100, 2),
+            "avg_tokens": round(token_sum / kept, 1) if kept > 0 else 0,
+            "avg_quality": round(quality_sum / kept, 3) if kept > 0 else 0,
+            "total_tokens": token_sum,
+            "elapsed_seconds": round(elapsed, 1),
+            "filter_settings": {
+                "min_tokens": min_tokens,
+                "max_tokens": max_tokens,
+                "min_quality": min_quality,
+            },
+            "rejection_breakdown": dict(reject_reasons.most_common()),
+            "source_kept": dict(source_kept),
+            "source_rejected": dict(source_rejected),
+            "type_distribution": dict(type_counts.most_common()),
+            "language_distribution": dict(lang_counts.most_common(30)),
+        }
+        with open(REPORT_FILE, "w", encoding="utf-8") as rf:
+            json.dump(report, rf, indent=2)
+        print(f"  Report:  {REPORT_FILE}")
+    print("=" * 60)
+# ── CLI ───────────────────────────────────────────────────────────────
+def main():
+    parser = argparse.ArgumentParser(
+        description="MINDI Quality Filter — remove low-quality training examples",
+    )
+    parser.add_argument("--dry-run", action="store_true", help="Preview counts without writing output")
+    parser.add_argument("--min-tokens", type=int, default=50, help="Minimum token count (default: 50)")
+    parser.add_argument("--max-tokens", type=int, default=4096, help="Maximum token count (default: 4096)")
+    parser.add_argument("--min-quality", type=float, default=5.0, help="Minimum quality_score (default: 5.0)")
+    args = parser.parse_args()
+    run_filter(
+        dry_run=args.dry_run,
+        min_tokens=args.min_tokens,
+        max_tokens=args.max_tokens,
+        min_quality=args.min_quality,
+    )
+if __name__ == "__main__":
+    main()

scripts/split_data.py ADDED Viewed

	@@ -0,0 +1,207 @@

+#!/usr/bin/env python3
+"""
+MINDI 1.5 Vision-Coder — Train / Validation / Test Split
+Splits mindi_filtered.jsonl into:
+  - train.jsonl      (90%)
+  - val.jsonl         (5%)
+  - test.jsonl        (5%)
+Stratified by source to ensure proportional representation.
+Deterministic with a fixed random seed.
+Usage:
+    python scripts/split_data.py                        # Default 90/5/5
+    python scripts/split_data.py --train 0.85 --val 0.10 --test 0.05
+    python scripts/split_data.py --seed 42
+    python scripts/split_data.py --dry-run
+"""
+from __future__ import annotations
+import argparse
+import json
+import random
+import sys
+import time
+from collections import Counter
+from pathlib import Path
+# ── Paths ─────────────────────────────────────────────────────────────
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+INPUT_FILE = PROJECT_ROOT / "data" / "processed" / "mindi_filtered.jsonl"
+OUTPUT_DIR = PROJECT_ROOT / "data" / "processed"
+TRAIN_FILE = OUTPUT_DIR / "train.jsonl"
+VAL_FILE = OUTPUT_DIR / "val.jsonl"
+TEST_FILE = OUTPUT_DIR / "test.jsonl"
+def run_split(
+    train_ratio: float = 0.90,
+    val_ratio: float = 0.05,
+    test_ratio: float = 0.05,
+    seed: int = 42,
+    dry_run: bool = False,
+) -> None:
+    """Split filtered data into train/val/test with stratification by source."""
+    # Validate ratios
+    total_ratio = train_ratio + val_ratio + test_ratio
+    if abs(total_ratio - 1.0) > 0.001:
+        print(f"ERROR: Ratios must sum to 1.0, got {total_ratio:.3f}")
+        sys.exit(1)
+    if not INPUT_FILE.exists():
+        print(f"ERROR: Input file not found: {INPUT_FILE}")
+        print("  Run quality_filter.py first to generate mindi_filtered.jsonl")
+        sys.exit(1)
+    print(f"Loading examples from {INPUT_FILE.name} ...")
+    start = time.time()
+    # Group lines by source for stratified splitting
+    source_lines: dict[str, list[str]] = {}
+    total = 0
+    with open(INPUT_FILE, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            total += 1
+            try:
+                example = json.loads(line)
+                source = example.get("source", "unknown")
+            except json.JSONDecodeError:
+                source = "unknown"
+            source_lines.setdefault(source, []).append(line)
+    load_time = time.time() - start
+    print(f"  Loaded {total:,} examples in {load_time:.1f}s")
+    print(f"  Sources: {len(source_lines)}")
+    print()
+    # Split settings
+    print(f"Split ratios: train={train_ratio:.0%} / val={val_ratio:.0%} / test={test_ratio:.0%}")
+    print(f"Random seed:  {seed}")
+    print(f"Dry run:      {dry_run}")
+    print()
+    rng = random.Random(seed)
+    train_lines: list[str] = []
+    val_lines: list[str] = []
+    test_lines: list[str] = []
+    source_stats: dict[str, dict[str, int]] = {}
+    for source in sorted(source_lines.keys()):
+        lines = source_lines[source]
+        rng.shuffle(lines)
+        n = len(lines)
+        n_val = max(1, round(n * val_ratio)) if n >= 3 else 0
+        n_test = max(1, round(n * test_ratio)) if n >= 3 else 0
+        n_train = n - n_val - n_test
+        # Edge case: if too few examples, put all in train
+        if n < 3:
+            n_train = n
+            n_val = 0
+            n_test = 0
+        train_lines.extend(lines[:n_train])
+        val_lines.extend(lines[n_train:n_train + n_val])
+        test_lines.extend(lines[n_train + n_val:])
+        source_stats[source] = {
+            "total": n,
+            "train": n_train,
+            "val": n_val,
+            "test": n_test,
+        }
+    # Shuffle final lists (so sources are interleaved)
+    rng.shuffle(train_lines)
+    rng.shuffle(val_lines)
+    rng.shuffle(test_lines)
+    # ── Summary ───────────────────────────────────────────────────
+    print("=" * 60)
+    print("  SPLIT SUMMARY")
+    print("=" * 60)
+    print(f"  Total:       {total:>10,}")
+    print(f"  Train:       {len(train_lines):>10,} ({len(train_lines)/total*100:.1f}%)")
+    print(f"  Validation:  {len(val_lines):>10,} ({len(val_lines)/total*100:.1f}%)")
+    print(f"  Test:        {len(test_lines):>10,} ({len(test_lines)/total*100:.1f}%)")
+    print()
+    print("  Per-source breakdown:")
+    print(f"    {'Source':<25s} {'Total':>8s} {'Train':>8s} {'Val':>8s} {'Test':>8s}")
+    print(f"    {'-'*25} {'-'*8} {'-'*8} {'-'*8} {'-'*8}")
+    for source in sorted(source_stats.keys()):
+        s = source_stats[source]
+        print(f"    {source:<25s} {s['total']:>8,} {s['train']:>8,} {s['val']:>8,} {s['test']:>8,}")
+    print()
+    if not dry_run:
+        OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+        print("Writing files ...")
+        for path, lines, name in [
+            (TRAIN_FILE, train_lines, "train"),
+            (VAL_FILE, val_lines, "val"),
+            (TEST_FILE, test_lines, "test"),
+        ]:
+            with open(path, "w", encoding="utf-8") as f:
+                for line in lines:
+                    f.write(line + "\n")
+            size_mb = path.stat().st_size / (1024 * 1024)
+            print(f"  {name:<12s} → {path.name:<20s} ({len(lines):>10,} examples, {size_mb:>8.1f} MB)")
+        # Save split metadata
+        meta = {
+            "total": total,
+            "train_count": len(train_lines),
+            "val_count": len(val_lines),
+            "test_count": len(test_lines),
+            "train_pct": round(len(train_lines) / total * 100, 2),
+            "val_pct": round(len(val_lines) / total * 100, 2),
+            "test_pct": round(len(test_lines) / total * 100, 2),
+            "seed": seed,
+            "source_breakdown": source_stats,
+        }
+        meta_path = OUTPUT_DIR / "split_meta.json"
+        with open(meta_path, "w", encoding="utf-8") as f:
+            json.dump(meta, f, indent=2)
+        print(f"  Metadata    → {meta_path.name}")
+    elapsed = time.time() - start
+    print(f"\n  Done in {elapsed:.1f}s")
+    print("=" * 60)
+# ── CLI ───────────────────────────────────────────────────────────────
+def main():
+    parser = argparse.ArgumentParser(
+        description="MINDI Data Splitter — stratified train/val/test split",
+    )
+    parser.add_argument("--train", type=float, default=0.90, help="Train ratio (default: 0.90)")
+    parser.add_argument("--val", type=float, default=0.05, help="Validation ratio (default: 0.05)")
+    parser.add_argument("--test", type=float, default=0.05, help="Test ratio (default: 0.05)")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed (default: 42)")
+    parser.add_argument("--dry-run", action="store_true", help="Preview split without writing files")
+    args = parser.parse_args()
+    run_split(
+        train_ratio=args.train,
+        val_ratio=args.val,
+        test_ratio=args.test,
+        seed=args.seed,
+        dry_run=args.dry_run,
+    )
+if __name__ == "__main__":
+    main()

scripts/train.py CHANGED Viewed

@@ -1,39 +1,348 @@
 """
-MINDI 1.5 Vision-Coder — Training Launch Script
-Entry point for starting LoRA fine-tuning.
-Loads config, initializes model + dataset, and runs training.
 """
 from __future__ import annotations
 import argparse
 from pathlib import Path
-def main() -> None:
-    """Parse args and launch training."""
-    parser = argparse.ArgumentParser(description="MINDI 1.5 — Launch LoRA Training")
     parser.add_argument(
-        "--config", type=str, default="./configs/training_config.yaml",
         help="Path to training config YAML",
     )
     parser.add_argument(
-        "--local", action="store_true", default=True,
-        help="Use local GPU overrides (RTX 4060 mode)",
     )
     parser.add_argument(
-        "--cloud", action="store_true",
-        help="Use cloud GPU settings (MI300X mode)",
     )
-    args = parser.parse_args()
-    local_mode = not args.cloud
-    config_path = Path(args.config)
-    print(f"[MINDI Training] Config: {config_path}")
-    print(f"[MINDI Training] Mode: {'local (RTX 4060)' if local_mode else 'cloud (MI300X)'}")
-    print("[MINDI Training] Pipeline will be wired after Phase 3 setup.")
 if __name__ == "__main__":

+#!/usr/bin/env python3
 """
+MINDI 1.5 Vision-Coder — Master Training Script
+Usage:
+    python scripts/train.py --phase 1              # Run phase 1 only
+    python scripts/train.py --phase all             # Run all 3 phases
+    python scripts/train.py --phase 2 --resume checkpoints/training/phase1_lora_step5000
+    python scripts/train.py --dry_run               # Test 10 steps only
+    python scripts/train.py --push_to_hub           # Upload after training
+Handles Ctrl+C gracefully: saves checkpoint before exit.
 """
 from __future__ import annotations
 import argparse
+import signal
+import sys
+import traceback
 from pathlib import Path
+# Resolve project root (scripts/ is one level deep)
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+import torch
+import yaml
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="MINDI 1.5 Vision-Coder — Training",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--phase", type=str, default="all",
+        choices=["1", "2", "3", "all"],
+        help="Which phase(s) to run: 1, 2, 3, or all (default: all)",
+    )
+    parser.add_argument(
+        "--resume", type=str, default=None,
+        help="Path to checkpoint directory to resume from",
+    )
     parser.add_argument(
+        "--config", type=str,
+        default=str(PROJECT_ROOT / "configs" / "training_config.yaml"),
         help="Path to training config YAML",
     )
     parser.add_argument(
+        "--dry_run", action="store_true",
+        help="Test run: only 10 steps per phase",
     )
     parser.add_argument(
+        "--push_to_hub", action="store_true",
+        help="Push checkpoints to HuggingFace after each phase",
+    )
+    parser.add_argument(
+        "--no_wandb", action="store_true",
+        help="Disable WandB logging",
+    )
+    return parser.parse_args()
+def load_config(config_path: str) -> dict:
+    """Load and return the training config YAML."""
+    path = Path(config_path)
+    if not path.exists():
+        raise FileNotFoundError(f"Config not found: {path}")
+    with open(path, "r", encoding="utf-8") as f:
+        return yaml.safe_load(f)
+def build_training_config(raw: dict, dry_run: bool = False):
+    """Build TrainingConfig from parsed YAML."""
+    from src.training.mindi_trainer import PhaseConfig, TrainingConfig
+    training = raw.get("training", {})
+    data = raw.get("data", {})
+    output = raw.get("output", {})
+    logging_cfg = raw.get("logging", {})
+    model_cfg = raw.get("model", {})
+    # Build phase configs from YAML
+    phases = []
+    phase_defs = [
+        ("phase1", "phase1_lora", True, False, False),
+        ("phase2", "phase2_vision_bridge", False, True, True),
+        ("phase3", "phase3_all", True, True, True),
+    ]
+    cumulative_step = 0
+    for key, name, lora, vision, fusion in phase_defs:
+        pcfg = training.get(key, {})
+        steps = pcfg.get("steps", 2500)
+        if dry_run:
+            steps = 10
+        start = cumulative_step
+        end = cumulative_step + steps
+        phases.append(PhaseConfig(
+            name=name,
+            start_step=start,
+            end_step=end,
+            learning_rate=float(pcfg.get("lr", 2e-4)),
+            batch_size=pcfg.get("batch_size", 8),
+            gradient_accumulation_steps=training.get("grad_accumulation", 4),
+            lora=lora,
+            vision_projection=vision,
+            fusion=fusion,
+        ))
+        cumulative_step = end
+    config = TrainingConfig(
+        train_file=PROJECT_ROOT / data.get("train_file", "data/processed/train.jsonl"),
+        val_file=PROJECT_ROOT / data.get("val_file", "data/processed/val.jsonl"),
+        output_dir=PROJECT_ROOT / output.get("checkpoint_dir", "checkpoints/training"),
+        log_dir=PROJECT_ROOT / logging_cfg.get("log_dir", "logs/training"),
+        max_seq_length=data.get("max_length", 4096),
+        use_compile=model_cfg.get("use_compile", False),
+        gradient_checkpointing=model_cfg.get("gradient_checkpointing", True),
+        dtype=model_cfg.get("dtype", "bf16"),
+        num_workers=data.get("num_workers", 4),
+        pin_memory=True,
+        prefetch_factor=2,
+        weight_decay=0.01,
+        warmup_ratio=0.03,
+        max_grad_norm=float(training.get("max_grad_norm", 1.0)),
+        seed=42,
+        log_every_n_steps=logging_cfg.get("log_every", 10),
+        eval_every_n_steps=training.get("eval_every", 250),
+        save_every_n_steps=training.get("save_every", 500),
+        phases=phases,
+    )
+    if dry_run:
+        config.eval_every_n_steps = 5
+        config.save_every_n_steps = 10
+        config.log_every_n_steps = 1
+    return config
+def init_wandb(raw_config: dict, phase: str, disabled: bool = False):
+    """Initialize WandB logging."""
+    if disabled:
+        return None
+    try:
+        import wandb
+        logging_cfg = raw_config.get("logging", {})
+        run = wandb.init(
+            project=logging_cfg.get("wandb_project", "mindi-1.5-vision-coder"),
+            entity=logging_cfg.get("wandb_entity", "mindigenous"),
+            name=f"mindi15-{phase}",
+            config=raw_config,
+            tags=["mindi-1.5", "training", f"phase-{phase}"],
+            reinit=True,
+        )
+        print(f"[train.py] WandB initialized: {run.url}")
+        return run
+    except ImportError:
+        print("[train.py] WandB not installed — logging disabled")
+        return None
+    except Exception as e:
+        print(f"[train.py] WandB init failed: {e} — continuing without logging")
+        return None
+def push_checkpoint_to_hub(checkpoint_dir: Path, raw_config: dict) -> None:
+    """Push a checkpoint to HuggingFace Hub."""
+    output = raw_config.get("output", {})
+    repo_id = output.get("hf_repo", "Mindigenous/MINDI-1.5-Vision-Coder")
+    try:
+        from huggingface_hub import HfApi
+        import os
+        api = HfApi(token=os.environ.get("HF_TOKEN"))
+        print(f"[train.py] Pushing checkpoint to {repo_id} ...")
+        api.upload_folder(
+            folder_path=str(checkpoint_dir),
+            repo_id=repo_id,
+            path_in_repo=f"checkpoints/{checkpoint_dir.name}",
+            repo_type="model",
+        )
+        print(f"[train.py] Pushed to https://huggingface.co/{repo_id}")
+    except ImportError:
+        print("[train.py] huggingface_hub not installed — skipping push")
+    except Exception as e:
+        print(f"[train.py] Push to hub failed: {e}")
+def log_wandb_phase_complete(wandb_run, summary: dict) -> None:
+    """Log phase completion to WandB."""
+    if wandb_run is None:
+        return
+    try:
+        import wandb
+        wandb.log({
+            "phase_complete": True,
+            "phase": summary.get("phase", "unknown"),
+            "total_steps": summary.get("total_steps", 0),
+            "best_val_loss": summary.get("best_val_loss", 0),
+            "elapsed_minutes": summary.get("elapsed_minutes", 0),
+        })
+    except Exception:
+        pass
+def main() -> None:
+    args = parse_args()
+    print()
+    print("=" * 60)
+    print("  MINDI 1.5 Vision-Coder — Training Launch")
+    print("  MINDIGENOUS.AI")
+    print("=" * 60)
+    print()
+    print(f"  Phase:       {args.phase}")
+    print(f"  Config:      {args.config}")
+    print(f"  Resume:      {args.resume or 'None'}")
+    print(f"  Dry run:     {args.dry_run}")
+    print(f"  Push to hub: {args.push_to_hub}")
+    print(f"  Device:      {'cuda' if torch.cuda.is_available() else 'cpu'}")
+    if torch.cuda.is_available():
+        print(f"  GPU:         {torch.cuda.get_device_name(0)}")
+        vram_gb = torch.cuda.get_device_properties(0).total_mem / (1024 ** 3)
+        print(f"  VRAM:        {vram_gb:.1f} GB")
+    print()
+    # Load config
+    raw_config = load_config(args.config)
+    config = build_training_config(raw_config, dry_run=args.dry_run)
+    # Filter phases based on --phase arg
+    if args.phase != "all":
+        phase_idx = int(args.phase) - 1
+        if phase_idx < 0 or phase_idx >= len(config.phases):
+            print(f"ERROR: Invalid phase {args.phase}. Available: 1-{len(config.phases)}")
+            sys.exit(1)
+        selected_phase = config.phases[phase_idx]
+        # Adjust to start from 0 for single-phase run
+        step_count = selected_phase.end_step - selected_phase.start_step
+        selected_phase.start_step = 0
+        selected_phase.end_step = step_count
+        config.phases = [selected_phase]
+    # Initialize model
+    print("[train.py] Initializing MINDI 1.5 model ...")
+    from src.model.mindi_model import MINDI15
+    model_cfg = raw_config.get("model", {})
+    vision_cfg = raw_config.get("vision", {})
+    model = MINDI15(
+        model_name=model_cfg.get("name", "Qwen/Qwen2.5-Coder-7B-Instruct"),
+        clip_model=vision_cfg.get("clip_model", "openai/clip-vit-large-patch14"),
+        hidden_size=model_cfg.get("hidden_size", 4096),
+        num_visual_tokens=vision_cfg.get("visual_tokens", 256),
+        torch_dtype=config.torch_dtype,
     )
+    # Initialize trainer
+    from src.training.mindi_trainer import MINDITrainer
+    trainer = MINDITrainer(model=model, config=config)
+    # Resume from checkpoint
+    if args.resume:
+        resume_path = Path(args.resume)
+        if not resume_path.is_absolute():
+            resume_path = PROJECT_ROOT / resume_path
+        trainer.resume_from_checkpoint(resume_path)
+    # Initialize WandB
+    wandb_run = init_wandb(raw_config, args.phase, disabled=args.no_wandb)
+    # Graceful Ctrl+C handler
+    interrupted = False
+    def signal_handler(sig, frame):
+        nonlocal interrupted
+        if interrupted:
+            print("\n[train.py] Forced exit!")
+            sys.exit(1)
+        interrupted = True
+        print("\n[train.py] Ctrl+C received — saving checkpoint before exit ...")
+        try:
+            emergency_dir = config.output_dir / "emergency_checkpoint"
+            emergency_dir.mkdir(parents=True, exist_ok=True)
+            model.save(emergency_dir)
+            print(f"[train.py] Emergency checkpoint saved: {emergency_dir}")
+        except Exception as e:
+            print(f"[train.py] Emergency save failed: {e}")
+        sys.exit(0)
+    signal.signal(signal.SIGINT, signal_handler)
+    # Run training
+    try:
+        if args.phase == "all":
+            summary = trainer.train()
+            final_dir = config.output_dir / "final"
+            if args.push_to_hub:
+                push_checkpoint_to_hub(final_dir, raw_config)
+            log_wandb_phase_complete(wandb_run, summary)
+        else:
+            phase = config.phases[0]
+            summary = trainer.train_phase(phase)
+            ckpt_dir = config.output_dir / f"{phase.name}_step{phase.end_step}"
+            if args.push_to_hub:
+                push_checkpoint_to_hub(ckpt_dir, raw_config)
+            log_wandb_phase_complete(wandb_run, summary)
+    except KeyboardInterrupt:
+        signal_handler(None, None)
+    except Exception as e:
+        print(f"\n[train.py] ERROR: {e}")
+        traceback.print_exc()
+        try:
+            crash_dir = config.output_dir / "crash_checkpoint"
+            crash_dir.mkdir(parents=True, exist_ok=True)
+            model.save(crash_dir)
+            print(f"[train.py] Crash checkpoint saved: {crash_dir}")
+        except Exception:
+            pass
+        sys.exit(1)
+    finally:
+        if wandb_run is not None:
+            try:
+                import wandb
+                wandb.finish()
+            except Exception:
+                pass
+    # Final summary
+    hf_repo = raw_config.get("output", {}).get("hf_repo", "Mindigenous/MINDI-1.5-Vision-Coder")
+    print()
+    print("=" * 60)
+    print("  Training complete!")
+    print(f"  Best val loss:  {trainer.best_val_loss:.4f}")
+    print(f"  Checkpoint at:  {config.output_dir}")
+    if args.push_to_hub:
+        print(f"  HuggingFace:    https://huggingface.co/{hf_repo}")
+    print("=" * 60)
+    print()
 if __name__ == "__main__":

scripts/upload_everything_to_hf.py ADDED Viewed

	@@ -0,0 +1,354 @@

+#!/usr/bin/env python3
+"""
+Upload ENTIRE MINDI 1.5 Vision-Coder project to HuggingFace.
+REPO 1 (model):   Mindigenous/MINDI-1.5-Vision-Coder
+REPO 2 (dataset):  Mindigenous/MINDI-1.5-training-data
+Both private.  On MI300X we will clone these repos directly.
+"""
+import os
+import sys
+import time
+from pathlib import Path
+from dotenv import load_dotenv
+from huggingface_hub import HfApi, create_repo
+# ── Paths ──────────────────────────────────────────────────────────────
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+ENV_FILE = PROJECT_ROOT / ".env"
+# ── Repo names ─────────────────────────────────────────────────────────
+MODEL_REPO  = "Mindigenous/MINDI-1.5-Vision-Coder"
+DATASET_REPO = "Mindigenous/MINDI-1.5-training-data"
+# ── Model card (written to repo as README.md) ─────────────────────────
+MODEL_CARD = """\
+---
+license: apache-2.0
+language:
+- en
+tags:
+- code-generation
+- nextjs
+- react
+- typescript
+- vision
+- multimodal
+- mindi
+- mindigenous
+base_model: Qwen/Qwen2.5-Coder-7B-Instruct
+---
+# MINDI 1.5 Vision-Coder
+Built by MINDIGENOUS.AI
+## Model Description
+MINDI 1.5 is an agentic AI coding model
+that sees its own output and critiques it.
+## Key Features
+- Generates Next.js 14 + Tailwind + TypeScript
+- Sees screenshots via CLIP ViT-L/14
+- Critiques its own UI/UX output
+- Searches internet for latest packages
+- Tests code in sandbox environment
+- Self-fixes errors automatically
+## Training
+- Base: Qwen/Qwen2.5-Coder-7B-Instruct
+- Method: LoRA fine-tuning
+- Hardware: AMD MI300X 192GB VRAM
+- Dataset: 1,449,428 examples
+- Tokens: 859,694,776
+- Status: Training in progress
+## Built By
+Faaz - MINDIGENOUS.AI
+Mumbai, India
+April 2026
+"""
+# ── Dataset card ───────────────────────────────────────────────────────
+DATASET_CARD = """\
+---
+license: apache-2.0
+language:
+- en
+tags:
+- code-generation
+- nextjs
+- react
+- typescript
+- vision
+- multimodal
+- mindi
+- mindigenous
+size_categories:
+- 1M<n<10M
+---
+# MINDI 1.5 Training Data
+Training dataset for **MINDI 1.5 Vision-Coder** by MINDIGENOUS.AI
+## Dataset Statistics
+| Metric | Value |
+|--------|-------|
+| Total examples | 1,449,428 |
+| Total tokens | 859,694,776 |
+| Avg tokens/example | 593 |
+| Avg quality score | 6.49 |
+| Sources | 9 |
+## Splits
+| Split | Examples | Percentage |
+|-------|----------|------------|
+| Train | 1,304,486 | 90.0% |
+| Validation | 72,471 | 5.0% |
+| Test | 72,471 | 5.0% |
+## Sources
+| Source | Examples | Kept % |
+|--------|----------|--------|
+| starcoderdata | 569,350 | 94.9% |
+| websight | 250,987 | 99.99% |
+| evol_code | 155,998 | 99.7% |
+| codefeedback | 149,865 | 99.9% |
+| magicoder | 149,987 | 99.99% |
+| synthetic_nextjs | 90,000 | 100% (protected) |
+| codealpaca | 59,241 | 98.8% |
+| search_examples | 15,000 | 100% (protected) |
+| sandbox_examples | 9,000 | 100% (protected) |
+## Type Distribution
+| Type | Examples |
+|------|----------|
+| code_generation | 1,183,441 |
+| vision_code | 250,987 |
+| search | 15,000 |
+## Language Distribution
+| Language | Examples |
+|----------|----------|
+| unknown | 490,305 |
+| typescript | 375,859 |
+| javascript | 298,497 |
+| python | 211,842 |
+| html | 36,371 |
+| java | 32,458 |
+| rust | 3,709 |
+| go | 387 |
+## Format
+Each example is a JSON object with:
+- `conversations`: list of `{"role": ..., "content": ...}` turns
+- `source`: dataset origin
+- `type`: code_generation / vision_code / search
+- `language`: programming language
+- `quality_score`: heuristic quality (0-10+)
+- `token_count`: number of tokens
+## Quality Filtering
+- Protected sources (sandbox, search, synthetic_nextjs) bypass aggressive filters
+- MINDI special token bonuses boost agentic examples
+- Dedup via SHA-256 content hashing
+- Rejection reasons: too_many_tokens (30,637), boilerplate (1,373), duplicate (59)
+## Built By
+Faaz - MINDIGENOUS.AI
+Mumbai, India — April 2026
+"""
+# ────────────────────────────────────────────────────────────────────────
+def load_token() -> str:
+    """Load HF token from .env."""
+    load_dotenv(ENV_FILE)
+    token = os.getenv("HUGGINGFACE_TOKEN") or os.getenv("HF_TOKEN")
+    if not token:
+        print("ERROR: No HUGGINGFACE_TOKEN or HF_TOKEN found in .env")
+        sys.exit(1)
+    return token
+def ensure_repo(api: HfApi, repo_id: str, repo_type: str, token: str):
+    """Create repo if it doesn't exist."""
+    try:
+        create_repo(
+            repo_id=repo_id,
+            repo_type=repo_type,
+            private=True,
+            token=token,
+            exist_ok=True,
+        )
+        print(f"  Repo ready: {repo_id} ({repo_type})")
+    except Exception as e:
+        print(f"  Repo create/check: {e}")
+def upload_folder(api: HfApi, local: Path, remote: str, repo_id: str,
+                  repo_type: str, token: str):
+    """Upload a local folder to HF repo."""
+    if not local.exists():
+        print(f"  SKIP (not found): {local}")
+        return
+    label = str(local.relative_to(PROJECT_ROOT))
+    print(f"  Uploading {label}/ to {repo_type} repo ... ", end="", flush=True)
+    t0 = time.time()
+    api.upload_folder(
+        repo_id=repo_id,
+        repo_type=repo_type,
+        folder_path=str(local),
+        path_in_repo=remote,
+        token=token,
+        ignore_patterns=["__pycache__", "*.pyc", ".git"],
+    )
+    print(f"done ({time.time() - t0:.1f}s)")
+def upload_file(api: HfApi, local: Path, remote: str, repo_id: str,
+                repo_type: str, token: str):
+    """Upload a single file to HF repo."""
+    if not local.exists():
+        print(f"  SKIP (not found): {local.name}")
+        return
+    size_mb = local.stat().st_size / (1024 * 1024)
+    label = str(local.relative_to(PROJECT_ROOT))
+    print(f"  Uploading {label} ({size_mb:.1f} MB) to {repo_type} repo ... ",
+          end="", flush=True)
+    t0 = time.time()
+    api.upload_file(
+        repo_id=repo_id,
+        repo_type=repo_type,
+        path_or_fileobj=str(local),
+        path_in_repo=remote,
+        token=token,
+    )
+    print(f"done ({time.time() - t0:.1f}s)")
+def upload_readme(api: HfApi, content: str, repo_id: str,
+                  repo_type: str, token: str):
+    """Upload a README.md string to a repo."""
+    print(f"  Uploading README.md to {repo_type} repo ... ", end="", flush=True)
+    api.upload_file(
+        repo_id=repo_id,
+        repo_type=repo_type,
+        path_or_fileobj=content.encode("utf-8"),
+        path_in_repo="README.md",
+        token=token,
+    )
+    print("done")
+# ────────────────────────────────────────────────────────────────────────
+def main():
+    print("=" * 60)
+    print("  MINDI 1.5 — Upload Everything to HuggingFace")
+    print("=" * 60)
+    print()
+    token = load_token()
+    api = HfApi()
+    # ── Create repos ───────────────────────────────────────────────
+    print("[1/4] Creating repositories ...")
+    ensure_repo(api, MODEL_REPO, "model", token)
+    ensure_repo(api, DATASET_REPO, "dataset", token)
+    print()
+    # ── REPO 1: Model (code + configs) ─────────────────────────────
+    print("[2/4] Uploading to MODEL repo:", MODEL_REPO)
+    print("-" * 50)
+    # Folders
+    model_folders = [
+        (PROJECT_ROOT / "src",            "src"),
+        (PROJECT_ROOT / "scripts",        "scripts"),
+        (PROJECT_ROOT / "configs",        "configs"),
+        (PROJECT_ROOT / "data" / "tokenizer", "data/tokenizer"),
+        (PROJECT_ROOT / "tests",          "tests"),
+        (PROJECT_ROOT / "api",            "api"),
+    ]
+    for local, remote in model_folders:
+        upload_folder(api, local, remote, MODEL_REPO, "model", token)
+    # Single files
+    model_files = [
+        (PROJECT_ROOT / "requirements.txt",   "requirements.txt"),
+        (PROJECT_ROOT / "setup.py",           "setup.py"),
+        (PROJECT_ROOT / "activate_mindi.bat", "activate_mindi.bat"),
+        (PROJECT_ROOT / ".env.example",       ".env.example"),
+    ]
+    for local, remote in model_files:
+        upload_file(api, local, remote, MODEL_REPO, "model", token)
+    # setup_mi300x.sh
+    mi300x_sh = PROJECT_ROOT / "setup_mi300x.sh"
+    if mi300x_sh.exists():
+        upload_file(api, mi300x_sh, "setup_mi300x.sh", MODEL_REPO, "model", token)
+    # Model card replaces README.md
+    upload_readme(api, MODEL_CARD, MODEL_REPO, "model", token)
+    print()
+    # ── REPO 2: Dataset ────────────────────────────────────────────
+    print("[3/4] Uploading to DATASET repo:", DATASET_REPO)
+    print("-" * 50)
+    processed = PROJECT_ROOT / "data" / "processed"
+    dataset_files = [
+        (processed / "train.jsonl",        "processed/train.jsonl"),
+        (processed / "val.jsonl",          "processed/val.jsonl"),
+        (processed / "test.jsonl",         "processed/test.jsonl"),
+        (processed / "mindi_filtered.jsonl", "processed/mindi_filtered.jsonl"),
+        (processed / "filter_report.json", "processed/filter_report.json"),
+        (processed / "split_meta.json",    "processed/split_meta.json"),
+    ]
+    for local, remote in dataset_files:
+        upload_file(api, local, remote, DATASET_REPO, "dataset", token)
+    # Raw data folder
+    upload_folder(
+        api, PROJECT_ROOT / "data" / "raw", "raw",
+        DATASET_REPO, "dataset", token,
+    )
+    # Tokenizer copy in dataset repo
+    upload_folder(
+        api, PROJECT_ROOT / "data" / "tokenizer", "tokenizer",
+        DATASET_REPO, "dataset", token,
+    )
+    # Dataset card
+    upload_readme(api, DATASET_CARD, DATASET_REPO, "dataset", token)
+    print()
+    # ── Done ───────────────────────────────────────────────────────
+    print("[4/4] Upload complete!")
+    print()
+    print("╔══════════════════════════════════════╗")
+    print("║ UPLOAD COMPLETE!                     ║")
+    print("║                                      ║")
+    print("║ Model repo:                          ║")
+    print("║ huggingface.co/Mindigenous/           ║")
+    print("║ MINDI-1.5-Vision-Coder               ║")
+    print("║                                      ║")
+    print("║ Dataset repo:                        ║")
+    print("║ huggingface.co/datasets/             ║")
+    print("║ Mindigenous/MINDI-1.5-training-data  ║")
+    print("║                                      ║")
+    print("║ On MI300X just run:                  ║")
+    print("║ git clone https://huggingface.co/    ║")
+    print("║ Mindigenous/MINDI-1.5-Vision-Coder   ║")
+    print("║                                      ║")
+    print("║ Ready to train! 🚀                   ║")
+    print("╚══════════════════════════════════════╝")
+if __name__ == "__main__":
+    main()

setup_mi300x.sh ADDED Viewed

	@@ -0,0 +1,145 @@

+#!/bin/bash
+# ============================================================
+# MINDI 1.5 Vision-Coder — MI300X Setup Script
+# One command to set up everything on DigitalOcean AMD MI300X
+# ============================================================
+set -e
+echo "============================================================"
+echo "  MINDI 1.5 Vision-Coder — MI300X Setup"
+echo "  MINDIGENOUS.AI"
+echo "============================================================"
+echo ""
+# ── Check HF_TOKEN ─────────────────────────────────────────────
+if [ -z "$HF_TOKEN" ]; then
+    echo "ERROR: Set HF_TOKEN environment variable first!"
+    echo "  export HF_TOKEN=hf_your_token_here"
+    exit 1
+fi
+# ── Step 1: Install ROCm PyTorch ───────────────────────────────
+echo "[1/7] Installing ROCm PyTorch (ROCm 6.0) ..."
+pip install torch torchvision torchaudio \
+    --index-url https://download.pytorch.org/whl/rocm6.0
+# ── Step 2: Clone the full project from HF ────────────────────
+echo ""
+echo "[2/7] Cloning MINDI 1.5 from HuggingFace ..."
+if [ -d "MINDI-1.5-Vision-Coder" ]; then
+    echo "  Directory exists — pulling latest ..."
+    cd MINDI-1.5-Vision-Coder
+    git pull
+else
+    git clone https://${HF_TOKEN}@huggingface.co/Mindigenous/MINDI-1.5-Vision-Coder
+    cd MINDI-1.5-Vision-Coder
+fi
+# ── Step 3: Install Python requirements ────────────────────────
+echo ""
+echo "[3/7] Installing Python requirements ..."
+pip install -r requirements.txt
+# Additional training dependencies
+pip install wandb huggingface_hub accelerate
+# ── Step 4: Download training data from HF ─────────────────────
+echo ""
+echo "[4/7] Downloading training dataset ..."
+python -c "
+from huggingface_hub import snapshot_download
+import os
+snapshot_download(
+    repo_id='Mindigenous/MINDI-1.5-training-data',
+    repo_type='dataset',
+    local_dir='data/',
+    token=os.environ['HF_TOKEN']
+)
+print('Dataset downloaded!')
+"
+# Verify data files exist
+echo "  Checking data files ..."
+if [ ! -f "data/processed/train.jsonl" ]; then
+    echo "  ERROR: train.jsonl not found!"
+    exit 1
+fi
+if [ ! -f "data/processed/val.jsonl" ]; then
+    echo "  ERROR: val.jsonl not found!"
+    exit 1
+fi
+TRAIN_SIZE=$(du -sh data/processed/train.jsonl | cut -f1)
+VAL_SIZE=$(du -sh data/processed/val.jsonl | cut -f1)
+echo "  train.jsonl: ${TRAIN_SIZE}"
+echo "  val.jsonl:   ${VAL_SIZE}"
+# ── Step 5: Set environment variables ──────────────────────────
+echo ""
+echo "[5/7] Setting environment variables ..."
+# ROCm / PyTorch settings
+export HSA_OVERRIDE_GFX_VERSION=11.0.0
+export PYTORCH_ROCM_ARCH="gfx942"
+export HIP_VISIBLE_DEVICES=0
+export TOKENIZERS_PARALLELISM=false
+export WANDB_PROJECT="mindi-1.5-vision-coder"
+# Create .env file
+cat > .env << EOF
+HF_TOKEN=${HF_TOKEN}
+HSA_OVERRIDE_GFX_VERSION=11.0.0
+PYTORCH_ROCM_ARCH=gfx942
+HIP_VISIBLE_DEVICES=0
+TOKENIZERS_PARALLELISM=false
+WANDB_PROJECT=mindi-1.5-vision-coder
+EOF
+echo "  .env file created"
+# ── Step 6: Verify GPU detected ───────────────────────────────
+echo ""
+echo "[6/7] Verifying GPU ..."
+python -c "
+import torch
+print(f'  PyTorch version: {torch.__version__}')
+print(f'  CUDA available:  {torch.cuda.is_available()}')
+if torch.cuda.is_available():
+    print(f'  GPU name:        {torch.cuda.get_device_name(0)}')
+    vram = torch.cuda.get_device_properties(0).total_mem / (1024**3)
+    print(f'  VRAM:            {vram:.1f} GB')
+    print(f'  ROCm backend:    {torch.version.hip is not None}')
+else:
+    print('  WARNING: No GPU detected!')
+    exit(1)
+"
+# Quick bf16 test
+python -c "
+import torch
+x = torch.randn(100, 100, dtype=torch.bfloat16, device='cuda')
+y = torch.matmul(x, x.T)
+print(f'  bf16 matmul test: PASSED (shape={y.shape})')
+"
+# ── Step 7: Create output directories ─────────────────────────
+echo ""
+echo "[7/7] Creating output directories ..."
+mkdir -p checkpoints/training
+mkdir -p checkpoints/best
+mkdir -p logs/training
+# ── Done ───────────────────────────────────────────────────────
+echo ""
+echo "============================================================"
+echo "  MINDI 1.5 Vision-Coder — MI300X Ready!"
+echo ""
+echo "  Project:  $(pwd)"
+echo "  Data:     ${TRAIN_SIZE} train / ${VAL_SIZE} val"
+echo "  GPU:      $(python -c 'import torch; print(torch.cuda.get_device_name(0))' 2>/dev/null || echo 'N/A')"
+echo ""
+echo "  Ready to train!"
+echo "  Run:  python scripts/train.py --phase 1"
+echo ""
+echo "  Or dry run first:"
+echo "  Run:  python scripts/train.py --dry_run --no_wandb"
+echo "============================================================"

src/model/architecture.py ADDED Viewed

	@@ -0,0 +1,289 @@

+"""
+MINDI 1.5 Vision-Coder — Model Architecture
+Loads Qwen/Qwen2.5-Coder-7B-Instruct with LoRA adapters.
+Handles model initialization, LoRA application, save/load,
+and parameter counting for the base LLM component.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Optional
+import torch
+from peft import LoraConfig, PeftModel, TaskType, get_peft_model
+from transformers import AutoModelForCausalLM, AutoTokenizer
+class MINDIArchitecture:
+    """Qwen2.5-Coder-7B-Instruct with LoRA for MINDI 1.5 fine-tuning."""
+    DEFAULT_TARGET_MODULES: list[str] = [
+        "q_proj", "k_proj", "v_proj", "o_proj",
+        "gate_proj", "up_proj", "down_proj",
+    ]
+    def __init__(
+        self,
+        model_name: str = "Qwen/Qwen2.5-Coder-7B-Instruct",
+        device: Optional[str] = None,
+        cache_dir: Optional[Path] = None,
+        torch_dtype: torch.dtype = torch.bfloat16,
+    ) -> None:
+        """
+        Initialize the architecture wrapper.
+        Args:
+            model_name: HuggingFace model identifier.
+            device: Target device ('cuda', 'cpu', or None for auto).
+            cache_dir: Local directory for model weight cache.
+            torch_dtype: Data type for model weights.
+        """
+        self.model_name = model_name
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.cache_dir = Path(cache_dir) if cache_dir else Path("./checkpoints/base")
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+        self.torch_dtype = torch_dtype
+        self.model: Optional[AutoModelForCausalLM] = None
+        self.peft_model: Optional[PeftModel] = None
+        self.tokenizer: Optional[AutoTokenizer] = None
+        self._load_model()
+    def _load_model(self) -> None:
+        """Load the base model and tokenizer from HuggingFace or cache."""
+        print(f"[MINDIArchitecture] Loading {self.model_name} ...")
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            cache_dir=str(self.cache_dir),
+            torch_dtype=self.torch_dtype,
+            device_map="auto" if self.device == "cuda" else None,
+            trust_remote_code=True,
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_name,
+            cache_dir=str(self.cache_dir),
+            trust_remote_code=True,
+        )
+        print(f"[MINDIArchitecture] Loaded on {self.device} "
+              f"({self._fmt_params(self._total_params())} params)")
+    def apply_lora(
+        self,
+        r: int = 64,
+        lora_alpha: int = 128,
+        lora_dropout: float = 0.05,
+        target_modules: Optional[list[str]] = None,
+    ) -> PeftModel:
+        """
+        Apply LoRA adapters to the base model.
+        Args:
+            r: LoRA rank.
+            lora_alpha: LoRA scaling factor.
+            lora_dropout: Dropout probability for LoRA layers.
+            target_modules: List of module names to apply LoRA to.
+        Returns:
+            The PEFT-wrapped model.
+        """
+        if self.model is None:
+            raise RuntimeError("Base model not loaded.")
+        if target_modules is None:
+            target_modules = self.DEFAULT_TARGET_MODULES
+        lora_config = LoraConfig(
+            r=r,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            target_modules=target_modules,
+            bias="none",
+            task_type=TaskType.CAUSAL_LM,
+        )
+        self.peft_model = get_peft_model(self.model, lora_config)
+        info = self.get_trainable_params()
+        print(f"[MINDIArchitecture] LoRA applied (r={r}, alpha={lora_alpha})")
+        print(f"  Trainable:  {info['trainable']:>14,}  ({info['trainable_pct']:.2f}%)")
+        print(f"  Frozen:     {info['frozen']:>14,}")
+        print(f"  Total:      {info['total']:>14,}")
+        return self.peft_model
+    def get_trainable_params(self) -> dict:
+        """
+        Count trainable, frozen, and total parameters.
+        Returns:
+            Dictionary with 'trainable', 'frozen', 'total', 'trainable_pct'.
+        """
+        model = self.peft_model or self.model
+        if model is None:
+            return {"trainable": 0, "frozen": 0, "total": 0, "trainable_pct": 0.0}
+        trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        total = sum(p.numel() for p in model.parameters())
+        frozen = total - trainable
+        pct = 100.0 * trainable / total if total > 0 else 0.0
+        return {
+            "trainable": trainable,
+            "frozen": frozen,
+            "total": total,
+            "trainable_pct": round(pct, 4),
+        }
+    def print_model_info(self) -> None:
+        """Print detailed model architecture and parameter information."""
+        model = self.peft_model or self.model
+        if model is None:
+            print("[MINDIArchitecture] No model loaded.")
+            return
+        info = self.get_trainable_params()
+        print()
+        print("=" * 60)
+        print("  MINDI 1.5 — Model Architecture Info")
+        print("=" * 60)
+        print(f"  Base model:     {self.model_name}")
+        print(f"  Device:         {self.device}")
+        print(f"  Dtype:          {self.torch_dtype}")
+        print(f"  LoRA active:    {self.peft_model is not None}")
+        print(f"  Total params:   {self._fmt_params(info['total'])}")
+        print(f"  Trainable:      {self._fmt_params(info['trainable'])} "
+              f"({info['trainable_pct']:.2f}%)")
+        print(f"  Frozen:         {self._fmt_params(info['frozen'])}")
+        if self.peft_model is not None:
+            config = self.peft_model.peft_config.get("default")
+            if config is not None:
+                print(f"  LoRA rank:      {config.r}")
+                print(f"  LoRA alpha:     {config.lora_alpha}")
+                print(f"  LoRA dropout:   {config.lora_dropout}")
+                print(f"  Target modules: {config.target_modules}")
+        print("=" * 60)
+        print()
+    def save_lora(self, path: Optional[Path] = None) -> Path:
+        """
+        Save LoRA adapter weights to disk.
+        Args:
+            path: Directory to save to. Defaults to checkpoints/lora.
+        Returns:
+            Path where weights were saved.
+        """
+        if self.peft_model is None:
+            raise RuntimeError("No LoRA adapter to save. Call apply_lora() first.")
+        save_path = Path(path) if path else Path("./checkpoints/lora")
+        save_path.mkdir(parents=True, exist_ok=True)
+        self.peft_model.save_pretrained(str(save_path))
+        print(f"[MINDIArchitecture] LoRA saved to {save_path}")
+        return save_path
+    def load_lora(self, path: Path) -> PeftModel:
+        """
+        Load LoRA adapter weights from disk.
+        Args:
+            path: Directory containing saved adapter weights.
+        Returns:
+            The PEFT-wrapped model with loaded adapter.
+        """
+        path = Path(path)
+        if not path.exists():
+            raise FileNotFoundError(f"LoRA adapter not found: {path}")
+        if self.model is None:
+            raise RuntimeError("Base model not loaded.")
+        self.peft_model = PeftModel.from_pretrained(
+            self.model, str(path)
+        )
+        print(f"[MINDIArchitecture] LoRA loaded from {path}")
+        return self.peft_model
+    def resize_embeddings(self, new_vocab_size: int) -> None:
+        """Resize model embeddings for new special tokens."""
+        model = self.peft_model or self.model
+        if model is None:
+            raise RuntimeError("No model loaded.")
+        old_size = model.get_input_embeddings().weight.shape[0]
+        if new_vocab_size != old_size:
+            model.resize_token_embeddings(new_vocab_size)
+            print(f"[MINDIArchitecture] Resized embeddings: {old_size} → {new_vocab_size}")
+    def get_model(self) -> AutoModelForCausalLM | PeftModel:
+        """Return the active model (PEFT if LoRA applied, else base)."""
+        model = self.peft_model or self.model
+        if model is None:
+            raise RuntimeError("No model loaded.")
+        return model
+    # ── helpers ───────────────────────────────────────────────────
+    def _total_params(self) -> int:
+        model = self.peft_model or self.model
+        if model is None:
+            return 0
+        return sum(p.numel() for p in model.parameters())
+    @staticmethod
+    def _fmt_params(n: int) -> str:
+        if n >= 1_000_000_000:
+            return f"{n / 1_000_000_000:.2f}B"
+        if n >= 1_000_000:
+            return f"{n / 1_000_000:.2f}M"
+        if n >= 1_000:
+            return f"{n / 1_000:.1f}K"
+        return str(n)
+# ── Test block ────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    print("=" * 60)
+    print("  MINDI 1.5 — Architecture Test")
+    print("=" * 60)
+    print()
+    # 1. Load base model
+    arch = MINDIArchitecture(
+        model_name="Qwen/Qwen2.5-Coder-7B-Instruct",
+    )
+    # 2. Apply LoRA
+    peft_model = arch.apply_lora(
+        r=64,
+        lora_alpha=128,
+        lora_dropout=0.05,
+    )
+    # 3. Print full info
+    arch.print_model_info()
+    # 4. Verify trainable params
+    info = arch.get_trainable_params()
+    assert info["trainable"] > 0, "No trainable parameters!"
+    assert info["frozen"] > info["trainable"], "More trainable than frozen — LoRA may not be applied!"
+    # 5. Verify LoRA modules exist
+    lora_modules = [name for name, _ in peft_model.named_parameters() if "lora_" in name]
+    print(f"  LoRA modules found: {len(lora_modules)}")
+    assert len(lora_modules) > 0, "No LoRA modules found!"
+    # 6. Quick forward pass test (small input)
+    print("\n  Running forward pass test ...")
+    test_input = arch.tokenizer("Hello MINDI!", return_tensors="pt")
+    test_input = {k: v.to(arch.device) for k, v in test_input.items()}
+    with torch.no_grad():
+        output = peft_model(**test_input)
+    print(f"  Output logits shape: {output.logits.shape}")
+    print(f"  Loss: {output.loss}")
+    print("\n  ✓ All architecture tests passed!")
+    print("=" * 60)

src/model/fusion_layer.py ADDED Viewed

	@@ -0,0 +1,221 @@

+"""
+MINDI 1.5 Vision-Coder — Vision-Language Fusion Layer
+Prepends projected visual tokens (256 × 4096) to text token embeddings
+and extends the attention mask accordingly.  Uses Linear + LayerNorm
+for the visual projection gate.
+"""
+from __future__ import annotations
+from typing import Optional
+import torch
+import torch.nn as nn
+class VisionLanguageFusion(nn.Module):
+    """
+    Fuses visual and text embeddings by prepending visual tokens.
+    Pipeline:
+        1. visual_tokens (batch, 256, 4096) → Linear → LayerNorm
+        2. Prepend to text_embeds (batch, seq_len, 4096)
+        3. Extend attention_mask to cover the extra 256 visual positions
+    All trainable parameters live in the gate projection + LayerNorm.
+    """
+    def __init__(
+        self,
+        hidden_size: int = 4096,
+        num_visual_tokens: int = 256,
+    ) -> None:
+        """
+        Initialize the fusion layer.
+        Args:
+            hidden_size: Dimension of both visual and text embeddings (must match).
+            num_visual_tokens: Number of visual tokens prepended (default 256).
+        """
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_visual_tokens = num_visual_tokens
+        # Gate projection: Linear + LayerNorm to align visual features
+        self.gate_proj = nn.Linear(hidden_size, hidden_size)
+        self.layer_norm = nn.LayerNorm(hidden_size)
+    def forward(
+        self,
+        text_embeds: torch.Tensor,
+        visual_tokens: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Fuse visual tokens into text embeddings.
+        Args:
+            text_embeds: Text token embeddings (batch, seq_len, hidden_size).
+            visual_tokens: Projected visual tokens (batch, 256, hidden_size), or None
+                           for text-only inputs.
+            attention_mask: Text attention mask (batch, seq_len), or None.
+        Returns:
+            fused_embeds: (batch, 256 + seq_len, hidden_size) if visual, else unchanged.
+            fused_mask: Extended attention mask, or None if input mask was None.
+        """
+        # Text-only path — no vision tokens to fuse
+        if visual_tokens is None:
+            return text_embeds, attention_mask
+        batch_size = text_embeds.shape[0]
+        v_batch = visual_tokens.shape[0]
+        # Handle batch size mismatch (single image broadcast to batch)
+        if v_batch == 1 and batch_size > 1:
+            visual_tokens = visual_tokens.expand(batch_size, -1, -1)
+        # Gate projection + LayerNorm
+        gated_visual = self.gate_proj(visual_tokens)   # (batch, 256, hidden_size)
+        gated_visual = self.layer_norm(gated_visual)    # (batch, 256, hidden_size)
+        # Prepend visual tokens to text embeddings
+        fused_embeds = torch.cat([gated_visual, text_embeds], dim=1)
+        # Extend attention mask
+        fused_mask = self._extend_attention_mask(attention_mask, batch_size, text_embeds.device)
+        return fused_embeds, fused_mask
+    def _extend_attention_mask(
+        self,
+        attention_mask: Optional[torch.Tensor],
+        batch_size: int,
+        device: torch.device,
+    ) -> Optional[torch.Tensor]:
+        """
+        Extend attention mask to include visual token positions (all attended).
+        Args:
+            attention_mask: Original text mask (batch, seq_len) or None.
+            batch_size: Current batch size.
+            device: Target device.
+        Returns:
+            Extended mask (batch, 256 + seq_len) or None.
+        """
+        if attention_mask is None:
+            return None
+        # Visual tokens are always fully attended
+        visual_mask = torch.ones(
+            batch_size,
+            self.num_visual_tokens,
+            dtype=attention_mask.dtype,
+            device=device,
+        )
+        return torch.cat([visual_mask, attention_mask], dim=1)
+    def get_trainable_params(self) -> dict:
+        """
+        Count trainable parameters in the fusion layer.
+        Returns:
+            Dictionary with 'trainable', 'total', and 'trainable_pct'.
+        """
+        trainable = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        total = sum(p.numel() for p in self.parameters())
+        pct = 100.0 * trainable / total if total > 0 else 0.0
+        return {
+            "trainable": trainable,
+            "total": total,
+            "trainable_pct": round(pct, 4),
+        }
+    def extra_repr(self) -> str:
+        return (
+            f"hidden_size={self.hidden_size}, "
+            f"num_visual_tokens={self.num_visual_tokens}"
+        )
+# ── Test block ────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    print("=" * 60)
+    print("  MINDI 1.5 — Fusion Layer Test")
+    print("=" * 60)
+    print()
+    BATCH = 2
+    SEQ_LEN = 128
+    HIDDEN = 4096
+    N_VIS = 256
+    fusion = VisionLanguageFusion(hidden_size=HIDDEN, num_visual_tokens=N_VIS)
+    print(f"  Fusion layer:\n  {fusion}\n")
+    # ── Test 1: Vision + Text fusion ─────────────────────────────
+    print("  Test 1: Vision + Text fusion")
+    text_embeds = torch.randn(BATCH, SEQ_LEN, HIDDEN)
+    visual_tokens = torch.randn(BATCH, N_VIS, HIDDEN)
+    attention_mask = torch.ones(BATCH, SEQ_LEN, dtype=torch.long)
+    fused_embeds, fused_mask = fusion(text_embeds, visual_tokens, attention_mask)
+    expected_seq = N_VIS + SEQ_LEN  # 256 + 128 = 384
+    assert fused_embeds.shape == (BATCH, expected_seq, HIDDEN), \
+        f"Expected ({BATCH}, {expected_seq}, {HIDDEN}), got {fused_embeds.shape}"
+    assert fused_mask is not None and fused_mask.shape == (BATCH, expected_seq), \
+        f"Expected mask ({BATCH}, {expected_seq}), got {fused_mask.shape}"
+    print(f"    fused_embeds: {fused_embeds.shape} ✓")
+    print(f"    fused_mask:   {fused_mask.shape} ✓")
+    # ── Test 2: Text-only (no vision) ────────────────────────────
+    print("\n  Test 2: Text-only (no vision)")
+    text_only, mask_only = fusion(text_embeds, None, attention_mask)
+    assert text_only.shape == (BATCH, SEQ_LEN, HIDDEN)
+    assert mask_only is not None and mask_only.shape == (BATCH, SEQ_LEN)
+    print(f"    text_only:  {text_only.shape} ✓")
+    print(f"    mask_only:  {mask_only.shape} ✓")
+    # ── Test 3: No attention mask ────────────────────────────────
+    print("\n  Test 3: Vision fusion without attention mask")
+    fused_no_mask, none_mask = fusion(text_embeds, visual_tokens, None)
+    assert fused_no_mask.shape == (BATCH, expected_seq, HIDDEN)
+    assert none_mask is None
+    print(f"    fused_embeds: {fused_no_mask.shape} ✓")
+    print(f"    fused_mask:   None ✓")
+    # ── Test 4: Single-image broadcast ───────────────────────────
+    print("\n  Test 4: Single-image broadcast to batch")
+    single_visual = torch.randn(1, N_VIS, HIDDEN)
+    fused_bc, mask_bc = fusion(text_embeds, single_visual, attention_mask)
+    assert fused_bc.shape == (BATCH, expected_seq, HIDDEN)
+    print(f"    fused_embeds: {fused_bc.shape} ✓ (broadcast 1 → {BATCH})")
+    # ── Test 5: Trainable params ─────────────────────────────────
+    print("\n  Test 5: Parameter counts")
+    info = fusion.get_trainable_params()
+    # gate_proj: 4096*4096 + 4096 = 16,781,312
+    # layer_norm: 4096 + 4096 = 8,192
+    expected_params = HIDDEN * HIDDEN + HIDDEN + HIDDEN + HIDDEN  # Linear(w+b) + LN(w+b)
+    assert info["trainable"] == expected_params, \
+        f"Expected {expected_params}, got {info['trainable']}"
+    print(f"    Trainable: {info['trainable']:,}")
+    print(f"    Total:     {info['total']:,}")
+    print(f"    Pct:       {info['trainable_pct']}%")
+    # ── Test 6: Gradient flow ────────────────────────────────────
+    print("\n  Test 6: Gradient flow through fusion")
+    fusion.zero_grad()
+    fused_embeds, _ = fusion(text_embeds, visual_tokens, attention_mask)
+    loss = fused_embeds.sum()
+    loss.backward()
+    assert fusion.gate_proj.weight.grad is not None, "No gradient on gate_proj!"
+    assert fusion.layer_norm.weight.grad is not None, "No gradient on layer_norm!"
+    print("    gate_proj gradient:  ✓")
+    print("    layer_norm gradient: ✓")
+    print("\n  ✓ All fusion layer tests passed!")
+    print("=" * 60)

src/model/mindi_model.py ADDED Viewed

	@@ -0,0 +1,620 @@

+"""
+MINDI 1.5 Vision-Coder — Complete Model
+Combines MINDIArchitecture (Qwen2.5-Coder + LoRA), VisionEncoder (CLIP ViT-L/14),
+and VisionLanguageFusion into a single MINDI15 class with forward(), generate(),
+parse_output(), save(), and load() methods.
+Uses the MINDI custom tokenizer (data/tokenizer/mindi_tokenizer/) with 22 special
+tokens for agentic code generation capabilities.
+"""
+from __future__ import annotations
+import re
+from pathlib import Path
+from typing import Optional
+import torch
+import torch.nn as nn
+from PIL import Image
+from transformers import AutoTokenizer, PreTrainedTokenizerFast
+from src.model.architecture import MINDIArchitecture
+from src.model.fusion_layer import VisionLanguageFusion
+from src.model.vision_encoder import VisionEncoder
+# ── MINDI special token pairs ────────────────────────────────────────
+MINDI_SECTION_TOKENS: dict[str, tuple[str, str]] = {
+    "thinking":  ("<|think_start|>",    "<|think_end|>"),
+    "file":      ("<|file_start|>",     "<|file_end|>"),
+    "code":      ("<|code_start|>",     "<|code_end|>"),
+    "critique":  ("<|critique_start|>", "<|critique_end|>"),
+    "suggest":   ("<|suggest_start|>",  "<|suggest_end|>"),
+    "search":    ("<|search_start|>",   "<|search_end|>"),
+    "error":     ("<|error_start|>",    "<|error_end|>"),
+    "fix":       ("<|fix_start|>",      "<|fix_end|>"),
+}
+# Project root (resolved relative to this file)
+PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
+DEFAULT_TOKENIZER_PATH = PROJECT_ROOT / "data" / "tokenizer" / "mindi_tokenizer"
+class MINDI15(nn.Module):
+    """
+    MINDI 1.5 Vision-Coder — complete multimodal coding model.
+    Components:
+        - architecture: Qwen2.5-Coder-7B-Instruct + LoRA
+        - vision_encoder: CLIP ViT-L/14 (frozen) → 256 tokens × 4096
+        - fusion: Linear + LayerNorm prepend fusion
+        - tokenizer: MINDI custom tokenizer with 22 special tokens
+    """
+    def __init__(
+        self,
+        model_name: str = "Qwen/Qwen2.5-Coder-7B-Instruct",
+        clip_model: str = "openai/clip-vit-large-patch14",
+        hidden_size: int = 4096,
+        num_visual_tokens: int = 256,
+        tokenizer_path: Optional[Path] = None,
+        device: Optional[str] = None,
+        torch_dtype: torch.dtype = torch.bfloat16,
+        cache_dir: Optional[Path] = None,
+    ) -> None:
+        """
+        Initialize MINDI 1.5 with all components.
+        Args:
+            model_name: HuggingFace base LLM identifier.
+            clip_model: HuggingFace CLIP vision model identifier.
+            hidden_size: LLM hidden dimension (must match Qwen config).
+            num_visual_tokens: Number of visual tokens from CLIP (256).
+            tokenizer_path: Path to MINDI custom tokenizer directory.
+            device: Target device ('cuda', 'cpu', or None for auto).
+            torch_dtype: Data type for model weights.
+            cache_dir: Base directory for model weight caches.
+        """
+        super().__init__()
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.hidden_size = hidden_size
+        self.num_visual_tokens = num_visual_tokens
+        self.torch_dtype = torch_dtype
+        cache_base = Path(cache_dir) if cache_dir else PROJECT_ROOT / "checkpoints"
+        print("=" * 60)
+        print("  MINDI 1.5 Vision-Coder — Initializing")
+        print("=" * 60)
+        # 1. Load MINDI custom tokenizer (NOT the base Qwen tokenizer)
+        tok_path = Path(tokenizer_path) if tokenizer_path else DEFAULT_TOKENIZER_PATH
+        print(f"\n[MINDI15] Loading MINDI tokenizer from {tok_path} ...")
+        self.tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained(
+            str(tok_path),
+            trust_remote_code=True,
+        )
+        print(f"  Vocab size: {len(self.tokenizer)}")
+        # 2. LLM backbone with LoRA
+        self.architecture = MINDIArchitecture(
+            model_name=model_name,
+            device=self.device,
+            cache_dir=cache_base / "base",
+            torch_dtype=torch_dtype,
+        )
+        # Resize embeddings to match MINDI tokenizer (includes 22 special tokens)
+        self.architecture.resize_embeddings(len(self.tokenizer))
+        # Apply LoRA
+        self.architecture.apply_lora()
+        # 3. Vision encoder (frozen CLIP + trainable projection)
+        self.vision_encoder = VisionEncoder(
+            model_name=clip_model,
+            llm_hidden_size=hidden_size,
+            device=self.device,
+            cache_dir=cache_base / "vision",
+        )
+        # 4. Fusion layer
+        self.fusion = VisionLanguageFusion(
+            hidden_size=hidden_size,
+            num_visual_tokens=num_visual_tokens,
+        )
+        self.fusion.to(self.device)
+        # Cache special token IDs
+        self._special_ids: dict[str, int] = {}
+        for section, (start_tok, end_tok) in MINDI_SECTION_TOKENS.items():
+            sid = self.tokenizer.convert_tokens_to_ids(start_tok)
+            eid = self.tokenizer.convert_tokens_to_ids(end_tok)
+            self._special_ids[f"{section}_start"] = sid
+            self._special_ids[f"{section}_end"] = eid
+        self._print_summary()
+    def _print_summary(self) -> None:
+        """Print initialization summary."""
+        llm_info = self.architecture.get_trainable_params()
+        vis_info = {
+            "trainable": sum(p.numel() for p in self.vision_encoder.parameters() if p.requires_grad),
+            "total": sum(p.numel() for p in self.vision_encoder.parameters()),
+        }
+        fus_info = self.fusion.get_trainable_params()
+        total_trainable = llm_info["trainable"] + vis_info["trainable"] + fus_info["trainable"]
+        total_all = llm_info["total"] + vis_info["total"] + fus_info["total"]
+        print()
+        print("=" * 60)
+        print("  MINDI 1.5 — Initialization Complete")
+        print("=" * 60)
+        print(f"  LLM trainable (LoRA):   {llm_info['trainable']:>14,}")
+        print(f"  Vision trainable:       {vis_info['trainable']:>14,}")
+        print(f"  Fusion trainable:       {fus_info['trainable']:>14,}")
+        print(f"  ─────────────────────────────────────")
+        print(f"  Total trainable:        {total_trainable:>14,}")
+        print(f"  Total params:           {total_all:>14,}")
+        print(f"  Tokenizer vocab:        {len(self.tokenizer):>14,}")
+        print("=" * 60)
+        print()
+    # ── Forward pass ──────────────────────────────────────────────
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        image: Optional[Image.Image] = None,
+    ) -> dict:
+        """
+        Forward pass with optional vision input.
+        Args:
+            input_ids: Token IDs (batch, seq_len).
+            attention_mask: Attention mask (batch, seq_len).
+            labels: Target token IDs for loss computation (batch, seq_len).
+            image: Optional PIL image for multimodal input.
+        Returns:
+            Dict with 'loss', 'logits', and optionally 'visual_tokens'.
+        """
+        model = self.architecture.get_model()
+        # Get text embeddings from the LLM's embedding layer
+        text_embeds = model.get_input_embeddings()(input_ids)
+        # Encode vision if image provided
+        visual_tokens = None
+        if image is not None:
+            visual_tokens = self.vision_encoder.encode_image(image)
+        # Fuse vision + text
+        fused_embeds, fused_mask = self.fusion(text_embeds, visual_tokens, attention_mask)
+        # Extend labels if vision tokens were prepended
+        if visual_tokens is not None and labels is not None:
+            batch_size = labels.shape[0]
+            # -100 = ignore index for cross-entropy on visual positions
+            visual_labels = torch.full(
+                (batch_size, self.num_visual_tokens),
+                fill_value=-100,
+                dtype=labels.dtype,
+                device=labels.device,
+            )
+            labels = torch.cat([visual_labels, labels], dim=1)
+        # Forward through LLM with embeddings (bypass tokenization)
+        outputs = model(
+            inputs_embeds=fused_embeds,
+            attention_mask=fused_mask,
+            labels=labels,
+        )
+        result = {
+            "loss": outputs.loss,
+            "logits": outputs.logits,
+        }
+        if visual_tokens is not None:
+            result["visual_tokens"] = visual_tokens
+        return result
+    # ── Generation ────────────────────────────────────────────────
+    @torch.no_grad()
+    def generate(
+        self,
+        prompt: str,
+        image: Optional[Image.Image] = None,
+        max_new_tokens: int = 2048,
+        temperature: float = 0.7,
+        top_p: float = 0.9,
+        top_k: int = 50,
+        do_sample: bool = True,
+        repetition_penalty: float = 1.1,
+    ) -> str:
+        """
+        Generate text from a prompt, optionally conditioned on an image.
+        Uses the MINDI custom tokenizer (with special tokens) for both
+        encoding the prompt and decoding the output.
+        Args:
+            prompt: Input text prompt.
+            image: Optional PIL image for multimodal generation.
+            max_new_tokens: Maximum tokens to generate.
+            temperature: Sampling temperature.
+            top_p: Nucleus sampling threshold.
+            top_k: Top-k sampling threshold.
+            do_sample: Whether to sample (False = greedy).
+            repetition_penalty: Penalty for repeated tokens.
+        Returns:
+            Generated text string (decoded with MINDI tokenizer).
+        """
+        model = self.architecture.get_model()
+        model.eval()
+        # Tokenize with MINDI tokenizer
+        inputs = self.tokenizer(prompt, return_tensors="pt")
+        input_ids = inputs["input_ids"].to(self.device)
+        attention_mask = inputs["attention_mask"].to(self.device)
+        # If image provided, build fused embeddings
+        if image is not None:
+            text_embeds = model.get_input_embeddings()(input_ids)
+            visual_tokens = self.vision_encoder.encode_image(image)
+            fused_embeds, fused_mask = self.fusion(text_embeds, visual_tokens, attention_mask)
+            output_ids = model.generate(
+                inputs_embeds=fused_embeds,
+                attention_mask=fused_mask,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                do_sample=do_sample,
+                repetition_penalty=repetition_penalty,
+                pad_token_id=self.tokenizer.pad_token_id or self.tokenizer.eos_token_id,
+            )
+        else:
+            # Text-only generation (direct input_ids)
+            output_ids = model.generate(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                do_sample=do_sample,
+                repetition_penalty=repetition_penalty,
+                pad_token_id=self.tokenizer.pad_token_id or self.tokenizer.eos_token_id,
+            )
+        # Decode only the newly generated tokens
+        generated_ids = output_ids[:, input_ids.shape[1]:]
+        text = self.tokenizer.decode(generated_ids[0], skip_special_tokens=False)
+        return text.strip()
+    # ── Output parsing ────────────────────────────────────────────
+    @staticmethod
+    def parse_output(text: str) -> dict[str, list[str]]:
+        """
+        Parse generated text and extract ALL MINDI special-token sections.
+        Extracts content between each pair of special tokens:
+            <|think_start|> ... <|think_end|>     → "thinking"
+            <|file_start|> ... <|file_end|>       → "file"
+            <|code_start|> ... <|code_end|>       → "code"
+            <|critique_start|> ... <|critique_end|> → "critique"
+            <|suggest_start|> ... <|suggest_end|>   → "suggest"
+            <|search_start|> ... <|search_end|>     → "search"
+            <|error_start|> ... <|error_end|>       → "error"
+            <|fix_start|> ... <|fix_end|>           → "fix"
+        Each section may appear multiple times; all occurrences are captured.
+        Args:
+            text: Raw generated text potentially containing special tokens.
+        Returns:
+            Dict mapping section name → list of extracted content strings.
+            Empty list if section not found. Also includes "raw" with full text.
+        """
+        result: dict[str, list[str]] = {"raw": [text]}
+        for section, (start_tok, end_tok) in MINDI_SECTION_TOKENS.items():
+            # Escape the pipe characters for regex
+            pattern = re.escape(start_tok) + r"(.*?)" + re.escape(end_tok)
+            matches = re.findall(pattern, text, flags=re.DOTALL)
+            result[section] = [m.strip() for m in matches]
+        return result
+    # ── Phase control (for 3-phase training) ──────────────────────
+    def set_trainable_components(
+        self,
+        lora: bool = False,
+        vision_projection: bool = False,
+        fusion: bool = False,
+    ) -> dict[str, int]:
+        """
+        Enable/disable training for specific components.
+        Used by the trainer to implement 3-phase training:
+            Phase 1: lora=True,  vision_projection=False, fusion=False
+            Phase 2: lora=False, vision_projection=True,  fusion=True
+            Phase 3: lora=True,  vision_projection=True,  fusion=True
+        Args:
+            lora: Whether LoRA adapter parameters should be trainable.
+            vision_projection: Whether the vision projection layer should train.
+            fusion: Whether the fusion layer should be trainable.
+        Returns:
+            Dict with trainable param counts per component.
+        """
+        counts = {}
+        # LoRA parameters
+        peft_model = self.architecture.peft_model
+        if peft_model is not None:
+            for name, param in peft_model.named_parameters():
+                if "lora_" in name:
+                    param.requires_grad = lora
+        counts["lora"] = sum(
+            p.numel() for n, p in (peft_model or self.architecture.model).named_parameters()
+            if "lora_" in n and p.requires_grad
+        )
+        # Vision projection
+        for param in self.vision_encoder.projection.parameters():
+            param.requires_grad = vision_projection
+        counts["vision_projection"] = sum(
+            p.numel() for p in self.vision_encoder.projection.parameters() if p.requires_grad
+        )
+        # Fusion layer
+        for param in self.fusion.parameters():
+            param.requires_grad = fusion
+        counts["fusion"] = sum(
+            p.numel() for p in self.fusion.parameters() if p.requires_grad
+        )
+        counts["total_trainable"] = counts["lora"] + counts["vision_projection"] + counts["fusion"]
+        print(f"[MINDI15] Trainable: LoRA={counts['lora']:,} | "
+              f"VisionProj={counts['vision_projection']:,} | "
+              f"Fusion={counts['fusion']:,} | "
+              f"Total={counts['total_trainable']:,}")
+        return counts
+    # ── Save / Load ───────────────────────────────────────────────
+    def save(self, save_dir: Optional[Path] = None) -> Path:
+        """
+        Save all trainable weights (LoRA + vision projection + fusion).
+        Args:
+            save_dir: Root directory for saving. Defaults to checkpoints/mindi15.
+        Returns:
+            Path to save directory.
+        """
+        save_path = Path(save_dir) if save_dir else PROJECT_ROOT / "checkpoints" / "mindi15"
+        save_path.mkdir(parents=True, exist_ok=True)
+        # LoRA adapter
+        self.architecture.save_lora(save_path / "lora")
+        # Vision projection
+        self.vision_encoder.save_projection(save_path / "vision")
+        # Fusion layer
+        fusion_path = save_path / "fusion"
+        fusion_path.mkdir(parents=True, exist_ok=True)
+        torch.save(self.fusion.state_dict(), fusion_path / "fusion.pt")
+        print(f"[MINDI15] All weights saved to {save_path}")
+        return save_path
+    def load(self, load_dir: Path) -> None:
+        """
+        Load all trainable weights (LoRA + vision projection + fusion).
+        Args:
+            load_dir: Root directory containing saved weights.
+        """
+        load_path = Path(load_dir)
+        if not load_path.exists():
+            raise FileNotFoundError(f"Checkpoint not found: {load_path}")
+        # LoRA adapter
+        lora_path = load_path / "lora"
+        if lora_path.exists():
+            self.architecture.load_lora(lora_path)
+        # Vision projection
+        vision_path = load_path / "vision"
+        if vision_path.exists():
+            self.vision_encoder.load_projection(vision_path)
+        # Fusion layer
+        fusion_file = load_path / "fusion" / "fusion.pt"
+        if fusion_file.exists():
+            state_dict = torch.load(fusion_file, map_location=self.device, weights_only=True)
+            self.fusion.load_state_dict(state_dict)
+            print(f"[MINDI15] Fusion loaded from {fusion_file.parent}")
+        print(f"[MINDI15] All weights loaded from {load_path}")
+    # ── Utilities ─────────────────────────────────────────────────
+    def get_all_trainable_params(self) -> dict:
+        """Get combined trainable parameter counts across all components."""
+        llm = self.architecture.get_trainable_params()
+        vis_trainable = sum(
+            p.numel() for p in self.vision_encoder.parameters() if p.requires_grad
+        )
+        fus = self.fusion.get_trainable_params()
+        total_trainable = llm["trainable"] + vis_trainable + fus["trainable"]
+        total_all = llm["total"] + sum(p.numel() for p in self.vision_encoder.parameters()) + fus["total"]
+        return {
+            "llm_trainable": llm["trainable"],
+            "llm_total": llm["total"],
+            "vision_trainable": vis_trainable,
+            "fusion_trainable": fus["trainable"],
+            "total_trainable": total_trainable,
+            "total_params": total_all,
+            "trainable_pct": round(100.0 * total_trainable / total_all, 4) if total_all > 0 else 0.0,
+        }
+    def print_info(self) -> None:
+        """Print complete model information."""
+        self.architecture.print_model_info()
+        info = self.get_all_trainable_params()
+        print("  MINDI 1.5 Combined Trainable Parameters:")
+        print(f"    LLM (LoRA):       {info['llm_trainable']:>14,}")
+        print(f"    Vision proj:      {info['vision_trainable']:>14,}")
+        print(f"    Fusion:           {info['fusion_trainable']:>14,}")
+        print(f"    Total trainable:  {info['total_trainable']:>14,}")
+        print(f"    Total params:     {info['total_params']:>14,}")
+        print(f"    Trainable %:      {info['trainable_pct']:>13.2f}%")
+        print()
+# ── Test block ────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    print("=" * 60)
+    print("  MINDI 1.5 — Complete Model Test")
+    print("=" * 60)
+    print()
+    # ── Test 1: parse_output (no GPU needed) ─────────────────────
+    print("  Test 1: parse_output()")
+    sample_output = (
+        "<|think_start|>The user wants a Python function.<|think_end|>"
+        "<|file_start|>main.py<|file_end|>"
+        "<|code_start|>def hello():\n    print('Hello MINDI!')<|code_end|>"
+        "<|critique_start|>Missing type hints and docstring.<|critique_end|>"
+        "<|suggest_start|>Add return type annotation.<|suggest_end|>"
+        "<|search_start|>python type hints best practices<|search_end|>"
+        "<|error_start|>NameError: name 'x' is not defined<|error_end|>"
+        "<|fix_start|>Add x = 0 before the loop.<|fix_end|>"
+        "<|think_start|>Let me also add error handling.<|think_end|>"
+    )
+    parsed = MINDI15.parse_output(sample_output)
+    assert len(parsed["thinking"]) == 2, f"Expected 2 thinking sections, got {len(parsed['thinking'])}"
+    assert parsed["thinking"][0] == "The user wants a Python function."
+    assert parsed["thinking"][1] == "Let me also add error handling."
+    assert parsed["file"] == ["main.py"]
+    assert parsed["code"] == ["def hello():\n    print('Hello MINDI!')"]
+    assert parsed["critique"] == ["Missing type hints and docstring."]
+    assert parsed["suggest"] == ["Add return type annotation."]
+    assert parsed["search"] == ["python type hints best practices"]
+    assert parsed["error"] == ["NameError: name 'x' is not defined"]
+    assert parsed["fix"] == ["Add x = 0 before the loop."]
+    assert "raw" in parsed
+    print("    All 8 section types extracted correctly ✓")
+    print(f"    Sections found: {[k for k, v in parsed.items() if k != 'raw' and v]}")
+    # ── Test 2: parse_output with missing sections ───────────────
+    print("\n  Test 2: parse_output() with partial output")
+    partial = "<|code_start|>print('hi')<|code_end|>"
+    parsed2 = MINDI15.parse_output(partial)
+    assert parsed2["code"] == ["print('hi')"]
+    assert parsed2["thinking"] == []
+    assert parsed2["file"] == []
+    assert parsed2["fix"] == []
+    print("    Missing sections return empty lists ✓")
+    # ── Test 3: parse_output with empty input ────────────────────
+    print("\n  Test 3: parse_output() with empty string")
+    parsed3 = MINDI15.parse_output("")
+    assert all(v == [] for k, v in parsed3.items() if k != "raw")
+    print("    Empty input returns all empty lists ✓")
+    # ── Test 4: Verify MINDI_SECTION_TOKENS covers all 8 ────────
+    print("\n  Test 4: Token coverage")
+    expected_sections = {"thinking", "file", "code", "critique", "suggest", "search", "error", "fix"}
+    assert set(MINDI_SECTION_TOKENS.keys()) == expected_sections
+    print(f"    All 8 sections defined: {sorted(expected_sections)} ✓")
+    # ── GPU-dependent tests (skip if no CUDA) ────────────────────
+    if torch.cuda.is_available():
+        print("\n  Test 5: Full model initialization (GPU)")
+        model = MINDI15()
+        model.print_info()
+        # Test set_trainable_components (Phase 1)
+        print("\n  Test 6: Phase 1 — LoRA only")
+        counts = model.set_trainable_components(lora=True, vision_projection=False, fusion=False)
+        assert counts["lora"] > 0
+        assert counts["vision_projection"] == 0
+        assert counts["fusion"] == 0
+        # Test set_trainable_components (Phase 2)
+        print("\n  Test 7: Phase 2 — Vision bridge only")
+        counts = model.set_trainable_components(lora=False, vision_projection=True, fusion=True)
+        assert counts["lora"] == 0
+        assert counts["vision_projection"] > 0
+        assert counts["fusion"] > 0
+        # Test set_trainable_components (Phase 3)
+        print("\n  Test 8: Phase 3 — All trainable")
+        counts = model.set_trainable_components(lora=True, vision_projection=True, fusion=True)
+        assert counts["lora"] > 0
+        assert counts["vision_projection"] > 0
+        assert counts["fusion"] > 0
+        # Test forward (text only)
+        print("\n  Test 9: Forward pass (text only)")
+        tokens = model.tokenizer("Hello MINDI!", return_tensors="pt")
+        input_ids = tokens["input_ids"].to(model.device)
+        attn_mask = tokens["attention_mask"].to(model.device)
+        result = model(input_ids=input_ids, attention_mask=attn_mask, labels=input_ids)
+        assert result["loss"] is not None
+        print(f"    Loss: {result['loss'].item():.4f}")
+        print(f"    Logits: {result['logits'].shape}")
+        # Test forward (with image)
+        print("\n  Test 10: Forward pass (with dummy image)")
+        dummy_img = Image.new("RGB", (224, 224), color=(100, 150, 200))
+        result_v = model(input_ids=input_ids, attention_mask=attn_mask, labels=input_ids, image=dummy_img)
+        assert result_v["loss"] is not None
+        assert "visual_tokens" in result_v
+        print(f"    Loss: {result_v['loss'].item():.4f}")
+        print(f"    Visual tokens: {result_v['visual_tokens'].shape}")
+        # Test generate (text only)
+        print("\n  Test 11: Generate (text only, short)")
+        output = model.generate("Write a hello world in Python:", max_new_tokens=50)
+        print(f"    Output: {output[:100]}...")
+        print("\n  Test 12: Save/load round-trip")
+        import tempfile
+        with tempfile.TemporaryDirectory() as tmp:
+            model.save(Path(tmp))
+            # Verify files exist
+            assert (Path(tmp) / "lora").exists()
+            assert (Path(tmp) / "vision" / "projection.pt").exists()
+            assert (Path(tmp) / "fusion" / "fusion.pt").exists()
+            print("    Save ✓")
+    else:
+        print("\n  [SKIP] GPU tests (no CUDA available)")
+        print("  Tests 5-12 require GPU with ~20GB VRAM")
+    print("\n  ✓ All MINDI 1.5 model tests passed!")
+    print("=" * 60)

src/model/vision_encoder.py CHANGED Viewed

@@ -1,8 +1,9 @@
 """
 MINDI 1.5 Vision-Coder — Vision Encoder
-Uses CLIP ViT-L/14 to encode UI screenshots into embeddings
-that the coding model can understand and critique.
 """
 from __future__ import annotations
@@ -13,79 +14,237 @@ from typing import Optional
 import torch
 import torch.nn as nn
 from PIL import Image
-from transformers import CLIPModel, CLIPProcessor
 class VisionEncoder(nn.Module):
-    """CLIP-based vision encoder for UI screenshot understanding."""
     def __init__(
         self,
         model_name: str = "openai/clip-vit-large-patch14",
-        projection_dim: int = 768,
         device: Optional[str] = None,
         cache_dir: Optional[Path] = None,
     ) -> None:
         super().__init__()
         self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
-        self.cache_dir = cache_dir or Path("./checkpoints/vision")
         self.cache_dir.mkdir(parents=True, exist_ok=True)
-        # Load CLIP model and processor
-        self.clip: CLIPModel = CLIPModel.from_pretrained(
-            model_name, cache_dir=str(self.cache_dir)
         )
-        self.processor: CLIPProcessor = CLIPProcessor.from_pretrained(
-            model_name, cache_dir=str(self.cache_dir)
         )
-        # Freeze CLIP backbone — we only train the projection layer
         for param in self.clip.parameters():
             param.requires_grad = False
-        # Trainable projection: CLIP hidden → LLM embedding space
-        clip_hidden_size: int = self.clip.config.vision_config.hidden_size  # 1024
-        self.projection = nn.Sequential(
-            nn.Linear(clip_hidden_size, projection_dim),
-            nn.GELU(),
-            nn.Linear(projection_dim, projection_dim),
-        )
         self.to(self.device)
-    def encode_image(self, image: Image.Image) -> torch.Tensor:
-        """Encode a PIL image into a projected embedding tensor."""
-        inputs = self.processor(images=image, return_tensors="pt")
-        inputs = {k: v.to(self.device) for k, v in inputs.items()}
         with torch.no_grad():
-            vision_outputs = self.clip.vision_model(**inputs)
-            # Use [CLS] token embedding
-            cls_embedding = vision_outputs.last_hidden_state[:, 0, :]
-        # Project into LLM embedding space (this part IS trainable)
-        projected = self.projection(cls_embedding)
         return projected
-    def encode_screenshot(self, screenshot_path: Path) -> torch.Tensor:
-        """Load a screenshot from disk and encode it."""
-        if not screenshot_path.exists():
-            raise FileNotFoundError(f"Screenshot not found: {screenshot_path}")
-        image = Image.open(screenshot_path).convert("RGB")
         return self.encode_image(image)
     def save_projection(self, save_dir: Optional[Path] = None) -> Path:
-        """Save only the trainable projection weights."""
-        save_path = save_dir or self.cache_dir / "projection"
         save_path.mkdir(parents=True, exist_ok=True)
         torch.save(self.projection.state_dict(), save_path / "projection.pt")
         return save_path
     def load_projection(self, load_dir: Path) -> None:
-        """Load projection weights from disk."""
-        weights_path = load_dir / "projection.pt"
         if not weights_path.exists():
             raise FileNotFoundError(f"Projection weights not found: {weights_path}")
         state_dict = torch.load(weights_path, map_location=self.device, weights_only=True)
         self.projection.load_state_dict(state_dict)

 """
 MINDI 1.5 Vision-Coder — Vision Encoder
+Uses CLIP ViT-L/14 (frozen) to encode UI screenshots into 256 visual
+tokens projected from 1024 → 4096 to match the Qwen hidden dimension.
+Output shape: (batch, 256, 4096).
 """
 from __future__ import annotations
 import torch
 import torch.nn as nn
 from PIL import Image
+from transformers import CLIPImageProcessor, CLIPVisionModel
 class VisionEncoder(nn.Module):
+    """
+    CLIP ViT-L/14 vision encoder for MINDI 1.5.
+    Extracts ALL 256 patch tokens (excludes CLS) from CLIP and
+    projects them from 1024 → 4096 to match Qwen2.5 hidden_size.
+    The CLIP backbone is frozen; only the projection layer trains.
+    """
+    NUM_PATCHES: int = 256  # ViT-L/14: 16×16 patches from 224×224
     def __init__(
         self,
         model_name: str = "openai/clip-vit-large-patch14",
+        llm_hidden_size: int = 4096,
         device: Optional[str] = None,
         cache_dir: Optional[Path] = None,
+        torch_dtype: torch.dtype = torch.float32,
     ) -> None:
+        """
+        Initialize the vision encoder.
+        Args:
+            model_name: HuggingFace CLIP vision model identifier.
+            llm_hidden_size: Target projection dimension (must match LLM hidden_size).
+            device: Target device ('cuda', 'cpu', or None for auto).
+            cache_dir: Local directory for model weight cache.
+            torch_dtype: Data type for CLIP weights (projection always float32).
+        """
         super().__init__()
+        self.model_name = model_name
+        self.llm_hidden_size = llm_hidden_size
         self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.cache_dir = Path(cache_dir) if cache_dir else Path("./checkpoints/vision")
         self.cache_dir.mkdir(parents=True, exist_ok=True)
+        # Load CLIP vision model (no text tower) and image processor
+        print(f"[VisionEncoder] Loading {model_name} ...")
+        self.clip = CLIPVisionModel.from_pretrained(
+            model_name,
+            cache_dir=str(self.cache_dir),
+            torch_dtype=torch_dtype,
         )
+        self.image_processor = CLIPImageProcessor.from_pretrained(
+            model_name,
+            cache_dir=str(self.cache_dir),
         )
+        # Freeze entire CLIP backbone
         for param in self.clip.parameters():
             param.requires_grad = False
+        self.clip.eval()
+        # Trainable projection: CLIP hidden (1024) → LLM hidden (4096)
+        clip_hidden_size: int = self.clip.config.hidden_size  # 1024
+        self.projection = nn.Linear(clip_hidden_size, self.llm_hidden_size)
         self.to(self.device)
+        trainable = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        total = sum(p.numel() for p in self.parameters())
+        print(f"[VisionEncoder] Loaded — {clip_hidden_size} → {self.llm_hidden_size}")
+        print(f"  Trainable: {trainable:,}  |  Total: {total:,}")
+    def encode_image(self, image: Optional[Image.Image]) -> Optional[torch.Tensor]:
+        """
+        Encode a single PIL image into projected patch token embeddings.
+        Args:
+            image: A PIL Image (RGB), or None.
+        Returns:
+            Tensor of shape (1, 256, 4096) or None if input is None.
+        """
+        if image is None:
+            return None
+        inputs = self.image_processor(images=image, return_tensors="pt")
+        pixel_values = inputs["pixel_values"].to(device=self.device, dtype=self.clip.dtype)
         with torch.no_grad():
+            vision_outputs = self.clip(pixel_values=pixel_values)
+            # last_hidden_state: (batch, 257, 1024) — 1 CLS + 256 patches
+            patch_tokens = vision_outputs.last_hidden_state[:, 1:, :]  # (1, 256, 1024)
+        # Project into LLM embedding space (trainable)
+        projected = self.projection(patch_tokens.float())  # (1, 256, 4096)
         return projected
+    def encode_batch(self, images: list[Optional[Image.Image]]) -> list[Optional[torch.Tensor]]:
+        """
+        Encode a batch of images. None entries pass through as None.
+        Args:
+            images: List of PIL Images or Nones.
+        Returns:
+            List of tensors (1, 256, 4096) or Nones matching input order.
+        """
+        results: list[Optional[torch.Tensor]] = [None] * len(images)
+        valid_indices = [i for i, img in enumerate(images) if img is not None]
+        if not valid_indices:
+            return results
+        valid_images = [images[i] for i in valid_indices]
+        inputs = self.image_processor(images=valid_images, return_tensors="pt")
+        pixel_values = inputs["pixel_values"].to(device=self.device, dtype=self.clip.dtype)
+        with torch.no_grad():
+            vision_outputs = self.clip(pixel_values=pixel_values)
+            patch_tokens = vision_outputs.last_hidden_state[:, 1:, :]  # (N, 256, 1024)
+        projected = self.projection(patch_tokens.float())  # (N, 256, 4096)
+        for batch_idx, orig_idx in enumerate(valid_indices):
+            results[orig_idx] = projected[batch_idx].unsqueeze(0)  # (1, 256, 4096)
+        return results
+    def encode_screenshot(self, screenshot_path: Path) -> Optional[torch.Tensor]:
+        """
+        Load a screenshot from disk and encode it.
+        Args:
+            screenshot_path: Path to image file.
+        Returns:
+            Tensor of shape (1, 256, 4096).
+        """
+        path = Path(screenshot_path)
+        if not path.exists():
+            raise FileNotFoundError(f"Screenshot not found: {path}")
+        image = Image.open(path).convert("RGB")
         return self.encode_image(image)
     def save_projection(self, save_dir: Optional[Path] = None) -> Path:
+        """
+        Save only the trainable projection weights.
+        Args:
+            save_dir: Directory to save to. Defaults to cache_dir/projection.
+        Returns:
+            Path where weights were saved.
+        """
+        save_path = Path(save_dir) if save_dir else self.cache_dir / "projection"
         save_path.mkdir(parents=True, exist_ok=True)
         torch.save(self.projection.state_dict(), save_path / "projection.pt")
+        print(f"[VisionEncoder] Projection saved to {save_path}")
         return save_path
     def load_projection(self, load_dir: Path) -> None:
+        """
+        Load projection weights from disk.
+        Args:
+            load_dir: Directory containing projection.pt.
+        """
+        weights_path = Path(load_dir) / "projection.pt"
         if not weights_path.exists():
             raise FileNotFoundError(f"Projection weights not found: {weights_path}")
         state_dict = torch.load(weights_path, map_location=self.device, weights_only=True)
         self.projection.load_state_dict(state_dict)
+        print(f"[VisionEncoder] Projection loaded from {load_dir}")
+    def get_num_visual_tokens(self) -> int:
+        """Return the number of visual tokens produced per image (256)."""
+        return self.NUM_PATCHES
+# ── Test block ────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    print("=" * 60)
+    print("  MINDI 1.5 — Vision Encoder Test")
+    print("=" * 60)
+    print()
+    # 1. Initialize encoder
+    encoder = VisionEncoder(
+        model_name="openai/clip-vit-large-patch14",
+        llm_hidden_size=4096,
+    )
+    # 2. Create a dummy image (224×224 RGB)
+    dummy_image = Image.new("RGB", (224, 224), color=(128, 128, 128))
+    # 3. Encode single image
+    print("\n  Encoding single image ...")
+    output = encoder.encode_image(dummy_image)
+    assert output is not None
+    print(f"  Output shape: {output.shape}")
+    assert output.shape == (1, 256, 4096), f"Expected (1, 256, 4096), got {output.shape}"
+    # 4. Encode None → should return None
+    none_output = encoder.encode_image(None)
+    assert none_output is None, "Expected None for None input"
+    print("  None input → None output ✓")
+    # 5. Encode batch (mixed with None)
+    print("\n  Encoding batch [image, None, image] ...")
+    batch_results = encoder.encode_batch([dummy_image, None, dummy_image])
+    assert batch_results[0] is not None and batch_results[0].shape == (1, 256, 4096)
+    assert batch_results[1] is None
+    assert batch_results[2] is not None and batch_results[2].shape == (1, 256, 4096)
+    print(f"  Batch results: [{batch_results[0].shape}, None, {batch_results[2].shape}]")
+    # 6. Check trainable params (only projection should train)
+    trainable = sum(p.numel() for p in encoder.parameters() if p.requires_grad)
+    frozen = sum(p.numel() for p in encoder.parameters() if not p.requires_grad)
+    print(f"\n  Trainable: {trainable:,}")
+    print(f"  Frozen:    {frozen:,}")
+    assert trainable == 1024 * 4096 + 4096, f"Unexpected trainable count: {trainable}"
+    assert frozen > trainable, "CLIP backbone should be frozen"
+    # 7. Save and reload projection
+    print("\n  Testing save/load projection ...")
+    import tempfile
+    with tempfile.TemporaryDirectory() as tmp:
+        save_path = encoder.save_projection(Path(tmp))
+        old_weight = encoder.projection.weight.clone()
+        # Perturb weights
+        encoder.projection.weight.data.fill_(0.0)
+        assert not torch.equal(encoder.projection.weight, old_weight)
+        # Reload
+        encoder.load_projection(Path(tmp))
+        assert torch.equal(encoder.projection.weight, old_weight), "Weights not restored!"
+    print("  Save/load round-trip ✓")
+    print("\n  ✓ All vision encoder tests passed!")
+    print("=" * 60)

src/training/mindi_trainer.py ADDED Viewed

	@@ -0,0 +1,745 @@

+"""
+MINDI 1.5 Vision-Coder — Trainer
+Production-ready 3-phase training loop optimized for AMD MI300X (192GB VRAM).
+Streams training data from disk (4.18GB train.jsonl) to avoid RAM exhaustion.
+Phases:
+    Phase 1 (steps 0–5000):     LoRA only,           LR 2e-4, batch 16
+    Phase 2 (steps 5000–7500):  Vision bridge only,   LR 1e-5, batch 8
+    Phase 3 (steps 7500–10000): All trainable,        LR 5e-5, batch 12
+MI300X specifics:
+    - ROCm presents as CUDA to PyTorch (torch.cuda.* works)
+    - bf16 (NOT fp16) for AMD stability
+    - torch.compile() optional (works on ROCm)
+    - Gradient checkpointing enabled
+    - DataLoader: num_workers=4, pin_memory=True, prefetch_factor=2
+"""
+from __future__ import annotations
+import json
+import math
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Iterator, Optional
+import torch
+import torch.nn as nn
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR
+from torch.utils.data import DataLoader, IterableDataset
+# ── Configuration ─────────────────────────────────────────────────────
+PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
+@dataclass
+class PhaseConfig:
+    """Configuration for a single training phase."""
+    name: str
+    start_step: int
+    end_step: int
+    learning_rate: float
+    batch_size: int
+    gradient_accumulation_steps: int = 4
+    # Component toggles
+    lora: bool = False
+    vision_projection: bool = False
+    fusion: bool = False
+@dataclass
+class TrainingConfig:
+    """Full training configuration."""
+    # Data paths
+    train_file: Path = field(default_factory=lambda: PROJECT_ROOT / "data" / "processed" / "train.jsonl")
+    val_file: Path = field(default_factory=lambda: PROJECT_ROOT / "data" / "processed" / "val.jsonl")
+    # Output
+    output_dir: Path = field(default_factory=lambda: PROJECT_ROOT / "checkpoints" / "training")
+    log_dir: Path = field(default_factory=lambda: PROJECT_ROOT / "logs" / "training")
+    # Model
+    max_seq_length: int = 8192
+    use_compile: bool = False
+    gradient_checkpointing: bool = True
+    # Hardware (MI300X defaults)
+    dtype: str = "bf16"
+    num_workers: int = 4
+    pin_memory: bool = True
+    prefetch_factor: int = 2
+    # Training
+    weight_decay: float = 0.01
+    warmup_ratio: float = 0.03
+    max_grad_norm: float = 1.0
+    seed: int = 42
+    # Logging
+    log_every_n_steps: int = 10
+    eval_every_n_steps: int = 250
+    save_every_n_steps: int = 500
+    # Phases
+    phases: list[PhaseConfig] = field(default_factory=lambda: [
+        PhaseConfig(
+            name="phase1_lora",
+            start_step=0, end_step=5000,
+            learning_rate=2e-4, batch_size=16,
+            lora=True, vision_projection=False, fusion=False,
+        ),
+        PhaseConfig(
+            name="phase2_vision_bridge",
+            start_step=5000, end_step=7500,
+            learning_rate=1e-5, batch_size=8,
+            lora=False, vision_projection=True, fusion=True,
+        ),
+        PhaseConfig(
+            name="phase3_all",
+            start_step=7500, end_step=10000,
+            learning_rate=5e-5, batch_size=12,
+            lora=True, vision_projection=True, fusion=True,
+        ),
+    ])
+    @property
+    def total_steps(self) -> int:
+        return self.phases[-1].end_step if self.phases else 0
+    @property
+    def torch_dtype(self) -> torch.dtype:
+        return {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[self.dtype]
+# ── Streaming Dataset ─────────────────────────────────────────────────
+class StreamingJSONLDataset(IterableDataset):
+    """
+    Streams JSONL training data from disk line by line.
+    Tokenizes on-the-fly to avoid loading 4+ GB into RAM.
+    Expected JSONL format:
+        {"id": "...", "type": "...", "source": "...",
+         "messages": [{"role": "system", "content": "..."},
+                      {"role": "user", "content": "..."},
+                      {"role": "assistant", "content": "..."}],
+         "metadata": {...}}
+    """
+    def __init__(
+        self,
+        file_path: Path,
+        tokenizer: Any,
+        max_length: int = 8192,
+        shuffle_buffer: int = 10000,
+        seed: int = 42,
+    ) -> None:
+        self.file_path = Path(file_path)
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.shuffle_buffer = shuffle_buffer
+        self.seed = seed
+        if not self.file_path.exists():
+            raise FileNotFoundError(f"Training data not found: {self.file_path}")
+    def _format_messages(self, messages: list[dict[str, str]]) -> str:
+        """Format chat messages into a single training string."""
+        # Use the tokenizer's chat template if available
+        if hasattr(self.tokenizer, "apply_chat_template"):
+            return self.tokenizer.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=False
+            )
+        # Fallback: simple concatenation
+        parts = []
+        for msg in messages:
+            role = msg.get("role", "user")
+            content = msg.get("content", "")
+            parts.append(f"<|{role}|>\n{content}")
+        return "\n".join(parts)
+    def _tokenize(self, text: str) -> Optional[dict[str, torch.Tensor]]:
+        """Tokenize text and create training labels."""
+        encoded = self.tokenizer(
+            text,
+            max_length=self.max_length,
+            truncation=True,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        input_ids = encoded["input_ids"].squeeze(0)
+        attention_mask = encoded["attention_mask"].squeeze(0)
+        # Labels = input_ids, with padding tokens masked as -100
+        labels = input_ids.clone()
+        labels[attention_mask == 0] = -100
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+        }
+    def _line_iterator(self) -> Iterator[dict]:
+        """Iterate over JSONL file line by line."""
+        with open(self.file_path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    yield json.loads(line)
+    def _shuffled_iterator(self) -> Iterator[dict]:
+        """Reservoir-style shuffle buffer for streaming data."""
+        import random
+        rng = random.Random(self.seed)
+        buffer: list[dict] = []
+        for item in self._line_iterator():
+            buffer.append(item)
+            if len(buffer) >= self.shuffle_buffer:
+                rng.shuffle(buffer)
+                yield from buffer
+                buffer.clear()
+        # Flush remaining items
+        if buffer:
+            rng.shuffle(buffer)
+            yield from buffer
+    def __iter__(self) -> Iterator[dict[str, torch.Tensor]]:
+        for example in self._shuffled_iterator():
+            messages = example.get("messages", [])
+            if not messages:
+                continue
+            text = self._format_messages(messages)
+            tokenized = self._tokenize(text)
+            if tokenized is not None:
+                yield tokenized
+    def count_lines(self) -> int:
+        """Count total lines (for progress estimation). Reads file once."""
+        count = 0
+        with open(self.file_path, "r", encoding="utf-8") as f:
+            for _ in f:
+                count += 1
+        return count
+# ── Trainer ───────────────────────────────────────────────────────────
+class MINDITrainer:
+    """
+    3-phase trainer for MINDI 1.5 Vision-Coder.
+    Optimized for AMD MI300X 192GB:
+        - bf16 mixed precision
+        - Gradient checkpointing
+        - Streaming data from disk
+        - Optional torch.compile()
+        - Phase-based component freezing/unfreezing
+    """
+    def __init__(
+        self,
+        model: nn.Module,
+        config: TrainingConfig,
+    ) -> None:
+        """
+        Initialize the trainer.
+        Args:
+            model: MINDI15 model instance (already initialized).
+            config: Training configuration.
+        """
+        self.model = model
+        self.config = config
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.global_step = 0
+        self.best_val_loss = float("inf")
+        # Create output directories
+        self.config.output_dir.mkdir(parents=True, exist_ok=True)
+        self.config.log_dir.mkdir(parents=True, exist_ok=True)
+        # Gradient checkpointing
+        if config.gradient_checkpointing:
+            base_model = self.model.architecture.get_model()
+            if hasattr(base_model, "gradient_checkpointing_enable"):
+                base_model.gradient_checkpointing_enable()
+                print("[MINDITrainer] Gradient checkpointing enabled")
+        # Optional torch.compile (works on ROCm)
+        if config.use_compile:
+            print("[MINDITrainer] Compiling model with torch.compile() ...")
+            self.model.architecture.peft_model = torch.compile(
+                self.model.architecture.peft_model
+            )
+            print("[MINDITrainer] Compilation complete")
+        # Mixed precision scaler (bf16 doesn't need GradScaler, but keep structure)
+        self.use_amp = config.dtype in ("bf16", "fp16")
+        self.amp_dtype = config.torch_dtype
+        # Training log
+        self.log_file = config.log_dir / "training_log.jsonl"
+        self.metrics_history: list[dict] = []
+        print(f"[MINDITrainer] Device: {self.device}")
+        print(f"[MINDITrainer] Dtype: {config.dtype}")
+        print(f"[MINDITrainer] Total steps: {config.total_steps}")
+        print(f"[MINDITrainer] Phases: {len(config.phases)}")
+    def _build_optimizer(self, phase: PhaseConfig) -> AdamW:
+        """Build optimizer for the current phase (only trainable params)."""
+        params = [p for p in self.model.parameters() if p.requires_grad]
+        if not params:
+            raise RuntimeError(f"No trainable parameters in phase '{phase.name}'")
+        return AdamW(
+            params,
+            lr=phase.learning_rate,
+            weight_decay=self.config.weight_decay,
+            betas=(0.9, 0.95),
+        )
+    def _build_scheduler(
+        self, optimizer: AdamW, phase: PhaseConfig
+    ) -> torch.optim.lr_scheduler.LRScheduler:
+        """Build LR scheduler: linear warmup + cosine decay."""
+        phase_steps = phase.end_step - phase.start_step
+        warmup_steps = max(1, int(phase_steps * self.config.warmup_ratio))
+        decay_steps = max(1, phase_steps - warmup_steps)
+        warmup = LinearLR(
+            optimizer,
+            start_factor=0.01,
+            end_factor=1.0,
+            total_iters=warmup_steps,
+        )
+        cosine = CosineAnnealingLR(
+            optimizer,
+            T_max=decay_steps,
+            eta_min=phase.learning_rate * 0.1,
+        )
+        return SequentialLR(
+            optimizer,
+            schedulers=[warmup, cosine],
+            milestones=[warmup_steps],
+        )
+    def _build_dataloader(
+        self, file_path: Path, batch_size: int, shuffle_buffer: int = 10000
+    ) -> DataLoader:
+        """Build a streaming DataLoader."""
+        dataset = StreamingJSONLDataset(
+            file_path=file_path,
+            tokenizer=self.model.tokenizer,
+            max_length=self.config.max_seq_length,
+            shuffle_buffer=shuffle_buffer,
+            seed=self.config.seed,
+        )
+        return DataLoader(
+            dataset,
+            batch_size=batch_size,
+            num_workers=self.config.num_workers,
+            pin_memory=self.config.pin_memory,
+            prefetch_factor=self.config.prefetch_factor if self.config.num_workers > 0 else None,
+            drop_last=True,
+        )
+    def _log_metrics(self, metrics: dict) -> None:
+        """Append metrics to log file and history."""
+        self.metrics_history.append(metrics)
+        with open(self.log_file, "a", encoding="utf-8") as f:
+            f.write(json.dumps(metrics) + "\n")
+    @torch.no_grad()
+    def evaluate(self, val_loader: DataLoader, max_batches: int = 50) -> float:
+        """
+        Run validation and return average loss.
+        Args:
+            val_loader: Validation DataLoader.
+            max_batches: Maximum batches to evaluate (for speed).
+        Returns:
+            Average validation loss.
+        """
+        self.model.eval()
+        total_loss = 0.0
+        count = 0
+        for batch_idx, batch in enumerate(val_loader):
+            if batch_idx >= max_batches:
+                break
+            input_ids = batch["input_ids"].to(self.device)
+            attention_mask = batch["attention_mask"].to(self.device)
+            labels = batch["labels"].to(self.device)
+            with torch.autocast(device_type="cuda", dtype=self.amp_dtype, enabled=self.use_amp):
+                result = self.model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    labels=labels,
+                )
+            if result["loss"] is not None:
+                total_loss += result["loss"].item()
+                count += 1
+        self.model.train()
+        return total_loss / max(count, 1)
+    def _save_checkpoint(self, phase_name: str, step: int, val_loss: float) -> Path:
+        """Save a training checkpoint."""
+        ckpt_dir = self.config.output_dir / f"{phase_name}_step{step}"
+        ckpt_dir.mkdir(parents=True, exist_ok=True)
+        # Save model weights
+        self.model.save(ckpt_dir)
+        # Save trainer state
+        state = {
+            "global_step": self.global_step,
+            "phase": phase_name,
+            "step_in_phase": step,
+            "val_loss": val_loss,
+            "best_val_loss": self.best_val_loss,
+        }
+        torch.save(state, ckpt_dir / "trainer_state.pt")
+        print(f"[MINDITrainer] Checkpoint saved: {ckpt_dir}")
+        return ckpt_dir
+    def train_phase(self, phase: PhaseConfig) -> dict:
+        """
+        Execute a single training phase.
+        Args:
+            phase: Phase configuration.
+        Returns:
+            Dict with phase training metrics.
+        """
+        print()
+        print("=" * 60)
+        print(f"  Phase: {phase.name}")
+        print(f"  Steps: {phase.start_step} → {phase.end_step}")
+        print(f"  LR: {phase.learning_rate}  |  Batch: {phase.batch_size}")
+        print(f"  Components: LoRA={phase.lora}, Vision={phase.vision_projection}, "
+              f"Fusion={phase.fusion}")
+        print("=" * 60)
+        # Set trainable components
+        self.model.set_trainable_components(
+            lora=phase.lora,
+            vision_projection=phase.vision_projection,
+            fusion=phase.fusion,
+        )
+        # Build optimizer and scheduler for this phase
+        optimizer = self._build_optimizer(phase)
+        scheduler = self._build_scheduler(optimizer, phase)
+        # Build data loaders
+        train_loader = self._build_dataloader(
+            self.config.train_file, phase.batch_size
+        )
+        val_loader = self._build_dataloader(
+            self.config.val_file, batch_size=max(phase.batch_size // 2, 1),
+            shuffle_buffer=1000,
+        )
+        self.model.train()
+        phase_steps = phase.end_step - phase.start_step
+        step_in_phase = 0
+        accum_loss = 0.0
+        accum_count = 0
+        phase_start_time = time.time()
+        train_iter = iter(train_loader)
+        while step_in_phase < phase_steps:
+            # Get next batch (restart iterator if exhausted = new epoch)
+            try:
+                batch = next(train_iter)
+            except StopIteration:
+                train_iter = iter(train_loader)
+                batch = next(train_iter)
+            input_ids = batch["input_ids"].to(self.device)
+            attention_mask = batch["attention_mask"].to(self.device)
+            labels = batch["labels"].to(self.device)
+            # Forward pass with mixed precision
+            with torch.autocast(device_type="cuda", dtype=self.amp_dtype, enabled=self.use_amp):
+                result = self.model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    labels=labels,
+                )
+                loss = result["loss"]
+                if loss is None:
+                    continue
+                # Scale loss for gradient accumulation
+                loss = loss / phase.gradient_accumulation_steps
+            # Backward pass
+            loss.backward()
+            accum_loss += loss.item() * phase.gradient_accumulation_steps
+            accum_count += 1
+            # Optimizer step (every gradient_accumulation_steps)
+            if accum_count % phase.gradient_accumulation_steps == 0:
+                # Gradient clipping
+                torch.nn.utils.clip_grad_norm_(
+                    [p for p in self.model.parameters() if p.requires_grad],
+                    self.config.max_grad_norm,
+                )
+                optimizer.step()
+                scheduler.step()
+                optimizer.zero_grad()
+                step_in_phase += 1
+                self.global_step += 1
+                avg_loss = accum_loss / phase.gradient_accumulation_steps
+                accum_loss = 0.0
+                # Logging
+                if step_in_phase % self.config.log_every_n_steps == 0:
+                    elapsed = time.time() - phase_start_time
+                    steps_per_sec = step_in_phase / elapsed if elapsed > 0 else 0.0
+                    eta_sec = (phase_steps - step_in_phase) / steps_per_sec if steps_per_sec > 0 else 0.0
+                    metrics = {
+                        "phase": phase.name,
+                        "global_step": self.global_step,
+                        "step_in_phase": step_in_phase,
+                        "loss": round(avg_loss, 4),
+                        "lr": optimizer.param_groups[0]["lr"],
+                        "steps_per_sec": round(steps_per_sec, 3),
+                        "eta_minutes": round(eta_sec / 60, 1),
+                        "elapsed_minutes": round(elapsed / 60, 1),
+                    }
+                    self._log_metrics(metrics)
+                    print(f"  [{phase.name}] step {step_in_phase}/{phase_steps} | "
+                          f"loss={avg_loss:.4f} | "
+                          f"lr={optimizer.param_groups[0]['lr']:.2e} | "
+                          f"speed={steps_per_sec:.2f} steps/s | "
+                          f"ETA={eta_sec / 60:.1f}min")
+                # Evaluation
+                if step_in_phase % self.config.eval_every_n_steps == 0:
+                    val_loss = self.evaluate(val_loader)
+                    print(f"  [{phase.name}] EVAL step {step_in_phase} | val_loss={val_loss:.4f}")
+                    self._log_metrics({
+                        "phase": phase.name,
+                        "global_step": self.global_step,
+                        "val_loss": round(val_loss, 4),
+                        "type": "eval",
+                    })
+                    # Save best model
+                    if val_loss < self.best_val_loss:
+                        self.best_val_loss = val_loss
+                        self._save_checkpoint(phase.name, step_in_phase, val_loss)
+                        print(f"  [{phase.name}] New best val_loss: {val_loss:.4f}")
+                # Periodic save
+                if step_in_phase % self.config.save_every_n_steps == 0:
+                    self._save_checkpoint(phase.name, step_in_phase, self.best_val_loss)
+        # End-of-phase save
+        phase_elapsed = time.time() - phase_start_time
+        self._save_checkpoint(phase.name, step_in_phase, self.best_val_loss)
+        phase_summary = {
+            "phase": phase.name,
+            "total_steps": step_in_phase,
+            "elapsed_minutes": round(phase_elapsed / 60, 1),
+            "best_val_loss": round(self.best_val_loss, 4),
+            "type": "phase_complete",
+        }
+        self._log_metrics(phase_summary)
+        print(f"\n  [{phase.name}] Complete — {step_in_phase} steps in "
+              f"{phase_elapsed / 60:.1f} min")
+        return phase_summary
+    def train(self) -> dict:
+        """
+        Run all 3 training phases sequentially.
+        Returns:
+            Dict with complete training summary.
+        """
+        print()
+        print("=" * 60)
+        print("  MINDI 1.5 — Training Start")
+        print(f"  Total phases: {len(self.config.phases)}")
+        print(f"  Total steps:  {self.config.total_steps}")
+        print(f"  Device:       {self.device}")
+        print(f"  Dtype:        {self.config.dtype}")
+        print(f"  Output:       {self.config.output_dir}")
+        print("=" * 60)
+        torch.manual_seed(self.config.seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(self.config.seed)
+        training_start = time.time()
+        phase_summaries = []
+        for phase in self.config.phases:
+            summary = self.train_phase(phase)
+            phase_summaries.append(summary)
+        total_elapsed = time.time() - training_start
+        # Final save
+        final_dir = self.config.output_dir / "final"
+        final_dir.mkdir(parents=True, exist_ok=True)
+        self.model.save(final_dir)
+        training_summary = {
+            "total_steps": self.global_step,
+            "total_minutes": round(total_elapsed / 60, 1),
+            "best_val_loss": round(self.best_val_loss, 4),
+            "phases": phase_summaries,
+            "type": "training_complete",
+        }
+        self._log_metrics(training_summary)
+        print()
+        print("=" * 60)
+        print("  MINDI 1.5 — Training Complete")
+        print(f"  Total steps:     {self.global_step}")
+        print(f"  Total time:      {total_elapsed / 60:.1f} minutes")
+        print(f"  Best val loss:   {self.best_val_loss:.4f}")
+        print(f"  Final saved to:  {final_dir}")
+        print("=" * 60)
+        return training_summary
+    def resume_from_checkpoint(self, checkpoint_dir: Path) -> None:
+        """
+        Resume training from a checkpoint.
+        Args:
+            checkpoint_dir: Directory containing saved checkpoint.
+        """
+        checkpoint_dir = Path(checkpoint_dir)
+        state_file = checkpoint_dir / "trainer_state.pt"
+        if not state_file.exists():
+            raise FileNotFoundError(f"Trainer state not found: {state_file}")
+        # Load model weights
+        self.model.load(checkpoint_dir)
+        # Load trainer state
+        state = torch.load(state_file, map_location=self.device, weights_only=True)
+        self.global_step = state["global_step"]
+        self.best_val_loss = state["best_val_loss"]
+        print(f"[MINDITrainer] Resumed from step {self.global_step} "
+              f"(val_loss={self.best_val_loss:.4f})")
+# ── Test block ────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    print("=" * 60)
+    print("  MINDI 1.5 — Trainer Test")
+    print("=" * 60)
+    print()
+    # ── Test 1: Config defaults ──────────────────────────────────
+    print("  Test 1: TrainingConfig defaults")
+    config = TrainingConfig()
+    assert config.total_steps == 10000
+    assert config.dtype == "bf16"
+    assert config.torch_dtype == torch.bfloat16
+    assert len(config.phases) == 3
+    assert config.gradient_checkpointing is True
+    assert config.num_workers == 4
+    assert config.pin_memory is True
+    assert config.prefetch_factor == 2
+    print(f"    Total steps: {config.total_steps}")
+    print(f"    Dtype: {config.dtype}")
+    print(f"    Phases: {[p.name for p in config.phases]}")
+    print("    ✓ Config defaults correct")
+    # ── Test 2: Phase configs ────────────────────────────────────
+    print("\n  Test 2: Phase configurations")
+    p1, p2, p3 = config.phases
+    assert p1.name == "phase1_lora"
+    assert p1.batch_size == 16
+    assert p1.learning_rate == 2e-4
+    assert p1.lora is True and p1.vision_projection is False and p1.fusion is False
+    assert p2.name == "phase2_vision_bridge"
+    assert p2.batch_size == 8
+    assert p2.learning_rate == 1e-5
+    assert p2.lora is False and p2.vision_projection is True and p2.fusion is True
+    assert p3.name == "phase3_all"
+    assert p3.batch_size == 12
+    assert p3.learning_rate == 5e-5
+    assert p3.lora is True and p3.vision_projection is True and p3.fusion is True
+    print("    Phase 1: LoRA only, batch=16, lr=2e-4 ✓")
+    print("    Phase 2: Vision bridge, batch=8, lr=1e-5 ✓")
+    print("    Phase 3: All, batch=12, lr=5e-5 ✓")
+    # ── Test 3: Streaming dataset (if data exists) ───────────────
+    print("\n  Test 3: StreamingJSONLDataset")
+    train_path = config.train_file
+    if train_path.exists():
+        from transformers import AutoTokenizer
+        tok = AutoTokenizer.from_pretrained(
+            str(PROJECT_ROOT / "data" / "tokenizer" / "mindi_tokenizer"),
+            trust_remote_code=True,
+        )
+        dataset = StreamingJSONLDataset(
+            file_path=train_path,
+            tokenizer=tok,
+            max_length=512,  # small for test
+            shuffle_buffer=100,
+        )
+        count = 0
+        for item in dataset:
+            assert "input_ids" in item
+            assert "attention_mask" in item
+            assert "labels" in item
+            assert item["input_ids"].shape[0] == 512
+            count += 1
+            if count >= 5:
+                break
+        print(f"    Streamed {count} examples, shape={item['input_ids'].shape} ✓")
+    else:
+        print(f"    [SKIP] Train file not found: {train_path}")
+    # ── Test 4: PhaseConfig step ranges ──────────────────────────
+    print("\n  Test 4: Phase step continuity")
+    for i in range(1, len(config.phases)):
+        prev = config.phases[i - 1]
+        curr = config.phases[i]
+        assert prev.end_step == curr.start_step, \
+            f"Gap between {prev.name} and {curr.name}"
+    print("    All phases are contiguous ✓")
+    # ── Test 5: Gradient accumulation ────────────────────────────
+    print("\n  Test 5: Gradient accumulation steps")
+    for phase in config.phases:
+        assert phase.gradient_accumulation_steps == 4
+    print("    All phases: grad_accum=4 ✓")
+    print("\n  ✓ All trainer tests passed!")
+    print("=" * 60)