#!/usr/bin/env python3 """ MINDI 1.5 Vision-Coder — Master Training Script Usage: python scripts/train.py --phase 1 # Run phase 1 only python scripts/train.py --phase all # Run all 3 phases python scripts/train.py --phase 2 --resume checkpoints/training/phase1_lora_step5000 python scripts/train.py --dry_run # Test 10 steps only python scripts/train.py --push_to_hub # Upload after training Handles Ctrl+C gracefully: saves checkpoint before exit. """ from __future__ import annotations import argparse import signal import sys import traceback from pathlib import Path # Resolve project root (scripts/ is one level deep) PROJECT_ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(PROJECT_ROOT)) import torch import yaml def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="MINDI 1.5 Vision-Coder — Training", formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument( "--phase", type=str, default="all", choices=["1", "2", "3", "all"], help="Which phase(s) to run: 1, 2, 3, or all (default: all)", ) parser.add_argument( "--resume", type=str, default=None, help="Path to checkpoint directory to resume from", ) parser.add_argument( "--config", type=str, default=str(PROJECT_ROOT / "configs" / "training_config.yaml"), help="Path to training config YAML", ) parser.add_argument( "--dry_run", action="store_true", help="Test run: only 10 steps per phase", ) parser.add_argument( "--push_to_hub", action="store_true", help="Push checkpoints to HuggingFace after each phase", ) parser.add_argument( "--no_wandb", action="store_true", help="Disable WandB logging", ) return parser.parse_args() def load_config(config_path: str) -> dict: """Load and return the training config YAML.""" path = Path(config_path) if not path.exists(): raise FileNotFoundError(f"Config not found: {path}") with open(path, "r", encoding="utf-8") as f: return yaml.safe_load(f) def build_training_config(raw: dict, dry_run: bool = False): """Build TrainingConfig from parsed YAML.""" from src.training.mindi_trainer import PhaseConfig, TrainingConfig training = raw.get("training", {}) data = raw.get("data", {}) output = raw.get("output", {}) logging_cfg = raw.get("logging", {}) model_cfg = raw.get("model", {}) # Build phase configs from YAML phases = [] phase_defs = [ ("phase1", "phase1_lora", True, False, False, "text"), ("phase2", "phase2_vision_bridge", False, True, True, "vision"), ("phase3", "phase3_all", True, True, True, "mixed"), ] cumulative_step = 0 for key, name, lora, vision, fusion, data_type in phase_defs: pcfg = training.get(key, {}) steps = pcfg.get("steps", 2500) if dry_run: steps = 10 start = cumulative_step end = cumulative_step + steps phases.append(PhaseConfig( name=name, start_step=start, end_step=end, learning_rate=float(pcfg.get("lr", 2e-4)), batch_size=pcfg.get("batch_size", 8), gradient_accumulation_steps=training.get("grad_accumulation", 4), lora=lora, vision_projection=vision, fusion=fusion, data_type=data_type, )) cumulative_step = end config = TrainingConfig( train_file=PROJECT_ROOT / data.get("train_file", "data/processed/train.jsonl"), val_file=PROJECT_ROOT / data.get("val_file", "data/processed/val.jsonl"), vision_train_file=PROJECT_ROOT / data.get("vision_train_file", "data/websight/train.jsonl"), vision_val_file=PROJECT_ROOT / data.get("vision_val_file", "data/websight/val.jsonl"), output_dir=PROJECT_ROOT / output.get("checkpoint_dir", "checkpoints/training"), log_dir=PROJECT_ROOT / logging_cfg.get("log_dir", "logs/training"), max_seq_length=data.get("max_length", 4096), use_compile=model_cfg.get("use_compile", False), gradient_checkpointing=model_cfg.get("gradient_checkpointing", True), dtype=model_cfg.get("dtype", "bf16"), num_workers=data.get("num_workers", 4), pin_memory=True, prefetch_factor=2, weight_decay=0.01, warmup_ratio=0.03, max_grad_norm=float(training.get("max_grad_norm", 1.0)), seed=42, log_every_n_steps=logging_cfg.get("log_every", 10), eval_every_n_steps=training.get("eval_every", 250), save_every_n_steps=training.get("save_every", 500), phases=phases, ) if dry_run: config.eval_every_n_steps = 5 config.save_every_n_steps = 10 config.log_every_n_steps = 1 return config def init_wandb(raw_config: dict, phase: str, disabled: bool = False): """Initialize WandB logging.""" if disabled: return None try: import wandb logging_cfg = raw_config.get("logging", {}) run = wandb.init( project=logging_cfg.get("wandb_project", "mindi-1.5-vision-coder"), entity=logging_cfg.get("wandb_entity", "mindigenous"), name=f"mindi15-{phase}", config=raw_config, tags=["mindi-1.5", "training", f"phase-{phase}"], reinit=True, ) print(f"[train.py] WandB initialized: {run.url}") return run except ImportError: print("[train.py] WandB not installed — logging disabled") return None except Exception as e: print(f"[train.py] WandB init failed: {e} — continuing without logging") return None def push_checkpoint_to_hub(checkpoint_dir: Path, raw_config: dict) -> None: """Push a checkpoint to HuggingFace Hub.""" output = raw_config.get("output", {}) repo_id = output.get("hf_repo", "Mindigenous/MINDI-1.5-Vision-Coder") try: from huggingface_hub import HfApi import os api = HfApi(token=os.environ.get("HF_TOKEN")) print(f"[train.py] Pushing checkpoint to {repo_id} ...") api.upload_folder( folder_path=str(checkpoint_dir), repo_id=repo_id, path_in_repo=f"checkpoints/{checkpoint_dir.name}", repo_type="model", ) print(f"[train.py] Pushed to https://huggingface.co/{repo_id}") except ImportError: print("[train.py] huggingface_hub not installed — skipping push") except Exception as e: print(f"[train.py] Push to hub failed: {e}") def log_wandb_phase_complete(wandb_run, summary: dict) -> None: """Log phase completion to WandB.""" if wandb_run is None: return try: import wandb wandb.log({ "phase_complete": True, "phase": summary.get("phase", "unknown"), "total_steps": summary.get("total_steps", 0), "best_val_loss": summary.get("best_val_loss", 0), "elapsed_minutes": summary.get("elapsed_minutes", 0), }) except Exception: pass def main() -> None: args = parse_args() print() print("=" * 60) print(" MINDI 1.5 Vision-Coder — Training Launch") print(" MINDIGENOUS.AI") print("=" * 60) print() print(f" Phase: {args.phase}") print(f" Config: {args.config}") print(f" Resume: {args.resume or 'None'}") print(f" Dry run: {args.dry_run}") print(f" Push to hub: {args.push_to_hub}") print(f" Device: {'cuda' if torch.cuda.is_available() else 'cpu'}") if torch.cuda.is_available(): print(f" GPU: {torch.cuda.get_device_name(0)}") vram_gb = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3) print(f" VRAM: {vram_gb:.1f} GB") print() # Load config raw_config = load_config(args.config) config = build_training_config(raw_config, dry_run=args.dry_run) # Filter phases based on --phase arg if args.phase != "all": phase_idx = int(args.phase) - 1 if phase_idx < 0 or phase_idx >= len(config.phases): print(f"ERROR: Invalid phase {args.phase}. Available: 1-{len(config.phases)}") sys.exit(1) selected_phase = config.phases[phase_idx] # Adjust to start from 0 for single-phase run step_count = selected_phase.end_step - selected_phase.start_step selected_phase.start_step = 0 selected_phase.end_step = step_count config.phases = [selected_phase] # Initialize model print("[train.py] Initializing MINDI 1.5 model ...") from src.model.mindi_model import MINDI15 model_cfg = raw_config.get("model", {}) vision_cfg = raw_config.get("vision", {}) model = MINDI15( model_name=model_cfg.get("name", "Qwen/Qwen2.5-Coder-7B-Instruct"), clip_model=vision_cfg.get("clip_model", "openai/clip-vit-large-patch14"), hidden_size=model_cfg.get("hidden_size", 3584), num_visual_tokens=vision_cfg.get("visual_tokens", 256), torch_dtype=config.torch_dtype, ) # Initialize trainer from src.training.mindi_trainer import MINDITrainer trainer = MINDITrainer(model=model, config=config) # Resume from checkpoint if args.resume: resume_path = Path(args.resume) if not resume_path.is_absolute(): resume_path = PROJECT_ROOT / resume_path trainer.resume_from_checkpoint(resume_path) # Initialize WandB wandb_run = init_wandb(raw_config, args.phase, disabled=args.no_wandb) # Graceful Ctrl+C handler interrupted = False def signal_handler(sig, frame): nonlocal interrupted if interrupted: print("\n[train.py] Forced exit!") sys.exit(1) interrupted = True print("\n[train.py] Ctrl+C received — saving checkpoint before exit ...") try: emergency_dir = config.output_dir / "emergency_checkpoint" emergency_dir.mkdir(parents=True, exist_ok=True) model.save(emergency_dir) print(f"[train.py] Emergency checkpoint saved: {emergency_dir}") except Exception as e: print(f"[train.py] Emergency save failed: {e}") sys.exit(0) signal.signal(signal.SIGINT, signal_handler) # Run training try: if args.phase == "all": summary = trainer.train() final_dir = config.output_dir / "final" if args.push_to_hub: push_checkpoint_to_hub(final_dir, raw_config) log_wandb_phase_complete(wandb_run, summary) else: phase = config.phases[0] summary = trainer.train_phase(phase) ckpt_dir = config.output_dir / f"{phase.name}_step{phase.end_step}" if args.push_to_hub: push_checkpoint_to_hub(ckpt_dir, raw_config) log_wandb_phase_complete(wandb_run, summary) except KeyboardInterrupt: signal_handler(None, None) except Exception as e: print(f"\n[train.py] ERROR: {e}") traceback.print_exc() try: crash_dir = config.output_dir / "crash_checkpoint" crash_dir.mkdir(parents=True, exist_ok=True) model.save(crash_dir) print(f"[train.py] Crash checkpoint saved: {crash_dir}") except Exception: pass sys.exit(1) finally: if wandb_run is not None: try: import wandb wandb.finish() except Exception: pass # Final summary hf_repo = raw_config.get("output", {}).get("hf_repo", "Mindigenous/MINDI-1.5-Vision-Coder") print() print("=" * 60) print(" Training complete!") print(f" Best val loss: {trainer.best_val_loss:.4f}") print(f" Checkpoint at: {config.output_dir}") if args.push_to_hub: print(f" HuggingFace: https://huggingface.co/{hf_repo}") print("=" * 60) print() if __name__ == "__main__": main()