#!/usr/bin/env python3 """ Humigence CLI - Main entry point for all Humigence commands """ import typer from typing import Optional, Dict, Any from rich.console import Console from rich.panel import Panel from rich.table import Table from pathlib import Path import sys import os from datetime import datetime # Add the current directory to the path for imports sys.path.insert(0, str(Path(__file__).parent)) from training.train_wikitext import run_training, run_training_from_config from training.autodetect import detect_family, suggested_lora_targets from validation.matrix import ( get_gpu_info, precision_supported, estimate_model_params, estimate_memory_bytes, tokenizer_ok, PRECISIONS, ) from validation.dryrun import dry_run from validation.fallback import FallbackSimulator, ConfigCandidate from config.schema import ValidationConfig, TrainingConfig, ConfigMetadata, save_config, validation_to_training_config app = typer.Typer( name="humigence", help="Your AI. Your pipeline. Zero code.", add_completion=False, rich_markup_mode="rich" ) console = Console() @app.command() def train_wikitext( model: str = typer.Option( "", "--model", "-m", help="Path or Hugging Face model name (e.g., 'gpt2' or 'microsoft/DialoGPT-small')" ), output_dir: str = typer.Option( ..., "--output-dir", "-o", help="Directory where checkpoints will be saved" ), epochs: int = typer.Option( 1, "--epochs", "-e", help="Number of training epochs" ), batch_size: int = typer.Option( 2, "--batch-size", "-b", help="Per-device batch size" ), learning_rate: float = typer.Option( 5e-5, "--learning-rate", "-lr", help="Learning rate for training" ), dataset: str = typer.Option( "wikitext", "--dataset", help="Dataset name (default: wikitext)" ), dataset_config: str = typer.Option( "wikitext-2-raw-v1", "--dataset-config", help="Dataset configuration (default: wikitext-2-raw-v1)" ), max_steps: Optional[int] = typer.Option( None, "--max-steps", help="Maximum training steps (overrides epochs if set)" ), block_size: int = typer.Option( 1024, "--block-size", help="Maximum sequence length" ), grad_accum: int = typer.Option( 4, "--grad-accum", help="Gradient accumulation steps" ), warmup_steps: int = typer.Option( 100, "--warmup-steps", help="Number of warmup steps" ), logging_steps: int = typer.Option( 10, "--logging-steps", help="Logging frequency in steps" ), save_steps: int = typer.Option( 200, "--save-steps", help="Model saving frequency in steps" ), eval_steps: int = typer.Option( 200, "--eval-steps", help="Evaluation frequency in steps" ), lora_r: int = typer.Option( 8, "--lora-r", help="LoRA rank" ), lora_alpha: int = typer.Option( 32, "--lora-alpha", help="LoRA alpha parameter" ), lora_dropout: float = typer.Option( 0.05, "--lora-dropout", help="LoRA dropout rate" ), config: Optional[str] = typer.Option( None, "--config", help="Load configuration from YAML file" ), ): """ Train a model on Wikitext dataset using LoRA fine-tuning. This command fine-tunes a language model on the Wikitext dataset using LoRA (Low-Rank Adaptation) for efficient parameter updates. The training runs on a single GPU by default. Examples: # Basic training with GPT-2 humigence train-wikitext --model gpt2 --output-dir ./out # Training with custom parameters humigence train-wikitext --model microsoft/DialoGPT-small --output-dir ./out --epochs 2 --batch-size 4 --learning-rate 1e-4 # Training with specific steps instead of epochs humigence train-wikitext --model gpt2 --output-dir ./out --max-steps 1000 --batch-size 2 # Training with config file humigence train-wikitext --config ./myconfig.yaml --output-dir ./out """ # Validate that either model or config is provided if not config and not model: console.print("[bold red]❌ Error: Either --model or --config must be provided[/bold red]") raise typer.Exit(1) # Load config from file if provided if config: try: from config.schema import load_config, validation_to_training_config # Try to load as TrainingConfig first, then ValidationConfig try: loaded_config, metadata = load_config(config, TrainingConfig) except Exception: # If it fails, try loading as ValidationConfig and convert validation_config, metadata = load_config(config, ValidationConfig) loaded_config = validation_to_training_config(validation_config, output_dir) # Override with CLI arguments (CLI takes precedence) config_dict = loaded_config.dict() # Update with CLI values (only if they're not default values) if model != "": # If model was provided via CLI config_dict["model"] = model if output_dir != "": # If output_dir was provided via CLI config_dict["output_dir"] = output_dir if epochs != 1: config_dict["epochs"] = epochs if batch_size != 2: config_dict["batch_size"] = batch_size if learning_rate != 5e-5: config_dict["learning_rate"] = learning_rate if dataset != "wikitext": config_dict["dataset"] = dataset if dataset_config != "wikitext-2-raw-v1": config_dict["dataset_config"] = dataset_config if max_steps is not None: config_dict["max_steps"] = max_steps if block_size != 1024: config_dict["block_size"] = block_size if grad_accum != 4: config_dict["grad_accum"] = grad_accum if warmup_steps != 100: config_dict["warmup_steps"] = warmup_steps if logging_steps != 10: config_dict["logging_steps"] = logging_steps if save_steps != 200: config_dict["save_steps"] = save_steps if eval_steps != 200: config_dict["eval_steps"] = eval_steps if lora_r != 8: config_dict["lora_r"] = lora_r if lora_alpha != 32: config_dict["lora_alpha"] = lora_alpha if lora_dropout != 0.05: config_dict["lora_dropout"] = lora_dropout # Create new config with merged values final_config = TrainingConfig(**config_dict) # Extract values for display and function call model = final_config.model output_dir = final_config.output_dir dataset = final_config.dataset dataset_config = final_config.dataset_config epochs = final_config.epochs batch_size = final_config.batch_size learning_rate = final_config.learning_rate max_steps = final_config.max_steps block_size = final_config.block_size grad_accum = final_config.grad_accum warmup_steps = final_config.warmup_steps logging_steps = final_config.logging_steps save_steps = final_config.save_steps eval_steps = final_config.eval_steps lora_r = final_config.lora_r lora_alpha = final_config.lora_alpha lora_dropout = final_config.lora_dropout console.print(f"[bold blue]📁 Loaded configuration from {config}[/bold blue]") # Display provenance information if metadata is available if metadata: provenance_info = f"Created: {metadata.created}" if metadata.gpu: provenance_info += f" | GPU: {metadata.gpu}" if metadata.auto_heal and metadata.fallback_chain: provenance_info += f" | Auto-healed: {' → '.join(metadata.fallback_chain)}" elif metadata.auto_heal: provenance_info += " | Auto-healed: (no fallbacks needed)" else: provenance_info += " | Direct validation (no auto-healing)" console.print(f"[dim]📋 {provenance_info}[/dim]") except Exception as e: console.print(f"[bold red]❌ Failed to load config from {config}: {e}[/bold red]") raise typer.Exit(1) # Display training configuration config_panel = Panel( f"""[bold blue]Training Configuration[/bold blue] [cyan]Model:[/cyan] {model} [cyan]Output Directory:[/cyan] {output_dir} [cyan]Epochs:[/cyan] {epochs} [cyan]Batch Size:[/cyan] {batch_size} [cyan]Learning Rate:[/cyan] {learning_rate} [cyan]Dataset:[/cyan] {dataset}/{dataset_config} [cyan]Max Steps:[/cyan] {max_steps if max_steps else 'Auto-calculated'} [cyan]Block Size:[/cyan] {block_size} [cyan]Gradient Accumulation:[/cyan] {grad_accum} [cyan]LoRA Rank:[/cyan] {lora_r} [cyan]LoRA Alpha:[/cyan] {lora_alpha} [cyan]LoRA Dropout:[/cyan] {lora_dropout}""", title="🚀 Starting Wikitext Training", border_style="green" ) console.print(config_panel) # Create output directory if it doesn't exist Path(output_dir).mkdir(parents=True, exist_ok=True) # Run training try: if config: # Use config-based training with launcher from training.launcher import launch_training result = launch_training(final_config) else: # Use individual parameters - convert to TrainingConfig and use launcher from config.schema import TrainingConfig from training.launcher import launch_training training_config = TrainingConfig( model=model, output_dir=output_dir, dataset=dataset, dataset_config=dataset_config, precision="fp16", seq_len=block_size, batch_size=batch_size, epochs=epochs, learning_rate=learning_rate, max_steps=max_steps, block_size=block_size, grad_accum=grad_accum, warmup_steps=warmup_steps, logging_steps=logging_steps, save_steps=save_steps, eval_steps=eval_steps, lora=True, lora_r=lora_r, lora_alpha=lora_alpha, lora_dropout=lora_dropout, gradient_checkpointing=True, text_field="text", schema="plain", gpu_mode="single", gpu_ids=[0] ) result = launch_training(training_config) if result["status"] == "success": console.print(Panel( f"""[bold green]✅ Training Completed Successfully![/bold green] [cyan]Output Directory:[/cyan] {result['output_dir']} [cyan]Model Path:[/cyan] {result['model_path']} [bold blue]Final Metrics:[/bold blue] [cyan]Train Loss:[/cyan] {result['metrics'].get('train_loss', 'N/A')} [cyan]Eval Loss:[/cyan] {result['metrics'].get('eval_loss', 'N/A')} [cyan]Total Steps:[/cyan] {result['metrics'].get('total_steps', 'N/A')} [cyan]Epochs:[/cyan] {result['metrics'].get('epochs', 'N/A')} [cyan]Train Runtime:[/cyan] {result['metrics'].get('train_runtime', 'N/A')}s [cyan]Samples/Second:[/cyan] {result['metrics'].get('train_samples_per_second', 'N/A')}""", title="🎉 Training Results", border_style="green" )) return else: console.print(Panel( f"""[bold red]❌ Training Failed[/bold red] [red]Error:[/red] {result.get('error', 'Unknown error')} [cyan]Output Directory:[/cyan] {result.get('output_dir', 'N/A')}""", title="💥 Training Error", border_style="red" )) raise typer.Exit(1) except Exception as e: console.print(Panel( f"""[bold red]❌ Unexpected Error[/bold red] [red]Error:[/red] {str(e)}""", title="💥 Unexpected Error", border_style="red" )) raise typer.Exit(1) @app.command() def train( config: str = typer.Option(..., "--config", "-c", help="Path to YAML configuration file"), output_dir: Optional[str] = typer.Option(None, "--output-dir", "-o", help="Override output directory"), epochs: Optional[int] = typer.Option(None, "--epochs", "-e", help="Override number of epochs"), batch_size: Optional[int] = typer.Option(None, "--batch-size", "-b", help="Override batch size"), learning_rate: Optional[float] = typer.Option(None, "--learning-rate", "-lr", help="Override learning rate"), max_steps: Optional[int] = typer.Option(None, "--max-steps", help="Override maximum training steps"), dataset: Optional[str] = typer.Option(None, "--dataset", help="Override dataset specification"), text_field: Optional[str] = typer.Option(None, "--text-field", help="Override text field for HF datasets"), schema: Optional[str] = typer.Option(None, "--schema", help="Override schema for JSONL datasets"), gradient_checkpointing: Optional[bool] = typer.Option(None, "--gradient-checkpointing/--no-gradient-checkpointing", help="Override gradient checkpointing"), flash_attn: Optional[bool] = typer.Option(None, "--flash-attn/--no-flash-attn", help="Override flash attention"), dtype: Optional[str] = typer.Option(None, "--dtype", help="Override data type: fp32|fp16|bf16"), gpu_mode: Optional[str] = typer.Option(None, "--gpu-mode", help="Override GPU mode: single|multi"), gpu_ids: Optional[str] = typer.Option(None, "--gpu-ids", help="Override GPU IDs (comma-separated, e.g., '0,1,2')"), ): """ Train a model using a configuration file with dataset-agnostic support. This command supports training on: - Wikitext datasets (wikitext) - JSONL SFT datasets (jsonl:path/to/file.jsonl) - Hugging Face datasets (hf:dataset_name or dataset_name) Examples: # Train with Wikitext humigence train --config gpt2_wikitext.yaml # Train with JSONL SFT dataset humigence train --config my_sft_config.yaml # Train with Hugging Face dataset humigence train --config imdb_config.yaml # Override specific parameters humigence train --config my_config.yaml --epochs 3 --batch-size 4 """ # Load configuration try: from config.schema import load_config, validation_to_training_config # Try to load as TrainingConfig first, then ValidationConfig try: loaded_config, metadata = load_config(config, TrainingConfig) except Exception: # If it fails, try loading as ValidationConfig and convert validation_config, metadata = load_config(config, ValidationConfig) if not output_dir: console.print("[bold red]❌ Error: --output-dir is required when using ValidationConfig[/bold red]") raise typer.Exit(1) loaded_config = validation_to_training_config(validation_config, output_dir) # Override with CLI arguments (CLI takes precedence) config_dict = loaded_config.dict() if output_dir: config_dict["output_dir"] = output_dir if epochs is not None: config_dict["epochs"] = epochs if batch_size is not None: config_dict["batch_size"] = batch_size if learning_rate is not None: config_dict["learning_rate"] = learning_rate if max_steps is not None: config_dict["max_steps"] = max_steps if dataset: config_dict["dataset"] = dataset if text_field: config_dict["text_field"] = text_field if schema: config_dict["schema"] = schema if gradient_checkpointing is not None: config_dict["gradient_checkpointing"] = gradient_checkpointing if flash_attn is not None: config_dict["flash_attn"] = flash_attn if dtype: config_dict["dtype"] = dtype if gpu_mode: config_dict["gpu_mode"] = gpu_mode if gpu_ids: # Parse comma-separated GPU IDs try: gpu_ids_list = [int(x.strip()) for x in gpu_ids.split(",")] config_dict["gpu_ids"] = gpu_ids_list except ValueError: console.print(f"[red]❌ Invalid GPU IDs format: {gpu_ids}. Use comma-separated integers (e.g., '0,1,2')[/red]") raise typer.Exit(1) # Create final config final_config = TrainingConfig(**config_dict) console.print(f"[bold blue]📁 Loaded configuration from {config}[/bold blue]") # Display provenance information if metadata is available if metadata: provenance_info = f"Created: {metadata.created}" if metadata.gpu: provenance_info += f" | GPU: {metadata.gpu}" if metadata.auto_heal and metadata.fallback_chain: provenance_info += f" | Auto-healed: {' → '.join(metadata.fallback_chain)}" elif metadata.auto_heal: provenance_info += " | Auto-healed: (no fallbacks needed)" else: provenance_info += " | Direct validation (no auto-healing)" console.print(f"[dim]📋 {provenance_info}[/dim]") # Display dataset provenance if available if metadata.dataset: dataset_info = f"📁 Dataset: {metadata.dataset.get('file_path', metadata.dataset.get('dataset_name', 'N/A'))}" if metadata.dataset.get('schema'): dataset_info += f" ({metadata.dataset['schema']})" console.print(f"[dim]{dataset_info}[/dim]") if 'train_size' in metadata.dataset and 'eval_size' in metadata.dataset: size_info = f"🔢 Train size: {metadata.dataset['train_size']} | Eval size: {metadata.dataset['eval_size']}" console.print(f"[dim]{size_info}[/dim]") if 'sha256' in metadata.dataset: sha256 = metadata.dataset['sha256'] if len(sha256) > 12: sha256 = sha256[:12] + "..." console.print(f"[dim]🔑 SHA256: {sha256}[/dim]") else: console.print("[yellow]⚠️ Config missing dataset metadata. Consider re-running validate to persist provenance.[/yellow]") except Exception as e: console.print(f"[bold red]❌ Failed to load config from {config}: {e}[/bold red]") raise typer.Exit(1) # Display training configuration dataset_info = f"{final_config.dataset.type}" if final_config.dataset.path: dataset_info += f" ({final_config.dataset.path})" elif final_config.dataset.name: dataset_info += f" ({final_config.dataset.name})" config_panel = Panel( f"""[bold blue]Training Configuration[/bold blue] [cyan]Model:[/cyan] {final_config.model} [cyan]Output Directory:[/cyan] {final_config.output_dir} [cyan]Dataset:[/cyan] {dataset_info} [cyan]Schema:[/cyan] {final_config.dataset.schema_type or 'auto'} [cyan]Text Field:[/cyan] {final_config.dataset.text_field or 'auto'} [cyan]Epochs:[/cyan] {final_config.epochs} [cyan]Batch Size:[/cyan] {final_config.batch_size} [cyan]Learning Rate:[/cyan] {final_config.learning_rate} [cyan]Max Steps:[/cyan] {final_config.max_steps if final_config.max_steps else 'Auto-calculated'} [cyan]Block Size:[/cyan] {final_config.block_size} [cyan]Gradient Accumulation:[/cyan] {final_config.grad_accum} [cyan]LoRA Rank:[/cyan] {final_config.lora_r} [cyan]LoRA Alpha:[/cyan] {final_config.lora_alpha} [cyan]LoRA Dropout:[/cyan] {final_config.lora_dropout} [cyan]Gradient Checkpointing:[/cyan] {final_config.gradient_checkpointing} [cyan]Flash Attention:[/cyan] {final_config.flash_attn} [cyan]Data Type:[/cyan] {final_config.dtype}""", title="🚀 Starting Dataset-Agnostic Training", border_style="green" ) console.print(config_panel) # Create output directory if it doesn't exist Path(final_config.output_dir).mkdir(parents=True, exist_ok=True) # Run training try: from training.launcher import launch_training result = launch_training(final_config) if result["status"] == "success": console.print(Panel( f"""[bold green]✅ Training Completed Successfully![/bold green] [cyan]Output Directory:[/cyan] {result['output_dir']} [cyan]Model Path:[/cyan] {result['model_path']} [bold blue]Final Metrics:[/bold blue] [cyan]Train Loss:[/cyan] {result['metrics'].get('train_loss', 'N/A')} [cyan]Eval Loss:[/cyan] {result['metrics'].get('eval_loss', 'N/A')} [cyan]Total Steps:[/cyan] {result['metrics'].get('total_steps', 'N/A')} [cyan]Epochs:[/cyan] {result['metrics'].get('epochs', 'N/A')} [cyan]Train Runtime:[/cyan] {result['metrics'].get('train_runtime', 'N/A')}s [cyan]Samples/Second:[/cyan] {result['metrics'].get('train_samples_per_second', 'N/A')}""", title="🎉 Training Results", border_style="green" )) return else: console.print(Panel( f"""[bold red]❌ Training Failed[/bold red] [red]Error:[/red] {result.get('error', 'Unknown error')} [cyan]Output Directory:[/cyan] {result.get('output_dir', 'N/A')}""", title="💥 Training Error", border_style="red" )) raise typer.Exit(1) except Exception as e: console.print(Panel( f"""[bold red]❌ Unexpected Error[/bold red] [red]Error:[/red] {str(e)}""", title="💥 Unexpected Error", border_style="red" )) raise typer.Exit(1) @app.command() def validate( model: str = typer.Option(..., help="HF model id or local path"), dataset: str = typer.Option("wikitext", help="Dataset specification: wikitext | jsonl: | hf:"), precision: str = typer.Option("fp16", help="fp32|fp16|bf16|qlora4bit"), seq_len: int = typer.Option(1024, help="Sequence length"), batch_size: int = typer.Option(2, help="Batch size"), lora: bool = typer.Option(True, help="Enable LoRA"), max_samples: int = typer.Option(128, help="Max samples for schema sniff"), text_field: Optional[str] = typer.Option(None, help="Text field for generic HF datasets"), schema: Optional[str] = typer.Option(None, help="Schema for JSONL datasets: sft | dialogue | plain | auto"), role_markers: bool = typer.Option(True, "--role-markers/--no-role-markers", help="Use role markers for dialogue datasets"), user_marker: str = typer.Option("", help="User role marker"), assistant_marker: str = typer.Option("", help="Assistant role marker"), eval_split: Optional[float] = typer.Option(None, help="Fraction of data to use for evaluation (0.0-1.0)"), eval_file: Optional[str] = typer.Option(None, help="Path to separate evaluation file (for JSONL)"), gradient_checkpointing: bool = typer.Option(False, "--gradient-checkpointing/--no-gradient-checkpointing", help="Enable gradient checkpointing"), flash_attn: bool = typer.Option(False, "--flash-attn/--no-flash-attn", help="Enable flash attention"), dtype: str = typer.Option("fp16", help="Data type: fp32|fp16|bf16"), dry_run_flag: bool = typer.Option(True, "--dry-run/--no-dry-run", help="Do the 1-batch fwd+bwd"), auto_heal: bool = typer.Option(True, "--auto-heal/--no-auto-heal", help="Enable auto-healing fallback simulation"), max_attempts: int = typer.Option(10, help="Maximum fallback attempts for auto-healing"), save_config_path: Optional[str] = typer.Option(None, "--save-config", help="Save auto-healed config to YAML file"), overwrite: bool = typer.Option(False, "--overwrite", help="Overwrite existing config file instead of versioning"), ): """ Validate model, dataset, and training configuration before training. This command performs comprehensive validation including: - Model family detection and LoRA target module validation - GPU capability and precision support checks - Memory estimation and OOM prevention - Tokenizer validation - Optional 1-batch dry-run to test actual training setup Examples: # Basic validation with GPT-2 humigence validate --model gpt2 --dataset wikitext --precision fp16 # Validate with BF16 (will fail on non-BF16 GPUs) humigence validate --model gpt2 --precision bf16 # Validate with 4-bit quantization humigence validate --model gpt2 --precision qlora4bit # Validate without dry-run humigence validate --model gpt2 --no-dry-run """ if precision not in PRECISIONS: typer.secho(f"Unsupported precision: {precision}", fg=typer.colors.RED, err=True) raise typer.Exit(1) # Detect model family and get config family, cfg = detect_family(model) gpu = get_gpu_info() tok_ok, tok_msg = tokenizer_ok(model) prec_ok, prec_msg = precision_supported(precision, gpu) # Detect dataset type and validate dataset_type = _detect_dataset_type(dataset) dataset_ok, dataset_msg = _validate_dataset(dataset, dataset_type, text_field, schema) # Create dataset configuration with eval split support dataset_config = _create_dataset_config(dataset, text_field, schema, role_markers, user_marker, assistant_marker, eval_split, eval_file) # GPU-aware defaults and warnings _apply_gpu_aware_defaults(gpu, precision, batch_size, seq_len, gradient_checkpointing, flash_attn, dtype) # Load dataset to capture metadata dataset_metadata = None if dataset_ok: try: from training.data_loader import create_dataset_loader loader = create_dataset_loader( dataset, text_field=text_field, schema=schema or "auto", role_markers=role_markers, user_marker=user_marker, assistant_marker=assistant_marker, eval_split=eval_split, eval_file=eval_file ) # Load dataset to get metadata train_dataset, eval_dataset = loader.load() dataset_metadata = loader.get_metadata() except Exception as e: console.print(f"[yellow]⚠️ Could not load dataset metadata: {e}[/yellow]") dataset_metadata = None # Estimate parameters and memory params = estimate_model_params(cfg) mem_est = estimate_memory_bytes(params, precision, adam=True, lora=lora) mem_info = f"est ~{mem_est/1e9:.2f} GB" if mem_est else "n/a" # Collect warnings warns = [] if not tok_ok: warns.append(f"Tokenizer: {tok_msg}") if not prec_ok: warns.append(f"Precision: {prec_msg}") if not dataset_ok: warns.append(f"Dataset: {dataset_msg}") # Check sequence length against model limits max_pos = getattr(cfg, "max_position_embeddings", None) if max_pos and seq_len > max_pos: warns.append(f"seq_len {seq_len} > model limit {max_pos}. Suggest <= {max_pos}.") # Create summary table tbl = Table(title="Humigence Validation Summary") tbl.add_column("Item", style="cyan") tbl.add_column("Value", style="white") tbl.add_row("Model", model) tbl.add_row("Family", family) tbl.add_row("Dataset Type", dataset_config.type) tbl.add_row("Dataset Path/Name", dataset_config.path or dataset_config.name or "N/A") tbl.add_row("Schema", dataset_config.schema_type or "auto") tbl.add_row("Text Field", dataset_config.text_field or "auto") if dataset_config.type == "jsonl" and dataset_config.schema_type == "dialogue": tbl.add_row("Role Markers", f"{dataset_config.user_marker} / {dataset_config.assistant_marker}") # Add dataset metadata if available if dataset_metadata: tbl.add_row("Train Size", str(dataset_metadata.get("train_size", "N/A"))) tbl.add_row("Eval Size", str(dataset_metadata.get("eval_size", "N/A"))) if "sha256" in dataset_metadata: sha256 = dataset_metadata["sha256"] if len(sha256) > 12: sha256 = sha256[:12] + "..." tbl.add_row("SHA256", sha256) tbl.add_row("Precision", precision) tbl.add_row("GPU", f"{gpu.name} (bf16={gpu.bf16_supported}, cc={gpu.cc_major}.{gpu.cc_minor})" if gpu.available else "CPU") tbl.add_row("Params (est.)", f"{params:,}" if params else "unknown") tbl.add_row("Memory (est.)", mem_info) tbl.add_row("Seq Len", str(seq_len)) tbl.add_row("Batch Size", str(batch_size)) tbl.add_row("LoRA", str(lora)) tbl.add_row("Tokenizer", "OK" if tok_ok else f"ISSUE: {tok_msg}") tbl.add_row("Precision Support", "OK" if prec_ok else f"ISSUE: {prec_msg}") tbl.add_row("Dataset", "OK" if dataset_ok else f"ISSUE: {dataset_msg}") console.print(tbl) # Display warnings if warns: console.print("\n[yellow]Warnings:[/yellow]") for w in warns: console.print(f" - {w}") # Check precision support if not prec_ok: console.print("\n[bold red]FAIL[/bold red]: Precision not supported.") _print_fallback(precision, gpu, lora, seq_len, batch_size) raise typer.Exit(2) # Perform dry run if requested if dry_run_flag: console.print("\n[bold]Running 1-batch dry-run...[/bold]") lora_targets = suggested_lora_targets(family) if lora else None res = dry_run( model_id_or_path=model, precision=precision, seq_len=seq_len, batch_size=batch_size, lora=lora, lora_targets=lora_targets, ) if res.ok: console.print(f"[green]PASS[/green]: dry-run completed. loss={res.details.get('loss'):.4f}") # Save config if requested (even without auto-healing) if save_config_path: validation_config = ValidationConfig( model=model, dataset=dataset_config, precision=precision, seq_len=seq_len, batch_size=batch_size, lora=lora, lora_targets=lora_targets, gradient_checkpointing=gradient_checkpointing, flash_attn=flash_attn, dtype=dtype, max_samples=max_samples ) # Create runtime metadata runtime_metadata = _create_runtime_metadata(gpu) # Create metadata metadata = ConfigMetadata( created=datetime.now().isoformat(), gpu=f"{gpu.name} (bf16={gpu.bf16_supported}, cc={gpu.cc_major}.{gpu.cc_minor})" if gpu.available else "CPU", precision_supported=[p for p in ["fp32", "fp16", "bf16", "qlora4bit"] if precision_supported(p, gpu)[0]], validator_version="0.3", auto_heal=False, fallback_chain=[], original_config={ "model": model, "precision": precision, "seq_len": seq_len, "batch_size": batch_size, "lora": lora, "gradient_checkpointing": gradient_checkpointing, "flash_attn": flash_attn, "dtype": dtype }, dataset=dataset_metadata, runtime=runtime_metadata ) saved_path = save_config(validation_config, save_config_path, metadata, overwrite) console.print(f"\n[bold green]✅ Config saved to {saved_path}[/bold green]") raise typer.Exit(0) else: console.print(f"[red]FAIL[/red]: dry-run error: {res.error}") # Auto-healing fallback simulation if auto_heal: console.print(f"[yellow]Auto-healing enabled. Attempting fallback simulation...[/yellow]") # Create initial config candidate initial_config = ConfigCandidate( model=model, precision=precision, seq_len=seq_len, batch_size=batch_size, lora=lora, lora_targets=lora_targets, gradient_checkpointing=False, dataset=dataset, text_field=text_field ) # Run fallback simulation simulator = FallbackSimulator() success, final_config = simulator.simulate_fallbacks(initial_config, max_attempts) if success: console.print(f"\n[bold green]🎉 AUTO-HEALING SUCCESSFUL![/bold green]") console.print(f"[dim]Found working configuration after {len(simulator.attempts)} attempts[/dim]") # Generate and display YAML config yaml_config = simulator.generate_yaml_config(final_config) console.print(f"\n[bold blue]AUTO-HEALED CONFIG PATCH[/bold blue]") console.print(f"[dim]```yaml[/dim]") console.print(yaml_config) console.print(f"[dim]```[/dim]") # Save config if requested if save_config_path: # Create ValidationConfig from final_config validation_config = ValidationConfig( model=final_config.model, dataset=final_config.dataset, precision=final_config.precision, seq_len=final_config.seq_len, batch_size=final_config.batch_size, lora=final_config.lora, lora_targets=final_config.lora_targets, gradient_checkpointing=final_config.gradient_checkpointing, text_field=final_config.text_field, schema=getattr(final_config, 'schema', schema), max_samples=max_samples ) # Create fallback chain from simulator attempts fallback_chain = [] for attempt in simulator.attempts[1:]: # Skip initial attempt if attempt.notes: fallback_chain.append(attempt.notes) else: # Generate fallback description from config changes prev_config = simulator.attempts[attempt.attempt_num - 2].config curr_config = attempt.config changes = [] if prev_config.precision != curr_config.precision: changes.append(f"precision {prev_config.precision} → {curr_config.precision}") if prev_config.seq_len != curr_config.seq_len: changes.append(f"seq_len {prev_config.seq_len} → {curr_config.seq_len}") if prev_config.batch_size != curr_config.batch_size: changes.append(f"batch_size {prev_config.batch_size} → {curr_config.batch_size}") if prev_config.gradient_checkpointing != curr_config.gradient_checkpointing: changes.append(f"gradient_checkpointing {prev_config.gradient_checkpointing} → {curr_config.gradient_checkpointing}") if changes: fallback_chain.append(", ".join(changes)) # Create metadata with fallback chain metadata = ConfigMetadata( created=datetime.now().isoformat(), gpu=f"{gpu.name} (bf16={gpu.bf16_supported}, cc={gpu.cc_major}.{gpu.cc_minor})" if gpu.available else "CPU", precision_supported=[p for p in ["fp32", "fp16", "bf16", "qlora4bit"] if precision_supported(p, gpu)[0]], validator_version="0.3", auto_heal=True, fallback_chain=fallback_chain, original_config={ "model": model, "precision": precision, "seq_len": seq_len, "batch_size": batch_size, "lora": lora }, dataset=dataset_metadata ) saved_path = save_config(validation_config, save_config_path, metadata, overwrite) console.print(f"\n[bold green]✅ Auto-healed config saved to {saved_path}[/bold green]") raise typer.Exit(0) else: console.print(f"\n[bold red]❌ AUTO-HEALING FAILED[/bold red]") console.print(f"[dim]Could not find working configuration after {max_attempts} attempts[/dim]") _print_fallback(precision, gpu, lora, seq_len, batch_size, res.oom) raise typer.Exit(3) else: # No auto-healing, just show fallback suggestions if res.oom: console.print("[yellow]Detected OOM. Proposing fallback...[/yellow]") _print_fallback(precision, gpu, lora, seq_len, batch_size, res.oom) raise typer.Exit(3) else: # No dry-run; rely on static checks if warns: console.print("[yellow]COMPLETE WITH WARNINGS[/yellow]") raise typer.Exit(0) console.print("[green]PASS[/green]") raise typer.Exit(0) def _detect_dataset_type(dataset_spec: str) -> str: """Detect dataset type from specification""" if dataset_spec == "wikitext": return "wikitext" elif dataset_spec.startswith("jsonl:"): return "jsonl" elif dataset_spec.startswith("hf:"): return "hf" else: # Assume it's a direct HF dataset name return "hf" def _create_dataset_config(dataset_spec: str, text_field: Optional[str], schema: Optional[str], role_markers: bool, user_marker: str, assistant_marker: str, eval_split: Optional[float] = None, eval_file: Optional[str] = None): """Create DatasetConfig from CLI parameters""" from config.schema import DatasetConfig dataset_type = _detect_dataset_type(dataset_spec) if dataset_type == "wikitext": return DatasetConfig(type="wikitext", name="wikitext") elif dataset_type == "jsonl": file_path = dataset_spec[6:] # Remove "jsonl:" prefix return DatasetConfig( type="jsonl", path=file_path, schema_type=schema or "auto", role_markers=role_markers, user_marker=user_marker, assistant_marker=assistant_marker, eval_split=eval_split, eval_file=eval_file ) elif dataset_type == "hf": dataset_name = dataset_spec[3:] if dataset_spec.startswith("hf:") else dataset_spec return DatasetConfig( type="hf", name=dataset_name, text_field=text_field or "text", eval_split=eval_split ) else: raise ValueError(f"Unknown dataset type: {dataset_type}") def _apply_gpu_aware_defaults(gpu, precision: str, batch_size: int, seq_len: int, gradient_checkpointing: bool, flash_attn: bool, dtype: str): """Apply GPU-aware defaults and warnings""" if not gpu.available: console.print("[yellow]⚠️ No GPU detected - using CPU mode[/yellow]") return # Get GPU memory info try: import torch if torch.cuda.is_available(): gpu_memory_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3) console.print(f"[blue]🔧 GPU Memory: {gpu_memory_gb:.1f}GB[/blue]") # Warn about potential OOM issues if precision == "fp32" and gpu_memory_gb < 24: console.print(f"[yellow]⚠️ Detected {gpu_memory_gb:.1f}GB GPU — fp32 may OOM, recommend fp16 with batch_size<=4[/yellow]") elif precision == "bf16" and not gpu.bf16_supported: console.print(f"[yellow]⚠️ GPU doesn't support BF16, recommend fp16[/yellow]") elif batch_size > 4 and gpu_memory_gb < 16: console.print(f"[yellow]⚠️ Large batch size ({batch_size}) on {gpu_memory_gb:.1f}GB GPU may cause OOM[/yellow]") except Exception as e: console.print(f"[yellow]⚠️ Could not get GPU memory info: {e}[/yellow]") def _create_runtime_metadata(gpu) -> Dict[str, Any]: """Create runtime environment metadata""" runtime_metadata = {} try: import torch import platform # GPU info if gpu.available: runtime_metadata["gpu"] = gpu.name runtime_metadata["vram_gb"] = torch.cuda.get_device_properties(0).total_memory / (1024**3) runtime_metadata["cuda"] = torch.version.cuda else: runtime_metadata["gpu"] = "CPU" runtime_metadata["vram_gb"] = 0 runtime_metadata["cuda"] = None # PyTorch version runtime_metadata["torch"] = torch.__version__ # System info runtime_metadata["platform"] = platform.platform() runtime_metadata["python"] = platform.python_version() except Exception as e: console.print(f"[yellow]⚠️ Could not collect runtime metadata: {e}[/yellow]") runtime_metadata["error"] = str(e) return runtime_metadata def _validate_dataset(dataset_spec: str, dataset_type: str, text_field: Optional[str], schema: Optional[str]) -> tuple[bool, str]: """Validate dataset specification and accessibility""" try: if dataset_type == "wikitext": # Wikitext is always valid return True, "OK" elif dataset_type == "jsonl": file_path = dataset_spec[6:] # Remove "jsonl:" prefix if not os.path.exists(file_path): return False, f"File not found: {file_path}" # Try to read first line to validate JSON format try: with open(file_path, 'r', encoding='utf-8') as f: first_line = f.readline().strip() if first_line: import json json.loads(first_line) return True, "OK" except json.JSONDecodeError: return False, f"Invalid JSON format in {file_path}" except Exception as e: return False, f"Error reading {file_path}: {e}" elif dataset_type == "hf": dataset_name = dataset_spec[3:] if dataset_spec.startswith("hf:") else dataset_spec # Try to load dataset info (without actually downloading) try: from datasets import get_dataset_infos infos = get_dataset_infos(dataset_name) if not infos: return False, f"Dataset {dataset_name} not found" return True, "OK" except Exception as e: return False, f"Error accessing dataset {dataset_name}: {e}" else: return False, f"Unknown dataset type: {dataset_type}" except Exception as e: return False, f"Dataset validation error: {e}" def _print_fallback(precision: str, gpu, lora: bool, seq_len: int, batch_size: int, oom: bool = False): """Print fallback configuration recommendations""" console.print("\n[bold]RECOMMENDED CONFIG PATCH[/bold]") suggest = { "precision": precision, "seq_len": seq_len, "batch_size": batch_size, "lora": lora, "gradient_checkpointing": False, } # Precision fallback if precision == "bf16" and not gpu.bf16_supported: suggest["precision"] = "fp16" if precision == "qlora4bit" and not gpu.available: suggest["precision"] = "fp16" # OOM mitigations if oom: if batch_size > 1: suggest["batch_size"] = max(1, batch_size // 2) else: suggest["gradient_checkpointing"] = True if seq_len > 1024: suggest["seq_len"] = min(1024, seq_len // 2) if precision in ("bf16", "fp32"): suggest["precision"] = "fp16" for k, v in suggest.items(): console.print(f" - {k}: {v}") @app.command() def gpu_info(): """Show detailed GPU information and selection options.""" from validation.matrix import get_all_gpu_info multi_gpu_info = get_all_gpu_info() if not multi_gpu_info.gpus: console.print(Panel( "[bold red]❌ No GPUs detected[/bold red]\n" "[dim]Training will run on CPU[/dim]", title="GPU Information", border_style="red" )) return # Create GPU information table table = Table(title="Available GPUs") table.add_column("Index", style="cyan", width=6) table.add_column("Name", style="white", width=40) table.add_column("VRAM", style="green", width=12) table.add_column("Compute Capability", style="blue", width=15) table.add_column("BF16 Support", style="yellow", width=12) for gpu in multi_gpu_info.gpus: vram_gb = gpu.total_bytes / (1024**3) cc = f"{gpu.cc_major}.{gpu.cc_minor}" bf16_support = "✅ Yes" if gpu.bf16_supported else "❌ No" table.add_row( str(gpu.device_index), gpu.name, f"{vram_gb:.1f} GB", cc, bf16_support ) console.print(table) # Show selection examples console.print(Panel( f"""[bold blue]GPU Selection Examples[/bold blue] [cyan]Single GPU Training:[/cyan] humigence train --config my_config.yaml --gpu-mode single --gpu-ids 0 [cyan]Multi-GPU Training (all GPUs):[/cyan] humigence train --config my_config.yaml --gpu-mode multi --gpu-ids 0,1 [cyan]Multi-GPU Training (specific GPUs):[/cyan] humigence train --config my_config.yaml --gpu-mode multi --gpu-ids 1,2 [dim]Total VRAM: {multi_gpu_info.total_vram_gb:.1f} GB across {multi_gpu_info.count} GPUs[/dim]""", title="Usage Examples", border_style="green" )) @app.command() def version(): """Show version information.""" console.print("[bold blue]Humigence v1.0.0[/bold blue]") console.print("[dim]Your AI. Your pipeline. Zero code.[/dim]") @app.callback() def main( version: bool = typer.Option( False, "--version", "-v", help="Show version and exit" ) ): """ Humigence - Your AI. Your pipeline. Zero code. A complete MLOps suite built for makers, teams, and enterprises. """ if version: console.print("[bold blue]Humigence v1.0.0[/bold blue]") console.print("[dim]Your AI. Your pipeline. Zero code.[/dim]") raise typer.Exit(0) if __name__ == "__main__": app()