File size: 12,512 Bytes
7275aef c4b369c 7275aef c4b369c 7275aef c4b369c 7275aef c4b369c 7275aef c4b369c 7275aef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 |
# cli/main.py
import sys
import time
from pathlib import Path
# Add the parent directory to the path so we can import from pipelines
sys.path.insert(0, str(Path(__file__).parent.parent))
# DO NOT import Unsloth here - delay until after wizard completion
UNSLOTH_AVAILABLE = None # Will be checked later
from cli.config_wizard import collect_training_config
from cli.atomic_eval import app as atomic_eval_app
from rich import print
from rich.console import Console
import inquirer
console = Console()
# Removed download functionality - system now only works with local datasets
def check_unsloth_availability():
"""Check if Unsloth is available (delayed import)"""
global UNSLOTH_AVAILABLE
if UNSLOTH_AVAILABLE is None:
try:
import unsloth
UNSLOTH_AVAILABLE = True
except ImportError:
UNSLOTH_AVAILABLE = False
return UNSLOTH_AVAILABLE
def detect_gpus():
"""Detect available GPUs"""
try:
import torch
if torch.cuda.is_available():
gpu_count = torch.cuda.device_count()
gpus = []
for i in range(gpu_count):
gpus.append({
"index": i,
"name": torch.cuda.get_device_name(i),
"memory": f"{torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f}GB"
})
return gpu_count, gpus
else:
return 0, []
except ImportError:
return 0, []
def choose_training_mode(gpu_count, gpus):
"""Choose training mode based on available GPUs"""
if gpu_count == 0:
console.print("[yellow]β οΈ No GPUs detected - CPU training not supported[/yellow]")
return None
elif gpu_count == 1:
console.print(f"[blue]π§ Single GPU detected - using GPU 0: {gpus[0]['name']}[/blue]")
return "single"
else:
# Multiple GPUs - prompt user to choose
console.print(f"[blue]π§ {gpu_count} GPUs detected - choose training mode[/blue]")
# Display available GPUs
from rich.table import Table
gpu_table = Table(show_header=True, box=None)
gpu_table.add_column("Index", style="cyan", width=6)
gpu_table.add_column("Name", style="white", width=40)
gpu_table.add_column("VRAM", style="green", width=10)
for gpu in gpus:
gpu_table.add_row(str(gpu['index']), gpu['name'], gpu['memory'])
console.print(gpu_table)
choices = [
"Multi-GPU Training (all available GPUs)",
"Single GPU Training (choose specific GPU)"
]
questions = [
inquirer.List('training_mode',
message="π§ Training Mode: (Use arrow keys)",
choices=choices,
default=choices[0])
]
answers = inquirer.prompt(questions)
selected_mode = answers['training_mode']
if "Multi-GPU" in selected_mode:
return "multi"
else:
# Single GPU - let user choose which one
gpu_choices = []
for gpu in gpus:
gpu_choices.append(f"GPU{gpu['index']}: {gpu['name']} ({gpu['memory']})")
questions = [
inquirer.List('gpu_selection',
message="Choose GPU: (Use arrow keys)",
choices=gpu_choices,
default=gpu_choices[0])
]
answers = inquirer.prompt(questions)
selected_gpu = answers['gpu_selection']
# Extract GPU index
gpu_index = int(selected_gpu.split("GPU")[1].split(":")[0])
console.print(f"[blue]Selected GPU {gpu_index}: {gpus[gpu_index]['name']}[/blue]")
return f"single_{gpu_index}"
def show_menu():
console.rule("[bold cyan]Humigence β Your AI. Your pipeline. Zero code.")
print("[dim]A complete MLOps suite built for makers, teams, and enterprises.[/dim]\n")
print("Options:")
print("[bold green]1.[/bold green] Supervised Fine-Tuning π")
print("[bold yellow]2.[/bold yellow] RAG Implementation (coming soon)")
print("[bold yellow]3.[/bold yellow] EnterpriseGPT (coming soon)")
print("[bold yellow]4.[/bold yellow] Batch Inference (coming soon)")
print("[bold yellow]5.[/bold yellow] Context Length (coming soon)")
print("[bold red]6.[/bold red] Exit\n")
def launch_training(config, training_mode, gpus):
"""Launch training based on the selected mode"""
import os
import subprocess
import json
# Change to the humigence directory
humigence_dir = Path(__file__).parent.parent
os.chdir(humigence_dir)
# Map model names to Unsloth equivalents
model_mapping = {
"Qwen/Qwen2.5-0.5B": "unsloth/Qwen2.5-0.5B-Instruct",
"microsoft/Phi-2": "unsloth/Phi-2",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0": "unsloth/TinyLlama-1.1B-Chat-v1.0"
}
# Use Unsloth model if available, otherwise use original
base_model = config.get("base_model", config.get("model_name", "Qwen/Qwen2.5-0.5B"))
model_name = model_mapping.get(base_model, base_model)
# Determine dataset parameters
dataset_path = config["dataset_path"]
if dataset_path.startswith("local:"):
# Local dataset - use as custom dataset
dataset_name = "jsonl"
dataset_config = dataset_path[6:] # Remove "local:" prefix
else:
# Default to wikitext for demo
dataset_name = "wikitext"
dataset_config = "wikitext-2-raw-v1"
# Map training recipe to precision
training_recipe = config.get("training_recipe", "QLoRA (4-bit NF4)")
if "QLoRA" in training_recipe:
precision = "qlora_4bit"
elif "BF16" in training_recipe:
precision = "lora_bf16"
else:
precision = "lora_fp16"
# Create output directory with timestamp
timestamp = time.strftime("%Y%m%d_%H%M%S")
if training_mode == "multi":
# Multi-GPU training with TorchRun
output_dir = f"./runs/humigence/out_lora_dual_{timestamp}"
console.print("[bold green]π Launching multi-GPU training with Unsloth...[/bold green]")
cmd = [
"torchrun",
"--nproc_per_node=2",
"training/unsloth/train_lora_dual.py",
"--model", model_name,
"--dataset", dataset_name,
"--dataset_config", dataset_config,
"--out_dir", output_dir,
"--max_steps", "1000",
"--per_device_batch", "2",
"--grad_accum", "4",
"--learning_rate", "2e-4",
"--block_size", "1024",
"--lora_r", "16",
"--lora_alpha", "32",
"--lora_dropout", "0.0",
"--precision", precision
]
console.print(f"[dim]Command: {' '.join(cmd)}[/dim]")
try:
result = subprocess.run(cmd, check=True, cwd=humigence_dir)
console.print("[bold green]β
Multi-GPU training completed successfully![/bold green]")
console.print(f"[blue]π Output saved to: {output_dir}[/blue]")
return True
except subprocess.CalledProcessError as e:
console.print(f"[bold red]β Multi-GPU training failed with return code: {e.returncode}[/bold red]")
console.print("[yellow]π Falling back to single-GPU training...[/yellow]")
# Fall through to single-GPU fallback
training_mode = "single"
if training_mode == "single" or training_mode.startswith("single_"):
# Single-GPU training
if training_mode.startswith("single_"):
gpu_index = int(training_mode.split("_")[1])
output_dir = f"./runs/humigence/out_lora_single_{timestamp}_gpu{gpu_index}"
else:
gpu_index = 0
output_dir = f"./runs/humigence/out_lora_single_{timestamp}"
console.print(f"[bold green]π Launching single-GPU training with Unsloth...[/bold green]")
console.print(f"[blue]Using GPU {gpu_index}: {gpus[gpu_index]['name']}[/blue]")
cmd = [
"python3",
"training/unsloth/train_lora_dual.py",
"--model", model_name,
"--dataset", dataset_name,
"--dataset_config", dataset_config,
"--out_dir", output_dir,
"--max_steps", "1000",
"--per_device_batch", "4", # Larger batch for single GPU
"--grad_accum", "2", # Less accumulation for single GPU
"--learning_rate", "2e-4",
"--block_size", "1024",
"--lora_r", "16",
"--lora_alpha", "32",
"--lora_dropout", "0.0",
"--precision", precision
]
console.print(f"[dim]Command: {' '.join(cmd)}[/dim]")
# Set environment for specific GPU
env = os.environ.copy()
env["CUDA_VISIBLE_DEVICES"] = str(gpu_index)
try:
result = subprocess.run(cmd, check=True, cwd=humigence_dir, env=env)
console.print("[bold green]β
Single-GPU training completed successfully![/bold green]")
console.print(f"[blue]π Output saved to: {output_dir}[/blue]")
return True
except subprocess.CalledProcessError as e:
console.print(f"[bold red]β Single-GPU training failed with return code: {e.returncode}[/bold red]")
return False
except Exception as e:
console.print(f"[bold red]β Single-GPU training failed: {e}[/bold red]")
return False
return False
def main():
while True:
show_menu()
choice = console.input("[bold blue]Select an option[/bold blue]: ")
if choice == "1":
console.print("[bold green]Starting Supervised Fine-Tuning...[/bold green]")
# Step 1: Run the configuration wizard (no Unsloth import yet)
config_path = collect_training_config()
if config_path is None:
# User cancelled or error occurred
console.print("[bold red]β Training cancelled. Returning to main menu.[/bold red]")
time.sleep(2)
continue
# Step 2: Load the configuration from the wizard
import json
with open(config_path, 'r') as f:
config = json.load(f)
# Step 3: NOW check if Unsloth dependencies are available (after wizard completion)
if not check_unsloth_availability():
console.print("[bold red]β Missing required dependencies: No module named 'unsloth'[/bold red]")
console.print("[yellow]β‘ To install, run:[/yellow]")
console.print("[cyan]python3 training/unsloth/setup_humigence_unsloth.py[/cyan]")
time.sleep(2)
continue
# Step 4: Detect GPUs BEFORE importing Unsloth (to avoid interference)
gpu_count, gpus = detect_gpus()
training_mode = choose_training_mode(gpu_count, gpus)
if training_mode is None:
console.print("[bold red]β No suitable training mode available. Returning to main menu.[/bold red]")
time.sleep(2)
continue
# Step 5: Launch training
success = launch_training(config, training_mode, gpus)
if not success:
console.print("[bold red]β Training failed. Check the logs above for details.[/bold red]")
# Ask if user wants to start another training session
console.print("\n[bold cyan]Training completed![/bold cyan]")
if console.input("[bold blue]Start another training session? (y/N)[/bold blue]: ").lower() in ['y', 'yes']:
continue
else:
break
elif choice == "6":
console.print("[bold red]Exiting Humigence CLI. Goodbye![/bold red]")
time.sleep(1)
sys.exit()
else:
console.print("[yellow]β οΈ Option not implemented yet. Try 1 or 6.[/yellow]\n")
time.sleep(1)
if __name__ == "__main__":
main() |