|
|
|
|
|
|
|
|
import sys |
|
|
import time |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent)) |
|
|
|
|
|
|
|
|
UNSLOTH_AVAILABLE = None |
|
|
|
|
|
from cli.config_wizard import collect_training_config |
|
|
from cli.atomic_eval import app as atomic_eval_app |
|
|
from rich import print |
|
|
from rich.console import Console |
|
|
import inquirer |
|
|
|
|
|
console = Console() |
|
|
|
|
|
|
|
|
|
|
|
def check_unsloth_availability(): |
|
|
"""Check if Unsloth is available (delayed import)""" |
|
|
global UNSLOTH_AVAILABLE |
|
|
if UNSLOTH_AVAILABLE is None: |
|
|
try: |
|
|
import unsloth |
|
|
UNSLOTH_AVAILABLE = True |
|
|
except ImportError: |
|
|
UNSLOTH_AVAILABLE = False |
|
|
return UNSLOTH_AVAILABLE |
|
|
|
|
|
def detect_gpus(): |
|
|
"""Detect available GPUs""" |
|
|
try: |
|
|
import torch |
|
|
if torch.cuda.is_available(): |
|
|
gpu_count = torch.cuda.device_count() |
|
|
gpus = [] |
|
|
for i in range(gpu_count): |
|
|
gpus.append({ |
|
|
"index": i, |
|
|
"name": torch.cuda.get_device_name(i), |
|
|
"memory": f"{torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f}GB" |
|
|
}) |
|
|
return gpu_count, gpus |
|
|
else: |
|
|
return 0, [] |
|
|
except ImportError: |
|
|
return 0, [] |
|
|
|
|
|
def choose_training_mode(gpu_count, gpus): |
|
|
"""Choose training mode based on available GPUs""" |
|
|
if gpu_count == 0: |
|
|
console.print("[yellow]β οΈ No GPUs detected - CPU training not supported[/yellow]") |
|
|
return None |
|
|
elif gpu_count == 1: |
|
|
console.print(f"[blue]π§ Single GPU detected - using GPU 0: {gpus[0]['name']}[/blue]") |
|
|
return "single" |
|
|
else: |
|
|
|
|
|
console.print(f"[blue]π§ {gpu_count} GPUs detected - choose training mode[/blue]") |
|
|
|
|
|
|
|
|
from rich.table import Table |
|
|
gpu_table = Table(show_header=True, box=None) |
|
|
gpu_table.add_column("Index", style="cyan", width=6) |
|
|
gpu_table.add_column("Name", style="white", width=40) |
|
|
gpu_table.add_column("VRAM", style="green", width=10) |
|
|
|
|
|
for gpu in gpus: |
|
|
gpu_table.add_row(str(gpu['index']), gpu['name'], gpu['memory']) |
|
|
|
|
|
console.print(gpu_table) |
|
|
|
|
|
choices = [ |
|
|
"Multi-GPU Training (all available GPUs)", |
|
|
"Single GPU Training (choose specific GPU)" |
|
|
] |
|
|
|
|
|
questions = [ |
|
|
inquirer.List('training_mode', |
|
|
message="π§ Training Mode: (Use arrow keys)", |
|
|
choices=choices, |
|
|
default=choices[0]) |
|
|
] |
|
|
|
|
|
answers = inquirer.prompt(questions) |
|
|
selected_mode = answers['training_mode'] |
|
|
|
|
|
if "Multi-GPU" in selected_mode: |
|
|
return "multi" |
|
|
else: |
|
|
|
|
|
gpu_choices = [] |
|
|
for gpu in gpus: |
|
|
gpu_choices.append(f"GPU{gpu['index']}: {gpu['name']} ({gpu['memory']})") |
|
|
|
|
|
questions = [ |
|
|
inquirer.List('gpu_selection', |
|
|
message="Choose GPU: (Use arrow keys)", |
|
|
choices=gpu_choices, |
|
|
default=gpu_choices[0]) |
|
|
] |
|
|
|
|
|
answers = inquirer.prompt(questions) |
|
|
selected_gpu = answers['gpu_selection'] |
|
|
|
|
|
|
|
|
gpu_index = int(selected_gpu.split("GPU")[1].split(":")[0]) |
|
|
console.print(f"[blue]Selected GPU {gpu_index}: {gpus[gpu_index]['name']}[/blue]") |
|
|
return f"single_{gpu_index}" |
|
|
|
|
|
def show_menu(): |
|
|
console.rule("[bold cyan]Humigence β Your AI. Your pipeline. Zero code.") |
|
|
print("[dim]A complete MLOps suite built for makers, teams, and enterprises.[/dim]\n") |
|
|
print("Options:") |
|
|
print("[bold green]1.[/bold green] Supervised Fine-Tuning π") |
|
|
print("[bold yellow]2.[/bold yellow] RAG Implementation (coming soon)") |
|
|
print("[bold yellow]3.[/bold yellow] EnterpriseGPT (coming soon)") |
|
|
print("[bold yellow]4.[/bold yellow] Batch Inference (coming soon)") |
|
|
print("[bold yellow]5.[/bold yellow] Context Length (coming soon)") |
|
|
print("[bold red]6.[/bold red] Exit\n") |
|
|
|
|
|
def launch_training(config, training_mode, gpus): |
|
|
"""Launch training based on the selected mode""" |
|
|
import os |
|
|
import subprocess |
|
|
import json |
|
|
|
|
|
|
|
|
humigence_dir = Path(__file__).parent.parent |
|
|
os.chdir(humigence_dir) |
|
|
|
|
|
|
|
|
model_mapping = { |
|
|
"Qwen/Qwen2.5-0.5B": "unsloth/Qwen2.5-0.5B-Instruct", |
|
|
"microsoft/Phi-2": "unsloth/Phi-2", |
|
|
"TinyLlama/TinyLlama-1.1B-Chat-v1.0": "unsloth/TinyLlama-1.1B-Chat-v1.0" |
|
|
} |
|
|
|
|
|
|
|
|
base_model = config.get("base_model", config.get("model_name", "Qwen/Qwen2.5-0.5B")) |
|
|
model_name = model_mapping.get(base_model, base_model) |
|
|
|
|
|
|
|
|
dataset_path = config["dataset_path"] |
|
|
if dataset_path.startswith("local:"): |
|
|
|
|
|
dataset_name = "jsonl" |
|
|
dataset_config = dataset_path[6:] |
|
|
else: |
|
|
|
|
|
dataset_name = "wikitext" |
|
|
dataset_config = "wikitext-2-raw-v1" |
|
|
|
|
|
|
|
|
training_recipe = config.get("training_recipe", "QLoRA (4-bit NF4)") |
|
|
if "QLoRA" in training_recipe: |
|
|
precision = "qlora_4bit" |
|
|
elif "BF16" in training_recipe: |
|
|
precision = "lora_bf16" |
|
|
else: |
|
|
precision = "lora_fp16" |
|
|
|
|
|
|
|
|
timestamp = time.strftime("%Y%m%d_%H%M%S") |
|
|
|
|
|
if training_mode == "multi": |
|
|
|
|
|
output_dir = f"./runs/humigence/out_lora_dual_{timestamp}" |
|
|
console.print("[bold green]π Launching multi-GPU training with Unsloth...[/bold green]") |
|
|
|
|
|
cmd = [ |
|
|
"torchrun", |
|
|
"--nproc_per_node=2", |
|
|
"training/unsloth/train_lora_dual.py", |
|
|
"--model", model_name, |
|
|
"--dataset", dataset_name, |
|
|
"--dataset_config", dataset_config, |
|
|
"--out_dir", output_dir, |
|
|
"--max_steps", "1000", |
|
|
"--per_device_batch", "2", |
|
|
"--grad_accum", "4", |
|
|
"--learning_rate", "2e-4", |
|
|
"--block_size", "1024", |
|
|
"--lora_r", "16", |
|
|
"--lora_alpha", "32", |
|
|
"--lora_dropout", "0.0", |
|
|
"--precision", precision |
|
|
] |
|
|
|
|
|
console.print(f"[dim]Command: {' '.join(cmd)}[/dim]") |
|
|
|
|
|
try: |
|
|
result = subprocess.run(cmd, check=True, cwd=humigence_dir) |
|
|
console.print("[bold green]β
Multi-GPU training completed successfully![/bold green]") |
|
|
console.print(f"[blue]π Output saved to: {output_dir}[/blue]") |
|
|
return True |
|
|
except subprocess.CalledProcessError as e: |
|
|
console.print(f"[bold red]β Multi-GPU training failed with return code: {e.returncode}[/bold red]") |
|
|
console.print("[yellow]π Falling back to single-GPU training...[/yellow]") |
|
|
|
|
|
training_mode = "single" |
|
|
|
|
|
if training_mode == "single" or training_mode.startswith("single_"): |
|
|
|
|
|
if training_mode.startswith("single_"): |
|
|
gpu_index = int(training_mode.split("_")[1]) |
|
|
output_dir = f"./runs/humigence/out_lora_single_{timestamp}_gpu{gpu_index}" |
|
|
else: |
|
|
gpu_index = 0 |
|
|
output_dir = f"./runs/humigence/out_lora_single_{timestamp}" |
|
|
|
|
|
console.print(f"[bold green]π Launching single-GPU training with Unsloth...[/bold green]") |
|
|
console.print(f"[blue]Using GPU {gpu_index}: {gpus[gpu_index]['name']}[/blue]") |
|
|
|
|
|
cmd = [ |
|
|
"python3", |
|
|
"training/unsloth/train_lora_dual.py", |
|
|
"--model", model_name, |
|
|
"--dataset", dataset_name, |
|
|
"--dataset_config", dataset_config, |
|
|
"--out_dir", output_dir, |
|
|
"--max_steps", "1000", |
|
|
"--per_device_batch", "4", |
|
|
"--grad_accum", "2", |
|
|
"--learning_rate", "2e-4", |
|
|
"--block_size", "1024", |
|
|
"--lora_r", "16", |
|
|
"--lora_alpha", "32", |
|
|
"--lora_dropout", "0.0", |
|
|
"--precision", precision |
|
|
] |
|
|
|
|
|
console.print(f"[dim]Command: {' '.join(cmd)}[/dim]") |
|
|
|
|
|
|
|
|
env = os.environ.copy() |
|
|
env["CUDA_VISIBLE_DEVICES"] = str(gpu_index) |
|
|
|
|
|
try: |
|
|
result = subprocess.run(cmd, check=True, cwd=humigence_dir, env=env) |
|
|
console.print("[bold green]β
Single-GPU training completed successfully![/bold green]") |
|
|
console.print(f"[blue]π Output saved to: {output_dir}[/blue]") |
|
|
return True |
|
|
except subprocess.CalledProcessError as e: |
|
|
console.print(f"[bold red]β Single-GPU training failed with return code: {e.returncode}[/bold red]") |
|
|
return False |
|
|
except Exception as e: |
|
|
console.print(f"[bold red]β Single-GPU training failed: {e}[/bold red]") |
|
|
return False |
|
|
|
|
|
return False |
|
|
|
|
|
def main(): |
|
|
while True: |
|
|
show_menu() |
|
|
choice = console.input("[bold blue]Select an option[/bold blue]: ") |
|
|
|
|
|
if choice == "1": |
|
|
console.print("[bold green]Starting Supervised Fine-Tuning...[/bold green]") |
|
|
|
|
|
|
|
|
config_path = collect_training_config() |
|
|
|
|
|
if config_path is None: |
|
|
|
|
|
console.print("[bold red]β Training cancelled. Returning to main menu.[/bold red]") |
|
|
time.sleep(2) |
|
|
continue |
|
|
|
|
|
|
|
|
import json |
|
|
with open(config_path, 'r') as f: |
|
|
config = json.load(f) |
|
|
|
|
|
|
|
|
if not check_unsloth_availability(): |
|
|
console.print("[bold red]β Missing required dependencies: No module named 'unsloth'[/bold red]") |
|
|
console.print("[yellow]β‘ To install, run:[/yellow]") |
|
|
console.print("[cyan]python3 training/unsloth/setup_humigence_unsloth.py[/cyan]") |
|
|
time.sleep(2) |
|
|
continue |
|
|
|
|
|
|
|
|
gpu_count, gpus = detect_gpus() |
|
|
training_mode = choose_training_mode(gpu_count, gpus) |
|
|
|
|
|
if training_mode is None: |
|
|
console.print("[bold red]β No suitable training mode available. Returning to main menu.[/bold red]") |
|
|
time.sleep(2) |
|
|
continue |
|
|
|
|
|
|
|
|
success = launch_training(config, training_mode, gpus) |
|
|
|
|
|
if not success: |
|
|
console.print("[bold red]β Training failed. Check the logs above for details.[/bold red]") |
|
|
|
|
|
|
|
|
console.print("\n[bold cyan]Training completed![/bold cyan]") |
|
|
if console.input("[bold blue]Start another training session? (y/N)[/bold blue]: ").lower() in ['y', 'yes']: |
|
|
continue |
|
|
else: |
|
|
break |
|
|
elif choice == "6": |
|
|
console.print("[bold red]Exiting Humigence CLI. Goodbye![/bold red]") |
|
|
time.sleep(1) |
|
|
sys.exit() |
|
|
else: |
|
|
console.print("[yellow]β οΈ Option not implemented yet. Try 1 or 6.[/yellow]\n") |
|
|
time.sleep(1) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |