File size: 12,512 Bytes
7275aef
 
c4b369c
7275aef
c4b369c
 
7275aef
c4b369c
 
7275aef
 
 
 
 
 
 
 
c4b369c
 
 
7275aef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4b369c
 
7275aef
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
# cli/main.py

import sys
import time
from pathlib import Path

# Add the parent directory to the path so we can import from pipelines
sys.path.insert(0, str(Path(__file__).parent.parent))

# DO NOT import Unsloth here - delay until after wizard completion
UNSLOTH_AVAILABLE = None  # Will be checked later

from cli.config_wizard import collect_training_config
from cli.atomic_eval import app as atomic_eval_app
from rich import print
from rich.console import Console
import inquirer

console = Console()

# Removed download functionality - system now only works with local datasets

def check_unsloth_availability():
    """Check if Unsloth is available (delayed import)"""
    global UNSLOTH_AVAILABLE
    if UNSLOTH_AVAILABLE is None:
        try:
            import unsloth
            UNSLOTH_AVAILABLE = True
        except ImportError:
            UNSLOTH_AVAILABLE = False
    return UNSLOTH_AVAILABLE

def detect_gpus():
    """Detect available GPUs"""
    try:
        import torch
        if torch.cuda.is_available():
            gpu_count = torch.cuda.device_count()
            gpus = []
            for i in range(gpu_count):
                gpus.append({
                    "index": i,
                    "name": torch.cuda.get_device_name(i),
                    "memory": f"{torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f}GB"
                })
            return gpu_count, gpus
        else:
            return 0, []
    except ImportError:
        return 0, []

def choose_training_mode(gpu_count, gpus):
    """Choose training mode based on available GPUs"""
    if gpu_count == 0:
        console.print("[yellow]⚠️ No GPUs detected - CPU training not supported[/yellow]")
        return None
    elif gpu_count == 1:
        console.print(f"[blue]πŸ”§ Single GPU detected - using GPU 0: {gpus[0]['name']}[/blue]")
        return "single"
    else:
        # Multiple GPUs - prompt user to choose
        console.print(f"[blue]πŸ”§ {gpu_count} GPUs detected - choose training mode[/blue]")
        
        # Display available GPUs
        from rich.table import Table
        gpu_table = Table(show_header=True, box=None)
        gpu_table.add_column("Index", style="cyan", width=6)
        gpu_table.add_column("Name", style="white", width=40)
        gpu_table.add_column("VRAM", style="green", width=10)
        
        for gpu in gpus:
            gpu_table.add_row(str(gpu['index']), gpu['name'], gpu['memory'])
        
        console.print(gpu_table)
        
        choices = [
            "Multi-GPU Training (all available GPUs)",
            "Single GPU Training (choose specific GPU)"
        ]
        
        questions = [
            inquirer.List('training_mode',
                         message="πŸ”§ Training Mode: (Use arrow keys)",
                         choices=choices,
                         default=choices[0])
        ]
        
        answers = inquirer.prompt(questions)
        selected_mode = answers['training_mode']
        
        if "Multi-GPU" in selected_mode:
            return "multi"
        else:
            # Single GPU - let user choose which one
            gpu_choices = []
            for gpu in gpus:
                gpu_choices.append(f"GPU{gpu['index']}: {gpu['name']} ({gpu['memory']})")
            
            questions = [
                inquirer.List('gpu_selection',
                             message="Choose GPU: (Use arrow keys)",
                             choices=gpu_choices,
                             default=gpu_choices[0])
            ]
            
            answers = inquirer.prompt(questions)
            selected_gpu = answers['gpu_selection']
            
            # Extract GPU index
            gpu_index = int(selected_gpu.split("GPU")[1].split(":")[0])
            console.print(f"[blue]Selected GPU {gpu_index}: {gpus[gpu_index]['name']}[/blue]")
            return f"single_{gpu_index}"

def show_menu():
    console.rule("[bold cyan]Humigence β€” Your AI. Your pipeline. Zero code.")
    print("[dim]A complete MLOps suite built for makers, teams, and enterprises.[/dim]\n")
    print("Options:")
    print("[bold green]1.[/bold green] Supervised Fine-Tuning πŸš€")
    print("[bold yellow]2.[/bold yellow] RAG Implementation (coming soon)")
    print("[bold yellow]3.[/bold yellow] EnterpriseGPT (coming soon)")
    print("[bold yellow]4.[/bold yellow] Batch Inference (coming soon)")
    print("[bold yellow]5.[/bold yellow] Context Length (coming soon)")
    print("[bold red]6.[/bold red] Exit\n")

def launch_training(config, training_mode, gpus):
    """Launch training based on the selected mode"""
    import os
    import subprocess
    import json
    
    # Change to the humigence directory
    humigence_dir = Path(__file__).parent.parent
    os.chdir(humigence_dir)
    
    # Map model names to Unsloth equivalents
    model_mapping = {
        "Qwen/Qwen2.5-0.5B": "unsloth/Qwen2.5-0.5B-Instruct",
        "microsoft/Phi-2": "unsloth/Phi-2",
        "TinyLlama/TinyLlama-1.1B-Chat-v1.0": "unsloth/TinyLlama-1.1B-Chat-v1.0"
    }
    
    # Use Unsloth model if available, otherwise use original
    base_model = config.get("base_model", config.get("model_name", "Qwen/Qwen2.5-0.5B"))
    model_name = model_mapping.get(base_model, base_model)
    
    # Determine dataset parameters
    dataset_path = config["dataset_path"]
    if dataset_path.startswith("local:"):
        # Local dataset - use as custom dataset
        dataset_name = "jsonl"
        dataset_config = dataset_path[6:]  # Remove "local:" prefix
    else:
        # Default to wikitext for demo
        dataset_name = "wikitext"
        dataset_config = "wikitext-2-raw-v1"
    
    # Map training recipe to precision
    training_recipe = config.get("training_recipe", "QLoRA (4-bit NF4)")
    if "QLoRA" in training_recipe:
        precision = "qlora_4bit"
    elif "BF16" in training_recipe:
        precision = "lora_bf16"
    else:
        precision = "lora_fp16"
    
    # Create output directory with timestamp
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    
    if training_mode == "multi":
        # Multi-GPU training with TorchRun
        output_dir = f"./runs/humigence/out_lora_dual_{timestamp}"
        console.print("[bold green]πŸš€ Launching multi-GPU training with Unsloth...[/bold green]")
        
        cmd = [
            "torchrun", 
            "--nproc_per_node=2", 
            "training/unsloth/train_lora_dual.py",
            "--model", model_name,
            "--dataset", dataset_name,
            "--dataset_config", dataset_config,
            "--out_dir", output_dir,
            "--max_steps", "1000",
            "--per_device_batch", "2",
            "--grad_accum", "4",
            "--learning_rate", "2e-4",
            "--block_size", "1024",
            "--lora_r", "16",
            "--lora_alpha", "32",
            "--lora_dropout", "0.0",
            "--precision", precision
        ]
        
        console.print(f"[dim]Command: {' '.join(cmd)}[/dim]")
        
        try:
            result = subprocess.run(cmd, check=True, cwd=humigence_dir)
            console.print("[bold green]βœ… Multi-GPU training completed successfully![/bold green]")
            console.print(f"[blue]πŸ“ Output saved to: {output_dir}[/blue]")
            return True
        except subprocess.CalledProcessError as e:
            console.print(f"[bold red]❌ Multi-GPU training failed with return code: {e.returncode}[/bold red]")
            console.print("[yellow]πŸ”„ Falling back to single-GPU training...[/yellow]")
            # Fall through to single-GPU fallback
            training_mode = "single"
    
    if training_mode == "single" or training_mode.startswith("single_"):
        # Single-GPU training
        if training_mode.startswith("single_"):
            gpu_index = int(training_mode.split("_")[1])
            output_dir = f"./runs/humigence/out_lora_single_{timestamp}_gpu{gpu_index}"
        else:
            gpu_index = 0
            output_dir = f"./runs/humigence/out_lora_single_{timestamp}"
        
        console.print(f"[bold green]πŸš€ Launching single-GPU training with Unsloth...[/bold green]")
        console.print(f"[blue]Using GPU {gpu_index}: {gpus[gpu_index]['name']}[/blue]")
        
        cmd = [
            "python3",
            "training/unsloth/train_lora_dual.py",
            "--model", model_name,
            "--dataset", dataset_name,
            "--dataset_config", dataset_config,
            "--out_dir", output_dir,
            "--max_steps", "1000",
            "--per_device_batch", "4",  # Larger batch for single GPU
            "--grad_accum", "2",        # Less accumulation for single GPU
            "--learning_rate", "2e-4",
            "--block_size", "1024",
            "--lora_r", "16",
            "--lora_alpha", "32",
            "--lora_dropout", "0.0",
            "--precision", precision
        ]
        
        console.print(f"[dim]Command: {' '.join(cmd)}[/dim]")
        
        # Set environment for specific GPU
        env = os.environ.copy()
        env["CUDA_VISIBLE_DEVICES"] = str(gpu_index)
        
        try:
            result = subprocess.run(cmd, check=True, cwd=humigence_dir, env=env)
            console.print("[bold green]βœ… Single-GPU training completed successfully![/bold green]")
            console.print(f"[blue]πŸ“ Output saved to: {output_dir}[/blue]")
            return True
        except subprocess.CalledProcessError as e:
            console.print(f"[bold red]❌ Single-GPU training failed with return code: {e.returncode}[/bold red]")
            return False
        except Exception as e:
            console.print(f"[bold red]❌ Single-GPU training failed: {e}[/bold red]")
            return False
    
    return False

def main():
    while True:
        show_menu()
        choice = console.input("[bold blue]Select an option[/bold blue]: ")

        if choice == "1":
            console.print("[bold green]Starting Supervised Fine-Tuning...[/bold green]")
            
            # Step 1: Run the configuration wizard (no Unsloth import yet)
            config_path = collect_training_config()
            
            if config_path is None:
                # User cancelled or error occurred
                console.print("[bold red]❌ Training cancelled. Returning to main menu.[/bold red]")
                time.sleep(2)
                continue
            
            # Step 2: Load the configuration from the wizard
            import json
            with open(config_path, 'r') as f:
                config = json.load(f)
            
            # Step 3: NOW check if Unsloth dependencies are available (after wizard completion)
            if not check_unsloth_availability():
                console.print("[bold red]❌ Missing required dependencies: No module named 'unsloth'[/bold red]")
                console.print("[yellow]➑ To install, run:[/yellow]")
                console.print("[cyan]python3 training/unsloth/setup_humigence_unsloth.py[/cyan]")
                time.sleep(2)
                continue
            
            # Step 4: Detect GPUs BEFORE importing Unsloth (to avoid interference)
            gpu_count, gpus = detect_gpus()
            training_mode = choose_training_mode(gpu_count, gpus)
            
            if training_mode is None:
                console.print("[bold red]❌ No suitable training mode available. Returning to main menu.[/bold red]")
                time.sleep(2)
                continue
            
            # Step 5: Launch training
            success = launch_training(config, training_mode, gpus)
            
            if not success:
                console.print("[bold red]❌ Training failed. Check the logs above for details.[/bold red]")
            
            # Ask if user wants to start another training session
            console.print("\n[bold cyan]Training completed![/bold cyan]")
            if console.input("[bold blue]Start another training session? (y/N)[/bold blue]: ").lower() in ['y', 'yes']:
                continue
            else:
                break
        elif choice == "6":
            console.print("[bold red]Exiting Humigence CLI. Goodbye![/bold red]")
            time.sleep(1)
            sys.exit()
        else:
            console.print("[yellow]⚠️ Option not implemented yet. Try 1 or 6.[/yellow]\n")
            time.sleep(1)

if __name__ == "__main__":
    main()