File size: 15,066 Bytes
7275aef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
from InquirerPy import prompt
from rich.console import Console
from rich.table import Table
from utils.device import get_system_info
from utils.validators import detect_datasets
import os
import json
from pathlib import Path
import datetime

console = Console()

def display_system_summary():
    info = get_system_info()

    table = Table(title="πŸ–₯️ System Detection Summary", show_lines=True)
    table.add_column("Property", style="cyan", no_wrap=True)
    table.add_column("Value", style="green")

    for key, val in info.items():
        if key == "GPUs":
            for i, gpu in enumerate(val):
                table.add_row(f"GPU {i} Name", gpu['name'])
                table.add_row(f"GPU {i} Memory", gpu['memory'])
        else:
            table.add_row(key, str(val))

    console.print("\n")
    console.print(table)

def get_available_models():
    """Get available models for LoRA training with auto-detection."""
    # Default Hugging Face cache path
    hf_cache = os.path.expanduser("~/.cache/huggingface/hub/models--")
    model_choices = []

    if os.path.exists(hf_cache):
        for root, dirs, files in os.walk(hf_cache):
            for d in dirs:
                if d.startswith("snapshots"):
                    model_dir = os.path.basename(os.path.dirname(root))
                    model_choices.append(model_dir.replace("models--", "").replace("--", "/"))
    
    # Add popular models for LoRA training
    model_choices += [
        "meta-llama/Meta-Llama-3-8B-Instruct",
        "meta-llama/Meta-Llama-3-70B-Instruct", 
        "mistralai/Mistral-7B-Instruct-v0.1",
        "mistralai/Mixtral-8x7B-Instruct-v0.1",
        "microsoft/Phi-2",
        "microsoft/Phi-3-mini-4k-instruct",
        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
        "Qwen/Qwen1.5-0.5B",
        "Qwen/Qwen1.5-1.8B",
        "Qwen/Qwen1.5-7B",
        "google/gemma-2-2b-it",
        "google/gemma-2-9b-it",
        "manual-entry (custom path/repo)"
    ]

    # De-dupe and sort
    return sorted(list(set(model_choices)))

def get_available_datasets():
    """Get available datasets for LoRA training."""
    # Detect local datasets
    local_datasets = detect_datasets()
    
    # Add popular Hugging Face datasets
    hf_datasets = [
        ("wikitext-2-raw-v1", "Hugging Face - WikiText-2 (Raw)"),
        ("wikitext-103-raw-v1", "Hugging Face - WikiText-103 (Raw)"),
        ("openwebtext", "Hugging Face - OpenWebText"),
        ("c4", "Hugging Face - C4 (Common Crawl)"),
        ("bookcorpus", "Hugging Face - BookCorpus"),
    ]
    
    # Combine local and HF datasets
    all_datasets = []
    
    # Add local datasets first
    for name, path in local_datasets:
        all_datasets.append((f"Local - {name}", f"local:{path}"))
    
    # Add HF datasets
    for dataset_id, display_name in hf_datasets:
        all_datasets.append((display_name, f"hf:{dataset_id}"))
    
    return all_datasets

def generate_output_directory(model_name, dataset_name):
    """Generate a meaningful output directory name."""
    # Clean model name for directory
    model_clean = model_name.replace("/", "_").replace(":", "_")
    dataset_clean = dataset_name.replace("/", "_").replace(":", "_")
    
    # Create timestamp
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    
    return f"out_lora_{model_clean}_{dataset_clean}_{timestamp}"

def get_lora_presets():
    """Get LoRA configuration presets."""
    return [
        {
            "name": "Efficient (r=8, Ξ±=16)",
            "description": "Fast training, lower memory usage",
            "r": 8,
            "alpha": 16,
            "dropout": 0.05
        },
        {
            "name": "Balanced (r=16, Ξ±=32)", 
            "description": "Good balance of performance and speed",
            "r": 16,
            "alpha": 32,
            "dropout": 0.05
        },
        {
            "name": "High Quality (r=32, Ξ±=64)",
            "description": "Better performance, more parameters",
            "r": 32,
            "alpha": 64,
            "dropout": 0.1
        },
        {
            "name": "Custom Configuration",
            "description": "Set your own LoRA parameters",
            "r": 16,
            "alpha": 32,
            "dropout": 0.05
        }
    ]

def run():
    """Run the LoRA training wizard."""
    console.print("\n[bold magenta]πŸ”§ Single-GPU LoRA Training Setup[/bold magenta]")

    # Setup mode selection
    questions = [
        {
            "type": "list",
            "name": "setup_mode",
            "message": "Choose Setup Mode:",
            "choices": [
                "Quick Start – Recommended settings for most users",
                "Custom Setup – Full control over all parameters"
            ],
        }
    ]

    answers = prompt(questions)
    setup_mode = answers.get("setup_mode").split(" ")[0].lower()  # 'quick' or 'custom'
    
    console.print(f"\n[green]βœ… You selected:[/green] [yellow]{answers.get('setup_mode')}[/yellow]")

    # Display system summary
    display_system_summary()

    # Model selection
    console.print("\n[bold blue]🧠 Model Selection[/bold blue]")
    available_models = get_available_models()
    
    model_question = [
        {
            "type": "list",
            "name": "base_model",
            "message": "Choose Base Model for LoRA Training:",
            "choices": available_models
        }
    ]

    model_answer = prompt(model_question)
    selected_model = model_answer.get("base_model")

    # Handle manual entry
    if selected_model == "manual-entry (custom path/repo)":
        manual_input = prompt([
            {
                "type": "input",
                "name": "custom_model",
                "message": "Enter Hugging Face repo or local model path:",
                "validate": lambda x: len(x.strip()) > 0
            }
        ])
        selected_model = manual_input.get("custom_model")

    console.print(f"\n[green]βœ… Selected model:[/green] [yellow]{selected_model}[/yellow]")

    # Dataset selection
    console.print("\n[bold blue]πŸ“š Dataset Selection[/bold blue]")
    available_datasets = get_available_datasets()
    
    if not available_datasets:
        console.print("[bold red]⚠️ No datasets found! Please ensure you have datasets available.[/bold red]")
        return None

    dataset_question = [
        {
            "type": "list",
            "name": "dataset",
            "message": "Choose Dataset for Training:",
            "choices": [name for name, _ in available_datasets]
        }
    ]

    dataset_answer = prompt(dataset_question)
    selected_dataset_display = dataset_answer.get("dataset")
    
    # Find the actual dataset path
    selected_dataset = None
    for name, path in available_datasets:
        if name == selected_dataset_display:
            selected_dataset = path
            break

    console.print(f"\n[green]βœ… Selected dataset:[/green] [yellow]{selected_dataset_display}[/yellow]")

    # Generate output directory
    output_dir = generate_output_directory(selected_model, selected_dataset_display)
    console.print(f"\n[green]πŸ“ Output directory:[/green] [yellow]{output_dir}[/yellow]")

    # LoRA configuration
    console.print("\n[bold blue]βš™οΈ LoRA Configuration[/bold blue]")
    lora_presets = get_lora_presets()
    
    lora_question = [
        {
            "type": "list",
            "name": "lora_preset",
            "message": "Choose LoRA Configuration:",
            "choices": [f"{preset['name']} - {preset['description']}" for preset in lora_presets]
        }
    ]

    lora_answer = prompt(lora_question)
    selected_preset = lora_answer.get("lora_preset").split(" - ")[0]
    
    # Find the preset
    selected_lora_config = None
    for preset in lora_presets:
        if preset['name'] == selected_preset:
            selected_lora_config = preset
            break

    console.print(f"\n[green]βœ… LoRA config:[/green] [yellow]{selected_preset}[/yellow]")

    # Training parameters
    if setup_mode == "custom":
        console.print("\n[bold blue]🎯 Training Parameters[/bold blue]")
        
        param_questions = [
            {
                "type": "input",
                "name": "max_steps",
                "message": "Maximum training steps:",
                "default": "1000",
                "validate": lambda x: x.isdigit() and int(x) > 0
            },
            {
                "type": "input", 
                "name": "batch_size",
                "message": "Per-device batch size:",
                "default": "4",
                "validate": lambda x: x.isdigit() and int(x) > 0
            },
            {
                "type": "input",
                "name": "grad_accum",
                "message": "Gradient accumulation steps:",
                "default": "4", 
                "validate": lambda x: x.isdigit() and int(x) > 0
            },
            {
                "type": "input",
                "name": "learning_rate",
                "message": "Learning rate:",
                "default": "2e-4",
                "validate": lambda x: float(x) > 0
            },
            {
                "type": "input",
                "name": "block_size",
                "message": "Block size for text grouping:",
                "default": "512",
                "validate": lambda x: x.isdigit() and int(x) > 0
            }
        ]
        
        param_answers = prompt(param_questions)
    else:
        # Quick start defaults
        param_answers = {
            "max_steps": "1000",
            "batch_size": "4", 
            "grad_accum": "4",
            "learning_rate": "2e-4",
            "block_size": "512"
        }

    # Custom LoRA parameters if needed
    if selected_preset == "Custom Configuration":
        console.print("\n[bold blue]πŸ”§ Custom LoRA Parameters[/bold blue]")
        
        custom_lora_questions = [
            {
                "type": "input",
                "name": "lora_r",
                "message": "LoRA rank (r):",
                "default": "16",
                "validate": lambda x: x.isdigit() and int(x) > 0
            },
            {
                "type": "input",
                "name": "lora_alpha", 
                "message": "LoRA alpha:",
                "default": "32",
                "validate": lambda x: x.isdigit() and int(x) > 0
            },
            {
                "type": "input",
                "name": "lora_dropout",
                "message": "LoRA dropout:",
                "default": "0.05",
                "validate": lambda x: 0 <= float(x) <= 1
            }
        ]
        
        custom_lora_answers = prompt(custom_lora_questions)
        selected_lora_config.update({
            "r": int(custom_lora_answers["lora_r"]),
            "alpha": int(custom_lora_answers["lora_alpha"]),
            "dropout": float(custom_lora_answers["lora_dropout"])
        })

    # Parse dataset type
    if selected_dataset.startswith("local:"):
        dataset_name = "jsonl"
        dataset_config = selected_dataset[6:]  # Remove "local:" prefix
    elif selected_dataset.startswith("hf:"):
        dataset_name = "wikitext"
        dataset_config = selected_dataset[3:]  # Remove "hf:" prefix
    else:
        dataset_name = "wikitext"
        dataset_config = selected_dataset

    # Combine all configuration
    final_config = {
        "setup_mode": setup_mode,
        "base_model": selected_model,
        "dataset_name": dataset_name,
        "dataset_config": dataset_config,
        "dataset_display": selected_dataset_display,
        "output_dir": output_dir,
        "lora_config": selected_lora_config,
        "max_steps": int(param_answers["max_steps"]),
        "batch_size": int(param_answers["batch_size"]),
        "grad_accum": int(param_answers["grad_accum"]),
        "learning_rate": float(param_answers["learning_rate"]),
        "block_size": int(param_answers["block_size"]),
        "timestamp": datetime.datetime.now().isoformat()
    }

    # Display configuration summary
    console.print("\n[bold cyan]πŸ“‹ Configuration Summary[/bold cyan]")
    summary_table = Table(show_header=True, header_style="bold magenta")
    summary_table.add_column("Parameter", style="cyan")
    summary_table.add_column("Value", style="green")
    
    summary_table.add_row("Model", selected_model)
    summary_table.add_row("Dataset", selected_dataset_display)
    summary_table.add_row("Output Directory", output_dir)
    summary_table.add_row("LoRA Rank (r)", str(selected_lora_config["r"]))
    summary_table.add_row("LoRA Alpha", str(selected_lora_config["alpha"]))
    summary_table.add_row("LoRA Dropout", str(selected_lora_config["dropout"]))
    summary_table.add_row("Max Steps", str(final_config["max_steps"]))
    summary_table.add_row("Batch Size", str(final_config["batch_size"]))
    summary_table.add_row("Grad Accumulation", str(final_config["grad_accum"]))
    summary_table.add_row("Learning Rate", str(final_config["learning_rate"]))
    summary_table.add_row("Block Size", str(final_config["block_size"]))
    
    console.print(summary_table)

    # Create output directory
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    # Save configuration
    config_path = Path(output_dir) / "lora_config.json"
    with open(config_path, "w") as f:
        json.dump(final_config, f, indent=2)

    console.print(f"\n[bold green]βœ… Configuration saved to:[/bold green] [cyan]{config_path}[/cyan]")

    # Generate reproduction script
    reproduce_script = f"""#!/bin/bash
# Re-run this exact LoRA training config
cd {Path.cwd()}
python3 cli/train_lora_single.py \\
    --model {selected_model} \\
    --output-dir {output_dir} \\
    --max-steps {final_config["max_steps"]} \\
    --batch-size {final_config["batch_size"]} \\
    --grad-accum {final_config["grad_accum"]} \\
    --learning-rate {final_config["learning_rate"]} \\
    --block-size {final_config["block_size"]} \\
    --lora-r {selected_lora_config["r"]} \\
    --lora-alpha {selected_lora_config["alpha"]} \\
    --lora-dropout {selected_lora_config["dropout"]} \\
    --dataset {dataset_name} \\
    --dataset-config {dataset_config}
"""

    reproduce_path = Path(output_dir) / "reproduce.sh"
    with open(reproduce_path, "w") as f:
        f.write(reproduce_script)
    reproduce_path.chmod(0o755)

    console.print(f"[bold green]βœ… Reproduction script saved to:[/bold green] [cyan]{reproduce_path}[/cyan]")

    # Final confirmation
    final_prompt = prompt([
        {
            "type": "confirm",
            "name": "confirm_training",
            "message": "πŸš€ Start LoRA training now?",
            "default": True
        }
    ])

    if not final_prompt["confirm_training"]:
        console.print("[bold yellow]❌ Training cancelled.[/bold yellow]")
        return None
    else:
        console.print("[bold green]πŸš€ Starting LoRA training...[/bold green]")
        return final_config

if __name__ == "__main__":
    run()