Prithvik-1 commited on
Commit
d048517
·
verified ·
1 Parent(s): 7d4b5f8

Upload scripts/training/finetune_codellama.py with huggingface_hub

Browse files
scripts/training/finetune_codellama.py ADDED
@@ -0,0 +1,516 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Enhanced Fine-tuning script for CodeLlama with optimized hyperparameters
4
+ Supports:
5
+ - Resume from checkpoint (automatic detection)
6
+ - Incremental fine-tuning (continue from existing adapter)
7
+ - Fresh training option
8
+ """
9
+
10
+ import os
11
+ import sys
12
+ import torch
13
+ import json
14
+ from pathlib import Path
15
+ from datasets import Dataset
16
+ from transformers import (
17
+ AutoModelForCausalLM,
18
+ AutoTokenizer,
19
+ TrainingArguments,
20
+ BitsAndBytesConfig,
21
+ Trainer,
22
+ DataCollatorForLanguageModeling,
23
+ EarlyStoppingCallback,
24
+ )
25
+ from peft import (
26
+ LoraConfig,
27
+ PeftModel,
28
+ get_peft_model,
29
+ prepare_model_for_kbit_training,
30
+ TaskType,
31
+ )
32
+
33
+ def get_device_info():
34
+ """Detect and return available compute device"""
35
+ device_info = {
36
+ "device": "cpu",
37
+ "device_type": "cpu",
38
+ "use_quantization": False,
39
+ "dtype": torch.float32
40
+ }
41
+
42
+ if torch.cuda.is_available():
43
+ device_info["device"] = "cuda"
44
+ device_info["device_type"] = "cuda"
45
+ device_info["use_quantization"] = True
46
+ device_info["dtype"] = torch.float16
47
+ device_info["device_count"] = torch.cuda.device_count()
48
+ device_info["device_name"] = torch.cuda.get_device_name(0)
49
+ print(f"✓ CUDA GPU detected: {device_info['device_name']} (Count: {device_info['device_count']})")
50
+ else:
51
+ print("⚠ No GPU detected, using CPU (training will be very slow)")
52
+
53
+ return device_info
54
+
55
+ def get_bitsandbytes_config():
56
+ """Get BitsAndBytes config if CUDA is available"""
57
+ if torch.cuda.is_available():
58
+ return BitsAndBytesConfig(
59
+ load_in_4bit=True,
60
+ bnb_4bit_quant_type="nf4",
61
+ bnb_4bit_compute_dtype=torch.float16,
62
+ bnb_4bit_use_double_quant=True,
63
+ )
64
+ return None
65
+
66
+ def load_and_prepare_model(
67
+ model_name: str,
68
+ adapter_path: str | None = None,
69
+ lora_r: int = 48,
70
+ lora_alpha: int = 96,
71
+ lora_dropout: float = 0.15
72
+ ):
73
+ """Load CodeLlama model with optimized LoRA configuration"""
74
+ device_info = get_device_info()
75
+ print(f"\nLoading model: {model_name}")
76
+
77
+ # Tokenizer
78
+ tokenizer_source = adapter_path if adapter_path and os.path.isdir(adapter_path) else model_name
79
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_source)
80
+ if tokenizer.pad_token is None:
81
+ tokenizer.pad_token = tokenizer.eos_token
82
+ tokenizer.pad_token_id = tokenizer.eos_token_id
83
+
84
+ # Quantization config
85
+ bnb_config = get_bitsandbytes_config()
86
+
87
+ # Model loading kwargs
88
+ model_kwargs = {
89
+ "trust_remote_code": True,
90
+ }
91
+
92
+ if bnb_config is not None:
93
+ print("Using 4-bit quantization (CUDA)")
94
+ model_kwargs["quantization_config"] = bnb_config
95
+ model_kwargs["device_map"] = "auto"
96
+ else:
97
+ model_kwargs["torch_dtype"] = device_info["dtype"]
98
+ model_kwargs["device_map"] = "auto"
99
+
100
+ # Load base model
101
+ base_model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
102
+
103
+ # Prepare for k-bit training
104
+ if bnb_config is not None:
105
+ base_model = prepare_model_for_kbit_training(base_model)
106
+
107
+ # LoRA configuration (optimized for CodeLlama)
108
+ lora_config = LoraConfig(
109
+ r=lora_r,
110
+ lora_alpha=lora_alpha,
111
+ target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
112
+ lora_dropout=lora_dropout,
113
+ bias="none",
114
+ task_type=TaskType.CAUSAL_LM,
115
+ )
116
+
117
+ # Load or create LoRA adapter
118
+ if adapter_path and os.path.isdir(adapter_path):
119
+ print(f"📂 Loading existing LoRA adapter from: {adapter_path}")
120
+ print(" (Incremental fine-tuning mode - continuing from existing model)")
121
+ model = PeftModel.from_pretrained(base_model, adapter_path, is_trainable=True)
122
+ else:
123
+ print("🆕 Creating new LoRA adapter (Fresh training mode)")
124
+ model = get_peft_model(base_model, lora_config)
125
+
126
+ # Enable gradient checkpointing
127
+ model.gradient_checkpointing_enable()
128
+
129
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
130
+ total_params = sum(p.numel() for p in model.parameters())
131
+ trainable_ratio = (trainable_params / total_params) * 100
132
+
133
+ print(f"\nModel loaded successfully!")
134
+ print(f" - Device: {device_info['device']}")
135
+ print(f" - Trainable parameters: {trainable_params:,}")
136
+ print(f" - Total parameters: {total_params:,}")
137
+ print(f" - Trainable ratio: {trainable_ratio:.2f}%")
138
+
139
+ return model, tokenizer, device_info
140
+
141
+ def tokenize_function(examples, tokenizer, max_length=1536):
142
+ """Tokenize function for dataset"""
143
+ # Ensure pad_token is set
144
+ if tokenizer.pad_token is None:
145
+ tokenizer.pad_token = tokenizer.eos_token
146
+ tokenizer.pad_token_id = tokenizer.eos_token_id
147
+
148
+ # Combine instruction and response
149
+ # For CodeLlama chat format: instruction already ends with [/INST]
150
+ # So we just append: instruction + response + EOS
151
+ texts = []
152
+ for instruction, response in zip(examples["instruction"], examples["response"]):
153
+ # Instruction already contains: <s>[INST]...[/INST]
154
+ # We append response + EOS
155
+ text = f"{instruction}{response}{tokenizer.eos_token}"
156
+ texts.append(text)
157
+
158
+ # Tokenize with padding to max_length for consistent batch sizes
159
+ tokenized = tokenizer(
160
+ texts,
161
+ truncation=True,
162
+ max_length=max_length,
163
+ padding="max_length",
164
+ return_tensors=None, # Return lists, not tensors
165
+ )
166
+
167
+ # Labels are same as input_ids for causal LM
168
+ labels = []
169
+ pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
170
+
171
+ # Set labels, masking padding tokens with -100 (ignored in loss)
172
+ for input_ids_seq in tokenized["input_ids"]:
173
+ label_seq = input_ids_seq.copy()
174
+ # Mask padding tokens
175
+ label_seq = [-100 if token_id == pad_token_id else token_id for token_id in label_seq]
176
+ labels.append(label_seq)
177
+
178
+ tokenized["labels"] = labels
179
+
180
+ return tokenized
181
+
182
+ def find_checkpoint(output_dir):
183
+ """Find the latest checkpoint in output directory"""
184
+ checkpoint_dir = Path(output_dir)
185
+ if not checkpoint_dir.exists():
186
+ return None
187
+
188
+ # Look for checkpoint directories
189
+ checkpoints = []
190
+ for item in checkpoint_dir.iterdir():
191
+ if item.is_dir() and item.name.startswith("checkpoint-"):
192
+ try:
193
+ step_num = int(item.name.split("-")[1])
194
+ trainer_state = item / "trainer_state.json"
195
+ if trainer_state.exists():
196
+ checkpoints.append((step_num, str(item)))
197
+ except (ValueError, IndexError):
198
+ continue
199
+
200
+ if checkpoints:
201
+ # Sort by step number and return latest
202
+ checkpoints.sort(key=lambda x: x[0], reverse=True)
203
+ return checkpoints[0][1]
204
+
205
+ return None
206
+
207
+ def load_training_data(file_path):
208
+ """Load training data from JSONL file"""
209
+ print(f"Loading training data from {file_path}")
210
+
211
+ if not os.path.exists(file_path):
212
+ raise FileNotFoundError(f"Training data file not found: {file_path}")
213
+
214
+ data = []
215
+ with open(file_path, 'r', encoding='utf-8') as f:
216
+ for line in f:
217
+ line = line.strip()
218
+ if line:
219
+ try:
220
+ data.append(json.loads(line))
221
+ except json.JSONDecodeError as e:
222
+ print(f"⚠️ Warning: Skipping invalid JSON line: {e}")
223
+ continue
224
+
225
+ return data
226
+
227
+ def main():
228
+ import argparse
229
+
230
+ parser = argparse.ArgumentParser(description="Fine-tune CodeLlama with optimized hyperparameters")
231
+ parser.add_argument("--base-model", required=True, help="Base model path or HuggingFace ID")
232
+ parser.add_argument("--adapter-path", default=None, help="Path to existing LoRA adapter (for incremental fine-tuning)")
233
+ parser.add_argument("--dataset", required=True, help="Path to training dataset JSONL")
234
+ parser.add_argument("--output-dir", required=True, help="Output directory for fine-tuned model")
235
+ parser.add_argument("--resume-from-checkpoint", default=None, help="Resume from specific checkpoint (or 'auto' to find latest)")
236
+ parser.add_argument("--fresh", action="store_true", help="Force fresh training (ignore existing checkpoints)")
237
+
238
+ # Hyperparameters (optimized for CodeLlama based on HYPERPARAMETER_ANALYSIS.md)
239
+ parser.add_argument("--max-length", type=int, default=1536, help="Max sequence length (default: 1536)")
240
+ parser.add_argument("--num-epochs", type=int, default=5, help="Number of epochs (default: 5)")
241
+ parser.add_argument("--batch-size", type=int, default=2, help="Batch size per device (default: 2)")
242
+ parser.add_argument("--gradient-accumulation", type=int, default=4, help="Gradient accumulation steps (default: 4)")
243
+ parser.add_argument("--learning-rate", type=float, default=2e-5, help="Learning rate (default: 2e-5)")
244
+ parser.add_argument("--lora-r", type=int, default=48, help="LoRA rank (default: 48)")
245
+ parser.add_argument("--lora-alpha", type=int, default=96, help="LoRA alpha (default: 96)")
246
+ parser.add_argument("--lora-dropout", type=float, default=0.15, help="LoRA dropout (default: 0.15)")
247
+ parser.add_argument("--warmup-ratio", type=float, default=0.1, help="Warmup ratio (default: 0.1)")
248
+ parser.add_argument("--eval-steps", type=int, default=25, help="Evaluation steps (default: 25)")
249
+ parser.add_argument("--save-steps", type=int, default=25, help="Save steps (default: 25)")
250
+ parser.add_argument("--early-stopping-patience", type=int, default=5, help="Early stopping patience (default: 5)")
251
+ parser.add_argument("--logging-steps", type=int, default=5, help="Logging steps (default: 5)")
252
+
253
+ args = parser.parse_args()
254
+
255
+ print("=" * 70)
256
+ print("🚀 CodeLlama Fine-Tuning with Optimized Hyperparameters")
257
+ print("=" * 70)
258
+ print(f"Base model: {args.base_model}")
259
+ print(f"Dataset: {args.dataset}")
260
+ print(f"Output dir: {args.output_dir}")
261
+ if args.adapter_path:
262
+ print(f"Adapter path: {args.adapter_path} (Incremental fine-tuning)")
263
+ print("=" * 70)
264
+
265
+ # Check for existing checkpoint
266
+ resume_checkpoint = None
267
+ if not args.fresh:
268
+ if args.resume_from_checkpoint == "auto":
269
+ resume_checkpoint = find_checkpoint(args.output_dir)
270
+ if resume_checkpoint:
271
+ print(f"\n✅ Found existing checkpoint: {resume_checkpoint}")
272
+ print(" Training will resume from this checkpoint")
273
+ elif args.resume_from_checkpoint:
274
+ resume_checkpoint = args.resume_from_checkpoint
275
+ if os.path.isdir(resume_checkpoint):
276
+ print(f"\n📂 Resuming from specified checkpoint: {resume_checkpoint}")
277
+ else:
278
+ print(f"\n⚠️ Warning: Checkpoint path does not exist: {resume_checkpoint}")
279
+ resume_checkpoint = None
280
+ else:
281
+ print("\n🆕 Fresh training mode - starting from scratch")
282
+ # Clear any existing checkpoints if fresh mode
283
+ if os.path.exists(args.output_dir):
284
+ checkpoint_dir = Path(args.output_dir)
285
+ for item in checkpoint_dir.iterdir():
286
+ if item.is_dir() and item.name.startswith("checkpoint-"):
287
+ print(f" Removing old checkpoint: {item.name}")
288
+ import shutil
289
+ shutil.rmtree(item)
290
+
291
+ # Load model and tokenizer
292
+ model, tokenizer, device_info = load_and_prepare_model(
293
+ args.base_model,
294
+ args.adapter_path,
295
+ lora_r=args.lora_r,
296
+ lora_alpha=args.lora_alpha,
297
+ lora_dropout=args.lora_dropout
298
+ )
299
+
300
+ # Check if using pre-split dataset (train.jsonl in split directory)
301
+ dataset_path = Path(args.dataset)
302
+ val_dataset_path = None
303
+ use_presplit = False
304
+
305
+ if dataset_path.name == "train.jsonl":
306
+ # Check if val.jsonl exists in same directory
307
+ val_path = dataset_path.parent / "val.jsonl"
308
+ if val_path.exists():
309
+ val_dataset_path = val_path
310
+ use_presplit = True
311
+ print(f"\n✅ Using pre-split dataset:")
312
+ print(f" Train: {dataset_path}")
313
+ print(f" Val: {val_dataset_path}")
314
+
315
+ # Load training data
316
+ training_data = load_training_data(args.dataset)
317
+
318
+ # Convert to dataset format
319
+ instructions = []
320
+ responses = []
321
+
322
+ for item in training_data:
323
+ if "instruction" in item and "response" in item:
324
+ instructions.append(item["instruction"])
325
+ responses.append(item["response"])
326
+ else:
327
+ print(f"⚠️ Warning: Skipping invalid sample (missing instruction/response)")
328
+
329
+ if not instructions:
330
+ raise ValueError("No valid training samples found in dataset")
331
+
332
+ print(f"\n✅ Loaded {len(instructions)} training samples")
333
+
334
+ # Create training dataset
335
+ train_dataset_dict = Dataset.from_dict({
336
+ "instruction": instructions,
337
+ "response": responses
338
+ })
339
+
340
+ # Tokenize training dataset
341
+ print("Tokenizing training dataset...")
342
+ tokenized_train = train_dataset_dict.map(
343
+ lambda x: tokenize_function(x, tokenizer, max_length=args.max_length),
344
+ batched=True,
345
+ remove_columns=train_dataset_dict.column_names
346
+ )
347
+
348
+ # Load validation dataset if pre-split, otherwise split from training data
349
+ if use_presplit and val_dataset_path:
350
+ print(f"\n✅ Loading validation dataset from: {val_dataset_path}")
351
+ val_data = load_training_data(str(val_dataset_path))
352
+ val_instructions = []
353
+ val_responses = []
354
+
355
+ for item in val_data:
356
+ if "instruction" in item and "response" in item:
357
+ val_instructions.append(item["instruction"])
358
+ val_responses.append(item["response"])
359
+
360
+ val_dataset_dict = Dataset.from_dict({
361
+ "instruction": val_instructions,
362
+ "response": val_responses
363
+ })
364
+
365
+ print("Tokenizing validation dataset...")
366
+ tokenized_val = val_dataset_dict.map(
367
+ lambda x: tokenize_function(x, tokenizer, max_length=args.max_length),
368
+ batched=True,
369
+ remove_columns=val_dataset_dict.column_names
370
+ )
371
+
372
+ train_dataset = tokenized_train
373
+ val_dataset = tokenized_val
374
+
375
+ print(f" - Training samples: {len(train_dataset)}")
376
+ print(f" - Validation samples: {len(val_dataset)}")
377
+ else:
378
+ # Split into train/validation (80/20)
379
+ print("\nSplitting dataset into train/validation (80/20)...")
380
+ train_val_split = tokenized_train.train_test_split(test_size=0.2, seed=42)
381
+ train_dataset = train_val_split["train"]
382
+ val_dataset = train_val_split["test"]
383
+
384
+ print(f" - Training samples: {len(train_dataset)}")
385
+ print(f" - Validation samples: {len(val_dataset)}")
386
+
387
+ print(f" - Training samples: {len(train_dataset)}")
388
+ print(f" - Validation samples: {len(val_dataset)}")
389
+
390
+ # Calculate training steps
391
+ use_fp16 = device_info["device_type"] == "cuda"
392
+ effective_batch_size = args.batch_size * args.gradient_accumulation
393
+ steps_per_epoch = max(1, len(train_dataset) // effective_batch_size)
394
+ total_steps = steps_per_epoch * args.num_epochs
395
+ warmup_steps = max(int(total_steps * args.warmup_ratio), 10)
396
+
397
+ print(f"\n📊 Training Configuration:")
398
+ print(f" - Total training steps: {total_steps}")
399
+ print(f" - Steps per epoch: {steps_per_epoch}")
400
+ print(f" - Warmup steps: {warmup_steps} ({100*warmup_steps/total_steps:.1f}% of training)")
401
+
402
+ # Training arguments (optimized for CodeLlama)
403
+ training_args = TrainingArguments(
404
+ output_dir=args.output_dir,
405
+ num_train_epochs=args.num_epochs,
406
+ per_device_train_batch_size=args.batch_size,
407
+ gradient_accumulation_steps=args.gradient_accumulation,
408
+ warmup_steps=warmup_steps,
409
+ learning_rate=args.learning_rate,
410
+ weight_decay=0.01,
411
+ fp16=use_fp16,
412
+ logging_steps=args.logging_steps,
413
+ save_steps=args.save_steps,
414
+ eval_strategy="steps",
415
+ eval_steps=args.eval_steps,
416
+ save_total_limit=3,
417
+ load_best_model_at_end=True,
418
+ metric_for_best_model="eval_loss",
419
+ greater_is_better=False,
420
+ lr_scheduler_type="cosine",
421
+ max_grad_norm=1.0,
422
+ report_to="none",
423
+ push_to_hub=False,
424
+ dataloader_pin_memory=(device_info["device_type"] == "cuda"),
425
+ remove_unused_columns=False,
426
+ resume_from_checkpoint=resume_checkpoint, # Resume support
427
+ )
428
+
429
+ print(f"\n⚙️ Hyperparameters (Optimized for CodeLlama):")
430
+ print(f" - Max length: {args.max_length}")
431
+ print(f" - Epochs: {args.num_epochs}")
432
+ print(f" - Batch size: {args.batch_size}")
433
+ print(f" - Gradient accumulation: {args.gradient_accumulation}")
434
+ print(f" - Learning rate: {args.learning_rate}")
435
+ print(f" - LoRA rank: {args.lora_r}")
436
+ print(f" - LoRA alpha: {args.lora_alpha}")
437
+ print(f" - LoRA dropout: {args.lora_dropout}")
438
+ print(f" - Device: {device_info['device']}")
439
+ print(f" - Mixed precision (fp16): {use_fp16}")
440
+ print("=" * 70)
441
+
442
+ # Data collator - since we pad during tokenization, collator mainly handles batching
443
+ # Ensure pad_token_id is set
444
+ if tokenizer.pad_token_id is None:
445
+ tokenizer.pad_token_id = tokenizer.eos_token_id
446
+ tokenizer.pad_token = tokenizer.eos_token
447
+
448
+ data_collator = DataCollatorForLanguageModeling(
449
+ tokenizer=tokenizer,
450
+ mlm=False, # Causal LM, not masked LM
451
+ )
452
+
453
+ # Create trainer
454
+ trainer = Trainer(
455
+ model=model,
456
+ args=training_args,
457
+ train_dataset=train_dataset,
458
+ eval_dataset=val_dataset,
459
+ data_collator=data_collator,
460
+ callbacks=[EarlyStoppingCallback(early_stopping_patience=args.early_stopping_patience)],
461
+ )
462
+
463
+ # Train
464
+ print("\n🚀 Starting training...")
465
+ if resume_checkpoint:
466
+ print(f" Resuming from: {resume_checkpoint}")
467
+ print("=" * 70)
468
+
469
+ trainer.train(resume_from_checkpoint=resume_checkpoint)
470
+
471
+ # Save final model
472
+ print(f"\n💾 Saving fine-tuned model to {args.output_dir}")
473
+ trainer.save_model(args.output_dir)
474
+ tokenizer.save_pretrained(args.output_dir)
475
+ model.save_pretrained(args.output_dir)
476
+
477
+ # Save training config
478
+ config = {
479
+ "base_model": args.base_model,
480
+ "adapter_path": args.adapter_path if args.adapter_path else None,
481
+ "dataset": args.dataset,
482
+ "output_dir": args.output_dir,
483
+ "hyperparameters": {
484
+ "max_length": args.max_length,
485
+ "num_epochs": args.num_epochs,
486
+ "batch_size": args.batch_size,
487
+ "gradient_accumulation": args.gradient_accumulation,
488
+ "learning_rate": args.learning_rate,
489
+ "lora_r": args.lora_r,
490
+ "lora_alpha": args.lora_alpha,
491
+ "lora_dropout": args.lora_dropout,
492
+ },
493
+ "training_mode": "incremental" if args.adapter_path else "fresh",
494
+ "resumed_from_checkpoint": resume_checkpoint is not None
495
+ }
496
+
497
+ config_path = Path(args.output_dir) / "training_config.json"
498
+ with open(config_path, 'w') as f:
499
+ json.dump(config, f, indent=2)
500
+
501
+ print("\n✅ Fine-tuning complete!")
502
+ print(f"Model saved to: {args.output_dir}")
503
+ print(f"Config saved to: {config_path}")
504
+ print(f"\n💡 To continue training with new data (incremental fine-tuning):")
505
+ print(f" python finetune_codellama.py --base-model {args.base_model} \\")
506
+ print(f" --adapter-path {args.output_dir} \\")
507
+ print(f" --dataset <new_dataset.jsonl> \\")
508
+ print(f" --output-dir <new_output_dir>")
509
+ print(f"\n💡 To resume from checkpoint if training is interrupted:")
510
+ print(f" python finetune_codellama.py --base-model {args.base_model} \\")
511
+ print(f" --dataset {args.dataset} \\")
512
+ print(f" --output-dir {args.output_dir} \\")
513
+ print(f" --resume-from-checkpoint auto")
514
+
515
+ if __name__ == "__main__":
516
+ main()