| | import os |
| | import random |
| | import numpy as np |
| | import torch |
| | from datasets import load_dataset |
| | from transformers import AutoTokenizer, AutoModelForCausalLM, EarlyStoppingCallback, TrainerCallback |
| | from trl import SFTTrainer, SFTConfig |
| | from peft import LoraConfig |
| | from transformers import BitsAndBytesConfig |
| |
|
| | |
| | BASE_MODEL = os.environ.get("BASE_MODEL", "DeepSeek-Coder-V2-Lite-Instruct") |
| | OUTPUT_DIR = os.environ.get("OUTPUT_DIR", "outputs/zenith-lora") |
| | DATA_PATH = os.environ.get("DATA_PATH", "data/zenith_combined.jsonl") |
| | VAL_PATH = os.environ.get("VAL_PATH") |
| | MAX_STEPS = int(os.environ.get("STEPS", 300)) |
| | SEED = int(os.environ.get("SEED", 42)) |
| |
|
| | os.makedirs(OUTPUT_DIR, exist_ok=True) |
| |
|
| | |
| | random.seed(SEED) |
| | np.random.seed(SEED) |
| | torch.manual_seed(SEED) |
| | if torch.cuda.is_available(): |
| | torch.cuda.manual_seed_all(SEED) |
| |
|
| | torch.backends.cuda.matmul.allow_tf32 = True |
| | torch.backends.cudnn.allow_tf32 = True |
| |
|
| | print(f"π Loading tokenizer and model from: {BASE_MODEL}") |
| | tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) |
| | if tokenizer.pad_token is None: |
| | tokenizer.pad_token = tokenizer.eos_token |
| |
|
| | |
| | compute_dtype = torch.float16 |
| | if torch.cuda.is_available(): |
| | major, _ = torch.cuda.get_device_capability(0) |
| | if major >= 8: |
| | print("β
Using bfloat16 for Ampere+ GPU") |
| | compute_dtype = torch.bfloat16 |
| |
|
| | |
| | bnb_config = BitsAndBytesConfig( |
| | load_in_4bit=True, |
| | bnb_4bit_quant_type="nf4", |
| | bnb_4bit_compute_dtype=compute_dtype, |
| | bnb_4bit_use_double_quant=True, |
| | ) |
| |
|
| | print("βοΈ Loading model with 4-bit quantization...") |
| | model = AutoModelForCausalLM.from_pretrained( |
| | BASE_MODEL, |
| | quantization_config=bnb_config, |
| | device_map="auto", |
| | trust_remote_code=True, |
| | ) |
| | model.config.use_cache = False |
| |
|
| | |
| | data_files = [DATA_PATH] |
| | print(f"π Loading dataset: {data_files}") |
| | raw_train = load_dataset("json", data_files=data_files, split="train") |
| |
|
| | if VAL_PATH and os.path.exists(VAL_PATH): |
| | print(f"π Using external validation: {VAL_PATH}") |
| | raw_val = load_dataset("json", data_files=VAL_PATH, split="train") |
| | else: |
| | split = raw_train.train_test_split(test_size=0.05, seed=SEED) |
| | raw_train, raw_val = split["train"], split["test"] |
| |
|
| | MAX_SEQ_LEN = int(os.environ.get("MAX_SEQ_LEN", 2048)) |
| |
|
| | def _valid(example): |
| | msgs = example.get("messages") |
| | if not isinstance(msgs, list) or not msgs: |
| | return False |
| | for m in msgs: |
| | if not isinstance(m, dict) or "role" not in m or "content" not in m: |
| | return False |
| | return True |
| |
|
| | def _to_text(example): |
| | try: |
| | text = tokenizer.apply_chat_template( |
| | example["messages"], tokenize=False, add_generation_prompt=False |
| | ) |
| | return {"text": text} |
| | except Exception: |
| | return {"text": ""} |
| |
|
| | train_ds = raw_train.filter(_valid) |
| | val_ds = raw_val.filter(_valid) |
| | train_ds = train_ds.map(_to_text, remove_columns=train_ds.column_names) |
| | val_ds = val_ds.map(_to_text, remove_columns=val_ds.column_names) |
| |
|
| | train_ds = train_ds.filter(lambda x: len(x.get("text", "")) > 0) |
| | val_ds = val_ds.filter(lambda x: len(x.get("text", "")) > 0) |
| |
|
| | print(f"β
Training samples: {len(train_ds)}, Validation: {len(val_ds)}") |
| |
|
| | |
| | peft_config = LoraConfig( |
| | r=int(os.environ.get("LORA_R", 8)), |
| | lora_alpha=int(os.environ.get("LORA_ALPHA", 16)), |
| | lora_dropout=float(os.environ.get("LORA_DROPOUT", 0.1)), |
| | bias="none", |
| | task_type="CAUSAL_LM", |
| | ) |
| |
|
| | |
| | class EvalEveryCallback(TrainerCallback): |
| | def __init__(self, eval_steps=100): |
| | self.eval_steps = eval_steps |
| | def on_step_end(self, args, state, control, **kwargs): |
| | if state.global_step % self.eval_steps == 0 and state.global_step > 0: |
| | control.should_evaluate = True |
| | return control |
| |
|
| | |
| | training_args = SFTConfig( |
| | output_dir=OUTPUT_DIR, |
| | max_steps=MAX_STEPS, |
| | per_device_train_batch_size=int(os.environ.get("BATCH", 2)), |
| | gradient_accumulation_steps=int(os.environ.get("GRAD_ACC", 2)), |
| | learning_rate=float(os.environ.get("LR", 5e-5)), |
| | lr_scheduler_type=os.environ.get("LR_SCHED", "cosine"), |
| | warmup_ratio=float(os.environ.get("WARMUP_RATIO", 0.1)), |
| | weight_decay=float(os.environ.get("WEIGHT_DECAY", 0.01)), |
| | max_grad_norm=float(os.environ.get("MAX_GRAD_NORM", 1.0)), |
| | logging_steps=int(os.environ.get("LOG_STEPS", 10)), |
| | save_steps=int(os.environ.get("SAVE_STEPS", 50)), |
| | save_total_limit=int(os.environ.get("SAVE_LIMIT", 2)), |
| | fp16=torch.cuda.is_available(), |
| | bf16=torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8, |
| | max_seq_length=MAX_SEQ_LEN, |
| | gradient_checkpointing=True, |
| | gradient_checkpointing_kwargs={"use_reentrant": False}, |
| | dataloader_drop_last=True, |
| | report_to="none", |
| | seed=SEED, |
| | ) |
| |
|
| | |
| | print(f"π Starting Zenith fine-tuning for {MAX_STEPS} steps (~2h runtime)...") |
| | trainer = SFTTrainer( |
| | model=model, |
| | tokenizer=tokenizer, |
| | train_dataset=train_ds, |
| | eval_dataset=val_ds, |
| | peft_config=peft_config, |
| | args=training_args, |
| | dataset_text_field="text", |
| | callbacks=[ |
| | EarlyStoppingCallback(early_stopping_patience=int(os.environ.get("EARLY_STOP_PATIENCE", 3))), |
| | EvalEveryCallback(eval_steps=int(os.environ.get("EVAL_STEPS", 50))) |
| | ], |
| | ) |
| |
|
| | trainer.train() |
| |
|
| | print("πΎ Saving LoRA adapter...") |
| | trainer.model.save_pretrained(OUTPUT_DIR) |
| | tokenizer.save_pretrained(OUTPUT_DIR) |
| |
|
| | print(f"β
Zenith LoRA adapter saved to: {OUTPUT_DIR}") |
| | print("π― Training complete under 2 hours.") |
| |
|