algorythmtechnologies
/

Zenith_Copilot

Safetensors

Model card Files Files and versions

xet

Community

algorythmtechnologies commited on Oct 23, 2025

Commit

f2d3b70

verified ·

1 Parent(s): 706b5ee

Update train.py

Browse files

Files changed (1) hide show

train.py +16 -34

train.py CHANGED Viewed

@@ -3,7 +3,7 @@ import random
 import numpy as np
 import torch
 from datasets import load_dataset
-from transformers import AutoTokenizer, AutoModelForCausalLM, EarlyStoppingCallback, TrainerCallback
 from trl import SFTTrainer, SFTConfig
 from peft import LoraConfig
 from transformers import BitsAndBytesConfig
@@ -13,7 +13,7 @@ BASE_MODEL = os.environ.get("BASE_MODEL", "DeepSeek-Coder-V2-Lite-Instruct")
 OUTPUT_DIR = os.environ.get("OUTPUT_DIR", "outputs/zenith-lora")
 DATA_PATH = os.environ.get("DATA_PATH", "data/zenith_combined.jsonl")
 VAL_PATH = os.environ.get("VAL_PATH")
-MAX_STEPS = int(os.environ.get("STEPS", 300))  # ~2 hr on A100
 SEED = int(os.environ.get("SEED", 42))
 os.makedirs(OUTPUT_DIR, exist_ok=True)
@@ -24,24 +24,20 @@ np.random.seed(SEED)
 torch.manual_seed(SEED)
 if torch.cuda.is_available():
     torch.cuda.manual_seed_all(SEED)
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
 print(f"🚀 Loading tokenizer and model from: {BASE_MODEL}")
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
-# ====== GPU PRECISION CONFIG ======
 compute_dtype = torch.float16
-if torch.cuda.is_available():
-    major, _ = torch.cuda.get_device_capability(0)
-    if major >= 8:
-        print("✅ Using bfloat16 for Ampere+ GPU")
-        compute_dtype = torch.bfloat16
-# ====== 4-BIT QUANTIZATION ======
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
@@ -58,20 +54,16 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 model.config.use_cache = False
-# ====== DATASET LOADING ======
 data_files = [DATA_PATH]
-print(f"📂 Loading dataset: {data_files}")
 raw_train = load_dataset("json", data_files=data_files, split="train")
 if VAL_PATH and os.path.exists(VAL_PATH):
-    print(f"📁 Using external validation: {VAL_PATH}")
     raw_val = load_dataset("json", data_files=VAL_PATH, split="train")
 else:
     split = raw_train.train_test_split(test_size=0.05, seed=SEED)
     raw_train, raw_val = split["train"], split["test"]
-MAX_SEQ_LEN = int(os.environ.get("MAX_SEQ_LEN", 2048))
 def _valid(example):
     msgs = example.get("messages")
     if not isinstance(msgs, list) or not msgs:
@@ -83,30 +75,27 @@ def _valid(example):
 def _to_text(example):
     try:
-        text = tokenizer.apply_chat_template(
-            example["messages"], tokenize=False, add_generation_prompt=False
-        )
         return {"text": text}
     except Exception:
         return {"text": ""}
-train_ds = raw_train.filter(_valid)
-val_ds = raw_val.filter(_valid)
-train_ds = train_ds.map(_to_text, remove_columns=train_ds.column_names)
-val_ds = val_ds.map(_to_text, remove_columns=val_ds.column_names)
 train_ds = train_ds.filter(lambda x: len(x.get("text", "")) > 0)
 val_ds = val_ds.filter(lambda x: len(x.get("text", "")) > 0)
 print(f"✅ Training samples: {len(train_ds)}, Validation: {len(val_ds)}")
-# ====== LORA CONFIG (gentle mode) ======
 peft_config = LoraConfig(
     r=int(os.environ.get("LORA_R", 8)),
     lora_alpha=int(os.environ.get("LORA_ALPHA", 16)),
     lora_dropout=float(os.environ.get("LORA_DROPOUT", 0.1)),
     bias="none",
     task_type="CAUSAL_LM",
 )
 # ====== EVAL CALLBACK ======
@@ -132,9 +121,8 @@ training_args = SFTConfig(
     logging_steps=int(os.environ.get("LOG_STEPS", 10)),
     save_steps=int(os.environ.get("SAVE_STEPS", 50)),
     save_total_limit=int(os.environ.get("SAVE_LIMIT", 2)),
-    fp16=torch.cuda.is_available(),
-    bf16=torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8,
-    max_seq_length=MAX_SEQ_LEN,
     gradient_checkpointing=True,
     gradient_checkpointing_kwargs={"use_reentrant": False},
     dataloader_drop_last=True,
@@ -143,19 +131,13 @@ training_args = SFTConfig(
 )
 # ====== TRAINER ======
-print(f"🏁 Starting Zenith fine-tuning for {MAX_STEPS} steps (~2h runtime)...")
 trainer = SFTTrainer(
     model=model,
-    tokenizer=tokenizer,
     train_dataset=train_ds,
     eval_dataset=val_ds,
     peft_config=peft_config,
     args=training_args,
-    dataset_text_field="text",
-    callbacks=[
-        EarlyStoppingCallback(early_stopping_patience=int(os.environ.get("EARLY_STOP_PATIENCE", 3))),
-        EvalEveryCallback(eval_steps=int(os.environ.get("EVAL_STEPS", 50)))
-    ],
 )
 trainer.train()
@@ -165,4 +147,4 @@ trainer.model.save_pretrained(OUTPUT_DIR)
 tokenizer.save_pretrained(OUTPUT_DIR)
 print(f"✅ Zenith LoRA adapter saved to: {OUTPUT_DIR}")
-print("🎯 Training complete under 2 hours.")

 import numpy as np
 import torch
 from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModelForCausalLM, TrainerCallback, EarlyStoppingCallback
 from trl import SFTTrainer, SFTConfig
 from peft import LoraConfig
 from transformers import BitsAndBytesConfig
 OUTPUT_DIR = os.environ.get("OUTPUT_DIR", "outputs/zenith-lora")
 DATA_PATH = os.environ.get("DATA_PATH", "data/zenith_combined.jsonl")
 VAL_PATH = os.environ.get("VAL_PATH")
+MAX_STEPS = int(os.environ.get("STEPS", 300))
 SEED = int(os.environ.get("SEED", 42))
 os.makedirs(OUTPUT_DIR, exist_ok=True)
 torch.manual_seed(SEED)
 if torch.cuda.is_available():
     torch.cuda.manual_seed_all(SEED)
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
+# ====== TOKENIZER & MODEL ======
 print(f"🚀 Loading tokenizer and model from: {BASE_MODEL}")
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
 compute_dtype = torch.float16
+if torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8:
+    compute_dtype = torch.bfloat16
+    print("✅ Ampere+ GPU detected — will prefer bf16 where supported.")
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
 )
 model.config.use_cache = False
+# ====== DATASET ======
 data_files = [DATA_PATH]
 raw_train = load_dataset("json", data_files=data_files, split="train")
 if VAL_PATH and os.path.exists(VAL_PATH):
     raw_val = load_dataset("json", data_files=VAL_PATH, split="train")
 else:
     split = raw_train.train_test_split(test_size=0.05, seed=SEED)
     raw_train, raw_val = split["train"], split["test"]
 def _valid(example):
     msgs = example.get("messages")
     if not isinstance(msgs, list) or not msgs:
 def _to_text(example):
     try:
+        text = tokenizer.apply_chat_template(example["messages"], tokenize=False, add_generation_prompt=False)
         return {"text": text}
     except Exception:
         return {"text": ""}
+train_ds = raw_train.filter(_valid).map(_to_text, remove_columns=raw_train.column_names)
+val_ds = raw_val.filter(_valid).map(_to_text, remove_columns=raw_val.column_names)
 train_ds = train_ds.filter(lambda x: len(x.get("text", "")) > 0)
 val_ds = val_ds.filter(lambda x: len(x.get("text", "")) > 0)
 print(f"✅ Training samples: {len(train_ds)}, Validation: {len(val_ds)}")
+# ====== LORA CONFIG ======
 peft_config = LoraConfig(
     r=int(os.environ.get("LORA_R", 8)),
     lora_alpha=int(os.environ.get("LORA_ALPHA", 16)),
     lora_dropout=float(os.environ.get("LORA_DROPOUT", 0.1)),
     bias="none",
     task_type="CAUSAL_LM",
+    target_modules=["q_proj", "v_proj"],  # Required for LoRA injection
 )
 # ====== EVAL CALLBACK ======
     logging_steps=int(os.environ.get("LOG_STEPS", 10)),
     save_steps=int(os.environ.get("SAVE_STEPS", 50)),
     save_total_limit=int(os.environ.get("SAVE_LIMIT", 2)),
+    fp16=torch.cuda.is_available() and compute_dtype==torch.float16,
+    bf16=torch.cuda.is_available() and compute_dtype==torch.bfloat16,
     gradient_checkpointing=True,
     gradient_checkpointing_kwargs={"use_reentrant": False},
     dataloader_drop_last=True,
 )
 # ====== TRAINER ======
+print(f"🏁 Starting Zenith fine-tuning for {MAX_STEPS} steps (~2h config)...")
 trainer = SFTTrainer(
     model=model,
     train_dataset=train_ds,
     eval_dataset=val_ds,
     peft_config=peft_config,
     args=training_args,
 )
 trainer.train()
 tokenizer.save_pretrained(OUTPUT_DIR)
 print(f"✅ Zenith LoRA adapter saved to: {OUTPUT_DIR}")
+print("🎯 Training complete under ~2 hours.")