RayMelius Claude Opus 4.6 commited on
Commit
3f2f8ab
Β·
1 Parent(s): 2966943

Fix training cell hanging: remove load_best_model_at_end

Browse files

- Removed load_best_model_at_end=True which caused the trainer to hang
trying to reload the best checkpoint after training (OOM on T4)
- Save adapter immediately after training completes in the same cell
- Removed redundant save_pretrained from the merge cell

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

notebooks/ch_trader_finetune.ipynb CHANGED
@@ -1228,7 +1228,7 @@
1228
  },
1229
  {
1230
  "cell_type": "code",
1231
- "source": "from transformers import TrainerCallback\nimport shutil, glob\n\nKAGGLE_DATASET_ID = \"xabonum/stockex-ch-checkpoints\"\nMAX_CHECKPOINTS_PERSISTENT = 3 # keep only last 3 in Kaggle dataset / Drive\n\n# ── Create Kaggle dataset metadata if needed ────────────────────────\nif USE_DRIVE and DRIVE_CKPT_DIR:\n metadata_path = os.path.join(DRIVE_CKPT_DIR, \"dataset-metadata.json\")\n if not os.path.exists(metadata_path):\n metadata = {\n \"title\": \"stockex-ch-checkpoints\",\n \"id\": KAGGLE_DATASET_ID,\n \"licenses\": [{\"name\": \"CC0-1.0\"}]\n }\n with open(metadata_path, \"w\") as f:\n json.dump(metadata, f, indent=2)\n print(\"Created dataset-metadata.json\")\n\n\ndef cleanup_old_checkpoints(ckpt_dir, keep=MAX_CHECKPOINTS_PERSISTENT):\n \"\"\"Keep only the last `keep` checkpoints in persistent storage.\"\"\"\n ckpts = sorted(\n glob.glob(os.path.join(ckpt_dir, \"checkpoint-*\")),\n key=lambda p: int(os.path.basename(p).split(\"-\")[-1])\n )\n while len(ckpts) > keep:\n old = ckpts.pop(0)\n shutil.rmtree(old)\n print(f\"[Checkpoint] Removed old: {os.path.basename(old)}\")\n\n\nclass CheckpointSyncCallback(TrainerCallback):\n \"\"\"Copy checkpoint to persistent storage, push LoRA to HF Hub, upload to Kaggle dataset.\"\"\"\n\n def on_save(self, args, state, control, **kwargs):\n ckpt_dir = os.path.join(args.output_dir, f\"checkpoint-{state.global_step}\")\n if not os.path.isdir(ckpt_dir):\n return\n\n # 1. Save to persistent folder (Drive / Kaggle working dir)\n if USE_DRIVE and DRIVE_CKPT_DIR:\n dest = os.path.join(DRIVE_CKPT_DIR, f\"checkpoint-{state.global_step}\")\n os.makedirs(DRIVE_CKPT_DIR, exist_ok=True)\n try:\n shutil.copytree(ckpt_dir, dest, dirs_exist_ok=True)\n print(f\"[Checkpoint] Saved -> {dest}\")\n except Exception as e:\n print(f\"[Checkpoint] Copy failed: {e}\")\n\n # Cleanup: keep only last 3 checkpoints in persistent storage\n try:\n cleanup_old_checkpoints(DRIVE_CKPT_DIR)\n except Exception as e:\n print(f\"[Checkpoint] Cleanup failed: {e}\")\n\n # 2. Push LoRA adapter to HF Hub\n try:\n kwargs[\"model\"].push_to_hub(\n OUTPUT_REPO,\n commit_message=f\"Checkpoint step {state.global_step} (epoch {state.epoch:.2f})\",\n token=HF_TOKEN,\n )\n print(f\"[Checkpoint] Pushed step {state.global_step} -> HF Hub\")\n except Exception as e:\n print(f\"[Checkpoint] HF push failed: {e}\")\n\n # 3. Upload to Kaggle dataset (Kaggle only)\n if RUNNING_ON_KAGGLE and USE_DRIVE:\n try:\n os.system(\n f\"kaggle datasets version -p {DRIVE_CKPT_DIR} \"\n f\"-m 'Checkpoint step {state.global_step}' --dir-mode zip\"\n )\n print(f\"[Checkpoint] Kaggle dataset updated for step {state.global_step}\")\n except Exception as e:\n print(f\"[Checkpoint] Kaggle dataset update failed: {e}\")\n\n\nsft_config = SFTConfig(\n output_dir=OUTPUT_DIR,\n num_train_epochs=NUM_EPOCHS,\n per_device_train_batch_size=BATCH_SIZE,\n per_device_eval_batch_size=BATCH_SIZE,\n gradient_accumulation_steps=GRAD_ACCUM,\n gradient_checkpointing=True,\n optim=\"paged_adamw_8bit\",\n learning_rate=LR,\n lr_scheduler_type=\"cosine\",\n warmup_ratio=0.05,\n fp16=not torch.cuda.is_bf16_supported(),\n bf16=torch.cuda.is_bf16_supported(),\n logging_steps=10,\n eval_strategy=\"steps\",\n eval_steps=SAVE_STEPS,\n save_strategy=\"steps\",\n save_steps=SAVE_STEPS,\n save_total_limit=3,\n load_best_model_at_end=True,\n metric_for_best_model=\"eval_loss\",\n greater_is_better=False,\n report_to=\"none\",\n dataset_text_field=\"text\",\n packing=False,\n)\n\ntrainer = SFTTrainer(\n model=model,\n args=sft_config,\n train_dataset=train_dataset,\n eval_dataset=val_dataset,\n peft_config=lora_config,\n processing_class=tokenizer,\n callbacks=[CheckpointSyncCallback()],\n)\n\nif RESUME_FROM:\n print(f\"Resuming from checkpoint: {RESUME_FROM}\")\nelse:\n print(f\"Starting training from scratch. Save every {SAVE_STEPS} steps.\")\n\ntrainer.train(resume_from_checkpoint=RESUME_FROM)\nprint(\"Training complete.\")",
1232
  "metadata": {
1233
  "trusted": true
1234
  },
@@ -1251,7 +1251,7 @@
1251
  },
1252
  {
1253
  "cell_type": "code",
1254
- "source": "from peft import PeftModel\n\n# Save best adapter checkpoint locally\ntrainer.model.save_pretrained(OUTPUT_DIR)\ntokenizer.save_pretrained(OUTPUT_DIR)\nprint(f\"Adapter saved to {OUTPUT_DIR}\")\n\n# Reload base model in fp16 for merging (can't merge with 4-bit)\nprint(\"Reloading base model in fp16 for adapter merge...\")\ndel model\ntorch.cuda.empty_cache()\n\nbase_model = AutoModelForCausalLM.from_pretrained(\n BASE_MODEL,\n torch_dtype=torch.float16,\n device_map=\"auto\",\n trust_remote_code=True,\n)\nmerged_model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)\nmerged_model = merged_model.merge_and_unload()\nprint(\"Adapters merged.\")",
1255
  "metadata": {
1256
  "id": "save-model",
1257
  "trusted": true
 
1228
  },
1229
  {
1230
  "cell_type": "code",
1231
+ "source": "from transformers import TrainerCallback\nimport shutil, glob\n\nKAGGLE_DATASET_ID = \"xabonum/stockex-ch-checkpoints\"\nMAX_CHECKPOINTS_PERSISTENT = 3 # keep only last 3 in Kaggle dataset / Drive\n\n# ── Create Kaggle dataset metadata if needed ────────────────────────\nif USE_DRIVE and DRIVE_CKPT_DIR:\n metadata_path = os.path.join(DRIVE_CKPT_DIR, \"dataset-metadata.json\")\n if not os.path.exists(metadata_path):\n metadata = {\n \"title\": \"stockex-ch-checkpoints\",\n \"id\": KAGGLE_DATASET_ID,\n \"licenses\": [{\"name\": \"CC0-1.0\"}]\n }\n with open(metadata_path, \"w\") as f:\n json.dump(metadata, f, indent=2)\n print(\"Created dataset-metadata.json\")\n\n\ndef cleanup_old_checkpoints(ckpt_dir, keep=MAX_CHECKPOINTS_PERSISTENT):\n \"\"\"Keep only the last `keep` checkpoints in persistent storage.\"\"\"\n ckpts = sorted(\n glob.glob(os.path.join(ckpt_dir, \"checkpoint-*\")),\n key=lambda p: int(os.path.basename(p).split(\"-\")[-1])\n )\n while len(ckpts) > keep:\n old = ckpts.pop(0)\n shutil.rmtree(old)\n print(f\"[Checkpoint] Removed old: {os.path.basename(old)}\")\n\n\nclass CheckpointSyncCallback(TrainerCallback):\n \"\"\"Copy checkpoint to persistent storage, push LoRA to HF Hub, upload to Kaggle dataset.\"\"\"\n\n def on_save(self, args, state, control, **kwargs):\n ckpt_dir = os.path.join(args.output_dir, f\"checkpoint-{state.global_step}\")\n if not os.path.isdir(ckpt_dir):\n return\n\n # 1. Save to persistent folder (Drive / Kaggle working dir)\n if USE_DRIVE and DRIVE_CKPT_DIR:\n dest = os.path.join(DRIVE_CKPT_DIR, f\"checkpoint-{state.global_step}\")\n os.makedirs(DRIVE_CKPT_DIR, exist_ok=True)\n try:\n shutil.copytree(ckpt_dir, dest, dirs_exist_ok=True)\n print(f\"[Checkpoint] Saved -> {dest}\")\n except Exception as e:\n print(f\"[Checkpoint] Copy failed: {e}\")\n\n # Cleanup: keep only last 3 checkpoints in persistent storage\n try:\n cleanup_old_checkpoints(DRIVE_CKPT_DIR)\n except Exception as e:\n print(f\"[Checkpoint] Cleanup failed: {e}\")\n\n # 2. Push LoRA adapter to HF Hub\n try:\n kwargs[\"model\"].push_to_hub(\n OUTPUT_REPO,\n commit_message=f\"Checkpoint step {state.global_step} (epoch {state.epoch:.2f})\",\n token=HF_TOKEN,\n )\n print(f\"[Checkpoint] Pushed step {state.global_step} -> HF Hub\")\n except Exception as e:\n print(f\"[Checkpoint] HF push failed: {e}\")\n\n # 3. Upload to Kaggle dataset (Kaggle only)\n if RUNNING_ON_KAGGLE and USE_DRIVE:\n try:\n os.system(\n f\"kaggle datasets version -p {DRIVE_CKPT_DIR} \"\n f\"-m 'Checkpoint step {state.global_step}' --dir-mode zip\"\n )\n print(f\"[Checkpoint] Kaggle dataset updated for step {state.global_step}\")\n except Exception as e:\n print(f\"[Checkpoint] Kaggle dataset update failed: {e}\")\n\n\nsft_config = SFTConfig(\n output_dir=OUTPUT_DIR,\n num_train_epochs=NUM_EPOCHS,\n per_device_train_batch_size=BATCH_SIZE,\n per_device_eval_batch_size=BATCH_SIZE,\n gradient_accumulation_steps=GRAD_ACCUM,\n gradient_checkpointing=True,\n optim=\"paged_adamw_8bit\",\n learning_rate=LR,\n lr_scheduler_type=\"cosine\",\n warmup_ratio=0.05,\n fp16=not torch.cuda.is_bf16_supported(),\n bf16=torch.cuda.is_bf16_supported(),\n logging_steps=10,\n eval_strategy=\"steps\",\n eval_steps=SAVE_STEPS,\n save_strategy=\"steps\",\n save_steps=SAVE_STEPS,\n save_total_limit=3,\n report_to=\"none\",\n dataset_text_field=\"text\",\n packing=False,\n)\n\ntrainer = SFTTrainer(\n model=model,\n args=sft_config,\n train_dataset=train_dataset,\n eval_dataset=val_dataset,\n peft_config=lora_config,\n processing_class=tokenizer,\n callbacks=[CheckpointSyncCallback()],\n)\n\nif RESUME_FROM:\n print(f\"Resuming from checkpoint: {RESUME_FROM}\")\nelse:\n print(f\"Starting training from scratch. Save every {SAVE_STEPS} steps.\")\n\ntrainer.train(resume_from_checkpoint=RESUME_FROM)\nprint(\"Training complete.\")\n\n# ── Immediately save adapter so subsequent cells don't depend on trainer state ──\ntrainer.model.save_pretrained(OUTPUT_DIR)\ntokenizer.save_pretrained(OUTPUT_DIR)\nprint(f\"Adapter saved to {OUTPUT_DIR}\")",
1232
  "metadata": {
1233
  "trusted": true
1234
  },
 
1251
  },
1252
  {
1253
  "cell_type": "code",
1254
+ "source": "from peft import PeftModel\n\n# Reload base model in fp16 for merging (can't merge with 4-bit)\nprint(\"Reloading base model in fp16 for adapter merge...\")\ndel model\ntorch.cuda.empty_cache()\n\nbase_model = AutoModelForCausalLM.from_pretrained(\n BASE_MODEL,\n torch_dtype=torch.float16,\n device_map=\"auto\",\n trust_remote_code=True,\n)\nmerged_model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)\nmerged_model = merged_model.merge_and_unload()\nprint(\"Adapters merged.\")",
1255
  "metadata": {
1256
  "id": "save-model",
1257
  "trusted": true
notebooks/stockex-clearing-house-llm-fine-tuning.ipynb CHANGED
@@ -1228,7 +1228,7 @@
1228
  },
1229
  {
1230
  "cell_type": "code",
1231
- "source": "from transformers import TrainerCallback\nimport shutil, glob\n\nKAGGLE_DATASET_ID = \"xabonum/stockex-ch-checkpoints\"\nMAX_CHECKPOINTS_PERSISTENT = 3 # keep only last 3 in Kaggle dataset / Drive\n\n# ── Create Kaggle dataset metadata if needed ────────────────────────\nif USE_DRIVE and DRIVE_CKPT_DIR:\n metadata_path = os.path.join(DRIVE_CKPT_DIR, \"dataset-metadata.json\")\n if not os.path.exists(metadata_path):\n metadata = {\n \"title\": \"stockex-ch-checkpoints\",\n \"id\": KAGGLE_DATASET_ID,\n \"licenses\": [{\"name\": \"CC0-1.0\"}]\n }\n with open(metadata_path, \"w\") as f:\n json.dump(metadata, f, indent=2)\n print(\"Created dataset-metadata.json\")\n\n\ndef cleanup_old_checkpoints(ckpt_dir, keep=MAX_CHECKPOINTS_PERSISTENT):\n \"\"\"Keep only the last `keep` checkpoints in persistent storage.\"\"\"\n ckpts = sorted(\n glob.glob(os.path.join(ckpt_dir, \"checkpoint-*\")),\n key=lambda p: int(os.path.basename(p).split(\"-\")[-1])\n )\n while len(ckpts) > keep:\n old = ckpts.pop(0)\n shutil.rmtree(old)\n print(f\"[Checkpoint] Removed old: {os.path.basename(old)}\")\n\n\nclass CheckpointSyncCallback(TrainerCallback):\n \"\"\"Copy checkpoint to persistent storage, push LoRA to HF Hub, upload to Kaggle dataset.\"\"\"\n\n def on_save(self, args, state, control, **kwargs):\n ckpt_dir = os.path.join(args.output_dir, f\"checkpoint-{state.global_step}\")\n if not os.path.isdir(ckpt_dir):\n return\n\n # 1. Save to persistent folder (Drive / Kaggle working dir)\n if USE_DRIVE and DRIVE_CKPT_DIR:\n dest = os.path.join(DRIVE_CKPT_DIR, f\"checkpoint-{state.global_step}\")\n os.makedirs(DRIVE_CKPT_DIR, exist_ok=True)\n try:\n shutil.copytree(ckpt_dir, dest, dirs_exist_ok=True)\n print(f\"[Checkpoint] Saved -> {dest}\")\n except Exception as e:\n print(f\"[Checkpoint] Copy failed: {e}\")\n\n # Cleanup: keep only last 3 checkpoints in persistent storage\n try:\n cleanup_old_checkpoints(DRIVE_CKPT_DIR)\n except Exception as e:\n print(f\"[Checkpoint] Cleanup failed: {e}\")\n\n # 2. Push LoRA adapter to HF Hub\n try:\n kwargs[\"model\"].push_to_hub(\n OUTPUT_REPO,\n commit_message=f\"Checkpoint step {state.global_step} (epoch {state.epoch:.2f})\",\n token=HF_TOKEN,\n )\n print(f\"[Checkpoint] Pushed step {state.global_step} -> HF Hub\")\n except Exception as e:\n print(f\"[Checkpoint] HF push failed: {e}\")\n\n # 3. Upload to Kaggle dataset (Kaggle only)\n if RUNNING_ON_KAGGLE and USE_DRIVE:\n try:\n os.system(\n f\"kaggle datasets version -p {DRIVE_CKPT_DIR} \"\n f\"-m 'Checkpoint step {state.global_step}' --dir-mode zip\"\n )\n print(f\"[Checkpoint] Kaggle dataset updated for step {state.global_step}\")\n except Exception as e:\n print(f\"[Checkpoint] Kaggle dataset update failed: {e}\")\n\n\nsft_config = SFTConfig(\n output_dir=OUTPUT_DIR,\n num_train_epochs=NUM_EPOCHS,\n per_device_train_batch_size=BATCH_SIZE,\n per_device_eval_batch_size=BATCH_SIZE,\n gradient_accumulation_steps=GRAD_ACCUM,\n gradient_checkpointing=True,\n optim=\"paged_adamw_8bit\",\n learning_rate=LR,\n lr_scheduler_type=\"cosine\",\n warmup_ratio=0.05,\n fp16=not torch.cuda.is_bf16_supported(),\n bf16=torch.cuda.is_bf16_supported(),\n logging_steps=10,\n eval_strategy=\"steps\",\n eval_steps=SAVE_STEPS,\n save_strategy=\"steps\",\n save_steps=SAVE_STEPS,\n save_total_limit=3,\n load_best_model_at_end=True,\n metric_for_best_model=\"eval_loss\",\n greater_is_better=False,\n report_to=\"none\",\n dataset_text_field=\"text\",\n packing=False,\n)\n\ntrainer = SFTTrainer(\n model=model,\n args=sft_config,\n train_dataset=train_dataset,\n eval_dataset=val_dataset,\n peft_config=lora_config,\n processing_class=tokenizer,\n callbacks=[CheckpointSyncCallback()],\n)\n\nif RESUME_FROM:\n print(f\"Resuming from checkpoint: {RESUME_FROM}\")\nelse:\n print(f\"Starting training from scratch. Save every {SAVE_STEPS} steps.\")\n\ntrainer.train(resume_from_checkpoint=RESUME_FROM)\nprint(\"Training complete.\")",
1232
  "metadata": {
1233
  "trusted": true
1234
  },
@@ -1251,7 +1251,7 @@
1251
  },
1252
  {
1253
  "cell_type": "code",
1254
- "source": "from peft import PeftModel\n\n# Save best adapter checkpoint locally\ntrainer.model.save_pretrained(OUTPUT_DIR)\ntokenizer.save_pretrained(OUTPUT_DIR)\nprint(f\"Adapter saved to {OUTPUT_DIR}\")\n\n# Reload base model in fp16 for merging (can't merge with 4-bit)\nprint(\"Reloading base model in fp16 for adapter merge...\")\ndel model\ntorch.cuda.empty_cache()\n\nbase_model = AutoModelForCausalLM.from_pretrained(\n BASE_MODEL,\n torch_dtype=torch.float16,\n device_map=\"auto\",\n trust_remote_code=True,\n)\nmerged_model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)\nmerged_model = merged_model.merge_and_unload()\nprint(\"Adapters merged.\")",
1255
  "metadata": {
1256
  "id": "save-model",
1257
  "trusted": true
 
1228
  },
1229
  {
1230
  "cell_type": "code",
1231
+ "source": "from transformers import TrainerCallback\nimport shutil, glob\n\nKAGGLE_DATASET_ID = \"xabonum/stockex-ch-checkpoints\"\nMAX_CHECKPOINTS_PERSISTENT = 3 # keep only last 3 in Kaggle dataset / Drive\n\n# ── Create Kaggle dataset metadata if needed ────────────────────────\nif USE_DRIVE and DRIVE_CKPT_DIR:\n metadata_path = os.path.join(DRIVE_CKPT_DIR, \"dataset-metadata.json\")\n if not os.path.exists(metadata_path):\n metadata = {\n \"title\": \"stockex-ch-checkpoints\",\n \"id\": KAGGLE_DATASET_ID,\n \"licenses\": [{\"name\": \"CC0-1.0\"}]\n }\n with open(metadata_path, \"w\") as f:\n json.dump(metadata, f, indent=2)\n print(\"Created dataset-metadata.json\")\n\n\ndef cleanup_old_checkpoints(ckpt_dir, keep=MAX_CHECKPOINTS_PERSISTENT):\n \"\"\"Keep only the last `keep` checkpoints in persistent storage.\"\"\"\n ckpts = sorted(\n glob.glob(os.path.join(ckpt_dir, \"checkpoint-*\")),\n key=lambda p: int(os.path.basename(p).split(\"-\")[-1])\n )\n while len(ckpts) > keep:\n old = ckpts.pop(0)\n shutil.rmtree(old)\n print(f\"[Checkpoint] Removed old: {os.path.basename(old)}\")\n\n\nclass CheckpointSyncCallback(TrainerCallback):\n \"\"\"Copy checkpoint to persistent storage, push LoRA to HF Hub, upload to Kaggle dataset.\"\"\"\n\n def on_save(self, args, state, control, **kwargs):\n ckpt_dir = os.path.join(args.output_dir, f\"checkpoint-{state.global_step}\")\n if not os.path.isdir(ckpt_dir):\n return\n\n # 1. Save to persistent folder (Drive / Kaggle working dir)\n if USE_DRIVE and DRIVE_CKPT_DIR:\n dest = os.path.join(DRIVE_CKPT_DIR, f\"checkpoint-{state.global_step}\")\n os.makedirs(DRIVE_CKPT_DIR, exist_ok=True)\n try:\n shutil.copytree(ckpt_dir, dest, dirs_exist_ok=True)\n print(f\"[Checkpoint] Saved -> {dest}\")\n except Exception as e:\n print(f\"[Checkpoint] Copy failed: {e}\")\n\n # Cleanup: keep only last 3 checkpoints in persistent storage\n try:\n cleanup_old_checkpoints(DRIVE_CKPT_DIR)\n except Exception as e:\n print(f\"[Checkpoint] Cleanup failed: {e}\")\n\n # 2. Push LoRA adapter to HF Hub\n try:\n kwargs[\"model\"].push_to_hub(\n OUTPUT_REPO,\n commit_message=f\"Checkpoint step {state.global_step} (epoch {state.epoch:.2f})\",\n token=HF_TOKEN,\n )\n print(f\"[Checkpoint] Pushed step {state.global_step} -> HF Hub\")\n except Exception as e:\n print(f\"[Checkpoint] HF push failed: {e}\")\n\n # 3. Upload to Kaggle dataset (Kaggle only)\n if RUNNING_ON_KAGGLE and USE_DRIVE:\n try:\n os.system(\n f\"kaggle datasets version -p {DRIVE_CKPT_DIR} \"\n f\"-m 'Checkpoint step {state.global_step}' --dir-mode zip\"\n )\n print(f\"[Checkpoint] Kaggle dataset updated for step {state.global_step}\")\n except Exception as e:\n print(f\"[Checkpoint] Kaggle dataset update failed: {e}\")\n\n\nsft_config = SFTConfig(\n output_dir=OUTPUT_DIR,\n num_train_epochs=NUM_EPOCHS,\n per_device_train_batch_size=BATCH_SIZE,\n per_device_eval_batch_size=BATCH_SIZE,\n gradient_accumulation_steps=GRAD_ACCUM,\n gradient_checkpointing=True,\n optim=\"paged_adamw_8bit\",\n learning_rate=LR,\n lr_scheduler_type=\"cosine\",\n warmup_ratio=0.05,\n fp16=not torch.cuda.is_bf16_supported(),\n bf16=torch.cuda.is_bf16_supported(),\n logging_steps=10,\n eval_strategy=\"steps\",\n eval_steps=SAVE_STEPS,\n save_strategy=\"steps\",\n save_steps=SAVE_STEPS,\n save_total_limit=3,\n report_to=\"none\",\n dataset_text_field=\"text\",\n packing=False,\n)\n\ntrainer = SFTTrainer(\n model=model,\n args=sft_config,\n train_dataset=train_dataset,\n eval_dataset=val_dataset,\n peft_config=lora_config,\n processing_class=tokenizer,\n callbacks=[CheckpointSyncCallback()],\n)\n\nif RESUME_FROM:\n print(f\"Resuming from checkpoint: {RESUME_FROM}\")\nelse:\n print(f\"Starting training from scratch. Save every {SAVE_STEPS} steps.\")\n\ntrainer.train(resume_from_checkpoint=RESUME_FROM)\nprint(\"Training complete.\")\n\n# ── Immediately save adapter so subsequent cells don't depend on trainer state ──\ntrainer.model.save_pretrained(OUTPUT_DIR)\ntokenizer.save_pretrained(OUTPUT_DIR)\nprint(f\"Adapter saved to {OUTPUT_DIR}\")",
1232
  "metadata": {
1233
  "trusted": true
1234
  },
 
1251
  },
1252
  {
1253
  "cell_type": "code",
1254
+ "source": "from peft import PeftModel\n\n# Reload base model in fp16 for merging (can't merge with 4-bit)\nprint(\"Reloading base model in fp16 for adapter merge...\")\ndel model\ntorch.cuda.empty_cache()\n\nbase_model = AutoModelForCausalLM.from_pretrained(\n BASE_MODEL,\n torch_dtype=torch.float16,\n device_map=\"auto\",\n trust_remote_code=True,\n)\nmerged_model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)\nmerged_model = merged_model.merge_and_unload()\nprint(\"Adapters merged.\")",
1255
  "metadata": {
1256
  "id": "save-model",
1257
  "trusted": true