RayMelius Claude Opus 4.6 commited on
Commit
a673bd5
Β·
1 Parent(s): ff6464a

Print resume summary: steps done/remaining, epoch progress, % complete

Browse files
Files changed (1) hide show
  1. notebooks/ch_trader_finetune.ipynb +1 -1
notebooks/ch_trader_finetune.ipynb CHANGED
@@ -85,7 +85,7 @@
85
  {
86
  "cell_type": "code",
87
  "id": "egq0dp9csuo",
88
- "source": "import shutil\nfrom transformers.trainer_utils import get_last_checkpoint\n\n# ── Mount Google Drive for persistent checkpoint storage ──────────────────────\ntry:\n from google.colab import drive\n drive.mount(\"/content/drive\", force_remount=False)\n DRIVE_CKPT_DIR = f\"/content/drive/MyDrive/stockex-ch-checkpoints\"\n USE_DRIVE = True\n print(f\"Google Drive mounted. Checkpoints β†’ {DRIVE_CKPT_DIR}\")\nexcept Exception:\n DRIVE_CKPT_DIR = None\n USE_DRIVE = False\n print(\"Google Drive not available β€” checkpoints saved locally only.\")\n\n# OUTPUT_DIR is the local working copy; DRIVE_CKPT_DIR is the persistent store\nos.makedirs(OUTPUT_DIR, exist_ok=True)\nif USE_DRIVE:\n os.makedirs(DRIVE_CKPT_DIR, exist_ok=True)\n # Restore latest checkpoint from Drive to local dir (if any)\n drive_ckpt = get_last_checkpoint(DRIVE_CKPT_DIR)\n if drive_ckpt:\n local_ckpt_name = os.path.basename(drive_ckpt)\n local_ckpt_path = os.path.join(OUTPUT_DIR, local_ckpt_name)\n if not os.path.exists(local_ckpt_path):\n print(f\"Restoring checkpoint from Drive: {drive_ckpt}\")\n shutil.copytree(drive_ckpt, local_ckpt_path)\n else:\n print(f\"Checkpoint already in local dir: {local_ckpt_path}\")\n\n# ── Detect latest local checkpoint ────────────────────────────────────────────\nRESUME_FROM = get_last_checkpoint(OUTPUT_DIR)\nif RESUME_FROM:\n print(f\"βœ“ Will resume training from: {RESUME_FROM}\")\nelse:\n print(\"No checkpoint found β€” starting fresh.\")\n\nSAVE_STEPS = 50 # save every N steps (also pushed to Drive + HF Hub)",
89
  "metadata": {},
90
  "execution_count": null,
91
  "outputs": []
 
85
  {
86
  "cell_type": "code",
87
  "id": "egq0dp9csuo",
88
+ "source": "import shutil, math\nfrom transformers.trainer_utils import get_last_checkpoint\n\n# ── Mount Google Drive for persistent checkpoint storage ──────────────────────\ntry:\n from google.colab import drive\n drive.mount(\"/content/drive\", force_remount=False)\n DRIVE_CKPT_DIR = \"/content/drive/MyDrive/stockex-ch-checkpoints\"\n USE_DRIVE = True\n print(f\"Google Drive mounted. Checkpoints β†’ {DRIVE_CKPT_DIR}\")\nexcept Exception:\n DRIVE_CKPT_DIR = None\n USE_DRIVE = False\n print(\"Google Drive not available β€” checkpoints saved locally only.\")\n\nos.makedirs(OUTPUT_DIR, exist_ok=True)\nif USE_DRIVE:\n os.makedirs(DRIVE_CKPT_DIR, exist_ok=True)\n drive_ckpt = get_last_checkpoint(DRIVE_CKPT_DIR)\n if drive_ckpt:\n local_ckpt_name = os.path.basename(drive_ckpt)\n local_ckpt_path = os.path.join(OUTPUT_DIR, local_ckpt_name)\n if not os.path.exists(local_ckpt_path):\n print(f\"Restoring checkpoint from Drive: {drive_ckpt}\")\n shutil.copytree(drive_ckpt, local_ckpt_path)\n\n# ── Detect latest local checkpoint ────────────────────────────────────────────\nRESUME_FROM = get_last_checkpoint(OUTPUT_DIR)\n\nSAVE_STEPS = 50\n\n# ── Resume summary ─────────────────────────────────────────────────────────────\nif RESUME_FROM:\n # Parse completed steps from checkpoint folder name (e.g. \"checkpoint-350\")\n completed_steps = int(os.path.basename(RESUME_FROM).split(\"-\")[-1])\n\n # Total steps = ceil(train_size / (batch * grad_accum)) * epochs\n train_size = int(DATASET_SIZE * 0.9)\n steps_per_epoch = math.ceil(train_size / (BATCH_SIZE * GRAD_ACCUM))\n total_steps = steps_per_epoch * NUM_EPOCHS\n remaining = max(0, total_steps - completed_steps)\n pct_done = 100 * completed_steps / total_steps\n epoch_done = completed_steps / steps_per_epoch\n\n print(\"\\n\" + \"=\" * 55)\n print(\" RESUMING FROM CHECKPOINT\")\n print(\"=\" * 55)\n print(f\" Checkpoint : {os.path.basename(RESUME_FROM)}\")\n print(f\" Steps done : {completed_steps:,} / {total_steps:,} ({pct_done:.1f}%)\")\n print(f\" Steps left : {remaining:,}\")\n print(f\" Epoch : {epoch_done:.2f} / {NUM_EPOCHS}\")\n print(f\" Epochs left : {NUM_EPOCHS - epoch_done:.2f}\")\n print(f\" Steps/epoch : {steps_per_epoch:,}\")\n print(f\" Save every : {SAVE_STEPS} steps\")\n print(\"=\" * 55 + \"\\n\")\nelse:\n train_size = int(DATASET_SIZE * 0.9)\n steps_per_epoch = math.ceil(train_size / (BATCH_SIZE * GRAD_ACCUM))\n total_steps = steps_per_epoch * NUM_EPOCHS\n print(\"\\n\" + \"=\" * 55)\n print(\" STARTING FRESH\")\n print(\"=\" * 55)\n print(f\" Total steps : {total_steps:,}\")\n print(f\" Steps/epoch : {steps_per_epoch:,}\")\n print(f\" Epochs : {NUM_EPOCHS}\")\n print(f\" Save every : {SAVE_STEPS} steps\")\n print(\"=\" * 55 + \"\\n\")",
89
  "metadata": {},
90
  "execution_count": null,
91
  "outputs": []