Print resume summary: steps done/remaining, epoch progress, % complete
Browse files
notebooks/ch_trader_finetune.ipynb
CHANGED
|
@@ -85,7 +85,7 @@
|
|
| 85 |
{
|
| 86 |
"cell_type": "code",
|
| 87 |
"id": "egq0dp9csuo",
|
| 88 |
-
"source": "import shutil\nfrom transformers.trainer_utils import get_last_checkpoint\n\n# ββ Mount Google Drive for persistent checkpoint storage ββββββββββββββββββββββ\ntry:\n from google.colab import drive\n drive.mount(\"/content/drive\", force_remount=False)\n DRIVE_CKPT_DIR =
|
| 89 |
"metadata": {},
|
| 90 |
"execution_count": null,
|
| 91 |
"outputs": []
|
|
|
|
| 85 |
{
|
| 86 |
"cell_type": "code",
|
| 87 |
"id": "egq0dp9csuo",
|
| 88 |
+
"source": "import shutil, math\nfrom transformers.trainer_utils import get_last_checkpoint\n\n# ββ Mount Google Drive for persistent checkpoint storage ββββββββββββββββββββββ\ntry:\n from google.colab import drive\n drive.mount(\"/content/drive\", force_remount=False)\n DRIVE_CKPT_DIR = \"/content/drive/MyDrive/stockex-ch-checkpoints\"\n USE_DRIVE = True\n print(f\"Google Drive mounted. Checkpoints β {DRIVE_CKPT_DIR}\")\nexcept Exception:\n DRIVE_CKPT_DIR = None\n USE_DRIVE = False\n print(\"Google Drive not available β checkpoints saved locally only.\")\n\nos.makedirs(OUTPUT_DIR, exist_ok=True)\nif USE_DRIVE:\n os.makedirs(DRIVE_CKPT_DIR, exist_ok=True)\n drive_ckpt = get_last_checkpoint(DRIVE_CKPT_DIR)\n if drive_ckpt:\n local_ckpt_name = os.path.basename(drive_ckpt)\n local_ckpt_path = os.path.join(OUTPUT_DIR, local_ckpt_name)\n if not os.path.exists(local_ckpt_path):\n print(f\"Restoring checkpoint from Drive: {drive_ckpt}\")\n shutil.copytree(drive_ckpt, local_ckpt_path)\n\n# ββ Detect latest local checkpoint ββββββββββββββββββββββββββββββββββββββββββββ\nRESUME_FROM = get_last_checkpoint(OUTPUT_DIR)\n\nSAVE_STEPS = 50\n\n# ββ Resume summary βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\nif RESUME_FROM:\n # Parse completed steps from checkpoint folder name (e.g. \"checkpoint-350\")\n completed_steps = int(os.path.basename(RESUME_FROM).split(\"-\")[-1])\n\n # Total steps = ceil(train_size / (batch * grad_accum)) * epochs\n train_size = int(DATASET_SIZE * 0.9)\n steps_per_epoch = math.ceil(train_size / (BATCH_SIZE * GRAD_ACCUM))\n total_steps = steps_per_epoch * NUM_EPOCHS\n remaining = max(0, total_steps - completed_steps)\n pct_done = 100 * completed_steps / total_steps\n epoch_done = completed_steps / steps_per_epoch\n\n print(\"\\n\" + \"=\" * 55)\n print(\" RESUMING FROM CHECKPOINT\")\n print(\"=\" * 55)\n print(f\" Checkpoint : {os.path.basename(RESUME_FROM)}\")\n print(f\" Steps done : {completed_steps:,} / {total_steps:,} ({pct_done:.1f}%)\")\n print(f\" Steps left : {remaining:,}\")\n print(f\" Epoch : {epoch_done:.2f} / {NUM_EPOCHS}\")\n print(f\" Epochs left : {NUM_EPOCHS - epoch_done:.2f}\")\n print(f\" Steps/epoch : {steps_per_epoch:,}\")\n print(f\" Save every : {SAVE_STEPS} steps\")\n print(\"=\" * 55 + \"\\n\")\nelse:\n train_size = int(DATASET_SIZE * 0.9)\n steps_per_epoch = math.ceil(train_size / (BATCH_SIZE * GRAD_ACCUM))\n total_steps = steps_per_epoch * NUM_EPOCHS\n print(\"\\n\" + \"=\" * 55)\n print(\" STARTING FRESH\")\n print(\"=\" * 55)\n print(f\" Total steps : {total_steps:,}\")\n print(f\" Steps/epoch : {steps_per_epoch:,}\")\n print(f\" Epochs : {NUM_EPOCHS}\")\n print(f\" Save every : {SAVE_STEPS} steps\")\n print(\"=\" * 55 + \"\\n\")",
|
| 89 |
"metadata": {},
|
| 90 |
"execution_count": null,
|
| 91 |
"outputs": []
|