chipling
/

opium-mdlm

chipling commited on 9 days ago

Commit

942c98a

verified ·

1 Parent(s): d5df505

Upload main.ipynb with huggingface_hub

Files changed (1) hide show

main.ipynb CHANGED Viewed

@@ -148,7 +148,7 @@
     "\n",
     "    # Training\n",
     "    seq_len: int = 256\n",
-    "    batch_size: int = 16           # T4 16GB \u2014 small batch, more accum\n",
     "    grad_accum_steps: int = 2      # Effective batch = 128\n",
     "    learning_rate: float = 3e-4\n",
     "    weight_decay: float = 0.01\n",
@@ -642,7 +642,7 @@
     "model_unwrapped = model\n",
     "if torch.cuda.device_count() > 1:\n",
     "    print(f\"\\nUsing {torch.cuda.device_count()} GPUs with DataParallel!\")\n",
-    "    model_dp = nn.DataParallel(model)\n",
     "else:\n",
     "    model_dp = model\n",
     "\n",
@@ -856,7 +856,6 @@
    "cell_type": "markdown",
    "id": "resume_md",
    "metadata": {},
-   "outputs": [],
    "source": [
     "## Resume from HuggingFace Checkpoint\n",
     "\n",
@@ -866,6 +865,7 @@
   },
   {
    "cell_type": "code",
    "id": "resume_code",
    "metadata": {},
    "outputs": [],
@@ -1957,16 +1957,6 @@
     "print(f'Total tokens processed: {tokens_processed:,}')\n"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "efed12b1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from google.colab import files; files.download('checkpoint_small.pt')\n"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,

     "\n",
     "    # Training\n",
     "    seq_len: int = 256\n",
+    "    batch_size: int = 32           # T4 16GB \u2014 small batch, more accum\n",
     "    grad_accum_steps: int = 2      # Effective batch = 128\n",
     "    learning_rate: float = 3e-4\n",
     "    weight_decay: float = 0.01\n",
     "model_unwrapped = model\n",
     "if torch.cuda.device_count() > 1:\n",
     "    print(f\"\\nUsing {torch.cuda.device_count()} GPUs with DataParallel!\")\n",
+    "    model_dp = nn.DataParallel(model, device_ids=[0, 1], output_device=0)\n",
     "else:\n",
     "    model_dp = model\n",
     "\n",
    "cell_type": "markdown",
    "id": "resume_md",
    "metadata": {},
    "source": [
     "## Resume from HuggingFace Checkpoint\n",
     "\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "resume_code",
    "metadata": {},
    "outputs": [],
     "print(f'Total tokens processed: {tokens_processed:,}')\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,