Upload main.ipynb with huggingface_hub
Browse files- main.ipynb +3 -13
main.ipynb
CHANGED
|
@@ -148,7 +148,7 @@
|
|
| 148 |
"\n",
|
| 149 |
" # Training\n",
|
| 150 |
" seq_len: int = 256\n",
|
| 151 |
-
" batch_size: int =
|
| 152 |
" grad_accum_steps: int = 2 # Effective batch = 128\n",
|
| 153 |
" learning_rate: float = 3e-4\n",
|
| 154 |
" weight_decay: float = 0.01\n",
|
|
@@ -642,7 +642,7 @@
|
|
| 642 |
"model_unwrapped = model\n",
|
| 643 |
"if torch.cuda.device_count() > 1:\n",
|
| 644 |
" print(f\"\\nUsing {torch.cuda.device_count()} GPUs with DataParallel!\")\n",
|
| 645 |
-
" model_dp = nn.DataParallel(model)\n",
|
| 646 |
"else:\n",
|
| 647 |
" model_dp = model\n",
|
| 648 |
"\n",
|
|
@@ -856,7 +856,6 @@
|
|
| 856 |
"cell_type": "markdown",
|
| 857 |
"id": "resume_md",
|
| 858 |
"metadata": {},
|
| 859 |
-
"outputs": [],
|
| 860 |
"source": [
|
| 861 |
"## Resume from HuggingFace Checkpoint\n",
|
| 862 |
"\n",
|
|
@@ -866,6 +865,7 @@
|
|
| 866 |
},
|
| 867 |
{
|
| 868 |
"cell_type": "code",
|
|
|
|
| 869 |
"id": "resume_code",
|
| 870 |
"metadata": {},
|
| 871 |
"outputs": [],
|
|
@@ -1957,16 +1957,6 @@
|
|
| 1957 |
"print(f'Total tokens processed: {tokens_processed:,}')\n"
|
| 1958 |
]
|
| 1959 |
},
|
| 1960 |
-
{
|
| 1961 |
-
"cell_type": "code",
|
| 1962 |
-
"execution_count": null,
|
| 1963 |
-
"id": "efed12b1",
|
| 1964 |
-
"metadata": {},
|
| 1965 |
-
"outputs": [],
|
| 1966 |
-
"source": [
|
| 1967 |
-
"from google.colab import files; files.download('checkpoint_small.pt')\n"
|
| 1968 |
-
]
|
| 1969 |
-
},
|
| 1970 |
{
|
| 1971 |
"cell_type": "code",
|
| 1972 |
"execution_count": null,
|
|
|
|
| 148 |
"\n",
|
| 149 |
" # Training\n",
|
| 150 |
" seq_len: int = 256\n",
|
| 151 |
+
" batch_size: int = 32 # T4 16GB \u2014 small batch, more accum\n",
|
| 152 |
" grad_accum_steps: int = 2 # Effective batch = 128\n",
|
| 153 |
" learning_rate: float = 3e-4\n",
|
| 154 |
" weight_decay: float = 0.01\n",
|
|
|
|
| 642 |
"model_unwrapped = model\n",
|
| 643 |
"if torch.cuda.device_count() > 1:\n",
|
| 644 |
" print(f\"\\nUsing {torch.cuda.device_count()} GPUs with DataParallel!\")\n",
|
| 645 |
+
" model_dp = nn.DataParallel(model, device_ids=[0, 1], output_device=0)\n",
|
| 646 |
"else:\n",
|
| 647 |
" model_dp = model\n",
|
| 648 |
"\n",
|
|
|
|
| 856 |
"cell_type": "markdown",
|
| 857 |
"id": "resume_md",
|
| 858 |
"metadata": {},
|
|
|
|
| 859 |
"source": [
|
| 860 |
"## Resume from HuggingFace Checkpoint\n",
|
| 861 |
"\n",
|
|
|
|
| 865 |
},
|
| 866 |
{
|
| 867 |
"cell_type": "code",
|
| 868 |
+
"execution_count": null,
|
| 869 |
"id": "resume_code",
|
| 870 |
"metadata": {},
|
| 871 |
"outputs": [],
|
|
|
|
| 1957 |
"print(f'Total tokens processed: {tokens_processed:,}')\n"
|
| 1958 |
]
|
| 1959 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1960 |
{
|
| 1961 |
"cell_type": "code",
|
| 1962 |
"execution_count": null,
|