0523-0022

Browse files

Files changed (4) hide show

ddpm-butterflies-128/logs/training_example/events.out.tfevents.1716433515.atl1-1-03-003-17-0.pace.gatech.edu.184517.0 +3 -0
ddpm-butterflies-128/logs/training_example/events.out.tfevents.1716433608.atl1-1-03-003-17-0.pace.gatech.edu.186921.0 +3 -0
ddpm-butterflies-128/logs/training_example/events.out.tfevents.1716433967.atl1-1-03-003-17-0.pace.gatech.edu.186921.1 +3 -0
diffusion.ipynb +78 -101

ddpm-butterflies-128/logs/training_example/events.out.tfevents.1716433515.atl1-1-03-003-17-0.pace.gatech.edu.184517.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:17b83cfdb678438b3674bcb2f33bfdd0cfd1e4db880f88a14890f078174fd92d
+size 10622

ddpm-butterflies-128/logs/training_example/events.out.tfevents.1716433608.atl1-1-03-003-17-0.pace.gatech.edu.186921.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:448aa0d8971ba332e9550522fda26b8d541d60cdf6d88ad7a3a541488c221e04
+size 1818

ddpm-butterflies-128/logs/training_example/events.out.tfevents.1716433967.atl1-1-03-003-17-0.pace.gatech.edu.186921.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53d62ea38646e54fa4ac8a309f2e2b5238ee541f20894781cce475a8b4dee901
+size 47323

diffusion.ipynb CHANGED Viewed

@@ -22,7 +22,8 @@
     "- 融合cond, guide_w, drop_out這些參數\n",
     "- 生成的21cm圖像該暗的地方不夠暗，似乎換成MNIST的數字圖像就沒問題\n",
     "- 我用diffusion模型生成MNIST的數字時發現，儘管生成的數據的範圍也存在負數數值，如-0.1,但畫出來的圖像卻是理想的黑色。數據的分佈與21cm的結果的分佈沒多大差別，我現在打算把代碼退回到21cm的情形\n",
-    "- 我統一了ddpm21cm這個module，能統一實現訓練和生成樣本，但目前有個bug， sample時總是會cuda out of memory，然而單獨resume model並sample就不會。"
    ]
   },
   {
@@ -30,41 +31,6 @@
    "execution_count": 1,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# import multiprocessing as mp\n",
-    "# mp.set_start_method('spawn', force=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c2516f72a37e425e80638265a633c6cf",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "from huggingface_hub import notebook_login\n",
-    "notebook_login()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "from dataclasses import dataclass\n",
     "import h5py\n",
@@ -94,7 +60,9 @@
     "from huggingface_hub import create_repo, upload_folder\n",
     "\n",
     "from load_h5 import Dataset4h5\n",
-    "from context_unet import ContextUnet"
    ]
   },
   {
@@ -119,7 +87,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -226,7 +194,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -257,7 +225,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -323,7 +291,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -534,47 +502,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ddpm21cm.train()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# accelerator = Accelerator()\n",
-    "# print(accelerator.process_index)\n",
-    "# print(accelerator.is_local_main_process)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ddpm21cm.sample(\"./outputs/model_state_09.pth\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# train_loop(config, nn_model, ddpm, optimizer, dataloader, lr_scheduler)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
@@ -584,7 +528,8 @@
       "51200 images can be loaded\n",
       "field.shape = (64, 64, 514)\n",
       "params keys = [b'ION_Tvir_MIN', b'HII_EFF_FACTOR']\n",
-      "loading 40 images randomly\n"
      ]
     },
     {
@@ -598,16 +543,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "images loaded: (40, 1, 64, 512)\n",
       "params loaded: (40, 2)\n",
-      "images rescaled to [-1.0, 1.0893712043762207]\n",
-      "params rescaled to [0.0, 0.9982320250627095]\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "23109fb8b689459b8aa9abc2a79a12c6",
        "version_major": 2,
        "version_minor": 0
       },
@@ -621,7 +565,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f6ce395ecf1c4d0fb246be2ca50bcb93",
        "version_major": 2,
        "version_minor": 0
       },
@@ -635,7 +579,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "890d3e8151ba4b8da4b4965b394e0cfa",
        "version_major": 2,
        "version_minor": 0
       },
@@ -649,7 +593,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "7fcb8456340d43f792bdd763b879b928",
        "version_major": 2,
        "version_minor": 0
       },
@@ -663,7 +607,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e6c559c766664a25bace866425de4213",
        "version_major": 2,
        "version_minor": 0
       },
@@ -677,7 +621,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "1ede5e1a69da4178979edf5573c37836",
        "version_major": 2,
        "version_minor": 0
       },
@@ -691,7 +635,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "268a679be0054f72b5ae819ac164d31a",
        "version_major": 2,
        "version_minor": 0
       },
@@ -705,7 +649,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a3e3bcd371f94556bc22b7674ca9a36d",
        "version_major": 2,
        "version_minor": 0
       },
@@ -719,7 +663,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b4df815762bc44d3848e4ebc6064127b",
        "version_major": 2,
        "version_minor": 0
       },
@@ -733,7 +677,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b08a2dad7d5243daad336da4d779d55a",
        "version_major": 2,
        "version_minor": 0
       },
@@ -748,6 +692,7 @@
    "source": [
     "if __name__ == \"__main__\":\n",
     "    # args = (config, nn_model, ddpm, optimizer, dataloader, lr_scheduler)\n",
     "    notebook_launcher(ddpm21cm.train, num_processes=1)"
    ]
   },
@@ -947,13 +892,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "167158bd7d494ee9a80d29d8f92a8a36",
        "version_major": 2,
        "version_minor": 0
       },
@@ -963,6 +908,38 @@
      },
      "metadata": {},
      "output_type": "display_data"
     }
    ],
    "source": [
@@ -1056,7 +1033,7 @@
     "    ),\n",
     ")\n",
     "\n",
-    "noise_scheduler = DDPMScheduler(num_timesteps=1000)\n",
     "\n",
     "optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)\n",
     "lr_scheduler = get_cosine_schedule_with_warmup(\n",
@@ -1116,7 +1093,7 @@
     "            bs = clean_images.shape[0]\n",
     "\n",
     "            timesteps = torch.randint(\n",
-    "                0, noise_scheduler.config.num_timesteps, (bs,), device=clean_images.device, dtype=torch.int64\n",
     "            )\n",
     "\n",
     "            noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)\n",
@@ -1160,7 +1137,7 @@
     "# args = (config, model, noise_scheduler, optimizer, dataloader, lr_scheduler)\n",
     "\n",
     "device_count = torch.cuda.device_count()\n",
-    "notebook_launcher(train_loop, (config, model, noise_scheduler, optimizer, dataloader, lr_scheduler), num_processes=1)"
    ]
   },
   {
@@ -1563,7 +1540,7 @@
     "    ),\n",
     ")\n",
     "\n",
-    "noise_scheduler = DDPMScheduler(num_timesteps=1000)\n",
     "\n",
     "optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)\n",
     "lr_scheduler = get_cosine_schedule_with_warmup(\n",
@@ -1623,7 +1600,7 @@
     "            bs = clean_images.shape[0]\n",
     "\n",
     "            timesteps = torch.randint(\n",
-    "                0, noise_scheduler.config.num_timesteps, (bs,), device=clean_images.device, dtype=torch.int64\n",
     "            )\n",
     "\n",
     "            noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)\n",

     "- 融合cond, guide_w, drop_out這些參數\n",
     "- 生成的21cm圖像該暗的地方不夠暗，似乎換成MNIST的數字圖像就沒問題\n",
     "- 我用diffusion模型生成MNIST的數字時發現，儘管生成的數據的範圍也存在負數數值，如-0.1,但畫出來的圖像卻是理想的黑色。數據的分佈與21cm的結果的分佈沒多大差別，我現在打算把代碼退回到21cm的情形\n",
+    "- 我統一了ddpm21cm這個module，能統一實現訓練和生成樣本，但目前有個bug， sample時總是會cuda out of memory，然而單獨resume model並sample就不會。\n",
+    "- 解決了，問題出在我忘了寫with torch.no_grad():"
    ]
   },
   {
    "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
     "from dataclasses import dataclass\n",
     "import h5py\n",
     "from huggingface_hub import create_repo, upload_folder\n",
     "\n",
     "from load_h5 import Dataset4h5\n",
+    "from context_unet import ContextUnet\n",
+    "\n",
+    "from huggingface_hub import notebook_login"
    ]
   },
   {
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
   },
   {
    "cell_type": "code",
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d488e670a37b408399687972aa7fef8a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "name": "stdout",
      "output_type": "stream",
       "51200 images can be loaded\n",
       "field.shape = (64, 64, 514)\n",
       "params keys = [b'ION_Tvir_MIN', b'HII_EFF_FACTOR']\n",
+      "loading 40 images randomly\n",
+      "images loaded: (40, 1, 64, 512)\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "params loaded: (40, 2)\n",
+      "images rescaled to [-1.0, 1.1514630317687988]\n",
+      "params rescaled to [0.0, 0.9958124549229699]\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "09696eb258f94136b32dec3290034ce2",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "77a8a7ad49d84bdcbfc522069118b222",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "28a00d5f8fdc472eae5b252888410421",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f145f20dab944419a730632a9d5081fc",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "de801afbf6aa428b9d21bf6a76a28ee8",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1e2c7ec7c4df438c93f21b420e93aaf3",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e963fcaf27ae401e926053f9f9c146f4",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d6fb1a76c93b45e59fa7d253ef344bda",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f0d88179dc7d42a793a820f4d1a08da3",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c8caffa8edc6457e97c2d315252a5adf",
        "version_major": 2,
        "version_minor": 0
       },
    "source": [
     "if __name__ == \"__main__\":\n",
     "    # args = (config, nn_model, ddpm, optimizer, dataloader, lr_scheduler)\n",
+    "    notebook_login()\n",
     "    notebook_launcher(ddpm21cm.train, num_processes=1)"
    ]
   },
   },
   {
    "cell_type": "code",
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f840b4efaf0a413394f033e5ffa3d2f4",
        "version_major": 2,
        "version_minor": 0
       },
      },
      "metadata": {},
      "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Repo card metadata block was not found. Setting CardData to empty.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Launching training on 2 GPUs.\n"
+     ]
+    },
+    {
+     "ename": "RuntimeError",
+     "evalue": "CUDA has been initialized before the `notebook_launcher` could create a forked subprocess. This likely stems from an outside import causing issues once the `notebook_launcher()` is called. Please review your imports and test them when running the `notebook_launcher()` to identify which one is problematic and causing CUDA to be initialized.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mProcessRaisedException\u001b[0m                    Traceback (most recent call last)",
+      "File \u001b[0;32m~/.conda/envs/diffusers/lib/python3.9/site-packages/accelerate/launchers.py:200\u001b[0m, in \u001b[0;36mnotebook_launcher\u001b[0;34m(function, args, num_processes, mixed_precision, use_port, master_addr, node_rank, num_nodes)\u001b[0m\n\u001b[1;32m    199\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 200\u001b[0m     start_processes(launcher, args\u001b[39m=\u001b[39;49margs, nprocs\u001b[39m=\u001b[39;49mnum_processes, start_method\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mfork\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[1;32m    201\u001b[0m \u001b[39mexcept\u001b[39;00m ProcessRaisedException \u001b[39mas\u001b[39;00m e:\n",
+      "File \u001b[0;32m/usr/local/pace-apps/manual/packages/pytorch/1.12.0/lib/python3.9/site-packages/torch/multiprocessing/spawn.py:198\u001b[0m, in \u001b[0;36mstart_processes\u001b[0;34m(fn, args, nprocs, join, daemon, start_method)\u001b[0m\n\u001b[1;32m    197\u001b[0m \u001b[39m# Loop on join until it returns True or raises an exception.\u001b[39;00m\n\u001b[0;32m--> 198\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mnot\u001b[39;00m context\u001b[39m.\u001b[39;49mjoin():\n\u001b[1;32m    199\u001b[0m     \u001b[39mpass\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/pace-apps/manual/packages/pytorch/1.12.0/lib/python3.9/site-packages/torch/multiprocessing/spawn.py:160\u001b[0m, in \u001b[0;36mProcessContext.join\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    159\u001b[0m msg \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m original_trace\n\u001b[0;32m--> 160\u001b[0m \u001b[39mraise\u001b[39;00m ProcessRaisedException(msg, error_index, failed_process\u001b[39m.\u001b[39mpid)\n",
+      "\u001b[0;31mProcessRaisedException\u001b[0m: \n\n-- Process 1 terminated with the following error:\nTraceback (most recent call last):\n  File \"/usr/local/pace-apps/manual/packages/pytorch/1.12.0/lib/python3.9/site-packages/torch/multiprocessing/spawn.py\", line 69, in _wrap\n    fn(i, *args)\n  File \"/storage/home/hcoda1/3/bxia34/.conda/envs/diffusers/lib/python3.9/site-packages/accelerate/utils/launch.py\", line 608, in __call__\n    self.launcher(*args)\n  File \"/scratch/166867/ipykernel_204345/1749266112.py\", line 117, in train_loop\n    accelerator = Accelerator(\n  File \"/storage/home/hcoda1/3/bxia34/.conda/envs/diffusers/lib/python3.9/site-packages/accelerate/accelerator.py\", line 371, in __init__\n    self.state = AcceleratorState(\n  File \"/storage/home/hcoda1/3/bxia34/.conda/envs/diffusers/lib/python3.9/site-packages/accelerate/state.py\", line 777, in __init__\n    PartialState(cpu, **kwargs)\n  File \"/storage/home/hcoda1/3/bxia34/.conda/envs/diffusers/lib/python3.9/site-packages/accelerate/state.py\", line 240, in __init__\n    torch.cuda.set_device(self.device)\n  File \"/usr/local/pace-apps/manual/packages/pytorch/1.12.0/lib/python3.9/site-packages/torch/cuda/__init__.py\", line 314, in set_device\n    torch._C._cuda_setDevice(device)\n  File \"/usr/local/pace-apps/manual/packages/pytorch/1.12.0/lib/python3.9/site-packages/torch/cuda/__init__.py\", line 207, in _lazy_init\n    raise RuntimeError(\nRuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method\n",
+      "\nThe above exception was the direct cause of the following exception:\n",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 195\u001b[0m\n\u001b[1;32m    192\u001b[0m \u001b[39m# args = (config, model, noise_scheduler, optimizer, dataloader, lr_scheduler)\u001b[39;00m\n\u001b[1;32m    194\u001b[0m device_count \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mcuda\u001b[39m.\u001b[39mdevice_count()\n\u001b[0;32m--> 195\u001b[0m notebook_launcher(train_loop, (config, model, noise_scheduler, optimizer, dataloader, lr_scheduler), num_processes\u001b[39m=\u001b[39;49m\u001b[39m2\u001b[39;49m)\n",
+      "File \u001b[0;32m~/.conda/envs/diffusers/lib/python3.9/site-packages/accelerate/launchers.py:203\u001b[0m, in \u001b[0;36mnotebook_launcher\u001b[0;34m(function, args, num_processes, mixed_precision, use_port, master_addr, node_rank, num_nodes)\u001b[0m\n\u001b[1;32m    201\u001b[0m \u001b[39mexcept\u001b[39;00m ProcessRaisedException \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m    202\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mCannot re-initialize CUDA in forked subprocess\u001b[39m\u001b[39m\"\u001b[39m \u001b[39min\u001b[39;00m e\u001b[39m.\u001b[39margs[\u001b[39m0\u001b[39m]:\n\u001b[0;32m--> 203\u001b[0m         \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\n\u001b[1;32m    204\u001b[0m             \u001b[39m\"\u001b[39m\u001b[39mCUDA has been initialized before the `notebook_launcher` could create a forked subprocess. \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    205\u001b[0m             \u001b[39m\"\u001b[39m\u001b[39mThis likely stems from an outside import causing issues once the `notebook_launcher()` is called. \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    206\u001b[0m             \u001b[39m\"\u001b[39m\u001b[39mPlease review your imports and test them when running the `notebook_launcher()` to identify \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    207\u001b[0m             \u001b[39m\"\u001b[39m\u001b[39mwhich one is problematic and causing CUDA to be initialized.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    208\u001b[0m         ) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n\u001b[1;32m    209\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[1;32m    210\u001b[0m         \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mAn issue was found when launching the training: \u001b[39m\u001b[39m{\u001b[39;00me\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: CUDA has been initialized before the `notebook_launcher` could create a forked subprocess. This likely stems from an outside import causing issues once the `notebook_launcher()` is called. Please review your imports and test them when running the `notebook_launcher()` to identify which one is problematic and causing CUDA to be initialized."
+     ]
     }
    ],
    "source": [
     "    ),\n",
     ")\n",
     "\n",
+    "noise_scheduler = DDPMScheduler(num_train_timesteps=1000)\n",
     "\n",
     "optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)\n",
     "lr_scheduler = get_cosine_schedule_with_warmup(\n",
     "            bs = clean_images.shape[0]\n",
     "\n",
     "            timesteps = torch.randint(\n",
+    "                0, noise_scheduler.config.num_train_timesteps, (bs,), device=clean_images.device, dtype=torch.int64\n",
     "            )\n",
     "\n",
     "            noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)\n",
     "# args = (config, model, noise_scheduler, optimizer, dataloader, lr_scheduler)\n",
     "\n",
     "device_count = torch.cuda.device_count()\n",
+    "notebook_launcher(train_loop, (config, model, noise_scheduler, optimizer, dataloader, lr_scheduler), num_processes=2)"
    ]
   },
   {
     "    ),\n",
     ")\n",
     "\n",
+    "noise_scheduler = DDPMScheduler(num_train_timesteps=1000)\n",
     "\n",
     "optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)\n",
     "lr_scheduler = get_cosine_schedule_with_warmup(\n",
     "            bs = clean_images.shape[0]\n",
     "\n",
     "            timesteps = torch.randint(\n",
+    "                0, noise_scheduler.config.num_train_timesteps, (bs,), device=clean_images.device, dtype=torch.int64\n",
     "            )\n",
     "\n",
     "            noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)\n",