Xsmos commited on
Commit
4ef91db
·
verified ·
1 Parent(s): 146a195
ddpm-butterflies-128/logs/training_example/events.out.tfevents.1716433515.atl1-1-03-003-17-0.pace.gatech.edu.184517.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17b83cfdb678438b3674bcb2f33bfdd0cfd1e4db880f88a14890f078174fd92d
3
+ size 10622
ddpm-butterflies-128/logs/training_example/events.out.tfevents.1716433608.atl1-1-03-003-17-0.pace.gatech.edu.186921.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:448aa0d8971ba332e9550522fda26b8d541d60cdf6d88ad7a3a541488c221e04
3
+ size 1818
ddpm-butterflies-128/logs/training_example/events.out.tfevents.1716433967.atl1-1-03-003-17-0.pace.gatech.edu.186921.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53d62ea38646e54fa4ac8a309f2e2b5238ee541f20894781cce475a8b4dee901
3
+ size 47323
diffusion.ipynb CHANGED
@@ -22,7 +22,8 @@
22
  "- 融合cond, guide_w, drop_out這些參數\n",
23
  "- 生成的21cm圖像該暗的地方不夠暗,似乎換成MNIST的數字圖像就沒問題\n",
24
  "- 我用diffusion模型生成MNIST的數字時發現,儘管生成的數據的範圍也存在負數數值,如-0.1,但畫出來的圖像卻是理想的黑色。數據的分佈與21cm的結果的分佈沒多大差別,我現在打算把代碼退回到21cm的情形\n",
25
- "- 我統一了ddpm21cm這個module,能統一實現訓練和生成樣本,但目前有個bug, sample時總是會cuda out of memory,然而單獨resume model並sample就不會。"
 
26
  ]
27
  },
28
  {
@@ -30,41 +31,6 @@
30
  "execution_count": 1,
31
  "metadata": {},
32
  "outputs": [],
33
- "source": [
34
- "# import multiprocessing as mp\n",
35
- "# mp.set_start_method('spawn', force=True)"
36
- ]
37
- },
38
- {
39
- "cell_type": "code",
40
- "execution_count": 1,
41
- "metadata": {},
42
- "outputs": [
43
- {
44
- "data": {
45
- "application/vnd.jupyter.widget-view+json": {
46
- "model_id": "c2516f72a37e425e80638265a633c6cf",
47
- "version_major": 2,
48
- "version_minor": 0
49
- },
50
- "text/plain": [
51
- "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
52
- ]
53
- },
54
- "metadata": {},
55
- "output_type": "display_data"
56
- }
57
- ],
58
- "source": [
59
- "from huggingface_hub import notebook_login\n",
60
- "notebook_login()"
61
- ]
62
- },
63
- {
64
- "cell_type": "code",
65
- "execution_count": 2,
66
- "metadata": {},
67
- "outputs": [],
68
  "source": [
69
  "from dataclasses import dataclass\n",
70
  "import h5py\n",
@@ -94,7 +60,9 @@
94
  "from huggingface_hub import create_repo, upload_folder\n",
95
  "\n",
96
  "from load_h5 import Dataset4h5\n",
97
- "from context_unet import ContextUnet"
 
 
98
  ]
99
  },
100
  {
@@ -119,7 +87,7 @@
119
  },
120
  {
121
  "cell_type": "code",
122
- "execution_count": 3,
123
  "metadata": {},
124
  "outputs": [],
125
  "source": [
@@ -226,7 +194,7 @@
226
  },
227
  {
228
  "cell_type": "code",
229
- "execution_count": 4,
230
  "metadata": {},
231
  "outputs": [],
232
  "source": [
@@ -257,7 +225,7 @@
257
  },
258
  {
259
  "cell_type": "code",
260
- "execution_count": 5,
261
  "metadata": {},
262
  "outputs": [],
263
  "source": [
@@ -323,7 +291,7 @@
323
  },
324
  {
325
  "cell_type": "code",
326
- "execution_count": 6,
327
  "metadata": {},
328
  "outputs": [
329
  {
@@ -534,47 +502,23 @@
534
  },
535
  {
536
  "cell_type": "code",
537
- "execution_count": 7,
538
- "metadata": {},
539
- "outputs": [],
540
- "source": [
541
- "# ddpm21cm.train()"
542
- ]
543
- },
544
- {
545
- "cell_type": "code",
546
- "execution_count": 8,
547
- "metadata": {},
548
- "outputs": [],
549
- "source": [
550
- "# accelerator = Accelerator()\n",
551
- "# print(accelerator.process_index)\n",
552
- "# print(accelerator.is_local_main_process)"
553
- ]
554
- },
555
- {
556
- "cell_type": "code",
557
- "execution_count": 9,
558
- "metadata": {},
559
- "outputs": [],
560
- "source": [
561
- "# ddpm21cm.sample(\"./outputs/model_state_09.pth\")"
562
- ]
563
- },
564
- {
565
- "cell_type": "code",
566
- "execution_count": 10,
567
- "metadata": {},
568
- "outputs": [],
569
- "source": [
570
- "# train_loop(config, nn_model, ddpm, optimizer, dataloader, lr_scheduler)"
571
- ]
572
- },
573
- {
574
- "cell_type": "code",
575
- "execution_count": 12,
576
  "metadata": {},
577
  "outputs": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
578
  {
579
  "name": "stdout",
580
  "output_type": "stream",
@@ -584,7 +528,8 @@
584
  "51200 images can be loaded\n",
585
  "field.shape = (64, 64, 514)\n",
586
  "params keys = [b'ION_Tvir_MIN', b'HII_EFF_FACTOR']\n",
587
- "loading 40 images randomly\n"
 
588
  ]
589
  },
590
  {
@@ -598,16 +543,15 @@
598
  "name": "stdout",
599
  "output_type": "stream",
600
  "text": [
601
- "images loaded: (40, 1, 64, 512)\n",
602
  "params loaded: (40, 2)\n",
603
- "images rescaled to [-1.0, 1.0893712043762207]\n",
604
- "params rescaled to [0.0, 0.9982320250627095]\n"
605
  ]
606
  },
607
  {
608
  "data": {
609
  "application/vnd.jupyter.widget-view+json": {
610
- "model_id": "23109fb8b689459b8aa9abc2a79a12c6",
611
  "version_major": 2,
612
  "version_minor": 0
613
  },
@@ -621,7 +565,7 @@
621
  {
622
  "data": {
623
  "application/vnd.jupyter.widget-view+json": {
624
- "model_id": "f6ce395ecf1c4d0fb246be2ca50bcb93",
625
  "version_major": 2,
626
  "version_minor": 0
627
  },
@@ -635,7 +579,7 @@
635
  {
636
  "data": {
637
  "application/vnd.jupyter.widget-view+json": {
638
- "model_id": "890d3e8151ba4b8da4b4965b394e0cfa",
639
  "version_major": 2,
640
  "version_minor": 0
641
  },
@@ -649,7 +593,7 @@
649
  {
650
  "data": {
651
  "application/vnd.jupyter.widget-view+json": {
652
- "model_id": "7fcb8456340d43f792bdd763b879b928",
653
  "version_major": 2,
654
  "version_minor": 0
655
  },
@@ -663,7 +607,7 @@
663
  {
664
  "data": {
665
  "application/vnd.jupyter.widget-view+json": {
666
- "model_id": "e6c559c766664a25bace866425de4213",
667
  "version_major": 2,
668
  "version_minor": 0
669
  },
@@ -677,7 +621,7 @@
677
  {
678
  "data": {
679
  "application/vnd.jupyter.widget-view+json": {
680
- "model_id": "1ede5e1a69da4178979edf5573c37836",
681
  "version_major": 2,
682
  "version_minor": 0
683
  },
@@ -691,7 +635,7 @@
691
  {
692
  "data": {
693
  "application/vnd.jupyter.widget-view+json": {
694
- "model_id": "268a679be0054f72b5ae819ac164d31a",
695
  "version_major": 2,
696
  "version_minor": 0
697
  },
@@ -705,7 +649,7 @@
705
  {
706
  "data": {
707
  "application/vnd.jupyter.widget-view+json": {
708
- "model_id": "a3e3bcd371f94556bc22b7674ca9a36d",
709
  "version_major": 2,
710
  "version_minor": 0
711
  },
@@ -719,7 +663,7 @@
719
  {
720
  "data": {
721
  "application/vnd.jupyter.widget-view+json": {
722
- "model_id": "b4df815762bc44d3848e4ebc6064127b",
723
  "version_major": 2,
724
  "version_minor": 0
725
  },
@@ -733,7 +677,7 @@
733
  {
734
  "data": {
735
  "application/vnd.jupyter.widget-view+json": {
736
- "model_id": "b08a2dad7d5243daad336da4d779d55a",
737
  "version_major": 2,
738
  "version_minor": 0
739
  },
@@ -748,6 +692,7 @@
748
  "source": [
749
  "if __name__ == \"__main__\":\n",
750
  " # args = (config, nn_model, ddpm, optimizer, dataloader, lr_scheduler)\n",
 
751
  " notebook_launcher(ddpm21cm.train, num_processes=1)"
752
  ]
753
  },
@@ -947,13 +892,13 @@
947
  },
948
  {
949
  "cell_type": "code",
950
- "execution_count": null,
951
  "metadata": {},
952
  "outputs": [
953
  {
954
  "data": {
955
  "application/vnd.jupyter.widget-view+json": {
956
- "model_id": "167158bd7d494ee9a80d29d8f92a8a36",
957
  "version_major": 2,
958
  "version_minor": 0
959
  },
@@ -963,6 +908,38 @@
963
  },
964
  "metadata": {},
965
  "output_type": "display_data"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
966
  }
967
  ],
968
  "source": [
@@ -1056,7 +1033,7 @@
1056
  " ),\n",
1057
  ")\n",
1058
  "\n",
1059
- "noise_scheduler = DDPMScheduler(num_timesteps=1000)\n",
1060
  "\n",
1061
  "optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)\n",
1062
  "lr_scheduler = get_cosine_schedule_with_warmup(\n",
@@ -1116,7 +1093,7 @@
1116
  " bs = clean_images.shape[0]\n",
1117
  "\n",
1118
  " timesteps = torch.randint(\n",
1119
- " 0, noise_scheduler.config.num_timesteps, (bs,), device=clean_images.device, dtype=torch.int64\n",
1120
  " )\n",
1121
  "\n",
1122
  " noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)\n",
@@ -1160,7 +1137,7 @@
1160
  "# args = (config, model, noise_scheduler, optimizer, dataloader, lr_scheduler)\n",
1161
  "\n",
1162
  "device_count = torch.cuda.device_count()\n",
1163
- "notebook_launcher(train_loop, (config, model, noise_scheduler, optimizer, dataloader, lr_scheduler), num_processes=1)"
1164
  ]
1165
  },
1166
  {
@@ -1563,7 +1540,7 @@
1563
  " ),\n",
1564
  ")\n",
1565
  "\n",
1566
- "noise_scheduler = DDPMScheduler(num_timesteps=1000)\n",
1567
  "\n",
1568
  "optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)\n",
1569
  "lr_scheduler = get_cosine_schedule_with_warmup(\n",
@@ -1623,7 +1600,7 @@
1623
  " bs = clean_images.shape[0]\n",
1624
  "\n",
1625
  " timesteps = torch.randint(\n",
1626
- " 0, noise_scheduler.config.num_timesteps, (bs,), device=clean_images.device, dtype=torch.int64\n",
1627
  " )\n",
1628
  "\n",
1629
  " noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)\n",
 
22
  "- 融合cond, guide_w, drop_out這些參數\n",
23
  "- 生成的21cm圖像該暗的地方不夠暗,似乎換成MNIST的數字圖像就沒問題\n",
24
  "- 我用diffusion模型生成MNIST的數字時發現,儘管生成的數據的範圍也存在負數數值,如-0.1,但畫出來的圖像卻是理想的黑色。數據的分佈與21cm的結果的分佈沒多大差別,我現在打算把代碼退回到21cm的情形\n",
25
+ "- 我統一了ddpm21cm這個module,能統一實現訓練和生成樣本,但目前有個bug, sample時總是會cuda out of memory,然而單獨resume model並sample就不會。\n",
26
+ "- 解決了,問題出在我忘了寫with torch.no_grad():"
27
  ]
28
  },
29
  {
 
31
  "execution_count": 1,
32
  "metadata": {},
33
  "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  "source": [
35
  "from dataclasses import dataclass\n",
36
  "import h5py\n",
 
60
  "from huggingface_hub import create_repo, upload_folder\n",
61
  "\n",
62
  "from load_h5 import Dataset4h5\n",
63
+ "from context_unet import ContextUnet\n",
64
+ "\n",
65
+ "from huggingface_hub import notebook_login"
66
  ]
67
  },
68
  {
 
87
  },
88
  {
89
  "cell_type": "code",
90
+ "execution_count": 2,
91
  "metadata": {},
92
  "outputs": [],
93
  "source": [
 
194
  },
195
  {
196
  "cell_type": "code",
197
+ "execution_count": 3,
198
  "metadata": {},
199
  "outputs": [],
200
  "source": [
 
225
  },
226
  {
227
  "cell_type": "code",
228
+ "execution_count": 4,
229
  "metadata": {},
230
  "outputs": [],
231
  "source": [
 
291
  },
292
  {
293
  "cell_type": "code",
294
+ "execution_count": 5,
295
  "metadata": {},
296
  "outputs": [
297
  {
 
502
  },
503
  {
504
  "cell_type": "code",
505
+ "execution_count": 6,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
506
  "metadata": {},
507
  "outputs": [
508
+ {
509
+ "data": {
510
+ "application/vnd.jupyter.widget-view+json": {
511
+ "model_id": "d488e670a37b408399687972aa7fef8a",
512
+ "version_major": 2,
513
+ "version_minor": 0
514
+ },
515
+ "text/plain": [
516
+ "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
517
+ ]
518
+ },
519
+ "metadata": {},
520
+ "output_type": "display_data"
521
+ },
522
  {
523
  "name": "stdout",
524
  "output_type": "stream",
 
528
  "51200 images can be loaded\n",
529
  "field.shape = (64, 64, 514)\n",
530
  "params keys = [b'ION_Tvir_MIN', b'HII_EFF_FACTOR']\n",
531
+ "loading 40 images randomly\n",
532
+ "images loaded: (40, 1, 64, 512)\n"
533
  ]
534
  },
535
  {
 
543
  "name": "stdout",
544
  "output_type": "stream",
545
  "text": [
 
546
  "params loaded: (40, 2)\n",
547
+ "images rescaled to [-1.0, 1.1514630317687988]\n",
548
+ "params rescaled to [0.0, 0.9958124549229699]\n"
549
  ]
550
  },
551
  {
552
  "data": {
553
  "application/vnd.jupyter.widget-view+json": {
554
+ "model_id": "09696eb258f94136b32dec3290034ce2",
555
  "version_major": 2,
556
  "version_minor": 0
557
  },
 
565
  {
566
  "data": {
567
  "application/vnd.jupyter.widget-view+json": {
568
+ "model_id": "77a8a7ad49d84bdcbfc522069118b222",
569
  "version_major": 2,
570
  "version_minor": 0
571
  },
 
579
  {
580
  "data": {
581
  "application/vnd.jupyter.widget-view+json": {
582
+ "model_id": "28a00d5f8fdc472eae5b252888410421",
583
  "version_major": 2,
584
  "version_minor": 0
585
  },
 
593
  {
594
  "data": {
595
  "application/vnd.jupyter.widget-view+json": {
596
+ "model_id": "f145f20dab944419a730632a9d5081fc",
597
  "version_major": 2,
598
  "version_minor": 0
599
  },
 
607
  {
608
  "data": {
609
  "application/vnd.jupyter.widget-view+json": {
610
+ "model_id": "de801afbf6aa428b9d21bf6a76a28ee8",
611
  "version_major": 2,
612
  "version_minor": 0
613
  },
 
621
  {
622
  "data": {
623
  "application/vnd.jupyter.widget-view+json": {
624
+ "model_id": "1e2c7ec7c4df438c93f21b420e93aaf3",
625
  "version_major": 2,
626
  "version_minor": 0
627
  },
 
635
  {
636
  "data": {
637
  "application/vnd.jupyter.widget-view+json": {
638
+ "model_id": "e963fcaf27ae401e926053f9f9c146f4",
639
  "version_major": 2,
640
  "version_minor": 0
641
  },
 
649
  {
650
  "data": {
651
  "application/vnd.jupyter.widget-view+json": {
652
+ "model_id": "d6fb1a76c93b45e59fa7d253ef344bda",
653
  "version_major": 2,
654
  "version_minor": 0
655
  },
 
663
  {
664
  "data": {
665
  "application/vnd.jupyter.widget-view+json": {
666
+ "model_id": "f0d88179dc7d42a793a820f4d1a08da3",
667
  "version_major": 2,
668
  "version_minor": 0
669
  },
 
677
  {
678
  "data": {
679
  "application/vnd.jupyter.widget-view+json": {
680
+ "model_id": "c8caffa8edc6457e97c2d315252a5adf",
681
  "version_major": 2,
682
  "version_minor": 0
683
  },
 
692
  "source": [
693
  "if __name__ == \"__main__\":\n",
694
  " # args = (config, nn_model, ddpm, optimizer, dataloader, lr_scheduler)\n",
695
+ " notebook_login()\n",
696
  " notebook_launcher(ddpm21cm.train, num_processes=1)"
697
  ]
698
  },
 
892
  },
893
  {
894
  "cell_type": "code",
895
+ "execution_count": 1,
896
  "metadata": {},
897
  "outputs": [
898
  {
899
  "data": {
900
  "application/vnd.jupyter.widget-view+json": {
901
+ "model_id": "f840b4efaf0a413394f033e5ffa3d2f4",
902
  "version_major": 2,
903
  "version_minor": 0
904
  },
 
908
  },
909
  "metadata": {},
910
  "output_type": "display_data"
911
+ },
912
+ {
913
+ "name": "stderr",
914
+ "output_type": "stream",
915
+ "text": [
916
+ "Repo card metadata block was not found. Setting CardData to empty.\n"
917
+ ]
918
+ },
919
+ {
920
+ "name": "stdout",
921
+ "output_type": "stream",
922
+ "text": [
923
+ "Launching training on 2 GPUs.\n"
924
+ ]
925
+ },
926
+ {
927
+ "ename": "RuntimeError",
928
+ "evalue": "CUDA has been initialized before the `notebook_launcher` could create a forked subprocess. This likely stems from an outside import causing issues once the `notebook_launcher()` is called. Please review your imports and test them when running the `notebook_launcher()` to identify which one is problematic and causing CUDA to be initialized.",
929
+ "output_type": "error",
930
+ "traceback": [
931
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
932
+ "\u001b[0;31mProcessRaisedException\u001b[0m Traceback (most recent call last)",
933
+ "File \u001b[0;32m~/.conda/envs/diffusers/lib/python3.9/site-packages/accelerate/launchers.py:200\u001b[0m, in \u001b[0;36mnotebook_launcher\u001b[0;34m(function, args, num_processes, mixed_precision, use_port, master_addr, node_rank, num_nodes)\u001b[0m\n\u001b[1;32m 199\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 200\u001b[0m start_processes(launcher, args\u001b[39m=\u001b[39;49margs, nprocs\u001b[39m=\u001b[39;49mnum_processes, start_method\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mfork\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[1;32m 201\u001b[0m \u001b[39mexcept\u001b[39;00m ProcessRaisedException \u001b[39mas\u001b[39;00m e:\n",
934
+ "File \u001b[0;32m/usr/local/pace-apps/manual/packages/pytorch/1.12.0/lib/python3.9/site-packages/torch/multiprocessing/spawn.py:198\u001b[0m, in \u001b[0;36mstart_processes\u001b[0;34m(fn, args, nprocs, join, daemon, start_method)\u001b[0m\n\u001b[1;32m 197\u001b[0m \u001b[39m# Loop on join until it returns True or raises an exception.\u001b[39;00m\n\u001b[0;32m--> 198\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mnot\u001b[39;00m context\u001b[39m.\u001b[39;49mjoin():\n\u001b[1;32m 199\u001b[0m \u001b[39mpass\u001b[39;00m\n",
935
+ "File \u001b[0;32m/usr/local/pace-apps/manual/packages/pytorch/1.12.0/lib/python3.9/site-packages/torch/multiprocessing/spawn.py:160\u001b[0m, in \u001b[0;36mProcessContext.join\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 159\u001b[0m msg \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m original_trace\n\u001b[0;32m--> 160\u001b[0m \u001b[39mraise\u001b[39;00m ProcessRaisedException(msg, error_index, failed_process\u001b[39m.\u001b[39mpid)\n",
936
+ "\u001b[0;31mProcessRaisedException\u001b[0m: \n\n-- Process 1 terminated with the following error:\nTraceback (most recent call last):\n File \"/usr/local/pace-apps/manual/packages/pytorch/1.12.0/lib/python3.9/site-packages/torch/multiprocessing/spawn.py\", line 69, in _wrap\n fn(i, *args)\n File \"/storage/home/hcoda1/3/bxia34/.conda/envs/diffusers/lib/python3.9/site-packages/accelerate/utils/launch.py\", line 608, in __call__\n self.launcher(*args)\n File \"/scratch/166867/ipykernel_204345/1749266112.py\", line 117, in train_loop\n accelerator = Accelerator(\n File \"/storage/home/hcoda1/3/bxia34/.conda/envs/diffusers/lib/python3.9/site-packages/accelerate/accelerator.py\", line 371, in __init__\n self.state = AcceleratorState(\n File \"/storage/home/hcoda1/3/bxia34/.conda/envs/diffusers/lib/python3.9/site-packages/accelerate/state.py\", line 777, in __init__\n PartialState(cpu, **kwargs)\n File \"/storage/home/hcoda1/3/bxia34/.conda/envs/diffusers/lib/python3.9/site-packages/accelerate/state.py\", line 240, in __init__\n torch.cuda.set_device(self.device)\n File \"/usr/local/pace-apps/manual/packages/pytorch/1.12.0/lib/python3.9/site-packages/torch/cuda/__init__.py\", line 314, in set_device\n torch._C._cuda_setDevice(device)\n File \"/usr/local/pace-apps/manual/packages/pytorch/1.12.0/lib/python3.9/site-packages/torch/cuda/__init__.py\", line 207, in _lazy_init\n raise RuntimeError(\nRuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method\n",
937
+ "\nThe above exception was the direct cause of the following exception:\n",
938
+ "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
939
+ "Cell \u001b[0;32mIn[1], line 195\u001b[0m\n\u001b[1;32m 192\u001b[0m \u001b[39m# args = (config, model, noise_scheduler, optimizer, dataloader, lr_scheduler)\u001b[39;00m\n\u001b[1;32m 194\u001b[0m device_count \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mcuda\u001b[39m.\u001b[39mdevice_count()\n\u001b[0;32m--> 195\u001b[0m notebook_launcher(train_loop, (config, model, noise_scheduler, optimizer, dataloader, lr_scheduler), num_processes\u001b[39m=\u001b[39;49m\u001b[39m2\u001b[39;49m)\n",
940
+ "File \u001b[0;32m~/.conda/envs/diffusers/lib/python3.9/site-packages/accelerate/launchers.py:203\u001b[0m, in \u001b[0;36mnotebook_launcher\u001b[0;34m(function, args, num_processes, mixed_precision, use_port, master_addr, node_rank, num_nodes)\u001b[0m\n\u001b[1;32m 201\u001b[0m \u001b[39mexcept\u001b[39;00m ProcessRaisedException \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m 202\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mCannot re-initialize CUDA in forked subprocess\u001b[39m\u001b[39m\"\u001b[39m \u001b[39min\u001b[39;00m e\u001b[39m.\u001b[39margs[\u001b[39m0\u001b[39m]:\n\u001b[0;32m--> 203\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\n\u001b[1;32m 204\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mCUDA has been initialized before the `notebook_launcher` could create a forked subprocess. \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 205\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mThis likely stems from an outside import causing issues once the `notebook_launcher()` is called. \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 206\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mPlease review your imports and test them when running the `notebook_launcher()` to identify \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 207\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mwhich one is problematic and causing CUDA to be initialized.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 208\u001b[0m ) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n\u001b[1;32m 209\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 210\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mAn issue was found when launching the training: \u001b[39m\u001b[39m{\u001b[39;00me\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n",
941
+ "\u001b[0;31mRuntimeError\u001b[0m: CUDA has been initialized before the `notebook_launcher` could create a forked subprocess. This likely stems from an outside import causing issues once the `notebook_launcher()` is called. Please review your imports and test them when running the `notebook_launcher()` to identify which one is problematic and causing CUDA to be initialized."
942
+ ]
943
  }
944
  ],
945
  "source": [
 
1033
  " ),\n",
1034
  ")\n",
1035
  "\n",
1036
+ "noise_scheduler = DDPMScheduler(num_train_timesteps=1000)\n",
1037
  "\n",
1038
  "optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)\n",
1039
  "lr_scheduler = get_cosine_schedule_with_warmup(\n",
 
1093
  " bs = clean_images.shape[0]\n",
1094
  "\n",
1095
  " timesteps = torch.randint(\n",
1096
+ " 0, noise_scheduler.config.num_train_timesteps, (bs,), device=clean_images.device, dtype=torch.int64\n",
1097
  " )\n",
1098
  "\n",
1099
  " noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)\n",
 
1137
  "# args = (config, model, noise_scheduler, optimizer, dataloader, lr_scheduler)\n",
1138
  "\n",
1139
  "device_count = torch.cuda.device_count()\n",
1140
+ "notebook_launcher(train_loop, (config, model, noise_scheduler, optimizer, dataloader, lr_scheduler), num_processes=2)"
1141
  ]
1142
  },
1143
  {
 
1540
  " ),\n",
1541
  ")\n",
1542
  "\n",
1543
+ "noise_scheduler = DDPMScheduler(num_train_timesteps=1000)\n",
1544
  "\n",
1545
  "optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)\n",
1546
  "lr_scheduler = get_cosine_schedule_with_warmup(\n",
 
1600
  " bs = clean_images.shape[0]\n",
1601
  "\n",
1602
  " timesteps = torch.randint(\n",
1603
+ " 0, noise_scheduler.config.num_train_timesteps, (bs,), device=clean_images.device, dtype=torch.int64\n",
1604
  " )\n",
1605
  "\n",
1606
  " noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)\n",