0523-0022
Browse files- ddpm-butterflies-128/logs/training_example/events.out.tfevents.1716433515.atl1-1-03-003-17-0.pace.gatech.edu.184517.0 +3 -0
- ddpm-butterflies-128/logs/training_example/events.out.tfevents.1716433608.atl1-1-03-003-17-0.pace.gatech.edu.186921.0 +3 -0
- ddpm-butterflies-128/logs/training_example/events.out.tfevents.1716433967.atl1-1-03-003-17-0.pace.gatech.edu.186921.1 +3 -0
- diffusion.ipynb +78 -101
ddpm-butterflies-128/logs/training_example/events.out.tfevents.1716433515.atl1-1-03-003-17-0.pace.gatech.edu.184517.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:17b83cfdb678438b3674bcb2f33bfdd0cfd1e4db880f88a14890f078174fd92d
|
| 3 |
+
size 10622
|
ddpm-butterflies-128/logs/training_example/events.out.tfevents.1716433608.atl1-1-03-003-17-0.pace.gatech.edu.186921.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:448aa0d8971ba332e9550522fda26b8d541d60cdf6d88ad7a3a541488c221e04
|
| 3 |
+
size 1818
|
ddpm-butterflies-128/logs/training_example/events.out.tfevents.1716433967.atl1-1-03-003-17-0.pace.gatech.edu.186921.1
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:53d62ea38646e54fa4ac8a309f2e2b5238ee541f20894781cce475a8b4dee901
|
| 3 |
+
size 47323
|
diffusion.ipynb
CHANGED
|
@@ -22,7 +22,8 @@
|
|
| 22 |
"- 融合cond, guide_w, drop_out這些參數\n",
|
| 23 |
"- 生成的21cm圖像該暗的地方不夠暗,似乎換成MNIST的數字圖像就沒問題\n",
|
| 24 |
"- 我用diffusion模型生成MNIST的數字時發現,儘管生成的數據的範圍也存在負數數值,如-0.1,但畫出來的圖像卻是理想的黑色。數據的分佈與21cm的結果的分佈沒多大差別,我現在打算把代碼退回到21cm的情形\n",
|
| 25 |
-
"- 我統一了ddpm21cm這個module,能統一實現訓練和生成樣本,但目前有個bug, sample時總是會cuda out of memory,然而單獨resume model並sample
|
|
|
|
| 26 |
]
|
| 27 |
},
|
| 28 |
{
|
|
@@ -30,41 +31,6 @@
|
|
| 30 |
"execution_count": 1,
|
| 31 |
"metadata": {},
|
| 32 |
"outputs": [],
|
| 33 |
-
"source": [
|
| 34 |
-
"# import multiprocessing as mp\n",
|
| 35 |
-
"# mp.set_start_method('spawn', force=True)"
|
| 36 |
-
]
|
| 37 |
-
},
|
| 38 |
-
{
|
| 39 |
-
"cell_type": "code",
|
| 40 |
-
"execution_count": 1,
|
| 41 |
-
"metadata": {},
|
| 42 |
-
"outputs": [
|
| 43 |
-
{
|
| 44 |
-
"data": {
|
| 45 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 46 |
-
"model_id": "c2516f72a37e425e80638265a633c6cf",
|
| 47 |
-
"version_major": 2,
|
| 48 |
-
"version_minor": 0
|
| 49 |
-
},
|
| 50 |
-
"text/plain": [
|
| 51 |
-
"VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
|
| 52 |
-
]
|
| 53 |
-
},
|
| 54 |
-
"metadata": {},
|
| 55 |
-
"output_type": "display_data"
|
| 56 |
-
}
|
| 57 |
-
],
|
| 58 |
-
"source": [
|
| 59 |
-
"from huggingface_hub import notebook_login\n",
|
| 60 |
-
"notebook_login()"
|
| 61 |
-
]
|
| 62 |
-
},
|
| 63 |
-
{
|
| 64 |
-
"cell_type": "code",
|
| 65 |
-
"execution_count": 2,
|
| 66 |
-
"metadata": {},
|
| 67 |
-
"outputs": [],
|
| 68 |
"source": [
|
| 69 |
"from dataclasses import dataclass\n",
|
| 70 |
"import h5py\n",
|
|
@@ -94,7 +60,9 @@
|
|
| 94 |
"from huggingface_hub import create_repo, upload_folder\n",
|
| 95 |
"\n",
|
| 96 |
"from load_h5 import Dataset4h5\n",
|
| 97 |
-
"from context_unet import ContextUnet"
|
|
|
|
|
|
|
| 98 |
]
|
| 99 |
},
|
| 100 |
{
|
|
@@ -119,7 +87,7 @@
|
|
| 119 |
},
|
| 120 |
{
|
| 121 |
"cell_type": "code",
|
| 122 |
-
"execution_count":
|
| 123 |
"metadata": {},
|
| 124 |
"outputs": [],
|
| 125 |
"source": [
|
|
@@ -226,7 +194,7 @@
|
|
| 226 |
},
|
| 227 |
{
|
| 228 |
"cell_type": "code",
|
| 229 |
-
"execution_count":
|
| 230 |
"metadata": {},
|
| 231 |
"outputs": [],
|
| 232 |
"source": [
|
|
@@ -257,7 +225,7 @@
|
|
| 257 |
},
|
| 258 |
{
|
| 259 |
"cell_type": "code",
|
| 260 |
-
"execution_count":
|
| 261 |
"metadata": {},
|
| 262 |
"outputs": [],
|
| 263 |
"source": [
|
|
@@ -323,7 +291,7 @@
|
|
| 323 |
},
|
| 324 |
{
|
| 325 |
"cell_type": "code",
|
| 326 |
-
"execution_count":
|
| 327 |
"metadata": {},
|
| 328 |
"outputs": [
|
| 329 |
{
|
|
@@ -534,47 +502,23 @@
|
|
| 534 |
},
|
| 535 |
{
|
| 536 |
"cell_type": "code",
|
| 537 |
-
"execution_count":
|
| 538 |
-
"metadata": {},
|
| 539 |
-
"outputs": [],
|
| 540 |
-
"source": [
|
| 541 |
-
"# ddpm21cm.train()"
|
| 542 |
-
]
|
| 543 |
-
},
|
| 544 |
-
{
|
| 545 |
-
"cell_type": "code",
|
| 546 |
-
"execution_count": 8,
|
| 547 |
-
"metadata": {},
|
| 548 |
-
"outputs": [],
|
| 549 |
-
"source": [
|
| 550 |
-
"# accelerator = Accelerator()\n",
|
| 551 |
-
"# print(accelerator.process_index)\n",
|
| 552 |
-
"# print(accelerator.is_local_main_process)"
|
| 553 |
-
]
|
| 554 |
-
},
|
| 555 |
-
{
|
| 556 |
-
"cell_type": "code",
|
| 557 |
-
"execution_count": 9,
|
| 558 |
-
"metadata": {},
|
| 559 |
-
"outputs": [],
|
| 560 |
-
"source": [
|
| 561 |
-
"# ddpm21cm.sample(\"./outputs/model_state_09.pth\")"
|
| 562 |
-
]
|
| 563 |
-
},
|
| 564 |
-
{
|
| 565 |
-
"cell_type": "code",
|
| 566 |
-
"execution_count": 10,
|
| 567 |
-
"metadata": {},
|
| 568 |
-
"outputs": [],
|
| 569 |
-
"source": [
|
| 570 |
-
"# train_loop(config, nn_model, ddpm, optimizer, dataloader, lr_scheduler)"
|
| 571 |
-
]
|
| 572 |
-
},
|
| 573 |
-
{
|
| 574 |
-
"cell_type": "code",
|
| 575 |
-
"execution_count": 12,
|
| 576 |
"metadata": {},
|
| 577 |
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 578 |
{
|
| 579 |
"name": "stdout",
|
| 580 |
"output_type": "stream",
|
|
@@ -584,7 +528,8 @@
|
|
| 584 |
"51200 images can be loaded\n",
|
| 585 |
"field.shape = (64, 64, 514)\n",
|
| 586 |
"params keys = [b'ION_Tvir_MIN', b'HII_EFF_FACTOR']\n",
|
| 587 |
-
"loading 40 images randomly\n"
|
|
|
|
| 588 |
]
|
| 589 |
},
|
| 590 |
{
|
|
@@ -598,16 +543,15 @@
|
|
| 598 |
"name": "stdout",
|
| 599 |
"output_type": "stream",
|
| 600 |
"text": [
|
| 601 |
-
"images loaded: (40, 1, 64, 512)\n",
|
| 602 |
"params loaded: (40, 2)\n",
|
| 603 |
-
"images rescaled to [-1.0, 1.
|
| 604 |
-
"params rescaled to [0.0, 0.
|
| 605 |
]
|
| 606 |
},
|
| 607 |
{
|
| 608 |
"data": {
|
| 609 |
"application/vnd.jupyter.widget-view+json": {
|
| 610 |
-
"model_id": "
|
| 611 |
"version_major": 2,
|
| 612 |
"version_minor": 0
|
| 613 |
},
|
|
@@ -621,7 +565,7 @@
|
|
| 621 |
{
|
| 622 |
"data": {
|
| 623 |
"application/vnd.jupyter.widget-view+json": {
|
| 624 |
-
"model_id": "
|
| 625 |
"version_major": 2,
|
| 626 |
"version_minor": 0
|
| 627 |
},
|
|
@@ -635,7 +579,7 @@
|
|
| 635 |
{
|
| 636 |
"data": {
|
| 637 |
"application/vnd.jupyter.widget-view+json": {
|
| 638 |
-
"model_id": "
|
| 639 |
"version_major": 2,
|
| 640 |
"version_minor": 0
|
| 641 |
},
|
|
@@ -649,7 +593,7 @@
|
|
| 649 |
{
|
| 650 |
"data": {
|
| 651 |
"application/vnd.jupyter.widget-view+json": {
|
| 652 |
-
"model_id": "
|
| 653 |
"version_major": 2,
|
| 654 |
"version_minor": 0
|
| 655 |
},
|
|
@@ -663,7 +607,7 @@
|
|
| 663 |
{
|
| 664 |
"data": {
|
| 665 |
"application/vnd.jupyter.widget-view+json": {
|
| 666 |
-
"model_id": "
|
| 667 |
"version_major": 2,
|
| 668 |
"version_minor": 0
|
| 669 |
},
|
|
@@ -677,7 +621,7 @@
|
|
| 677 |
{
|
| 678 |
"data": {
|
| 679 |
"application/vnd.jupyter.widget-view+json": {
|
| 680 |
-
"model_id": "
|
| 681 |
"version_major": 2,
|
| 682 |
"version_minor": 0
|
| 683 |
},
|
|
@@ -691,7 +635,7 @@
|
|
| 691 |
{
|
| 692 |
"data": {
|
| 693 |
"application/vnd.jupyter.widget-view+json": {
|
| 694 |
-
"model_id": "
|
| 695 |
"version_major": 2,
|
| 696 |
"version_minor": 0
|
| 697 |
},
|
|
@@ -705,7 +649,7 @@
|
|
| 705 |
{
|
| 706 |
"data": {
|
| 707 |
"application/vnd.jupyter.widget-view+json": {
|
| 708 |
-
"model_id": "
|
| 709 |
"version_major": 2,
|
| 710 |
"version_minor": 0
|
| 711 |
},
|
|
@@ -719,7 +663,7 @@
|
|
| 719 |
{
|
| 720 |
"data": {
|
| 721 |
"application/vnd.jupyter.widget-view+json": {
|
| 722 |
-
"model_id": "
|
| 723 |
"version_major": 2,
|
| 724 |
"version_minor": 0
|
| 725 |
},
|
|
@@ -733,7 +677,7 @@
|
|
| 733 |
{
|
| 734 |
"data": {
|
| 735 |
"application/vnd.jupyter.widget-view+json": {
|
| 736 |
-
"model_id": "
|
| 737 |
"version_major": 2,
|
| 738 |
"version_minor": 0
|
| 739 |
},
|
|
@@ -748,6 +692,7 @@
|
|
| 748 |
"source": [
|
| 749 |
"if __name__ == \"__main__\":\n",
|
| 750 |
" # args = (config, nn_model, ddpm, optimizer, dataloader, lr_scheduler)\n",
|
|
|
|
| 751 |
" notebook_launcher(ddpm21cm.train, num_processes=1)"
|
| 752 |
]
|
| 753 |
},
|
|
@@ -947,13 +892,13 @@
|
|
| 947 |
},
|
| 948 |
{
|
| 949 |
"cell_type": "code",
|
| 950 |
-
"execution_count":
|
| 951 |
"metadata": {},
|
| 952 |
"outputs": [
|
| 953 |
{
|
| 954 |
"data": {
|
| 955 |
"application/vnd.jupyter.widget-view+json": {
|
| 956 |
-
"model_id": "
|
| 957 |
"version_major": 2,
|
| 958 |
"version_minor": 0
|
| 959 |
},
|
|
@@ -963,6 +908,38 @@
|
|
| 963 |
},
|
| 964 |
"metadata": {},
|
| 965 |
"output_type": "display_data"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 966 |
}
|
| 967 |
],
|
| 968 |
"source": [
|
|
@@ -1056,7 +1033,7 @@
|
|
| 1056 |
" ),\n",
|
| 1057 |
")\n",
|
| 1058 |
"\n",
|
| 1059 |
-
"noise_scheduler = DDPMScheduler(
|
| 1060 |
"\n",
|
| 1061 |
"optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)\n",
|
| 1062 |
"lr_scheduler = get_cosine_schedule_with_warmup(\n",
|
|
@@ -1116,7 +1093,7 @@
|
|
| 1116 |
" bs = clean_images.shape[0]\n",
|
| 1117 |
"\n",
|
| 1118 |
" timesteps = torch.randint(\n",
|
| 1119 |
-
" 0, noise_scheduler.config.
|
| 1120 |
" )\n",
|
| 1121 |
"\n",
|
| 1122 |
" noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)\n",
|
|
@@ -1160,7 +1137,7 @@
|
|
| 1160 |
"# args = (config, model, noise_scheduler, optimizer, dataloader, lr_scheduler)\n",
|
| 1161 |
"\n",
|
| 1162 |
"device_count = torch.cuda.device_count()\n",
|
| 1163 |
-
"notebook_launcher(train_loop, (config, model, noise_scheduler, optimizer, dataloader, lr_scheduler), num_processes=
|
| 1164 |
]
|
| 1165 |
},
|
| 1166 |
{
|
|
@@ -1563,7 +1540,7 @@
|
|
| 1563 |
" ),\n",
|
| 1564 |
")\n",
|
| 1565 |
"\n",
|
| 1566 |
-
"noise_scheduler = DDPMScheduler(
|
| 1567 |
"\n",
|
| 1568 |
"optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)\n",
|
| 1569 |
"lr_scheduler = get_cosine_schedule_with_warmup(\n",
|
|
@@ -1623,7 +1600,7 @@
|
|
| 1623 |
" bs = clean_images.shape[0]\n",
|
| 1624 |
"\n",
|
| 1625 |
" timesteps = torch.randint(\n",
|
| 1626 |
-
" 0, noise_scheduler.config.
|
| 1627 |
" )\n",
|
| 1628 |
"\n",
|
| 1629 |
" noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)\n",
|
|
|
|
| 22 |
"- 融合cond, guide_w, drop_out這些參數\n",
|
| 23 |
"- 生成的21cm圖像該暗的地方不夠暗,似乎換成MNIST的數字圖像就沒問題\n",
|
| 24 |
"- 我用diffusion模型生成MNIST的數字時發現,儘管生成的數據的範圍也存在負數數值,如-0.1,但畫出來的圖像卻是理想的黑色。數據的分佈與21cm的結果的分佈沒多大差別,我現在打算把代碼退回到21cm的情形\n",
|
| 25 |
+
"- 我統一了ddpm21cm這個module,能統一實現訓練和生成樣本,但目前有個bug, sample時總是會cuda out of memory,然而單獨resume model並sample就不會。\n",
|
| 26 |
+
"- 解決了,問題出在我忘了寫with torch.no_grad():"
|
| 27 |
]
|
| 28 |
},
|
| 29 |
{
|
|
|
|
| 31 |
"execution_count": 1,
|
| 32 |
"metadata": {},
|
| 33 |
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
"source": [
|
| 35 |
"from dataclasses import dataclass\n",
|
| 36 |
"import h5py\n",
|
|
|
|
| 60 |
"from huggingface_hub import create_repo, upload_folder\n",
|
| 61 |
"\n",
|
| 62 |
"from load_h5 import Dataset4h5\n",
|
| 63 |
+
"from context_unet import ContextUnet\n",
|
| 64 |
+
"\n",
|
| 65 |
+
"from huggingface_hub import notebook_login"
|
| 66 |
]
|
| 67 |
},
|
| 68 |
{
|
|
|
|
| 87 |
},
|
| 88 |
{
|
| 89 |
"cell_type": "code",
|
| 90 |
+
"execution_count": 2,
|
| 91 |
"metadata": {},
|
| 92 |
"outputs": [],
|
| 93 |
"source": [
|
|
|
|
| 194 |
},
|
| 195 |
{
|
| 196 |
"cell_type": "code",
|
| 197 |
+
"execution_count": 3,
|
| 198 |
"metadata": {},
|
| 199 |
"outputs": [],
|
| 200 |
"source": [
|
|
|
|
| 225 |
},
|
| 226 |
{
|
| 227 |
"cell_type": "code",
|
| 228 |
+
"execution_count": 4,
|
| 229 |
"metadata": {},
|
| 230 |
"outputs": [],
|
| 231 |
"source": [
|
|
|
|
| 291 |
},
|
| 292 |
{
|
| 293 |
"cell_type": "code",
|
| 294 |
+
"execution_count": 5,
|
| 295 |
"metadata": {},
|
| 296 |
"outputs": [
|
| 297 |
{
|
|
|
|
| 502 |
},
|
| 503 |
{
|
| 504 |
"cell_type": "code",
|
| 505 |
+
"execution_count": 6,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 506 |
"metadata": {},
|
| 507 |
"outputs": [
|
| 508 |
+
{
|
| 509 |
+
"data": {
|
| 510 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 511 |
+
"model_id": "d488e670a37b408399687972aa7fef8a",
|
| 512 |
+
"version_major": 2,
|
| 513 |
+
"version_minor": 0
|
| 514 |
+
},
|
| 515 |
+
"text/plain": [
|
| 516 |
+
"VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
|
| 517 |
+
]
|
| 518 |
+
},
|
| 519 |
+
"metadata": {},
|
| 520 |
+
"output_type": "display_data"
|
| 521 |
+
},
|
| 522 |
{
|
| 523 |
"name": "stdout",
|
| 524 |
"output_type": "stream",
|
|
|
|
| 528 |
"51200 images can be loaded\n",
|
| 529 |
"field.shape = (64, 64, 514)\n",
|
| 530 |
"params keys = [b'ION_Tvir_MIN', b'HII_EFF_FACTOR']\n",
|
| 531 |
+
"loading 40 images randomly\n",
|
| 532 |
+
"images loaded: (40, 1, 64, 512)\n"
|
| 533 |
]
|
| 534 |
},
|
| 535 |
{
|
|
|
|
| 543 |
"name": "stdout",
|
| 544 |
"output_type": "stream",
|
| 545 |
"text": [
|
|
|
|
| 546 |
"params loaded: (40, 2)\n",
|
| 547 |
+
"images rescaled to [-1.0, 1.1514630317687988]\n",
|
| 548 |
+
"params rescaled to [0.0, 0.9958124549229699]\n"
|
| 549 |
]
|
| 550 |
},
|
| 551 |
{
|
| 552 |
"data": {
|
| 553 |
"application/vnd.jupyter.widget-view+json": {
|
| 554 |
+
"model_id": "09696eb258f94136b32dec3290034ce2",
|
| 555 |
"version_major": 2,
|
| 556 |
"version_minor": 0
|
| 557 |
},
|
|
|
|
| 565 |
{
|
| 566 |
"data": {
|
| 567 |
"application/vnd.jupyter.widget-view+json": {
|
| 568 |
+
"model_id": "77a8a7ad49d84bdcbfc522069118b222",
|
| 569 |
"version_major": 2,
|
| 570 |
"version_minor": 0
|
| 571 |
},
|
|
|
|
| 579 |
{
|
| 580 |
"data": {
|
| 581 |
"application/vnd.jupyter.widget-view+json": {
|
| 582 |
+
"model_id": "28a00d5f8fdc472eae5b252888410421",
|
| 583 |
"version_major": 2,
|
| 584 |
"version_minor": 0
|
| 585 |
},
|
|
|
|
| 593 |
{
|
| 594 |
"data": {
|
| 595 |
"application/vnd.jupyter.widget-view+json": {
|
| 596 |
+
"model_id": "f145f20dab944419a730632a9d5081fc",
|
| 597 |
"version_major": 2,
|
| 598 |
"version_minor": 0
|
| 599 |
},
|
|
|
|
| 607 |
{
|
| 608 |
"data": {
|
| 609 |
"application/vnd.jupyter.widget-view+json": {
|
| 610 |
+
"model_id": "de801afbf6aa428b9d21bf6a76a28ee8",
|
| 611 |
"version_major": 2,
|
| 612 |
"version_minor": 0
|
| 613 |
},
|
|
|
|
| 621 |
{
|
| 622 |
"data": {
|
| 623 |
"application/vnd.jupyter.widget-view+json": {
|
| 624 |
+
"model_id": "1e2c7ec7c4df438c93f21b420e93aaf3",
|
| 625 |
"version_major": 2,
|
| 626 |
"version_minor": 0
|
| 627 |
},
|
|
|
|
| 635 |
{
|
| 636 |
"data": {
|
| 637 |
"application/vnd.jupyter.widget-view+json": {
|
| 638 |
+
"model_id": "e963fcaf27ae401e926053f9f9c146f4",
|
| 639 |
"version_major": 2,
|
| 640 |
"version_minor": 0
|
| 641 |
},
|
|
|
|
| 649 |
{
|
| 650 |
"data": {
|
| 651 |
"application/vnd.jupyter.widget-view+json": {
|
| 652 |
+
"model_id": "d6fb1a76c93b45e59fa7d253ef344bda",
|
| 653 |
"version_major": 2,
|
| 654 |
"version_minor": 0
|
| 655 |
},
|
|
|
|
| 663 |
{
|
| 664 |
"data": {
|
| 665 |
"application/vnd.jupyter.widget-view+json": {
|
| 666 |
+
"model_id": "f0d88179dc7d42a793a820f4d1a08da3",
|
| 667 |
"version_major": 2,
|
| 668 |
"version_minor": 0
|
| 669 |
},
|
|
|
|
| 677 |
{
|
| 678 |
"data": {
|
| 679 |
"application/vnd.jupyter.widget-view+json": {
|
| 680 |
+
"model_id": "c8caffa8edc6457e97c2d315252a5adf",
|
| 681 |
"version_major": 2,
|
| 682 |
"version_minor": 0
|
| 683 |
},
|
|
|
|
| 692 |
"source": [
|
| 693 |
"if __name__ == \"__main__\":\n",
|
| 694 |
" # args = (config, nn_model, ddpm, optimizer, dataloader, lr_scheduler)\n",
|
| 695 |
+
" notebook_login()\n",
|
| 696 |
" notebook_launcher(ddpm21cm.train, num_processes=1)"
|
| 697 |
]
|
| 698 |
},
|
|
|
|
| 892 |
},
|
| 893 |
{
|
| 894 |
"cell_type": "code",
|
| 895 |
+
"execution_count": 1,
|
| 896 |
"metadata": {},
|
| 897 |
"outputs": [
|
| 898 |
{
|
| 899 |
"data": {
|
| 900 |
"application/vnd.jupyter.widget-view+json": {
|
| 901 |
+
"model_id": "f840b4efaf0a413394f033e5ffa3d2f4",
|
| 902 |
"version_major": 2,
|
| 903 |
"version_minor": 0
|
| 904 |
},
|
|
|
|
| 908 |
},
|
| 909 |
"metadata": {},
|
| 910 |
"output_type": "display_data"
|
| 911 |
+
},
|
| 912 |
+
{
|
| 913 |
+
"name": "stderr",
|
| 914 |
+
"output_type": "stream",
|
| 915 |
+
"text": [
|
| 916 |
+
"Repo card metadata block was not found. Setting CardData to empty.\n"
|
| 917 |
+
]
|
| 918 |
+
},
|
| 919 |
+
{
|
| 920 |
+
"name": "stdout",
|
| 921 |
+
"output_type": "stream",
|
| 922 |
+
"text": [
|
| 923 |
+
"Launching training on 2 GPUs.\n"
|
| 924 |
+
]
|
| 925 |
+
},
|
| 926 |
+
{
|
| 927 |
+
"ename": "RuntimeError",
|
| 928 |
+
"evalue": "CUDA has been initialized before the `notebook_launcher` could create a forked subprocess. This likely stems from an outside import causing issues once the `notebook_launcher()` is called. Please review your imports and test them when running the `notebook_launcher()` to identify which one is problematic and causing CUDA to be initialized.",
|
| 929 |
+
"output_type": "error",
|
| 930 |
+
"traceback": [
|
| 931 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
| 932 |
+
"\u001b[0;31mProcessRaisedException\u001b[0m Traceback (most recent call last)",
|
| 933 |
+
"File \u001b[0;32m~/.conda/envs/diffusers/lib/python3.9/site-packages/accelerate/launchers.py:200\u001b[0m, in \u001b[0;36mnotebook_launcher\u001b[0;34m(function, args, num_processes, mixed_precision, use_port, master_addr, node_rank, num_nodes)\u001b[0m\n\u001b[1;32m 199\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 200\u001b[0m start_processes(launcher, args\u001b[39m=\u001b[39;49margs, nprocs\u001b[39m=\u001b[39;49mnum_processes, start_method\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mfork\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[1;32m 201\u001b[0m \u001b[39mexcept\u001b[39;00m ProcessRaisedException \u001b[39mas\u001b[39;00m e:\n",
|
| 934 |
+
"File \u001b[0;32m/usr/local/pace-apps/manual/packages/pytorch/1.12.0/lib/python3.9/site-packages/torch/multiprocessing/spawn.py:198\u001b[0m, in \u001b[0;36mstart_processes\u001b[0;34m(fn, args, nprocs, join, daemon, start_method)\u001b[0m\n\u001b[1;32m 197\u001b[0m \u001b[39m# Loop on join until it returns True or raises an exception.\u001b[39;00m\n\u001b[0;32m--> 198\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mnot\u001b[39;00m context\u001b[39m.\u001b[39;49mjoin():\n\u001b[1;32m 199\u001b[0m \u001b[39mpass\u001b[39;00m\n",
|
| 935 |
+
"File \u001b[0;32m/usr/local/pace-apps/manual/packages/pytorch/1.12.0/lib/python3.9/site-packages/torch/multiprocessing/spawn.py:160\u001b[0m, in \u001b[0;36mProcessContext.join\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 159\u001b[0m msg \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m original_trace\n\u001b[0;32m--> 160\u001b[0m \u001b[39mraise\u001b[39;00m ProcessRaisedException(msg, error_index, failed_process\u001b[39m.\u001b[39mpid)\n",
|
| 936 |
+
"\u001b[0;31mProcessRaisedException\u001b[0m: \n\n-- Process 1 terminated with the following error:\nTraceback (most recent call last):\n File \"/usr/local/pace-apps/manual/packages/pytorch/1.12.0/lib/python3.9/site-packages/torch/multiprocessing/spawn.py\", line 69, in _wrap\n fn(i, *args)\n File \"/storage/home/hcoda1/3/bxia34/.conda/envs/diffusers/lib/python3.9/site-packages/accelerate/utils/launch.py\", line 608, in __call__\n self.launcher(*args)\n File \"/scratch/166867/ipykernel_204345/1749266112.py\", line 117, in train_loop\n accelerator = Accelerator(\n File \"/storage/home/hcoda1/3/bxia34/.conda/envs/diffusers/lib/python3.9/site-packages/accelerate/accelerator.py\", line 371, in __init__\n self.state = AcceleratorState(\n File \"/storage/home/hcoda1/3/bxia34/.conda/envs/diffusers/lib/python3.9/site-packages/accelerate/state.py\", line 777, in __init__\n PartialState(cpu, **kwargs)\n File \"/storage/home/hcoda1/3/bxia34/.conda/envs/diffusers/lib/python3.9/site-packages/accelerate/state.py\", line 240, in __init__\n torch.cuda.set_device(self.device)\n File \"/usr/local/pace-apps/manual/packages/pytorch/1.12.0/lib/python3.9/site-packages/torch/cuda/__init__.py\", line 314, in set_device\n torch._C._cuda_setDevice(device)\n File \"/usr/local/pace-apps/manual/packages/pytorch/1.12.0/lib/python3.9/site-packages/torch/cuda/__init__.py\", line 207, in _lazy_init\n raise RuntimeError(\nRuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method\n",
|
| 937 |
+
"\nThe above exception was the direct cause of the following exception:\n",
|
| 938 |
+
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
|
| 939 |
+
"Cell \u001b[0;32mIn[1], line 195\u001b[0m\n\u001b[1;32m 192\u001b[0m \u001b[39m# args = (config, model, noise_scheduler, optimizer, dataloader, lr_scheduler)\u001b[39;00m\n\u001b[1;32m 194\u001b[0m device_count \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mcuda\u001b[39m.\u001b[39mdevice_count()\n\u001b[0;32m--> 195\u001b[0m notebook_launcher(train_loop, (config, model, noise_scheduler, optimizer, dataloader, lr_scheduler), num_processes\u001b[39m=\u001b[39;49m\u001b[39m2\u001b[39;49m)\n",
|
| 940 |
+
"File \u001b[0;32m~/.conda/envs/diffusers/lib/python3.9/site-packages/accelerate/launchers.py:203\u001b[0m, in \u001b[0;36mnotebook_launcher\u001b[0;34m(function, args, num_processes, mixed_precision, use_port, master_addr, node_rank, num_nodes)\u001b[0m\n\u001b[1;32m 201\u001b[0m \u001b[39mexcept\u001b[39;00m ProcessRaisedException \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m 202\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mCannot re-initialize CUDA in forked subprocess\u001b[39m\u001b[39m\"\u001b[39m \u001b[39min\u001b[39;00m e\u001b[39m.\u001b[39margs[\u001b[39m0\u001b[39m]:\n\u001b[0;32m--> 203\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\n\u001b[1;32m 204\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mCUDA has been initialized before the `notebook_launcher` could create a forked subprocess. \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 205\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mThis likely stems from an outside import causing issues once the `notebook_launcher()` is called. \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 206\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mPlease review your imports and test them when running the `notebook_launcher()` to identify \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 207\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mwhich one is problematic and causing CUDA to be initialized.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 208\u001b[0m ) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n\u001b[1;32m 209\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 210\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mAn issue was found when launching the training: \u001b[39m\u001b[39m{\u001b[39;00me\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n",
|
| 941 |
+
"\u001b[0;31mRuntimeError\u001b[0m: CUDA has been initialized before the `notebook_launcher` could create a forked subprocess. This likely stems from an outside import causing issues once the `notebook_launcher()` is called. Please review your imports and test them when running the `notebook_launcher()` to identify which one is problematic and causing CUDA to be initialized."
|
| 942 |
+
]
|
| 943 |
}
|
| 944 |
],
|
| 945 |
"source": [
|
|
|
|
| 1033 |
" ),\n",
|
| 1034 |
")\n",
|
| 1035 |
"\n",
|
| 1036 |
+
"noise_scheduler = DDPMScheduler(num_train_timesteps=1000)\n",
|
| 1037 |
"\n",
|
| 1038 |
"optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)\n",
|
| 1039 |
"lr_scheduler = get_cosine_schedule_with_warmup(\n",
|
|
|
|
| 1093 |
" bs = clean_images.shape[0]\n",
|
| 1094 |
"\n",
|
| 1095 |
" timesteps = torch.randint(\n",
|
| 1096 |
+
" 0, noise_scheduler.config.num_train_timesteps, (bs,), device=clean_images.device, dtype=torch.int64\n",
|
| 1097 |
" )\n",
|
| 1098 |
"\n",
|
| 1099 |
" noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)\n",
|
|
|
|
| 1137 |
"# args = (config, model, noise_scheduler, optimizer, dataloader, lr_scheduler)\n",
|
| 1138 |
"\n",
|
| 1139 |
"device_count = torch.cuda.device_count()\n",
|
| 1140 |
+
"notebook_launcher(train_loop, (config, model, noise_scheduler, optimizer, dataloader, lr_scheduler), num_processes=2)"
|
| 1141 |
]
|
| 1142 |
},
|
| 1143 |
{
|
|
|
|
| 1540 |
" ),\n",
|
| 1541 |
")\n",
|
| 1542 |
"\n",
|
| 1543 |
+
"noise_scheduler = DDPMScheduler(num_train_timesteps=1000)\n",
|
| 1544 |
"\n",
|
| 1545 |
"optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)\n",
|
| 1546 |
"lr_scheduler = get_cosine_schedule_with_warmup(\n",
|
|
|
|
| 1600 |
" bs = clean_images.shape[0]\n",
|
| 1601 |
"\n",
|
| 1602 |
" timesteps = torch.randint(\n",
|
| 1603 |
+
" 0, noise_scheduler.config.num_train_timesteps, (bs,), device=clean_images.device, dtype=torch.int64\n",
|
| 1604 |
" )\n",
|
| 1605 |
"\n",
|
| 1606 |
" noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)\n",
|