Merge pull request #2 from ashawkey/main

Browse files

Files changed (6) hide show

main_nerf.py → main.py +12 -6
nerf/renderer.py +9 -3
nerf/sd.py +5 -3
readme.md +21 -10
requirements.txt +5 -2
scripts/run.sh +3 -3

main_nerf.py → main.py RENAMED Viewed

@@ -16,6 +16,8 @@ if __name__ == '__main__':
     parser.add_argument('-O', action='store_true', help="equals --fp16 --cuda_ray --dir_text")
     parser.add_argument('-O2', action='store_true', help="equals --fp16 --dir_text")
     parser.add_argument('--test', action='store_true', help="test mode")
     parser.add_argument('--workspace', type=str, default='workspace')
     parser.add_argument('--guidance', type=str, default='stable-diffusion', help='choose from [stable-diffusion, clip]')
     parser.add_argument('--seed', type=int, default=0)
@@ -26,7 +28,7 @@ if __name__ == '__main__':
     parser.add_argument('--ckpt', type=str, default='latest')
     parser.add_argument('--cuda_ray', action='store_true', help="use CUDA raymarching instead of pytorch")
     parser.add_argument('--max_steps', type=int, default=1024, help="max num steps sampled per ray (only valid when using --cuda_ray)")
-    parser.add_argument('--num_steps', type=int, default=256, help="num steps sampled per ray (only valid when not using --cuda_ray)")
     parser.add_argument('--upsample_steps', type=int, default=0, help="num steps up-sampled per ray (only valid when not using --cuda_ray)")
     parser.add_argument('--update_extra_interval', type=int, default=16, help="iter interval to update extra status (only valid when using --cuda_ray)")
     parser.add_argument('--max_ray_batch', type=int, default=4096, help="batch size of rays at inference to avoid OOM (only valid when not using --cuda_ray)")
@@ -37,7 +39,7 @@ if __name__ == '__main__':
     # network backbone
     parser.add_argument('--fp16', action='store_true', help="use amp mixed precision training")
     parser.add_argument('--backbone', type=str, default='grid', help="nerf backbone, choose from [grid, tcnn, vanilla]")
-    # rendering resolution in training
     parser.add_argument('--w', type=int, default=128, help="render width for NeRF in training")
     parser.add_argument('--h', type=int, default=128, help="render height for NeRF in training")
@@ -68,8 +70,8 @@ if __name__ == '__main__':
     if opt.O:
         opt.fp16 = True
-        opt.cuda_ray = True
         opt.dir_text = True
     elif opt.O2:
         opt.fp16 = True
         opt.dir_text = True
@@ -105,7 +107,9 @@ if __name__ == '__main__':
         else:
             test_loader = NeRFDataset(opt, device=device, type='test', H=opt.H, W=opt.W, size=100).dataloader()
             trainer.test(test_loader)
-            # trainer.save_mesh(resolution=256)
     else:
@@ -126,7 +130,7 @@ if __name__ == '__main__':
         # decay to 0.01 * init_lr at last iter step
         scheduler = lambda optimizer: optim.lr_scheduler.LambdaLR(optimizer, lambda iter: 0.01 ** min(iter / opt.iters, 1))
-        trainer = Trainer('ngp', opt, model, guidance, device=device, workspace=opt.workspace, optimizer=optimizer, ema_decay=0.95, fp16=opt.fp16, lr_scheduler=scheduler, use_checkpoint=opt.ckpt, eval_interval=1)
         if opt.gui:
             trainer.train_loader = train_loader # attach dataloader to trainer
@@ -143,4 +147,6 @@ if __name__ == '__main__':
             # also test
             test_loader = NeRFDataset(opt, device=device, type='test', H=opt.H, W=opt.W, size=100).dataloader()
             trainer.test(test_loader)
-            trainer.save_mesh(resolution=256)

     parser.add_argument('-O', action='store_true', help="equals --fp16 --cuda_ray --dir_text")
     parser.add_argument('-O2', action='store_true', help="equals --fp16 --dir_text")
     parser.add_argument('--test', action='store_true', help="test mode")
+    parser.add_argument('--save_mesh', action='store_true', help="export an obj mesh with texture")
+    parser.add_argument('--eval_interval', type=int, default=10, help="evaluate on the valid set every interval epochs")
     parser.add_argument('--workspace', type=str, default='workspace')
     parser.add_argument('--guidance', type=str, default='stable-diffusion', help='choose from [stable-diffusion, clip]')
     parser.add_argument('--seed', type=int, default=0)
     parser.add_argument('--ckpt', type=str, default='latest')
     parser.add_argument('--cuda_ray', action='store_true', help="use CUDA raymarching instead of pytorch")
     parser.add_argument('--max_steps', type=int, default=1024, help="max num steps sampled per ray (only valid when using --cuda_ray)")
+    parser.add_argument('--num_steps', type=int, default=128, help="num steps sampled per ray (only valid when not using --cuda_ray)")
     parser.add_argument('--upsample_steps', type=int, default=0, help="num steps up-sampled per ray (only valid when not using --cuda_ray)")
     parser.add_argument('--update_extra_interval', type=int, default=16, help="iter interval to update extra status (only valid when using --cuda_ray)")
     parser.add_argument('--max_ray_batch', type=int, default=4096, help="batch size of rays at inference to avoid OOM (only valid when not using --cuda_ray)")
     # network backbone
     parser.add_argument('--fp16', action='store_true', help="use amp mixed precision training")
     parser.add_argument('--backbone', type=str, default='grid', help="nerf backbone, choose from [grid, tcnn, vanilla]")
+    # rendering resolution in training, decrease this if CUDA OOM.
     parser.add_argument('--w', type=int, default=128, help="render width for NeRF in training")
     parser.add_argument('--h', type=int, default=128, help="render height for NeRF in training")
     if opt.O:
         opt.fp16 = True
         opt.dir_text = True
+        opt.cuda_ray = True
     elif opt.O2:
         opt.fp16 = True
         opt.dir_text = True
         else:
             test_loader = NeRFDataset(opt, device=device, type='test', H=opt.H, W=opt.W, size=100).dataloader()
             trainer.test(test_loader)
+            if opt.save_mesh:
+                trainer.save_mesh(resolution=256)
     else:
         # decay to 0.01 * init_lr at last iter step
         scheduler = lambda optimizer: optim.lr_scheduler.LambdaLR(optimizer, lambda iter: 0.01 ** min(iter / opt.iters, 1))
+        trainer = Trainer('ngp', opt, model, guidance, device=device, workspace=opt.workspace, optimizer=optimizer, ema_decay=0.95, fp16=opt.fp16, lr_scheduler=scheduler, use_checkpoint=opt.ckpt, eval_interval=opt.eval_interval)
         if opt.gui:
             trainer.train_loader = train_loader # attach dataloader to trainer
             # also test
             test_loader = NeRFDataset(opt, device=device, type='test', H=opt.H, W=opt.W, size=100).dataloader()
             trainer.test(test_loader)
+            if opt.save_mesh:
+                trainer.save_mesh(resolution=256)

nerf/renderer.py CHANGED Viewed

@@ -168,7 +168,7 @@ class NeRFRenderer(nn.Module):
             from sklearn.neighbors import NearestNeighbors
             from scipy.ndimage import binary_dilation, binary_erosion
-            glctx = dr.RasterizeGLContext()
             atlas = xatlas.Atlas()
             atlas.add_mesh(v_np, f_np)
@@ -271,7 +271,7 @@ class NeRFRenderer(nn.Module):
             print(f'[INFO] writing obj mesh to {obj_file}')
             with open(obj_file, "w") as fp:
-                fp.write(f'mtllib {name}.mtl \n')
                 print(f'[INFO] writing vertices {v_np.shape}')
                 for v in v_np:
@@ -320,6 +320,12 @@ class NeRFRenderer(nn.Module):
         nears.unsqueeze_(-1)
         fars.unsqueeze_(-1)
         #print(f'nears = {nears.min().item()} ~ {nears.max().item()}, fars = {fars.min().item()} ~ {fars.max().item()}')
         z_vals = torch.linspace(0.0, 1.0, num_steps, device=device).unsqueeze(0) # [1, T]
@@ -451,7 +457,7 @@ class NeRFRenderer(nn.Module):
         # random sample light_d if not provided
         if light_d is None:
             # gaussian noise around the ray origin, so the light always face the view dir (avoid dark face)
-            light_d = - (rays_o[0] + torch.randn(3, device=device, dtype=torch.float))
             light_d = safe_normalize(light_d)
         results = {}

             from sklearn.neighbors import NearestNeighbors
             from scipy.ndimage import binary_dilation, binary_erosion
+            glctx = dr.RasterizeCudaContext()
             atlas = xatlas.Atlas()
             atlas.add_mesh(v_np, f_np)
             print(f'[INFO] writing obj mesh to {obj_file}')
             with open(obj_file, "w") as fp:
+                fp.write(f'mtllib {name}mesh.mtl \n')
                 print(f'[INFO] writing vertices {v_np.shape}')
                 for v in v_np:
         nears.unsqueeze_(-1)
         fars.unsqueeze_(-1)
+        # random sample light_d if not provided
+        if light_d is None:
+            # gaussian noise around the ray origin, so the light always face the view dir (avoid dark face)
+            light_d = (rays_o[0] + torch.randn(3, device=device, dtype=torch.float))
+            light_d = safe_normalize(light_d)
         #print(f'nears = {nears.min().item()} ~ {nears.max().item()}, fars = {fars.min().item()} ~ {fars.max().item()}')
         z_vals = torch.linspace(0.0, 1.0, num_steps, device=device).unsqueeze(0) # [1, T]
         # random sample light_d if not provided
         if light_d is None:
             # gaussian noise around the ray origin, so the light always face the view dir (avoid dark face)
+            light_d = (rays_o[0] + torch.randn(3, device=device, dtype=torch.float))
             light_d = safe_normalize(light_d)
         results = {}

nerf/sd.py CHANGED Viewed

@@ -41,6 +41,7 @@ class StableDiffusion(nn.Module):
         # 4. Create a scheduler for inference
         self.scheduler = PNDMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=self.num_train_timesteps)
         print(f'[INFO] loaded stable diffusion!')
@@ -93,8 +94,9 @@ class StableDiffusion(nn.Module):
         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
         noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-        # w(t), one_minus_alpha_prod, i.e., sigma^2
-        w = (1 - self.scheduler.alphas_cumprod[t]).to(self.device)
         grad = w * (noise_pred - noise)
         # clip grad for stable training?
@@ -105,7 +107,7 @@ class StableDiffusion(nn.Module):
         latents.backward(gradient=grad, retain_graph=True)
         # torch.cuda.synchronize(); print(f'[TIME] guiding: backward {time.time() - _t:.4f}s')
-        return 0 # fake loss value
     def produce_latents(self, text_embeddings, height=512, width=512, num_inference_steps=50, guidance_scale=7.5, latents=None):

         # 4. Create a scheduler for inference
         self.scheduler = PNDMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=self.num_train_timesteps)
+        self.alphas = self.scheduler.alphas_cumprod.to(self.device) # for convenience
         print(f'[INFO] loaded stable diffusion!')
         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
         noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+        # w(t), alpha_t * sigma_t^2
+        # w = (1 - self.alphas[t])
+        w = self.alphas[t] ** 0.5 * (1 - self.alphas[t])
         grad = w * (noise_pred - noise)
         # clip grad for stable training?
         latents.backward(gradient=grad, retain_graph=True)
         # torch.cuda.synchronize(); print(f'[TIME] guiding: backward {time.time() - _t:.4f}s')
+        return 0 # dummy loss value
     def produce_latents(self, text_embeddings, height=512, width=512, num_inference_steps=50, guidance_scale=7.5, latents=None):

readme.md CHANGED Viewed

@@ -4,6 +4,8 @@ A pytorch implementation of the text-to-3D model **Dreamfusion**, powered by the
 The original paper's project page: [_DreamFusion: Text-to-3D using 2D Diffusion_](https://dreamfusion3d.github.io/).
 Examples generated from text prompt `a high quality photo of a pineapple` viewed with the GUI in real time:
 https://user-images.githubusercontent.com/25863658/194241493-f3e68f78-aefe-479e-a4a8-001424a61b37.mp4
@@ -37,14 +39,15 @@ cd stable-dreamfusion
 ```bash
 pip install -r requirements.txt
 # (optional) install the tcnn backbone if using --tcnn
 pip install git+https://github.com/NVlabs/tiny-cuda-nn/#subdirectory=bindings/torch
 # (optional) install CLIP guidance for the dreamfield setting
 pip install git+https://github.com/openai/CLIP.git
-# (optional) install nvdiffrast for exporting textured mesh
-pip install git+https://github.com/NVlabs/nvdiffrast/
 ```
 ### Build extension (optional)
@@ -68,19 +71,20 @@ First time running will take some time to compile the CUDA extensions.
 ```bash
 ### stable-dreamfusion setting
-# train with text prompt
 # `-O` equals `--cuda_ray --fp16 --dir_text`
-python main_nerf.py --text "a hamburger" --workspace trial -O
 # test (exporting 360 video, and an obj mesh with png texture)
-python main_nerf.py --text "a hamburger" --workspace trial -O --test
 # test with a GUI (free view control!)
-python main_nerf.py --text "a hamburger" --workspace trial -O --test --gui
 ### dreamfields (CLIP) setting
-python main_nerf.py --text "a hamburger" --workspace trial_clip -O --guidance clip
-python main_nerf.py --text "a hamburger" --workspace trial_clip -O --test --gui --guidance clip
 ```
 # Code organization & Advanced tips
@@ -104,13 +108,20 @@ latents.backward(gradient=grad, retain_graph=True)
 * Other regularizations are in `./nerf/utils.py > Trainer > train_step`.
     * The generation seems quite sensitive to regularizations on weights_sum (alphas for each ray). The original opacity loss tends to make NeRF disappear (zero density everywhere), so we use an entropy loss to replace it for now (encourages alpha to be either 0 or 1).
 * NeRF Rendering core function: `./nerf/renderer.py > NeRFRenderer > run_cuda`.
 * Shading & normal evaluation: `./nerf/network*.py > NeRFNetwork > forward`. Current implementation harms training and is disabled.
-    * use `--albedo_iters 1000` to enable random shading mode after 1000 steps from albedo, lambertian ,and textureless
     * light direction: current implementation use a plane light source, instead of a point light source...
 * View-dependent prompting: `./nerf/provider.py > get_view_direction`.
     * ues `--angle_overhead, --angle_front` to set the border. How to better divide front/back/side regions?
 * Network backbone (`./nerf/network*.py`) can be chosen by the `--backbone` option, but `tcnn` and `vanilla` are not well tested.
-    * the occupancy grid based training acceleration (instant-ngp like) may harm the generation progress, since once a grid cell is marked as empty, rays won't pass it later.
 * Spatial density bias (gaussian density blob): `./nerf/network*.py > NeRFNetwork > gaussian`.
 # Acknowledgement

 The original paper's project page: [_DreamFusion: Text-to-3D using 2D Diffusion_](https://dreamfusion3d.github.io/).
+Colab notebook for usage: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1MXT3yfOFvO0ooKEfiUUvTKwUkrrlCHpF?usp=sharing)
 Examples generated from text prompt `a high quality photo of a pineapple` viewed with the GUI in real time:
 https://user-images.githubusercontent.com/25863658/194241493-f3e68f78-aefe-479e-a4a8-001424a61b37.mp4
 ```bash
 pip install -r requirements.txt
+# (optional) install nvdiffrast for exporting textured mesh (--save_mesh)
+pip install git+https://github.com/NVlabs/nvdiffrast/
 # (optional) install the tcnn backbone if using --tcnn
 pip install git+https://github.com/NVlabs/tiny-cuda-nn/#subdirectory=bindings/torch
 # (optional) install CLIP guidance for the dreamfield setting
 pip install git+https://github.com/openai/CLIP.git
 ```
 ### Build extension (optional)
 ```bash
 ### stable-dreamfusion setting
+## train with text prompt
 # `-O` equals `--cuda_ray --fp16 --dir_text`
+python main.py --text "a hamburger" --workspace trial -O
+## after the training is finished:
 # test (exporting 360 video, and an obj mesh with png texture)
+python main.py --workspace trial -O --test
 # test with a GUI (free view control!)
+python main.py --workspace trial -O --test --gui
 ### dreamfields (CLIP) setting
+python main.py --text "a hamburger" --workspace trial_clip -O --guidance clip
+python main.py --text "a hamburger" --workspace trial_clip -O --test --gui --guidance clip
 ```
 # Code organization & Advanced tips
 * Other regularizations are in `./nerf/utils.py > Trainer > train_step`.
     * The generation seems quite sensitive to regularizations on weights_sum (alphas for each ray). The original opacity loss tends to make NeRF disappear (zero density everywhere), so we use an entropy loss to replace it for now (encourages alpha to be either 0 or 1).
 * NeRF Rendering core function: `./nerf/renderer.py > NeRFRenderer > run_cuda`.
+    * the occupancy grid based training acceleration (instant-ngp like, enabled by `--cuda_ray`) may harm the generation progress, since once a grid cell is marked as empty, rays won't pass it later...
+    * Not using `--cuda_ray` also works now:
+        ```bash
+        # `-O2` equals `--fp16 --dir_text`
+        python main.py --text "a hamburger" --workspace trial -O2 # faster training, but slower rendering
+        ```
+        Training is faster if only sample 128 points uniformly per ray (5h --> 2.5h).
+        More testing is needed...
 * Shading & normal evaluation: `./nerf/network*.py > NeRFNetwork > forward`. Current implementation harms training and is disabled.
+    * use `--albedo_iters 1000` to enable random shading mode after 1000 steps from albedo, lambertian, and textureless.
     * light direction: current implementation use a plane light source, instead of a point light source...
 * View-dependent prompting: `./nerf/provider.py > get_view_direction`.
     * ues `--angle_overhead, --angle_front` to set the border. How to better divide front/back/side regions?
 * Network backbone (`./nerf/network*.py`) can be chosen by the `--backbone` option, but `tcnn` and `vanilla` are not well tested.
 * Spatial density bias (gaussian density blob): `./nerf/network*.py > NeRFNetwork > gaussian`.
 # Acknowledgement

requirements.txt CHANGED Viewed

@@ -10,9 +10,12 @@ tqdm
 matplotlib
 PyMCubes
 rich
-pysdf
 dearpygui
 scipy
 diffusers
 transformers
-xatlas

 matplotlib
 PyMCubes
 rich
 dearpygui
 scipy
+huggingface_hub
 diffusers
 transformers
+xatlas
+scikit-learn
+imageio
+imageio-ffmpeg

scripts/run.sh CHANGED Viewed

@@ -1,5 +1,5 @@
 #! /bin/bash
-CUDA_VISIBLE_DEVICES=1 python main_nerf.py -O --text "a DSLR photo of cthulhu" --workspace trial_cthulhu
-CUDA_VISIBLE_DEVICES=1 python main_nerf.py -O --text "a DSLR photo of a squirrel" --workspace trial_squirrel
-CUDA_VISIBLE_DEVICES=1 python main_nerf.py -O --text "a DSLR photo of a cat lying on its side batting at a ball of yarn" --workspace trial_cat_lying

 #! /bin/bash
+CUDA_VISIBLE_DEVICES=1 python main.py -O --text "a DSLR photo of cthulhu" --workspace trial_cthulhu
+CUDA_VISIBLE_DEVICES=1 python main.py -O --text "a DSLR photo of a squirrel" --workspace trial_squirrel
+CUDA_VISIBLE_DEVICES=1 python main.py -O --text "a DSLR photo of a cat lying on its side batting at a ball of yarn" --workspace trial_cat_lying