TRELLIS.2

Paused

App Files Files Community

choephix commited on Mar 7

Commit

0ce1f9f

1 Parent(s): 2aa7f7d

Update Python formatting

Browse files

Files changed (2) hide show

app.py +210 -101
trellis2/pipelines/trellis2_image_to_3d.py +237 -137

app.py CHANGED Viewed

@@ -3,11 +3,14 @@ from gradio_client import Client, handle_file
 import spaces
 import os
-os.environ["OPENCV_IO_ENABLE_OPENEXR"] = '1'
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 os.environ["ATTN_BACKEND"] = "flash_attn_3"
-os.environ["FLEX_GEMM_AUTOTUNE_CACHE_PATH"] = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'autotune_cache.json')
-os.environ["FLEX_GEMM_AUTOTUNER_VERBOSE"] = '1'
 from datetime import datetime
 import shutil
 import cv2
@@ -26,14 +29,30 @@ import o_voxel
 MAX_SEED = np.iinfo(np.int32).max
-TMP_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tmp')
 MODES = [
     {"name": "Normal", "icon": "assets/app/normal.png", "render_key": "normal"},
     {"name": "Clay render", "icon": "assets/app/clay.png", "render_key": "clay"},
-    {"name": "Base color", "icon": "assets/app/basecolor.png", "render_key": "base_color"},
-    {"name": "HDRI forest", "icon": "assets/app/hdri_forest.png", "render_key": "shaded_forest"},
-    {"name": "HDRI sunset", "icon": "assets/app/hdri_sunset.png", "render_key": "shaded_sunset"},
-    {"name": "HDRI courtyard", "icon": "assets/app/hdri_courtyard.png", "render_key": "shaded_courtyard"},
 ]
 STEPS = 8
 DEFAULT_MODE = 3
@@ -307,16 +326,16 @@ def image_to_base64(image):
 def start_session(req: gr.Request):
     user_dir = os.path.join(TMP_DIR, str(req.session_hash))
     os.makedirs(user_dir, exist_ok=True)
 def end_session(req: gr.Request):
     user_dir = os.path.join(TMP_DIR, str(req.session_hash))
     shutil.rmtree(user_dir)
 def remove_background(input: Image.Image) -> Image.Image:
-    with tempfile.NamedTemporaryFile(suffix='.png') as f:
-        input = input.convert('RGB')
         input.save(f.name)
         output = rmbg_client.predict(handle_file(f.name), api_name="/image")[0][0]
         output = Image.open(output)
@@ -329,14 +348,17 @@ def preprocess_image(input: Image.Image) -> Image.Image:
     """
     # if has alpha channel, use it directly; otherwise, remove background
     has_alpha = False
-    if input.mode == 'RGBA':
         alpha = np.array(input)[:, :, 3]
         if not np.all(alpha == 255):
             has_alpha = True
     max_size = max(input.size)
     scale = min(1, 1024 / max_size)
     if scale < 1:
-        input = input.resize((int(input.width * scale), int(input.height * scale)), Image.Resampling.LANCZOS)
     if has_alpha:
         output = input
     else:
@@ -344,11 +366,21 @@ def preprocess_image(input: Image.Image) -> Image.Image:
     output_np = np.array(output)
     alpha = output_np[:, :, 3]
     bbox = np.argwhere(alpha > 0.8 * 255)
-    bbox = np.min(bbox[:, 1]), np.min(bbox[:, 0]), np.max(bbox[:, 1]), np.max(bbox[:, 0])
     center = (bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2
     size = max(bbox[2] - bbox[0], bbox[3] - bbox[1])
     size = int(size * 1)
-    bbox = center[0] - size // 2, center[1] - size // 2, center[0] + size // 2, center[1] + size // 2
     output = output.crop(bbox)  # type: ignore
     output = np.array(output).astype(np.float32) / 255
     output = output[:, :, :3] * output[:, :, 3:4]
@@ -359,20 +391,20 @@ def preprocess_image(input: Image.Image) -> Image.Image:
 def pack_state(latents: Tuple[SparseTensor, SparseTensor, int]) -> dict:
     shape_slat, tex_slat, res = latents
     return {
-        'shape_slat_feats': shape_slat.feats.cpu().numpy(),
-        'tex_slat_feats': tex_slat.feats.cpu().numpy(),
-        'coords': shape_slat.coords.cpu().numpy(),
-        'res': res,
     }
 def unpack_state(state: dict) -> Tuple[SparseTensor, SparseTensor, int]:
     shape_slat = SparseTensor(
-        feats=torch.from_numpy(state['shape_slat_feats']).cuda(),
-        coords=torch.from_numpy(state['coords']).cuda(),
     )
-    tex_slat = shape_slat.replace(torch.from_numpy(state['tex_slat_feats']).cuda())
-    return shape_slat, tex_slat, state['res']
 def get_seed(randomize_seed: bool, seed: int) -> int:
@@ -433,11 +465,13 @@ def image_to_3d(
         return_latent=True,
     )
     mesh = outputs[0]
-    mesh.simplify(16777216) # nvdiffrast limit
-    images = render_utils.render_snapshot(mesh, resolution=1024, r=2, fov=36, nviews=STEPS, envmap=envmap)
     state = pack_state(latents)
     torch.cuda.empty_cache()
     # --- HTML Construction ---
     # The Stack of 48 Images
     images_html = ""
@@ -445,14 +479,16 @@ def image_to_3d(
         for s_idx in range(STEPS):
             # ID Naming Convention: view-m{mode}-s{step}
             unique_id = f"view-m{m_idx}-s{s_idx}"
             # Logic: Only Mode 0, Step 0 is visible initially
-            is_visible = (m_idx == DEFAULT_MODE and s_idx == DEFAULT_STEP)
             vis_class = "visible" if is_visible else ""
             # Image Source
-            img_base64 = image_to_base64(Image.fromarray(images[mode['render_key']][s_idx]))
             # Render the Tag
             images_html += f"""
                 <img id="{unique_id}"
@@ -460,19 +496,19 @@ def image_to_3d(
                      src="{img_base64}"
                      loading="eager">
             """
     # Button Row HTML
     btns_html = ""
-    for idx, mode in enumerate(MODES):
         active_class = "active" if idx == DEFAULT_MODE else ""
         # Note: onclick calls the JS function defined in Head
         btns_html += f"""
-            <img src="{mode['icon_base64']}"
                  class="mode-btn {active_class}"
                  onclick="selectMode({idx})"
-                 title="{mode['name']}">
         """
     # Assemble the full component
     full_html = f"""
     <div class="previewer-container">
@@ -500,7 +536,7 @@ def image_to_3d(
         </div>
     </div>
     """
     return state, full_html
@@ -545,7 +581,7 @@ def extract_glb(
     now = datetime.now()
     timestamp = now.strftime("%Y-%m-%dT%H%M%S") + f".{now.microsecond // 1000:03d}"
     os.makedirs(user_dir, exist_ok=True)
-    glb_path = os.path.join(user_dir, f'sample_{timestamp}.glb')
     glb.export(glb_path, extension_webp=True)
     torch.cuda.empty_cache()
     return glb_path, glb_path
@@ -557,53 +593,102 @@ with gr.Blocks(delete_cache=(600, 600)) as demo:
     * Upload an image (preferably with an alpha-masked foreground object) and click Generate to create a 3D asset.
     * Click Extract GLB to export and download the generated GLB file if you're satisfied with the result. Otherwise, try another time.
     """)
     with gr.Row():
         with gr.Column(scale=1, min_width=360):
-            image_prompt = gr.Image(label="Image Prompt", format="png", image_mode="RGBA", type="pil", height=400)
-            resolution = gr.Radio(["512", "1024", "1536"], label="Resolution", value="512")
             seed = gr.Slider(0, MAX_SEED, label="Seed", value=0, step=1)
             randomize_seed = gr.Checkbox(label="Randomize Seed", value=False)
-            decimation_target = gr.Slider(20000, 500000, label="Decimation Target", value=20000, step=10000)
-            texture_size = gr.Slider(1024, 4096, label="Texture Size", value=1024, step=1024)
             generate_btn = gr.Button("Generate")
-            with gr.Accordion(label="Advanced Settings", open=False):
                 gr.Markdown("Stage 1: Sparse Structure Generation")
                 with gr.Row():
-                    ss_guidance_strength = gr.Slider(1.0, 10.0, label="Guidance Strength", value=7.5, step=0.1)
-                    ss_guidance_rescale = gr.Slider(0.0, 1.0, label="Guidance Rescale", value=0.7, step=0.01)
-                    ss_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=8, step=1)
-                    ss_rescale_t = gr.Slider(1.0, 6.0, label="Rescale T", value=5.0, step=0.1)
                 gr.Markdown("Stage 2: Shape Generation")
                 with gr.Row():
-                    shape_slat_guidance_strength = gr.Slider(1.0, 10.0, label="Guidance Strength", value=7.5, step=0.1)
-                    shape_slat_guidance_rescale = gr.Slider(0.0, 1.0, label="Guidance Rescale", value=0.5, step=0.01)
-                    shape_slat_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=8, step=1)
-                    shape_slat_rescale_t = gr.Slider(1.0, 6.0, label="Rescale T", value=3.0, step=0.1)
                 gr.Markdown("Stage 3: Material Generation")
                 with gr.Row():
-                    tex_slat_guidance_strength = gr.Slider(1.0, 10.0, label="Guidance Strength", value=1.0, step=0.1)
-                    tex_slat_guidance_rescale = gr.Slider(0.0, 1.0, label="Guidance Rescale", value=0.0, step=0.01)
-                    tex_slat_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=8, step=1)
-                    tex_slat_rescale_t = gr.Slider(1.0, 6.0, label="Rescale T", value=3.0, step=0.1)
         with gr.Column(scale=10):
             with gr.Walkthrough(selected=0) as walkthrough:
                 with gr.Step("Preview", id=0):
-                    preview_output = gr.HTML(empty_html, label="3D Asset Preview", show_label=True, container=True)
                     extract_btn = gr.Button("Extract GLB")
                 with gr.Step("Extract", id=1):
-                    glb_output = gr.Model3D(label="Extracted GLB", height=724, show_label=True, display_mode="solid", clear_color=(0.25, 0.25, 0.25, 1.0))
                     download_btn = gr.DownloadButton(label="Download GLB")
-            gr.Markdown("*We are actively working on improving the speed of GLB extraction. Currently, it may take half a minute or more and face count is limited.*")
         with gr.Column(scale=1, min_width=172):
             examples = gr.Examples(
                 examples=[
-                    f'assets/example_image/{image}'
                     for image in os.listdir("assets/example_image")
                 ],
                 inputs=[image_prompt],
@@ -612,14 +697,13 @@ with gr.Blocks(delete_cache=(600, 600)) as demo:
                 run_on_click=True,
                 examples_per_page=18,
             )
     output_buf = gr.State()
     # Handlers
     demo.load(start_session)
     demo.unload(end_session)
     image_prompt.upload(
         preprocess_image,
         inputs=[image_prompt],
@@ -630,27 +714,34 @@ with gr.Blocks(delete_cache=(600, 600)) as demo:
         get_seed,
         inputs=[randomize_seed, seed],
         outputs=[seed],
-    ).then(
-        lambda: gr.Walkthrough(selected=0), outputs=walkthrough
-    ).then(
         image_to_3d,
         inputs=[
-            image_prompt, seed, resolution,
-            ss_guidance_strength, ss_guidance_rescale, ss_sampling_steps, ss_rescale_t,
-            shape_slat_guidance_strength, shape_slat_guidance_rescale, shape_slat_sampling_steps, shape_slat_rescale_t,
-            tex_slat_guidance_strength, tex_slat_guidance_rescale, tex_slat_sampling_steps, tex_slat_rescale_t,
         ],
         outputs=[output_buf, preview_output],
     )
-    extract_btn.click(
-        lambda: gr.Walkthrough(selected=1), outputs=walkthrough
-    ).then(
         extract_glb,
         inputs=[output_buf, decimation_target, texture_size],
         outputs=[glb_output, download_btn],
     )
 # Launch the Gradio app
 if __name__ == "__main__":
@@ -659,29 +750,47 @@ if __name__ == "__main__":
     # Construct ui components
     btn_img_base64_strs = {}
     for i in range(len(MODES)):
-        icon = Image.open(MODES[i]['icon'])
-        MODES[i]['icon_base64'] = image_to_base64(icon)
     rmbg_client = Client("briaai/BRIA-RMBG-2.0")
-    pipeline = Trellis2ImageTo3DPipeline.from_pretrained('microsoft/TRELLIS.2-4B')
     pipeline.rembg_model = None
     pipeline.low_vram = False
     pipeline.cuda()
     envmap = {
-        'forest': EnvMap(torch.tensor(
-            cv2.cvtColor(cv2.imread('assets/hdri/forest.exr', cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGB),
-            dtype=torch.float32, device='cuda'
-        )),
-        'sunset': EnvMap(torch.tensor(
-            cv2.cvtColor(cv2.imread('assets/hdri/sunset.exr', cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGB),
-            dtype=torch.float32, device='cuda'
-        )),
-        'courtyard': EnvMap(torch.tensor(
-            cv2.cvtColor(cv2.imread('assets/hdri/courtyard.exr', cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGB),
-            dtype=torch.float32, device='cuda'
-        )),
     }
-    #demo.launch(css=css, head=head)
     demo.launch(server_name="0.0.0.0", server_port=7860, css=css, head=head)

 import spaces
 import os
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 os.environ["ATTN_BACKEND"] = "flash_attn_3"
+os.environ["FLEX_GEMM_AUTOTUNE_CACHE_PATH"] = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "autotune_cache.json"
+)
+os.environ["FLEX_GEMM_AUTOTUNER_VERBOSE"] = "1"
 from datetime import datetime
 import shutil
 import cv2
 MAX_SEED = np.iinfo(np.int32).max
+TMP_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tmp")
 MODES = [
     {"name": "Normal", "icon": "assets/app/normal.png", "render_key": "normal"},
     {"name": "Clay render", "icon": "assets/app/clay.png", "render_key": "clay"},
+    {
+        "name": "Base color",
+        "icon": "assets/app/basecolor.png",
+        "render_key": "base_color",
+    },
+    {
+        "name": "HDRI forest",
+        "icon": "assets/app/hdri_forest.png",
+        "render_key": "shaded_forest",
+    },
+    {
+        "name": "HDRI sunset",
+        "icon": "assets/app/hdri_sunset.png",
+        "render_key": "shaded_sunset",
+    },
+    {
+        "name": "HDRI courtyard",
+        "icon": "assets/app/hdri_courtyard.png",
+        "render_key": "shaded_courtyard",
+    },
 ]
 STEPS = 8
 DEFAULT_MODE = 3
 def start_session(req: gr.Request):
     user_dir = os.path.join(TMP_DIR, str(req.session_hash))
     os.makedirs(user_dir, exist_ok=True)
 def end_session(req: gr.Request):
     user_dir = os.path.join(TMP_DIR, str(req.session_hash))
     shutil.rmtree(user_dir)
 def remove_background(input: Image.Image) -> Image.Image:
+    with tempfile.NamedTemporaryFile(suffix=".png") as f:
+        input = input.convert("RGB")
         input.save(f.name)
         output = rmbg_client.predict(handle_file(f.name), api_name="/image")[0][0]
         output = Image.open(output)
     """
     # if has alpha channel, use it directly; otherwise, remove background
     has_alpha = False
+    if input.mode == "RGBA":
         alpha = np.array(input)[:, :, 3]
         if not np.all(alpha == 255):
             has_alpha = True
     max_size = max(input.size)
     scale = min(1, 1024 / max_size)
     if scale < 1:
+        input = input.resize(
+            (int(input.width * scale), int(input.height * scale)),
+            Image.Resampling.LANCZOS,
+        )
     if has_alpha:
         output = input
     else:
     output_np = np.array(output)
     alpha = output_np[:, :, 3]
     bbox = np.argwhere(alpha > 0.8 * 255)
+    bbox = (
+        np.min(bbox[:, 1]),
+        np.min(bbox[:, 0]),
+        np.max(bbox[:, 1]),
+        np.max(bbox[:, 0]),
+    )
     center = (bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2
     size = max(bbox[2] - bbox[0], bbox[3] - bbox[1])
     size = int(size * 1)
+    bbox = (
+        center[0] - size // 2,
+        center[1] - size // 2,
+        center[0] + size // 2,
+        center[1] + size // 2,
+    )
     output = output.crop(bbox)  # type: ignore
     output = np.array(output).astype(np.float32) / 255
     output = output[:, :, :3] * output[:, :, 3:4]
 def pack_state(latents: Tuple[SparseTensor, SparseTensor, int]) -> dict:
     shape_slat, tex_slat, res = latents
     return {
+        "shape_slat_feats": shape_slat.feats.cpu().numpy(),
+        "tex_slat_feats": tex_slat.feats.cpu().numpy(),
+        "coords": shape_slat.coords.cpu().numpy(),
+        "res": res,
     }
 def unpack_state(state: dict) -> Tuple[SparseTensor, SparseTensor, int]:
     shape_slat = SparseTensor(
+        feats=torch.from_numpy(state["shape_slat_feats"]).cuda(),
+        coords=torch.from_numpy(state["coords"]).cuda(),
     )
+    tex_slat = shape_slat.replace(torch.from_numpy(state["tex_slat_feats"]).cuda())
+    return shape_slat, tex_slat, state["res"]
 def get_seed(randomize_seed: bool, seed: int) -> int:
         return_latent=True,
     )
     mesh = outputs[0]
+    mesh.simplify(16777216)  # nvdiffrast limit
+    images = render_utils.render_snapshot(
+        mesh, resolution=1024, r=2, fov=36, nviews=STEPS, envmap=envmap
+    )
     state = pack_state(latents)
     torch.cuda.empty_cache()
     # --- HTML Construction ---
     # The Stack of 48 Images
     images_html = ""
         for s_idx in range(STEPS):
             # ID Naming Convention: view-m{mode}-s{step}
             unique_id = f"view-m{m_idx}-s{s_idx}"
             # Logic: Only Mode 0, Step 0 is visible initially
+            is_visible = m_idx == DEFAULT_MODE and s_idx == DEFAULT_STEP
             vis_class = "visible" if is_visible else ""
             # Image Source
+            img_base64 = image_to_base64(
+                Image.fromarray(images[mode["render_key"]][s_idx])
+            )
             # Render the Tag
             images_html += f"""
                 <img id="{unique_id}"
                      src="{img_base64}"
                      loading="eager">
             """
     # Button Row HTML
     btns_html = ""
+    for idx, mode in enumerate(MODES):
         active_class = "active" if idx == DEFAULT_MODE else ""
         # Note: onclick calls the JS function defined in Head
         btns_html += f"""
+            <img src="{mode["icon_base64"]}"
                  class="mode-btn {active_class}"
                  onclick="selectMode({idx})"
+                 title="{mode["name"]}">
         """
     # Assemble the full component
     full_html = f"""
     <div class="previewer-container">
         </div>
     </div>
     """
     return state, full_html
     now = datetime.now()
     timestamp = now.strftime("%Y-%m-%dT%H%M%S") + f".{now.microsecond // 1000:03d}"
     os.makedirs(user_dir, exist_ok=True)
+    glb_path = os.path.join(user_dir, f"sample_{timestamp}.glb")
     glb.export(glb_path, extension_webp=True)
     torch.cuda.empty_cache()
     return glb_path, glb_path
     * Upload an image (preferably with an alpha-masked foreground object) and click Generate to create a 3D asset.
     * Click Extract GLB to export and download the generated GLB file if you're satisfied with the result. Otherwise, try another time.
     """)
     with gr.Row():
         with gr.Column(scale=1, min_width=360):
+            image_prompt = gr.Image(
+                label="Image Prompt",
+                format="png",
+                image_mode="RGBA",
+                type="pil",
+                height=400,
+            )
+            resolution = gr.Radio(
+                ["512", "1024", "1536"], label="Resolution", value="512"
+            )
             seed = gr.Slider(0, MAX_SEED, label="Seed", value=0, step=1)
             randomize_seed = gr.Checkbox(label="Randomize Seed", value=False)
+            decimation_target = gr.Slider(
+                20000, 500000, label="Decimation Target", value=20000, step=10000
+            )
+            texture_size = gr.Slider(
+                1024, 4096, label="Texture Size", value=1024, step=1024
+            )
             generate_btn = gr.Button("Generate")
+            with gr.Accordion(label="Advanced Settings", open=False):
                 gr.Markdown("Stage 1: Sparse Structure Generation")
                 with gr.Row():
+                    ss_guidance_strength = gr.Slider(
+                        1.0, 10.0, label="Guidance Strength", value=7.5, step=0.1
+                    )
+                    ss_guidance_rescale = gr.Slider(
+                        0.0, 1.0, label="Guidance Rescale", value=0.7, step=0.01
+                    )
+                    ss_sampling_steps = gr.Slider(
+                        1, 50, label="Sampling Steps", value=8, step=1
+                    )
+                    ss_rescale_t = gr.Slider(
+                        1.0, 6.0, label="Rescale T", value=5.0, step=0.1
+                    )
                 gr.Markdown("Stage 2: Shape Generation")
                 with gr.Row():
+                    shape_slat_guidance_strength = gr.Slider(
+                        1.0, 10.0, label="Guidance Strength", value=7.5, step=0.1
+                    )
+                    shape_slat_guidance_rescale = gr.Slider(
+                        0.0, 1.0, label="Guidance Rescale", value=0.5, step=0.01
+                    )
+                    shape_slat_sampling_steps = gr.Slider(
+                        1, 50, label="Sampling Steps", value=8, step=1
+                    )
+                    shape_slat_rescale_t = gr.Slider(
+                        1.0, 6.0, label="Rescale T", value=3.0, step=0.1
+                    )
                 gr.Markdown("Stage 3: Material Generation")
                 with gr.Row():
+                    tex_slat_guidance_strength = gr.Slider(
+                        1.0, 10.0, label="Guidance Strength", value=1.0, step=0.1
+                    )
+                    tex_slat_guidance_rescale = gr.Slider(
+                        0.0, 1.0, label="Guidance Rescale", value=0.0, step=0.01
+                    )
+                    tex_slat_sampling_steps = gr.Slider(
+                        1, 50, label="Sampling Steps", value=8, step=1
+                    )
+                    tex_slat_rescale_t = gr.Slider(
+                        1.0, 6.0, label="Rescale T", value=3.0, step=0.1
+                    )
         with gr.Column(scale=10):
             with gr.Walkthrough(selected=0) as walkthrough:
                 with gr.Step("Preview", id=0):
+                    preview_output = gr.HTML(
+                        empty_html,
+                        label="3D Asset Preview",
+                        show_label=True,
+                        container=True,
+                    )
                     extract_btn = gr.Button("Extract GLB")
                 with gr.Step("Extract", id=1):
+                    glb_output = gr.Model3D(
+                        label="Extracted GLB",
+                        height=724,
+                        show_label=True,
+                        display_mode="solid",
+                        clear_color=(0.25, 0.25, 0.25, 1.0),
+                    )
                     download_btn = gr.DownloadButton(label="Download GLB")
+            gr.Markdown(
+                "*We are actively working on improving the speed of GLB extraction. Currently, it may take half a minute or more and face count is limited.*"
+            )
         with gr.Column(scale=1, min_width=172):
             examples = gr.Examples(
                 examples=[
+                    f"assets/example_image/{image}"
                     for image in os.listdir("assets/example_image")
                 ],
                 inputs=[image_prompt],
                 run_on_click=True,
                 examples_per_page=18,
             )
     output_buf = gr.State()
     # Handlers
     demo.load(start_session)
     demo.unload(end_session)
     image_prompt.upload(
         preprocess_image,
         inputs=[image_prompt],
         get_seed,
         inputs=[randomize_seed, seed],
         outputs=[seed],
+    ).then(lambda: gr.Walkthrough(selected=0), outputs=walkthrough).then(
         image_to_3d,
         inputs=[
+            image_prompt,
+            seed,
+            resolution,
+            ss_guidance_strength,
+            ss_guidance_rescale,
+            ss_sampling_steps,
+            ss_rescale_t,
+            shape_slat_guidance_strength,
+            shape_slat_guidance_rescale,
+            shape_slat_sampling_steps,
+            shape_slat_rescale_t,
+            tex_slat_guidance_strength,
+            tex_slat_guidance_rescale,
+            tex_slat_sampling_steps,
+            tex_slat_rescale_t,
         ],
         outputs=[output_buf, preview_output],
     )
+    extract_btn.click(lambda: gr.Walkthrough(selected=1), outputs=walkthrough).then(
         extract_glb,
         inputs=[output_buf, decimation_target, texture_size],
         outputs=[glb_output, download_btn],
     )
 # Launch the Gradio app
 if __name__ == "__main__":
     # Construct ui components
     btn_img_base64_strs = {}
     for i in range(len(MODES)):
+        icon = Image.open(MODES[i]["icon"])
+        MODES[i]["icon_base64"] = image_to_base64(icon)
     rmbg_client = Client("briaai/BRIA-RMBG-2.0")
+    pipeline = Trellis2ImageTo3DPipeline.from_pretrained("microsoft/TRELLIS.2-4B")
     pipeline.rembg_model = None
     pipeline.low_vram = False
     pipeline.cuda()
     envmap = {
+        "forest": EnvMap(
+            torch.tensor(
+                cv2.cvtColor(
+                    cv2.imread("assets/hdri/forest.exr", cv2.IMREAD_UNCHANGED),
+                    cv2.COLOR_BGR2RGB,
+                ),
+                dtype=torch.float32,
+                device="cuda",
+            )
+        ),
+        "sunset": EnvMap(
+            torch.tensor(
+                cv2.cvtColor(
+                    cv2.imread("assets/hdri/sunset.exr", cv2.IMREAD_UNCHANGED),
+                    cv2.COLOR_BGR2RGB,
+                ),
+                dtype=torch.float32,
+                device="cuda",
+            )
+        ),
+        "courtyard": EnvMap(
+            torch.tensor(
+                cv2.cvtColor(
+                    cv2.imread("assets/hdri/courtyard.exr", cv2.IMREAD_UNCHANGED),
+                    cv2.COLOR_BGR2RGB,
+                ),
+                dtype=torch.float32,
+                device="cuda",
+            )
+        ),
     }
+    # demo.launch(css=css, head=head)
     demo.launch(server_name="0.0.0.0", server_port=7860, css=css, head=head)

trellis2/pipelines/trellis2_image_to_3d.py CHANGED Viewed

@@ -28,6 +28,7 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         rembg_model (Callable): The model for removing background.
         low_vram (bool): Whether to use low-VRAM mode.
     """
     def __init__(
         self,
         models: dict[str, nn.Module] = None,
@@ -42,7 +43,7 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         image_cond_model: Callable = None,
         rembg_model: Callable = None,
         low_vram: bool = True,
-        default_pipeline_type: str = '1024_cascade',
     ):
         if models is None:
             return
@@ -60,12 +61,12 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         self.low_vram = low_vram
         self.default_pipeline_type = default_pipeline_type
         self.pbr_attr_layout = {
-            'base_color': slice(0, 3),
-            'metallic': slice(3, 4),
-            'roughness': slice(4, 5),
-            'alpha': slice(5, 6),
         }
-        self._device = 'cpu'
     @staticmethod
     def from_pretrained(path: str) -> "Trellis2ImageTo3DPipeline":
@@ -75,35 +76,51 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         Args:
             path (str): The path to the model. Can be either local path or a Hugging Face repository.
         """
-        pipeline = super(Trellis2ImageTo3DPipeline, Trellis2ImageTo3DPipeline).from_pretrained(path)
         new_pipeline = Trellis2ImageTo3DPipeline()
         new_pipeline.__dict__ = pipeline.__dict__
         args = pipeline._pretrained_args
-        new_pipeline.sparse_structure_sampler = getattr(samplers, args['sparse_structure_sampler']['name'])(**args['sparse_structure_sampler']['args'])
-        new_pipeline.sparse_structure_sampler_params = args['sparse_structure_sampler']['params']
-        new_pipeline.shape_slat_sampler = getattr(samplers, args['shape_slat_sampler']['name'])(**args['shape_slat_sampler']['args'])
-        new_pipeline.shape_slat_sampler_params = args['shape_slat_sampler']['params']
-        new_pipeline.tex_slat_sampler = getattr(samplers, args['tex_slat_sampler']['name'])(**args['tex_slat_sampler']['args'])
-        new_pipeline.tex_slat_sampler_params = args['tex_slat_sampler']['params']
-        new_pipeline.shape_slat_normalization = args['shape_slat_normalization']
-        new_pipeline.tex_slat_normalization = args['tex_slat_normalization']
-        new_pipeline.image_cond_model = getattr(image_feature_extractor, args['image_cond_model']['name'])(**args['image_cond_model']['args'])
-        new_pipeline.rembg_model = getattr(rembg, args['rembg_model']['name'])(**args['rembg_model']['args'])
-        new_pipeline.low_vram = args.get('low_vram', True)
-        new_pipeline.default_pipeline_type = args.get('default_pipeline_type', '1024_cascade')
         new_pipeline.pbr_attr_layout = {
-            'base_color': slice(0, 3),
-            'metallic': slice(3, 4),
-            'roughness': slice(4, 5),
-            'alpha': slice(5, 6),
         }
-        new_pipeline._device = 'cpu'
         return new_pipeline
@@ -121,18 +138,21 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         """
         # if has alpha channel, use it directly; otherwise, remove background
         has_alpha = False
-        if input.mode == 'RGBA':
             alpha = np.array(input)[:, :, 3]
             if not np.all(alpha == 255):
                 has_alpha = True
         max_size = max(input.size)
         scale = min(1, 1024 / max_size)
         if scale < 1:
-            input = input.resize((int(input.width * scale), int(input.height * scale)), Image.Resampling.LANCZOS)
         if has_alpha:
             output = input
         else:
-            input = input.convert('RGB')
             if self.low_vram:
                 self.rembg_model.to(self.device)
             output = self.rembg_model(input)
@@ -141,18 +161,33 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         output_np = np.array(output)
         alpha = output_np[:, :, 3]
         bbox = np.argwhere(alpha > 0.8 * 255)
-        bbox = np.min(bbox[:, 1]), np.min(bbox[:, 0]), np.max(bbox[:, 1]), np.max(bbox[:, 0])
         center = (bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2
         size = max(bbox[2] - bbox[0], bbox[3] - bbox[1])
         size = int(size * 1)
-        bbox = center[0] - size // 2, center[1] - size // 2, center[0] + size // 2, center[1] + size // 2
         output = output.crop(bbox)  # type: ignore
         output = np.array(output).astype(np.float32) / 255
         output = output[:, :, :3] * output[:, :, 3:4]
         output = Image.fromarray((output * 255).astype(np.uint8))
         return output
-    def get_cond(self, image: Union[torch.Tensor, list[Image.Image]], resolution: int, include_neg_cond: bool = True) -> dict:
         """
         Get the conditioning information for the model.
@@ -169,11 +204,11 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         if self.low_vram:
             self.image_cond_model.cpu()
         if not include_neg_cond:
-            return {'cond': cond}
         neg_cond = torch.zeros_like(cond)
         return {
-            'cond': cond,
-            'neg_cond': neg_cond,
         }
     def sample_sparse_structure(
@@ -185,7 +220,7 @@ class Trellis2ImageTo3DPipeline(Pipeline):
     ) -> torch.Tensor:
         """
         Sample sparse structures with the given conditioning.
         Args:
             cond (dict): The conditioning information.
             resolution (int): The resolution of the sparse structure.
@@ -193,7 +228,7 @@ class Trellis2ImageTo3DPipeline(Pipeline):
             sampler_params (dict): Additional parameters for the sampler.
         """
         # Sample sparse structure latent
-        flow_model = self.models['sparse_structure_flow_model']
         reso = flow_model.resolution
         in_channels = flow_model.in_channels
         noise = torch.randn(num_samples, in_channels, reso, reso, reso).to(self.device)
@@ -210,17 +245,19 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         ).samples
         if self.low_vram:
             flow_model.cpu()
         # Decode sparse structure latent
-        decoder = self.models['sparse_structure_decoder']
         if self.low_vram:
             decoder.to(self.device)
-        decoded = decoder(z_s)>0
         if self.low_vram:
             decoder.cpu()
         if resolution != decoded.shape[2]:
             ratio = decoded.shape[2] // resolution
-            decoded = torch.nn.functional.max_pool3d(decoded.float(), ratio, ratio, 0) > 0.5
         coords = torch.argwhere(decoded)[:, [0, 2, 3, 4]].int()
         return coords
@@ -234,7 +271,7 @@ class Trellis2ImageTo3DPipeline(Pipeline):
     ) -> SparseTensor:
         """
         Sample structured latent with the given conditioning.
         Args:
             cond (dict): The conditioning information.
             coords (torch.Tensor): The coordinates of the sparse structure.
@@ -259,12 +296,12 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         if self.low_vram:
             flow_model.cpu()
-        std = torch.tensor(self.shape_slat_normalization['std'])[None].to(slat.device)
-        mean = torch.tensor(self.shape_slat_normalization['mean'])[None].to(slat.device)
         slat = slat * std + mean
         return slat
     def sample_shape_slat_cascade(
         self,
         lr_cond: dict,
@@ -279,7 +316,7 @@ class Trellis2ImageTo3DPipeline(Pipeline):
     ) -> SparseTensor:
         """
         Sample structured latent with the given conditioning.
         Args:
             cond (dict): The conditioning information.
             coords (torch.Tensor): The coordinates of the sparse structure.
@@ -287,7 +324,9 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         """
         # LR
         noise = SparseTensor(
-            feats=torch.randn(coords.shape[0], flow_model_lr.in_channels).to(self.device),
             coords=coords,
         )
         sampler_params = {**self.shape_slat_sampler_params, **sampler_params}
@@ -303,32 +342,39 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         ).samples
         if self.low_vram:
             flow_model_lr.cpu()
-        std = torch.tensor(self.shape_slat_normalization['std'])[None].to(slat.device)
-        mean = torch.tensor(self.shape_slat_normalization['mean'])[None].to(slat.device)
         slat = slat * std + mean
         # Upsample
         if self.low_vram:
-            self.models['shape_slat_decoder'].to(self.device)
-            self.models['shape_slat_decoder'].low_vram = True
-        hr_coords = self.models['shape_slat_decoder'].upsample(slat, upsample_times=4)
         if self.low_vram:
-            self.models['shape_slat_decoder'].cpu()
-            self.models['shape_slat_decoder'].low_vram = False
         hr_resolution = resolution
         while True:
-            quant_coords = torch.cat([
-                hr_coords[:, :1],
-                ((hr_coords[:, 1:] + 0.5) / lr_resolution * (hr_resolution // 16)).int(),
-            ], dim=1)
             coords = quant_coords.unique(dim=0)
             num_tokens = coords.shape[0]
             if num_tokens < max_num_tokens or hr_resolution == 1024:
                 if hr_resolution != resolution:
-                    print(f"Due to the limited number of tokens, the resolution is reduced to {hr_resolution}.")
                 break
             hr_resolution -= 128
         # Sample structured latent
         noise = SparseTensor(
             feats=torch.randn(coords.shape[0], flow_model.in_channels).to(self.device),
@@ -348,10 +394,10 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         if self.low_vram:
             flow_model.cpu()
-        std = torch.tensor(self.shape_slat_normalization['std'])[None].to(slat.device)
-        mean = torch.tensor(self.shape_slat_normalization['mean'])[None].to(slat.device)
         slat = slat * std + mean
         return slat, hr_resolution
     def decode_shape_slat(
@@ -370,16 +416,16 @@ class Trellis2ImageTo3DPipeline(Pipeline):
             List[Mesh]: The decoded meshes.
             List[SparseTensor]: The decoded substructures.
         """
-        self.models['shape_slat_decoder'].set_resolution(resolution)
         if self.low_vram:
-            self.models['shape_slat_decoder'].to(self.device)
-            self.models['shape_slat_decoder'].low_vram = True
-        ret = self.models['shape_slat_decoder'](slat, return_subs=True)
         if self.low_vram:
-            self.models['shape_slat_decoder'].cpu()
-            self.models['shape_slat_decoder'].low_vram = False
         return ret
     def sample_tex_slat(
         self,
         cond: dict,
@@ -389,19 +435,31 @@ class Trellis2ImageTo3DPipeline(Pipeline):
     ) -> SparseTensor:
         """
         Sample structured latent with the given conditioning.
         Args:
             cond (dict): The conditioning information.
             shape_slat (SparseTensor): The structured latent for shape
             sampler_params (dict): Additional parameters for the sampler.
         """
         # Sample structured latent
-        std = torch.tensor(self.shape_slat_normalization['std'])[None].to(shape_slat.device)
-        mean = torch.tensor(self.shape_slat_normalization['mean'])[None].to(shape_slat.device)
         shape_slat = (shape_slat - mean) / std
-        in_channels = flow_model.in_channels if isinstance(flow_model, nn.Module) else flow_model[0].in_channels
-        noise = shape_slat.replace(feats=torch.randn(shape_slat.coords.shape[0], in_channels - shape_slat.feats.shape[1]).to(self.device))
         sampler_params = {**self.tex_slat_sampler_params, **sampler_params}
         if self.low_vram:
             flow_model.to(self.device)
@@ -417,10 +475,10 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         if self.low_vram:
             flow_model.cpu()
-        std = torch.tensor(self.tex_slat_normalization['std'])[None].to(slat.device)
-        mean = torch.tensor(self.tex_slat_normalization['mean'])[None].to(slat.device)
         slat = slat * std + mean
         return slat
     def decode_tex_slat(
@@ -439,12 +497,12 @@ class Trellis2ImageTo3DPipeline(Pipeline):
             List[SparseTensor]: The decoded texture voxels
         """
         if self.low_vram:
-            self.models['tex_slat_decoder'].to(self.device)
-        ret = self.models['tex_slat_decoder'](slat, guide_subs=subs) * 0.5 + 0.5
         if self.low_vram:
-            self.models['tex_slat_decoder'].cpu()
         return ret
     @torch.no_grad()
     def decode_latent(
         self,
@@ -467,17 +525,18 @@ class Trellis2ImageTo3DPipeline(Pipeline):
             m.fill_holes()
             out_mesh.append(
                 MeshWithVoxel(
-                    m.vertices, m.faces,
-                    origin = [-0.5, -0.5, -0.5],
-                    voxel_size = 1 / resolution,
-                    coords = v.coords[:, 1:],
-                    attrs = v.feats,
-                    voxel_shape = torch.Size([*v.shape, *v.spatial_shape]),
-                    layout=self.pbr_attr_layout
                 )
             )
         return out_mesh
     @torch.no_grad()
     def run(
         self,
@@ -509,76 +568,117 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         """
         # Check pipeline type
         pipeline_type = pipeline_type or self.default_pipeline_type
-        if pipeline_type == '512':
-            assert 'shape_slat_flow_model_512' in self.models, "No 512 resolution shape SLat flow model found."
-            assert 'tex_slat_flow_model_512' in self.models, "No 512 resolution texture SLat flow model found."
-        elif pipeline_type == '1024':
-            assert 'shape_slat_flow_model_1024' in self.models, "No 1024 resolution shape SLat flow model found."
-            assert 'tex_slat_flow_model_1024' in self.models, "No 1024 resolution texture SLat flow model found."
-        elif pipeline_type == '1024_cascade':
-            assert 'shape_slat_flow_model_512' in self.models, "No 512 resolution shape SLat flow model found."
-            assert 'shape_slat_flow_model_1024' in self.models, "No 1024 resolution shape SLat flow model found."
-            assert 'tex_slat_flow_model_1024' in self.models, "No 1024 resolution texture SLat flow model found."
-        elif pipeline_type == '1536_cascade':
-            assert 'shape_slat_flow_model_512' in self.models, "No 512 resolution shape SLat flow model found."
-            assert 'shape_slat_flow_model_1024' in self.models, "No 1024 resolution shape SLat flow model found."
-            assert 'tex_slat_flow_model_1024' in self.models, "No 1024 resolution texture SLat flow model found."
         else:
             raise ValueError(f"Invalid pipeline type: {pipeline_type}")
         if preprocess_image:
             image = self.preprocess_image(image)
         torch.manual_seed(seed)
         cond_512 = self.get_cond([image], 512)
-        cond_1024 = self.get_cond([image], 1024) if pipeline_type != '512' else None
-        ss_res = {'512': 32, '1024': 64, '1024_cascade': 32, '1536_cascade': 32}[pipeline_type]
         coords = self.sample_sparse_structure(
-            cond_512, ss_res,
-            num_samples, sparse_structure_sampler_params
         )
-        if pipeline_type == '512':
             shape_slat = self.sample_shape_slat(
-                cond_512, self.models['shape_slat_flow_model_512'],
-                coords, shape_slat_sampler_params
             )
             tex_slat = self.sample_tex_slat(
-                cond_512, self.models['tex_slat_flow_model_512'],
-                shape_slat, tex_slat_sampler_params
             )
             res = 512
-        elif pipeline_type == '1024':
             shape_slat = self.sample_shape_slat(
-                cond_1024, self.models['shape_slat_flow_model_1024'],
-                coords, shape_slat_sampler_params
             )
             tex_slat = self.sample_tex_slat(
-                cond_1024, self.models['tex_slat_flow_model_1024'],
-                shape_slat, tex_slat_sampler_params
             )
             res = 1024
-        elif pipeline_type == '1024_cascade':
             shape_slat, res = self.sample_shape_slat_cascade(
-                cond_512, cond_1024,
-                self.models['shape_slat_flow_model_512'], self.models['shape_slat_flow_model_1024'],
-                512, 1024,
-                coords, shape_slat_sampler_params,
-                max_num_tokens
             )
             tex_slat = self.sample_tex_slat(
-                cond_1024, self.models['tex_slat_flow_model_1024'],
-                shape_slat, tex_slat_sampler_params
             )
-        elif pipeline_type == '1536_cascade':
             shape_slat, res = self.sample_shape_slat_cascade(
-                cond_512, cond_1024,
-                self.models['shape_slat_flow_model_512'], self.models['shape_slat_flow_model_1024'],
-                512, 1536,
-                coords, shape_slat_sampler_params,
-                max_num_tokens
             )
             tex_slat = self.sample_tex_slat(
-                cond_1024, self.models['tex_slat_flow_model_1024'],
-                shape_slat, tex_slat_sampler_params
             )
         torch.cuda.empty_cache()
         out_mesh = self.decode_latent(shape_slat, tex_slat, res)

         rembg_model (Callable): The model for removing background.
         low_vram (bool): Whether to use low-VRAM mode.
     """
     def __init__(
         self,
         models: dict[str, nn.Module] = None,
         image_cond_model: Callable = None,
         rembg_model: Callable = None,
         low_vram: bool = True,
+        default_pipeline_type: str = "1024_cascade",
     ):
         if models is None:
             return
         self.low_vram = low_vram
         self.default_pipeline_type = default_pipeline_type
         self.pbr_attr_layout = {
+            "base_color": slice(0, 3),
+            "metallic": slice(3, 4),
+            "roughness": slice(4, 5),
+            "alpha": slice(5, 6),
         }
+        self._device = "cpu"
     @staticmethod
     def from_pretrained(path: str) -> "Trellis2ImageTo3DPipeline":
         Args:
             path (str): The path to the model. Can be either local path or a Hugging Face repository.
         """
+        pipeline = super(
+            Trellis2ImageTo3DPipeline, Trellis2ImageTo3DPipeline
+        ).from_pretrained(path)
         new_pipeline = Trellis2ImageTo3DPipeline()
         new_pipeline.__dict__ = pipeline.__dict__
         args = pipeline._pretrained_args
+        new_pipeline.sparse_structure_sampler = getattr(
+            samplers, args["sparse_structure_sampler"]["name"]
+        )(**args["sparse_structure_sampler"]["args"])
+        new_pipeline.sparse_structure_sampler_params = args["sparse_structure_sampler"][
+            "params"
+        ]
+        new_pipeline.shape_slat_sampler = getattr(
+            samplers, args["shape_slat_sampler"]["name"]
+        )(**args["shape_slat_sampler"]["args"])
+        new_pipeline.shape_slat_sampler_params = args["shape_slat_sampler"]["params"]
+        new_pipeline.tex_slat_sampler = getattr(
+            samplers, args["tex_slat_sampler"]["name"]
+        )(**args["tex_slat_sampler"]["args"])
+        new_pipeline.tex_slat_sampler_params = args["tex_slat_sampler"]["params"]
+        new_pipeline.shape_slat_normalization = args["shape_slat_normalization"]
+        new_pipeline.tex_slat_normalization = args["tex_slat_normalization"]
+        new_pipeline.image_cond_model = getattr(
+            image_feature_extractor, args["image_cond_model"]["name"]
+        )(**args["image_cond_model"]["args"])
+        new_pipeline.rembg_model = getattr(rembg, args["rembg_model"]["name"])(
+            **args["rembg_model"]["args"]
+        )
+        new_pipeline.low_vram = args.get("low_vram", True)
+        new_pipeline.default_pipeline_type = args.get(
+            "default_pipeline_type", "1024_cascade"
+        )
         new_pipeline.pbr_attr_layout = {
+            "base_color": slice(0, 3),
+            "metallic": slice(3, 4),
+            "roughness": slice(4, 5),
+            "alpha": slice(5, 6),
         }
+        new_pipeline._device = "cpu"
         return new_pipeline
         """
         # if has alpha channel, use it directly; otherwise, remove background
         has_alpha = False
+        if input.mode == "RGBA":
             alpha = np.array(input)[:, :, 3]
             if not np.all(alpha == 255):
                 has_alpha = True
         max_size = max(input.size)
         scale = min(1, 1024 / max_size)
         if scale < 1:
+            input = input.resize(
+                (int(input.width * scale), int(input.height * scale)),
+                Image.Resampling.LANCZOS,
+            )
         if has_alpha:
             output = input
         else:
+            input = input.convert("RGB")
             if self.low_vram:
                 self.rembg_model.to(self.device)
             output = self.rembg_model(input)
         output_np = np.array(output)
         alpha = output_np[:, :, 3]
         bbox = np.argwhere(alpha > 0.8 * 255)
+        bbox = (
+            np.min(bbox[:, 1]),
+            np.min(bbox[:, 0]),
+            np.max(bbox[:, 1]),
+            np.max(bbox[:, 0]),
+        )
         center = (bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2
         size = max(bbox[2] - bbox[0], bbox[3] - bbox[1])
         size = int(size * 1)
+        bbox = (
+            center[0] - size // 2,
+            center[1] - size // 2,
+            center[0] + size // 2,
+            center[1] + size // 2,
+        )
         output = output.crop(bbox)  # type: ignore
         output = np.array(output).astype(np.float32) / 255
         output = output[:, :, :3] * output[:, :, 3:4]
         output = Image.fromarray((output * 255).astype(np.uint8))
         return output
+    def get_cond(
+        self,
+        image: Union[torch.Tensor, list[Image.Image]],
+        resolution: int,
+        include_neg_cond: bool = True,
+    ) -> dict:
         """
         Get the conditioning information for the model.
         if self.low_vram:
             self.image_cond_model.cpu()
         if not include_neg_cond:
+            return {"cond": cond}
         neg_cond = torch.zeros_like(cond)
         return {
+            "cond": cond,
+            "neg_cond": neg_cond,
         }
     def sample_sparse_structure(
     ) -> torch.Tensor:
         """
         Sample sparse structures with the given conditioning.
         Args:
             cond (dict): The conditioning information.
             resolution (int): The resolution of the sparse structure.
             sampler_params (dict): Additional parameters for the sampler.
         """
         # Sample sparse structure latent
+        flow_model = self.models["sparse_structure_flow_model"]
         reso = flow_model.resolution
         in_channels = flow_model.in_channels
         noise = torch.randn(num_samples, in_channels, reso, reso, reso).to(self.device)
         ).samples
         if self.low_vram:
             flow_model.cpu()
         # Decode sparse structure latent
+        decoder = self.models["sparse_structure_decoder"]
         if self.low_vram:
             decoder.to(self.device)
+        decoded = decoder(z_s) > 0
         if self.low_vram:
             decoder.cpu()
         if resolution != decoded.shape[2]:
             ratio = decoded.shape[2] // resolution
+            decoded = (
+                torch.nn.functional.max_pool3d(decoded.float(), ratio, ratio, 0) > 0.5
+            )
         coords = torch.argwhere(decoded)[:, [0, 2, 3, 4]].int()
         return coords
     ) -> SparseTensor:
         """
         Sample structured latent with the given conditioning.
         Args:
             cond (dict): The conditioning information.
             coords (torch.Tensor): The coordinates of the sparse structure.
         if self.low_vram:
             flow_model.cpu()
+        std = torch.tensor(self.shape_slat_normalization["std"])[None].to(slat.device)
+        mean = torch.tensor(self.shape_slat_normalization["mean"])[None].to(slat.device)
         slat = slat * std + mean
         return slat
     def sample_shape_slat_cascade(
         self,
         lr_cond: dict,
     ) -> SparseTensor:
         """
         Sample structured latent with the given conditioning.
         Args:
             cond (dict): The conditioning information.
             coords (torch.Tensor): The coordinates of the sparse structure.
         """
         # LR
         noise = SparseTensor(
+            feats=torch.randn(coords.shape[0], flow_model_lr.in_channels).to(
+                self.device
+            ),
             coords=coords,
         )
         sampler_params = {**self.shape_slat_sampler_params, **sampler_params}
         ).samples
         if self.low_vram:
             flow_model_lr.cpu()
+        std = torch.tensor(self.shape_slat_normalization["std"])[None].to(slat.device)
+        mean = torch.tensor(self.shape_slat_normalization["mean"])[None].to(slat.device)
         slat = slat * std + mean
         # Upsample
         if self.low_vram:
+            self.models["shape_slat_decoder"].to(self.device)
+            self.models["shape_slat_decoder"].low_vram = True
+        hr_coords = self.models["shape_slat_decoder"].upsample(slat, upsample_times=4)
         if self.low_vram:
+            self.models["shape_slat_decoder"].cpu()
+            self.models["shape_slat_decoder"].low_vram = False
         hr_resolution = resolution
         while True:
+            quant_coords = torch.cat(
+                [
+                    hr_coords[:, :1],
+                    (
+                        (hr_coords[:, 1:] + 0.5) / lr_resolution * (hr_resolution // 16)
+                    ).int(),
+                ],
+                dim=1,
+            )
             coords = quant_coords.unique(dim=0)
             num_tokens = coords.shape[0]
             if num_tokens < max_num_tokens or hr_resolution == 1024:
                 if hr_resolution != resolution:
+                    print(
+                        f"Due to the limited number of tokens, the resolution is reduced to {hr_resolution}."
+                    )
                 break
             hr_resolution -= 128
         # Sample structured latent
         noise = SparseTensor(
             feats=torch.randn(coords.shape[0], flow_model.in_channels).to(self.device),
         if self.low_vram:
             flow_model.cpu()
+        std = torch.tensor(self.shape_slat_normalization["std"])[None].to(slat.device)
+        mean = torch.tensor(self.shape_slat_normalization["mean"])[None].to(slat.device)
         slat = slat * std + mean
         return slat, hr_resolution
     def decode_shape_slat(
             List[Mesh]: The decoded meshes.
             List[SparseTensor]: The decoded substructures.
         """
+        self.models["shape_slat_decoder"].set_resolution(resolution)
         if self.low_vram:
+            self.models["shape_slat_decoder"].to(self.device)
+            self.models["shape_slat_decoder"].low_vram = True
+        ret = self.models["shape_slat_decoder"](slat, return_subs=True)
         if self.low_vram:
+            self.models["shape_slat_decoder"].cpu()
+            self.models["shape_slat_decoder"].low_vram = False
         return ret
     def sample_tex_slat(
         self,
         cond: dict,
     ) -> SparseTensor:
         """
         Sample structured latent with the given conditioning.
         Args:
             cond (dict): The conditioning information.
             shape_slat (SparseTensor): The structured latent for shape
             sampler_params (dict): Additional parameters for the sampler.
         """
         # Sample structured latent
+        std = torch.tensor(self.shape_slat_normalization["std"])[None].to(
+            shape_slat.device
+        )
+        mean = torch.tensor(self.shape_slat_normalization["mean"])[None].to(
+            shape_slat.device
+        )
         shape_slat = (shape_slat - mean) / std
+        in_channels = (
+            flow_model.in_channels
+            if isinstance(flow_model, nn.Module)
+            else flow_model[0].in_channels
+        )
+        noise = shape_slat.replace(
+            feats=torch.randn(
+                shape_slat.coords.shape[0], in_channels - shape_slat.feats.shape[1]
+            ).to(self.device)
+        )
         sampler_params = {**self.tex_slat_sampler_params, **sampler_params}
         if self.low_vram:
             flow_model.to(self.device)
         if self.low_vram:
             flow_model.cpu()
+        std = torch.tensor(self.tex_slat_normalization["std"])[None].to(slat.device)
+        mean = torch.tensor(self.tex_slat_normalization["mean"])[None].to(slat.device)
         slat = slat * std + mean
         return slat
     def decode_tex_slat(
             List[SparseTensor]: The decoded texture voxels
         """
         if self.low_vram:
+            self.models["tex_slat_decoder"].to(self.device)
+        ret = self.models["tex_slat_decoder"](slat, guide_subs=subs) * 0.5 + 0.5
         if self.low_vram:
+            self.models["tex_slat_decoder"].cpu()
         return ret
     @torch.no_grad()
     def decode_latent(
         self,
             m.fill_holes()
             out_mesh.append(
                 MeshWithVoxel(
+                    m.vertices,
+                    m.faces,
+                    origin=[-0.5, -0.5, -0.5],
+                    voxel_size=1 / resolution,
+                    coords=v.coords[:, 1:],
+                    attrs=v.feats,
+                    voxel_shape=torch.Size([*v.shape, *v.spatial_shape]),
+                    layout=self.pbr_attr_layout,
                 )
             )
         return out_mesh
     @torch.no_grad()
     def run(
         self,
         """
         # Check pipeline type
         pipeline_type = pipeline_type or self.default_pipeline_type
+        if pipeline_type == "512":
+            assert "shape_slat_flow_model_512" in self.models, (
+                "No 512 resolution shape SLat flow model found."
+            )
+            assert "tex_slat_flow_model_512" in self.models, (
+                "No 512 resolution texture SLat flow model found."
+            )
+        elif pipeline_type == "1024":
+            assert "shape_slat_flow_model_1024" in self.models, (
+                "No 1024 resolution shape SLat flow model found."
+            )
+            assert "tex_slat_flow_model_1024" in self.models, (
+                "No 1024 resolution texture SLat flow model found."
+            )
+        elif pipeline_type == "1024_cascade":
+            assert "shape_slat_flow_model_512" in self.models, (
+                "No 512 resolution shape SLat flow model found."
+            )
+            assert "shape_slat_flow_model_1024" in self.models, (
+                "No 1024 resolution shape SLat flow model found."
+            )
+            assert "tex_slat_flow_model_1024" in self.models, (
+                "No 1024 resolution texture SLat flow model found."
+            )
+        elif pipeline_type == "1536_cascade":
+            assert "shape_slat_flow_model_512" in self.models, (
+                "No 512 resolution shape SLat flow model found."
+            )
+            assert "shape_slat_flow_model_1024" in self.models, (
+                "No 1024 resolution shape SLat flow model found."
+            )
+            assert "tex_slat_flow_model_1024" in self.models, (
+                "No 1024 resolution texture SLat flow model found."
+            )
         else:
             raise ValueError(f"Invalid pipeline type: {pipeline_type}")
         if preprocess_image:
             image = self.preprocess_image(image)
         torch.manual_seed(seed)
         cond_512 = self.get_cond([image], 512)
+        cond_1024 = self.get_cond([image], 1024) if pipeline_type != "512" else None
+        ss_res = {"512": 32, "1024": 64, "1024_cascade": 32, "1536_cascade": 32}[
+            pipeline_type
+        ]
         coords = self.sample_sparse_structure(
+            cond_512, ss_res, num_samples, sparse_structure_sampler_params
         )
+        if pipeline_type == "512":
             shape_slat = self.sample_shape_slat(
+                cond_512,
+                self.models["shape_slat_flow_model_512"],
+                coords,
+                shape_slat_sampler_params,
             )
             tex_slat = self.sample_tex_slat(
+                cond_512,
+                self.models["tex_slat_flow_model_512"],
+                shape_slat,
+                tex_slat_sampler_params,
             )
             res = 512
+        elif pipeline_type == "1024":
             shape_slat = self.sample_shape_slat(
+                cond_1024,
+                self.models["shape_slat_flow_model_1024"],
+                coords,
+                shape_slat_sampler_params,
             )
             tex_slat = self.sample_tex_slat(
+                cond_1024,
+                self.models["tex_slat_flow_model_1024"],
+                shape_slat,
+                tex_slat_sampler_params,
             )
             res = 1024
+        elif pipeline_type == "1024_cascade":
             shape_slat, res = self.sample_shape_slat_cascade(
+                cond_512,
+                cond_1024,
+                self.models["shape_slat_flow_model_512"],
+                self.models["shape_slat_flow_model_1024"],
+                512,
+                1024,
+                coords,
+                shape_slat_sampler_params,
+                max_num_tokens,
             )
             tex_slat = self.sample_tex_slat(
+                cond_1024,
+                self.models["tex_slat_flow_model_1024"],
+                shape_slat,
+                tex_slat_sampler_params,
             )
+        elif pipeline_type == "1536_cascade":
             shape_slat, res = self.sample_shape_slat_cascade(
+                cond_512,
+                cond_1024,
+                self.models["shape_slat_flow_model_512"],
+                self.models["shape_slat_flow_model_1024"],
+                512,
+                1536,
+                coords,
+                shape_slat_sampler_params,
+                max_num_tokens,
             )
             tex_slat = self.sample_tex_slat(
+                cond_1024,
+                self.models["tex_slat_flow_model_1024"],
+                shape_slat,
+                tex_slat_sampler_params,
             )
         torch.cuda.empty_cache()
         out_mesh = self.decode_latent(shape_slat, tex_slat, res)