Spaces:

HorizonRobotics
/

EmbodiedGen-Image-to-3D

Running on Zero

App Files Files Community

xinjie.wang commited on Dec 10, 2025

Commit

ddc47cd

1 Parent(s): a8ea627

update

Browse files

Files changed (10) hide show

app.py +36 -15
app_style.py +17 -1
common.py +95 -8
embodied_gen/data/backproject_v3.py +7 -7
embodied_gen/data/utils.py +40 -144
embodied_gen/models/sam3d.py +145 -0
embodied_gen/utils/monkey_patches.py +381 -0
embodied_gen/utils/trender.py +57 -5
requirements.txt +8 -1
thirdparty/TRELLIS/trellis/utils/postprocessing_utils.py +1 -1

app.py CHANGED Viewed

@@ -17,7 +17,9 @@
 import os
-os.environ["GRADIO_APP"] = "imageto3d"
 from glob import glob
 import gradio as gr
@@ -30,13 +32,24 @@ from common import (
     extract_3d_representations_v3,
     extract_urdf,
     get_seed,
-    image_to_3d,
     preprocess_image_fn,
     preprocess_sam_image_fn,
     select_point,
     start_session,
 )
 with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
     gr.HTML(image_css, visible=False)
     # gr.HTML(lighting_css, visible=False)
@@ -67,7 +80,7 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
     )
     with gr.Row():
-        with gr.Column(scale=2):
             with gr.Tabs() as input_tabs:
                 with gr.Tab(
                     label="Image(auto seg)", id=0
@@ -163,7 +176,11 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                         step=0.1,
                     )
                     ss_sampling_steps = gr.Slider(
-                        1, 50, label="Sampling Steps", value=12, step=1
                     )
                 gr.Markdown("Visual Appearance Generation")
                 with gr.Row():
@@ -175,7 +192,11 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                         step=0.1,
                     )
                     slat_sampling_steps = gr.Slider(
-                        1, 50, label="Sampling Steps", value=12, step=1
                     )
             generate_btn = gr.Button(
@@ -242,7 +263,7 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                 has quality inspection, open with an editor to view details.
             """
             )
             with gr.Row() as single_image_example:
                 examples = gr.Examples(
                     label="Image Gallery",
@@ -252,7 +273,7 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                             glob("assets/example_image/*")
                         )
                     ],
-                    inputs=[image_prompt, rmbg_tag],
                     fn=preprocess_image_fn,
                     outputs=[image_prompt, raw_image_cache],
                     run_on_click=True,
@@ -274,16 +295,16 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
                     run_on_click=True,
                     examples_per_page=10,
                 )
-        with gr.Column(scale=1):
             gr.Markdown("<br>")
             video_output = gr.Video(
                 label="Generated 3D Asset",
                 autoplay=True,
                 loop=True,
-                height=300,
             )
             model_output_gs = gr.Model3D(
-                label="Gaussian Representation", height=300, interactive=False
             )
             aligned_gs = gr.Textbox(visible=False)
             gr.Markdown(
@@ -292,9 +313,9 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
             with gr.Row():
                 model_output_mesh = gr.Model3D(
                     label="Mesh Representation",
-                    height=300,
                     interactive=False,
-                    clear_color=[0.8, 0.8, 0.8, 1],
                     elem_id="lighter_mesh",
                 )
@@ -320,7 +341,7 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
     image_prompt.upload(
         preprocess_image_fn,
-        inputs=[image_prompt, rmbg_tag],
         outputs=[image_prompt, raw_image_cache],
     )
     image_prompt.change(
@@ -437,11 +458,11 @@ with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
         inputs=[
             image_prompt,
             seed,
-            ss_guidance_strength,
             ss_sampling_steps,
-            slat_guidance_strength,
             slat_sampling_steps,
             raw_image_cache,
             image_seg_sam,
             is_samimage,
         ],

 import os
+# GRADIO_APP == "imageto3d_sam3d", sam3d object model, by default.
+# GRADIO_APP == "imageto3d", TRELLIS model.
+os.environ["GRADIO_APP"] = "imageto3d_sam3d"
 from glob import glob
 import gradio as gr
     extract_3d_representations_v3,
     extract_urdf,
     get_seed,
     preprocess_image_fn,
     preprocess_sam_image_fn,
     select_point,
     start_session,
 )
+app_name = os.getenv("GRADIO_APP")
+if app_name == "imageto3d_sam3d":
+    from common import image_to_3d_sam3d as image_to_3d
+    enable_pre_resize = False
+    sample_step = 25
+elif app_name == "imageto3d":
+    from common import image_to_3d
+    enable_pre_resize = True
+    sample_step = 12
 with gr.Blocks(delete_cache=(43200, 43200), theme=custom_theme) as demo:
     gr.HTML(image_css, visible=False)
     # gr.HTML(lighting_css, visible=False)
     )
     with gr.Row():
+        with gr.Column(scale=3):
             with gr.Tabs() as input_tabs:
                 with gr.Tab(
                     label="Image(auto seg)", id=0
                         step=0.1,
                     )
                     ss_sampling_steps = gr.Slider(
+                        1,
+                        50,
+                        label="Sampling Steps",
+                        value=sample_step,
+                        step=1,
                     )
                 gr.Markdown("Visual Appearance Generation")
                 with gr.Row():
                         step=0.1,
                     )
                     slat_sampling_steps = gr.Slider(
+                        1,
+                        50,
+                        label="Sampling Steps",
+                        value=sample_step,
+                        step=1,
                     )
             generate_btn = gr.Button(
                 has quality inspection, open with an editor to view details.
             """
             )
+            enable_pre_resize = gr.State(enable_pre_resize)
             with gr.Row() as single_image_example:
                 examples = gr.Examples(
                     label="Image Gallery",
                             glob("assets/example_image/*")
                         )
                     ],
+                    inputs=[image_prompt, rmbg_tag, enable_pre_resize],
                     fn=preprocess_image_fn,
                     outputs=[image_prompt, raw_image_cache],
                     run_on_click=True,
                     run_on_click=True,
                     examples_per_page=10,
                 )
+        with gr.Column(scale=2):
             gr.Markdown("<br>")
             video_output = gr.Video(
                 label="Generated 3D Asset",
                 autoplay=True,
                 loop=True,
+                height=400,
             )
             model_output_gs = gr.Model3D(
+                label="Gaussian Representation", height=350, interactive=False
             )
             aligned_gs = gr.Textbox(visible=False)
             gr.Markdown(
             with gr.Row():
                 model_output_mesh = gr.Model3D(
                     label="Mesh Representation",
+                    height=350,
                     interactive=False,
+                    clear_color=[0, 0, 0, 1],
                     elem_id="lighter_mesh",
                 )
     image_prompt.upload(
         preprocess_image_fn,
+        inputs=[image_prompt, rmbg_tag, enable_pre_resize],
         outputs=[image_prompt, raw_image_cache],
     )
     image_prompt.change(
         inputs=[
             image_prompt,
             seed,
             ss_sampling_steps,
             slat_sampling_steps,
             raw_image_cache,
+            ss_guidance_strength,
+            slat_guidance_strength,
             image_seg_sam,
             is_samimage,
         ],

app_style.py CHANGED Viewed

@@ -1,10 +1,26 @@
 from gradio.themes import Soft
 from gradio.themes.utils.colors import gray, neutral, slate, stone, teal, zinc
 lighting_css = """
 <style>
 #lighter_mesh canvas {
-    filter: brightness(2.0) !important;
 }
 </style>
 """

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
 from gradio.themes import Soft
 from gradio.themes.utils.colors import gray, neutral, slate, stone, teal, zinc
 lighting_css = """
 <style>
 #lighter_mesh canvas {
+    filter: brightness(2.3) !important;
 }
 </style>
 """

common.py CHANGED Viewed

@@ -151,6 +151,21 @@ if os.getenv("GRADIO_APP") == "imageto3d":
         os.path.dirname(os.path.abspath(__file__)), "sessions/imageto3d"
     )
     os.makedirs(TMP_DIR, exist_ok=True)
 elif os.getenv("GRADIO_APP") == "textto3d":
     RBG_REMOVER = RembgRemover()
     RBG14_REMOVER = BMGG14Remover()
@@ -169,6 +184,23 @@ elif os.getenv("GRADIO_APP") == "textto3d":
         os.path.dirname(os.path.abspath(__file__)), "sessions/textto3d"
     )
     os.makedirs(TMP_DIR, exist_ok=True)
 elif os.getenv("GRADIO_APP") == "texture_edit":
     DELIGHT = DelightingModel()
     IMAGESR_MODEL = ImageRealESRGAN(outscale=4)
@@ -201,18 +233,22 @@ def end_session(req: gr.Request) -> None:
 @spaces.GPU
 def preprocess_image_fn(
-    image: str | np.ndarray | Image.Image, rmbg_tag: str = "rembg"
 ) -> tuple[Image.Image, Image.Image]:
     if isinstance(image, str):
         image = Image.open(image)
     elif isinstance(image, np.ndarray):
         image = Image.fromarray(image)
-    image_cache = resize_pil(image.copy(), 1024)
     bg_remover = RBG_REMOVER if rmbg_tag == "rembg" else RBG14_REMOVER
     image = bg_remover(image)
-    image = trellis_preprocess(image)
     return image, image_cache
@@ -349,11 +385,11 @@ def select_point(
 def image_to_3d(
     image: Image.Image,
     seed: int,
-    ss_guidance_strength: float,
     ss_sampling_steps: int,
-    slat_guidance_strength: float,
     slat_sampling_steps: int,
     raw_image_cache: Image.Image,
     sam_image: Image.Image = None,
     is_sam_image: bool = False,
     req: gr.Request = None,
@@ -392,8 +428,56 @@ def image_to_3d(
     gs_model = outputs["gaussian"][0]
     mesh_model = outputs["mesh"][0]
-    color_images = render_video(gs_model)["color"]
-    normal_images = render_video(mesh_model)["normal"]
     video_path = os.path.join(output_root, "gs_mesh.mp4")
     merge_images_video(color_images, normal_images, video_path)
@@ -688,6 +772,7 @@ def text2image_fn(
     image_wh: int | tuple[int, int] = [1024, 1024],
     rmbg_tag: str = "rembg",
     seed: int = None,
     n_sample: int = 3,
     req: gr.Request = None,
 ):
@@ -715,7 +800,9 @@ def text2image_fn(
     for idx in range(len(images)):
         image = images[idx]
-        images[idx], _ = preprocess_image_fn(image, rmbg_tag)
     save_paths = []
     for idx, image in enumerate(images):

         os.path.dirname(os.path.abspath(__file__)), "sessions/imageto3d"
     )
     os.makedirs(TMP_DIR, exist_ok=True)
+elif os.getenv("GRADIO_APP") == "imageto3d_sam3d":
+    from embodied_gen.models.sam3d import Sam3dInference
+    RBG_REMOVER = RembgRemover()
+    RBG14_REMOVER = BMGG14Remover()
+    SAM_PREDICTOR = SAMPredictor(model_type="vit_h", device="cpu")
+    PIPELINE = Sam3dInference()
+    SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
+    GEO_CHECKER = MeshGeoChecker(GPT_CLIENT)
+    AESTHETIC_CHECKER = ImageAestheticChecker()
+    CHECKERS = [GEO_CHECKER, SEG_CHECKER, AESTHETIC_CHECKER]
+    TMP_DIR = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "sessions/imageto3d"
+    )
+    os.makedirs(TMP_DIR, exist_ok=True)
 elif os.getenv("GRADIO_APP") == "textto3d":
     RBG_REMOVER = RembgRemover()
     RBG14_REMOVER = BMGG14Remover()
         os.path.dirname(os.path.abspath(__file__)), "sessions/textto3d"
     )
     os.makedirs(TMP_DIR, exist_ok=True)
+elif os.getenv("GRADIO_APP") == "textto3d_sam3d":
+    from embodied_gen.models.sam3d import Sam3dInference
+    RBG_REMOVER = RembgRemover()
+    RBG14_REMOVER = BMGG14Remover()
+    PIPELINE = Sam3dInference()
+    text_model_dir = "weights/Kolors"
+    PIPELINE_IMG_IP = build_text2img_ip_pipeline(text_model_dir, ref_scale=0.3)
+    PIPELINE_IMG = build_text2img_pipeline(text_model_dir)
+    SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
+    GEO_CHECKER = MeshGeoChecker(GPT_CLIENT)
+    AESTHETIC_CHECKER = ImageAestheticChecker()
+    CHECKERS = [GEO_CHECKER, SEG_CHECKER, AESTHETIC_CHECKER]
+    TMP_DIR = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "sessions/textto3d"
+    )
+    os.makedirs(TMP_DIR, exist_ok=True)
 elif os.getenv("GRADIO_APP") == "texture_edit":
     DELIGHT = DelightingModel()
     IMAGESR_MODEL = ImageRealESRGAN(outscale=4)
 @spaces.GPU
 def preprocess_image_fn(
+    image: str | np.ndarray | Image.Image,
+    rmbg_tag: str = "rembg",
+    preprocess: bool = True,
 ) -> tuple[Image.Image, Image.Image]:
     if isinstance(image, str):
         image = Image.open(image)
     elif isinstance(image, np.ndarray):
         image = Image.fromarray(image)
+    image_cache = image.copy()  # resize_pil(image.copy(), 1024)
     bg_remover = RBG_REMOVER if rmbg_tag == "rembg" else RBG14_REMOVER
     image = bg_remover(image)
+    if preprocess:
+        image = trellis_preprocess(image)
     return image, image_cache
 def image_to_3d(
     image: Image.Image,
     seed: int,
     ss_sampling_steps: int,
     slat_sampling_steps: int,
     raw_image_cache: Image.Image,
+    ss_guidance_strength: float,
+    slat_guidance_strength: float,
     sam_image: Image.Image = None,
     is_sam_image: bool = False,
     req: gr.Request = None,
     gs_model = outputs["gaussian"][0]
     mesh_model = outputs["mesh"][0]
+    color_images = render_video(gs_model, r=1.85)["color"]
+    normal_images = render_video(mesh_model, r=1.85)["normal"]
+    video_path = os.path.join(output_root, "gs_mesh.mp4")
+    merge_images_video(color_images, normal_images, video_path)
+    state = pack_state(gs_model, mesh_model)
+    gc.collect()
+    torch.cuda.empty_cache()
+    return state, video_path
+@spaces.GPU
+def image_to_3d_sam3d(
+    image: Image.Image,
+    seed: int,
+    ss_sampling_steps: int,
+    slat_sampling_steps: int,
+    raw_image_cache: Image.Image,
+    ss_guidance_strength: float = None,
+    slat_guidance_strength: float = None,
+    sam_image: Image.Image = None,
+    is_sam_image: bool = False,
+    req: gr.Request = None,
+) -> tuple[dict, str]:
+    if is_sam_image:
+        seg_image = filter_image_small_connected_components(sam_image)
+        seg_image = Image.fromarray(seg_image, mode="RGBA")
+    else:
+        seg_image = image
+    if isinstance(seg_image, np.ndarray):
+        seg_image = Image.fromarray(seg_image)
+    output_root = os.path.join(TMP_DIR, str(req.session_hash))
+    os.makedirs(output_root, exist_ok=True)
+    seg_image.save(f"{output_root}/seg_image.png")
+    raw_image_cache.save(f"{output_root}/raw_image.png")
+    outputs = PIPELINE.run(
+        seg_image,
+        seed=seed,
+        stage1_inference_steps=ss_sampling_steps,
+        stage2_inference_steps=slat_sampling_steps,
+    )
+    gs_model = outputs["gaussian"][0]
+    mesh_model = outputs["mesh"][0]
+    color_images = render_video(gs_model, r=1.85)["color"]
+    normal_images = render_video(mesh_model, r=1.85)["normal"]
     video_path = os.path.join(output_root, "gs_mesh.mp4")
     merge_images_video(color_images, normal_images, video_path)
     image_wh: int | tuple[int, int] = [1024, 1024],
     rmbg_tag: str = "rembg",
     seed: int = None,
+    enable_pre_resize: bool = True,
     n_sample: int = 3,
     req: gr.Request = None,
 ):
     for idx in range(len(images)):
         image = images[idx]
+        images[idx], _ = preprocess_image_fn(
+            image, rmbg_tag, enable_pre_resize
+        )
     save_paths = []
     for idx, image in enumerate(images):

embodied_gen/data/backproject_v3.py CHANGED Viewed

@@ -14,10 +14,10 @@
 # implied. See the License for the specific language governing
 # permissions and limitations under the License.
-import os
 import argparse
 import logging
 import math
 from typing import Literal, Union
 import cv2
@@ -353,8 +353,8 @@ def parse_args():
     parser.add_argument(
         "--distance",
         type=float,
-        default=5,
-        help="Camera distance (default: 5)",
     )
     parser.add_argument(
         "--resolution_hw",
@@ -400,8 +400,8 @@ def parse_args():
     parser.add_argument(
         "--mesh_sipmlify_ratio",
         type=float,
-        default=0.9,
-        help="Mesh simplification ratio (default: 0.9)",
     )
     parser.add_argument(
         "--delight", action="store_true", help="Use delighting model."
@@ -500,7 +500,7 @@ def entrypoint(
     faces = mesh.faces.astype(np.int32)
     vertices = vertices.astype(np.float32)
-    if not args.skip_fix_mesh and len(faces) > 10 * args.n_max_faces:
         mesh_fixer = MeshFixer(vertices, faces, args.device)
         vertices, faces = mesh_fixer(
             filter_ratio=args.mesh_sipmlify_ratio,
@@ -512,7 +512,7 @@ def entrypoint(
         if len(faces) > args.n_max_faces:
             mesh_fixer = MeshFixer(vertices, faces, args.device)
             vertices, faces = mesh_fixer(
-                filter_ratio=max(0.05, args.mesh_sipmlify_ratio - 0.2),
                 max_hole_size=0.04,
                 resolution=1024,
                 num_views=1000,

 # implied. See the License for the specific language governing
 # permissions and limitations under the License.
 import argparse
 import logging
 import math
+import os
 from typing import Literal, Union
 import cv2
     parser.add_argument(
         "--distance",
         type=float,
+        default=4.5,
+        help="Camera distance (default: 4.5)",
     )
     parser.add_argument(
         "--resolution_hw",
     parser.add_argument(
         "--mesh_sipmlify_ratio",
         type=float,
+        default=0.85,
+        help="Mesh simplification ratio (default: 0.85)",
     )
     parser.add_argument(
         "--delight", action="store_true", help="Use delighting model."
     faces = mesh.faces.astype(np.int32)
     vertices = vertices.astype(np.float32)
+    if not args.skip_fix_mesh:
         mesh_fixer = MeshFixer(vertices, faces, args.device)
         vertices, faces = mesh_fixer(
             filter_ratio=args.mesh_sipmlify_ratio,
         if len(faces) > args.n_max_faces:
             mesh_fixer = MeshFixer(vertices, faces, args.device)
             vertices, faces = mesh_fixer(
+                filter_ratio=max(0.1, args.mesh_sipmlify_ratio - 0.1),
                 max_hole_size=0.04,
                 resolution=1024,
                 num_views=1000,

embodied_gen/data/utils.py CHANGED Viewed

@@ -15,10 +15,13 @@
 # permissions and limitations under the License.
 import math
 import os
-import random
 import zipfile
 from shutil import rmtree
 from typing import List, Tuple, Union
@@ -28,20 +31,9 @@ import numpy as np
 import nvdiffrast.torch as dr
 import torch
 import torch.nn.functional as F
-from PIL import Image, ImageEnhance
-try:
-    from kolors.models.modeling_chatglm import ChatGLMModel
-    from kolors.models.tokenization_chatglm import ChatGLMTokenizer
-except ImportError:
-    ChatGLMTokenizer = None
-    ChatGLMModel = None
-import logging
-from dataclasses import dataclass, field
 import trimesh
 from kaolin.render.camera import Camera
-from torch import nn
 logger = logging.getLogger(__name__)
@@ -50,10 +42,8 @@ __all__ = [
     "DiffrastRender",
     "save_images",
     "render_pbr",
-    "prelabel_text_feature",
     "calc_vertex_normals",
     "normalize_vertices_array",
-    "load_mesh_to_unit_cube",
     "as_list",
     "CameraSetting",
     "import_kaolin_mesh",
@@ -67,6 +57,7 @@ __all__ = [
     "trellis_preprocess",
     "delete_dir",
     "kaolin_to_opencv_view",
 ]
@@ -520,114 +511,6 @@ def render_pbr(
     return image, albedo, diffuse, normal
-def _move_to_target_device(data, device: str):
-    if isinstance(data, dict):
-        for key, value in data.items():
-            data[key] = _move_to_target_device(value, device)
-    elif isinstance(data, torch.Tensor):
-        return data.to(device)
-    return data
-def _encode_prompt(
-    prompt_batch,
-    text_encoders,
-    tokenizers,
-    proportion_empty_prompts=0,
-    is_train=True,
-):
-    prompt_embeds_list = []
-    captions = []
-    for caption in prompt_batch:
-        if random.random() < proportion_empty_prompts:
-            captions.append("")
-        elif isinstance(caption, str):
-            captions.append(caption)
-        elif isinstance(caption, (list, np.ndarray)):
-            captions.append(random.choice(caption) if is_train else caption[0])
-    with torch.no_grad():
-        for tokenizer, text_encoder in zip(tokenizers, text_encoders):
-            text_inputs = tokenizer(
-                captions,
-                padding="max_length",
-                max_length=256,
-                truncation=True,
-                return_tensors="pt",
-            ).to(text_encoder.device)
-            output = text_encoder(
-                input_ids=text_inputs.input_ids,
-                attention_mask=text_inputs.attention_mask,
-                position_ids=text_inputs.position_ids,
-                output_hidden_states=True,
-            )
-            # We are only interested in the pooled output of the text encoder.
-            prompt_embeds = output.hidden_states[-2].permute(1, 0, 2).clone()
-            pooled_prompt_embeds = output.hidden_states[-1][-1, :, :].clone()
-            bs_embed, seq_len, _ = prompt_embeds.shape
-            prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
-            prompt_embeds_list.append(prompt_embeds)
-    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
-    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
-    return prompt_embeds, pooled_prompt_embeds
-def load_llm_models(pretrained_model_name_or_path: str, device: str):
-    tokenizer = ChatGLMTokenizer.from_pretrained(
-        pretrained_model_name_or_path,
-        subfolder="text_encoder",
-    )
-    text_encoder = ChatGLMModel.from_pretrained(
-        pretrained_model_name_or_path,
-        subfolder="text_encoder",
-    ).to(device)
-    text_encoders = [
-        text_encoder,
-    ]
-    tokenizers = [
-        tokenizer,
-    ]
-    logger.info(f"Load model from {pretrained_model_name_or_path} done.")
-    return tokenizers, text_encoders
-def prelabel_text_feature(
-    prompt_batch: List[str],
-    output_dir: str,
-    tokenizers: nn.Module,
-    text_encoders: nn.Module,
-) -> List[str]:
-    os.makedirs(output_dir, exist_ok=True)
-    # prompt_batch ["text..."]
-    prompt_embeds, pooled_prompt_embeds = _encode_prompt(
-        prompt_batch, text_encoders, tokenizers
-    )
-    prompt_embeds = _move_to_target_device(prompt_embeds, device="cpu")
-    pooled_prompt_embeds = _move_to_target_device(
-        pooled_prompt_embeds, device="cpu"
-    )
-    data_dict = dict(
-        prompt_embeds=prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds
-    )
-    save_path = os.path.join(output_dir, "text_feat.pth")
-    torch.save(data_dict, save_path)
-    return save_path
 def _calc_face_normals(
     vertices: torch.Tensor,  # V,3 first vertex may be unreferenced
     faces: torch.Tensor,  # F,3 long, first face may be all zero
@@ -683,25 +566,6 @@ def normalize_vertices_array(
     return vertices, scale, center
-def load_mesh_to_unit_cube(
-    mesh_file: str,
-    mesh_scale: float = 1.0,
-) -> tuple[trimesh.Trimesh, float, list[float]]:
-    if not os.path.exists(mesh_file):
-        raise FileNotFoundError(f"mesh_file path {mesh_file} not exists.")
-    mesh = trimesh.load(mesh_file)
-    if isinstance(mesh, trimesh.Scene):
-        mesh = trimesh.utils.concatenate(mesh)
-    vertices, scale, center = normalize_vertices_array(
-        mesh.vertices, mesh_scale
-    )
-    mesh.vertices = vertices
-    return mesh, scale, center
 def as_list(obj):
     if isinstance(obj, (list, tuple)):
         return obj
@@ -998,8 +862,9 @@ def gamma_shs(shs: torch.Tensor, gamma: float) -> torch.Tensor:
 def resize_pil(image: Image.Image, max_size: int = 1024) -> Image.Image:
-    max_size = max(image.size)
-    scale = min(1, 1024 / max_size)
     if scale < 1:
         new_size = (int(image.width * scale), int(image.height * scale))
         image = image.resize(new_size, Image.Resampling.LANCZOS)
@@ -1068,3 +933,34 @@ def delete_dir(folder_path: str, keep_subs: list[str] = None) -> None:
             rmtree(item_path)
         else:
             os.remove(item_path)

 # permissions and limitations under the License.
+import logging
 import math
 import os
+import time
 import zipfile
+from contextlib import contextmanager
+from dataclasses import dataclass, field
 from shutil import rmtree
 from typing import List, Tuple, Union
 import nvdiffrast.torch as dr
 import torch
 import torch.nn.functional as F
 import trimesh
 from kaolin.render.camera import Camera
+from PIL import Image, ImageEnhance
 logger = logging.getLogger(__name__)
     "DiffrastRender",
     "save_images",
     "render_pbr",
     "calc_vertex_normals",
     "normalize_vertices_array",
     "as_list",
     "CameraSetting",
     "import_kaolin_mesh",
     "trellis_preprocess",
     "delete_dir",
     "kaolin_to_opencv_view",
+    "model_device_ctx",
 ]
     return image, albedo, diffuse, normal
 def _calc_face_normals(
     vertices: torch.Tensor,  # V,3 first vertex may be unreferenced
     faces: torch.Tensor,  # F,3 long, first face may be all zero
     return vertices, scale, center
 def as_list(obj):
     if isinstance(obj, (list, tuple)):
         return obj
 def resize_pil(image: Image.Image, max_size: int = 1024) -> Image.Image:
+    current_max_dim = max(image.size)
+    scale = min(1, max_size / current_max_dim)
     if scale < 1:
         new_size = (int(image.width * scale), int(image.height * scale))
         image = image.resize(new_size, Image.Resampling.LANCZOS)
             rmtree(item_path)
         else:
             os.remove(item_path)
+@contextmanager
+def model_device_ctx(
+    *models,
+    src_device: str = "cpu",
+    dst_device: str = "cuda",
+    verbose: bool = False,
+):
+    start = time.perf_counter()
+    for m in models:
+        if m is None:
+            continue
+        m.to(dst_device)
+    to_cuda_time = time.perf_counter() - start
+    try:
+        yield
+    finally:
+        start = time.perf_counter()
+        for m in models:
+            if m is None:
+                continue
+            m.to(src_device)
+        to_cpu_time = time.perf_counter() - start
+        if verbose:
+            model_names = [m.__class__.__name__ for m in models]
+            logger.debug(
+                f"[model_device_ctx] {model_names} to cuda: {to_cuda_time:.1f}s, to cpu: {to_cpu_time:.1f}s"
+            )

embodied_gen/models/sam3d.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+from embodied_gen.utils.monkey_patches import monkey_patch_sam3d
+monkey_patch_sam3d()
+import os
+import sys
+from typing import Optional, Union
+import numpy as np
+from hydra.utils import instantiate
+from modelscope import snapshot_download
+from omegaconf import OmegaConf
+from PIL import Image
+current_file_path = os.path.abspath(__file__)
+current_dir = os.path.dirname(current_file_path)
+sys.path.append(os.path.join(current_dir, "../.."))
+from thirdparty.sam3d.sam3d_objects.pipeline.inference_pipeline_pointmap import (
+    InferencePipelinePointMap,
+)
+__all__ = ["Sam3dInference"]
+def load_image(path: str) -> np.ndarray:
+    image = Image.open(path)
+    image = np.array(image)
+    image = image.astype(np.uint8)
+    return image
+def load_mask(path: str) -> np.ndarray:
+    mask = load_image(path)
+    mask = mask > 0
+    if mask.ndim == 3:
+        mask = mask[..., -1]
+    return mask
+class Sam3dInference:
+    def __init__(
+        self, local_dir: str = "weights/sam-3d-objects", compile: bool = False
+    ) -> None:
+        if not os.path.exists(local_dir):
+            snapshot_download("facebook/sam-3d-objects", local_dir=local_dir)
+        config_file = os.path.join(local_dir, "checkpoints/pipeline.yaml")
+        config = OmegaConf.load(config_file)
+        config.rendering_engine = "nvdiffrast"
+        config.compile_model = compile
+        config.workspace_dir = os.path.dirname(config_file)
+        # Generate 4 gs in each pixel.
+        config["slat_decoder_gs_config_path"] = config.pop(
+            "slat_decoder_gs_4_config_path", "slat_decoder_gs_4.yaml"
+        )
+        config["slat_decoder_gs_ckpt_path"] = config.pop(
+            "slat_decoder_gs_4_ckpt_path", "slat_decoder_gs_4.ckpt"
+        )
+        self.pipeline: InferencePipelinePointMap = instantiate(config)
+    def merge_mask_to_rgba(
+        self, image: np.ndarray, mask: np.ndarray
+    ) -> np.ndarray:
+        mask = mask.astype(np.uint8) * 255
+        mask = mask[..., None]
+        rgba_image = np.concatenate([image[..., :3], mask], axis=-1)
+        return rgba_image
+    def run(
+        self,
+        image: np.ndarray | Image.Image,
+        mask: np.ndarray = None,
+        seed: int = None,
+        pointmap: np.ndarray = None,
+        use_stage1_distillation: bool = False,
+        use_stage2_distillation: bool = False,
+        stage1_inference_steps: int = 25,
+        stage2_inference_steps: int = 25,
+    ) -> dict:
+        if isinstance(image, Image.Image):
+            image = np.array(image)
+        return self.pipeline.run(
+            image,
+            mask,
+            seed,
+            stage1_only=False,
+            with_mesh_postprocess=False,
+            with_texture_baking=False,
+            with_layout_postprocess=False,
+            use_vertex_color=True,
+            use_stage1_distillation=use_stage1_distillation,
+            use_stage2_distillation=use_stage2_distillation,
+            stage1_inference_steps=stage1_inference_steps,
+            stage2_inference_steps=stage2_inference_steps,
+            pointmap=pointmap,
+        )
+if __name__ == "__main__":
+    pipeline = Sam3dInference()
+    # load image
+    image = load_image(
+        "/home/users/xinjie.wang/xinjie/sam-3d-objects/notebook/images/shutterstock_stylish_kidsroom_1640806567/image.png"
+    )
+    mask = load_mask(
+        "/home/users/xinjie.wang/xinjie/sam-3d-objects/notebook/images/shutterstock_stylish_kidsroom_1640806567/13.png"
+    )
+    import torch
+    if torch.cuda.is_available():
+        torch.cuda.reset_peak_memory_stats()
+        torch.cuda.empty_cache()
+    from time import time
+    start = time()
+    output = pipeline(image, mask, seed=42)
+    print(f"Running cost: {round(time()-start, 1)}")
+    if torch.cuda.is_available():
+        max_memory = torch.cuda.max_memory_allocated() / (1024**3)
+        print(f"(Max VRAM): {max_memory:.2f} GB")
+    print(f"End: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
+    output["gs"].save_ply(f"outputs/splat.ply")
+    print("Your reconstruction has been saved to outputs/splat.ply")

embodied_gen/utils/monkey_patches.py CHANGED Viewed

@@ -25,6 +25,12 @@ from omegaconf import OmegaConf
 from PIL import Image
 from torchvision import transforms
 def monkey_patch_pano2room():
     current_file_path = os.path.abspath(__file__)
@@ -216,3 +222,378 @@ def monkey_patch_maniskill():
     ManiSkillScene.get_human_render_camera_images = (
         get_human_render_camera_images
     )

 from PIL import Image
 from torchvision import transforms
+__all__ = [
+    "monkey_patch_pano2room",
+    "monkey_patch_maniskill",
+    "monkey_patch_sam3d",
+]
 def monkey_patch_pano2room():
     current_file_path = os.path.abspath(__file__)
     ManiSkillScene.get_human_render_camera_images = (
         get_human_render_camera_images
     )
+def monkey_patch_sam3d():
+    from typing import Optional, Union
+    from embodied_gen.data.utils import model_device_ctx
+    from embodied_gen.utils.log import logger
+    os.environ["LIDRA_SKIP_INIT"] = "true"
+    current_file_path = os.path.abspath(__file__)
+    current_dir = os.path.dirname(current_file_path)
+    sam3d_root = os.path.abspath(
+        os.path.join(current_dir, "../../thirdparty/sam3d")
+    )
+    if sam3d_root not in sys.path:
+        sys.path.insert(0, sam3d_root)
+    print(f"[MonkeyPatch] Added to sys.path: {sam3d_root}")
+    def patch_pointmap_infer_pipeline():
+        from copy import deepcopy
+        try:
+            from sam3d_objects.pipeline.inference_pipeline_pointmap import (
+                InferencePipelinePointMap,
+            )
+        except ImportError:
+            logger.error(
+                "[MonkeyPatch]: Could not import sam3d_objects directly. Check paths."
+            )
+            return
+        def patch_run(
+            self,
+            image: Union[None, Image.Image, np.ndarray],
+            mask: Union[None, Image.Image, np.ndarray] = None,
+            seed: Optional[int] = None,
+            stage1_only=False,
+            with_mesh_postprocess=True,
+            with_texture_baking=True,
+            with_layout_postprocess=True,
+            use_vertex_color=False,
+            stage1_inference_steps=None,
+            stage2_inference_steps=None,
+            use_stage1_distillation=False,
+            use_stage2_distillation=False,
+            pointmap=None,
+            decode_formats=None,
+            estimate_plane=False,
+        ) -> dict:
+            image = self.merge_image_and_mask(image, mask)
+            with self.device:
+                pointmap_dict = self.compute_pointmap(image, pointmap)
+                pointmap = pointmap_dict["pointmap"]
+                pts = type(self)._down_sample_img(pointmap)
+                pts_colors = type(self)._down_sample_img(
+                    pointmap_dict["pts_color"]
+                )
+                if estimate_plane:
+                    return self.estimate_plane(pointmap_dict, image)
+                ss_input_dict = self.preprocess_image(
+                    image, self.ss_preprocessor, pointmap=pointmap
+                )
+                slat_input_dict = self.preprocess_image(
+                    image, self.slat_preprocessor
+                )
+                if seed is not None:
+                    torch.manual_seed(seed)
+                with model_device_ctx(
+                    self.models["ss_generator"],
+                    self.models["ss_decoder"],
+                    self.condition_embedders["ss_condition_embedder"],
+                ):
+                    ss_return_dict = self.sample_sparse_structure(
+                        ss_input_dict,
+                        inference_steps=stage1_inference_steps,
+                        use_distillation=use_stage1_distillation,
+                    )
+                # We could probably use the decoder from the models themselves
+                pointmap_scale = ss_input_dict.get("pointmap_scale", None)
+                pointmap_shift = ss_input_dict.get("pointmap_shift", None)
+                ss_return_dict.update(
+                    self.pose_decoder(
+                        ss_return_dict,
+                        scene_scale=pointmap_scale,
+                        scene_shift=pointmap_shift,
+                    )
+                )
+                logger.info(
+                    f"Rescaling scale by {ss_return_dict['downsample_factor']} after downsampling"
+                )
+                ss_return_dict["scale"] = (
+                    ss_return_dict["scale"]
+                    * ss_return_dict["downsample_factor"]
+                )
+                if stage1_only:
+                    logger.info("Finished!")
+                    ss_return_dict["voxel"] = (
+                        ss_return_dict["coords"][:, 1:] / 64 - 0.5
+                    )
+                    return {
+                        **ss_return_dict,
+                        "pointmap": pts.cpu().permute((1, 2, 0)),  # HxWx3
+                        "pointmap_colors": pts_colors.cpu().permute(
+                            (1, 2, 0)
+                        ),  # HxWx3
+                    }
+                    # return ss_return_dict
+                coords = ss_return_dict["coords"]
+                with model_device_ctx(
+                    self.models["slat_generator"],
+                    self.condition_embedders["slat_condition_embedder"],
+                ):
+                    slat = self.sample_slat(
+                        slat_input_dict,
+                        coords,
+                        inference_steps=stage2_inference_steps,
+                        use_distillation=use_stage2_distillation,
+                    )
+                with model_device_ctx(
+                    self.models["slat_decoder_mesh"],
+                    self.models["slat_decoder_gs"],
+                    self.models["slat_decoder_gs_4"],
+                ):
+                    outputs = self.decode_slat(
+                        slat,
+                        (
+                            self.decode_formats
+                            if decode_formats is None
+                            else decode_formats
+                        ),
+                    )
+                outputs = self.postprocess_slat_output(
+                    outputs,
+                    with_mesh_postprocess,
+                    with_texture_baking,
+                    use_vertex_color,
+                )
+                glb = outputs.get("glb", None)
+                try:
+                    if (
+                        with_layout_postprocess
+                        and self.layout_post_optimization_method is not None
+                    ):
+                        assert (
+                            glb is not None
+                        ), "require mesh to run postprocessing"
+                        logger.info(
+                            "Running layout post optimization method..."
+                        )
+                        postprocessed_pose = self.run_post_optimization(
+                            deepcopy(glb),
+                            pointmap_dict["intrinsics"],
+                            ss_return_dict,
+                            ss_input_dict,
+                        )
+                        ss_return_dict.update(postprocessed_pose)
+                except Exception as e:
+                    logger.error(
+                        f"Error during layout post optimization: {e}",
+                        exc_info=True,
+                    )
+                # glb.export("sample.glb")
+                logger.info("Finished!")
+                return {
+                    **ss_return_dict,
+                    **outputs,
+                    "pointmap": pts.cpu().permute((1, 2, 0)),  # HxWx3
+                    "pointmap_colors": pts_colors.cpu().permute(
+                        (1, 2, 0)
+                    ),  # HxWx3
+                }
+        InferencePipelinePointMap.run = patch_run
+    def patch_infer_init():
+        import torch
+        try:
+            from sam3d_objects.pipeline import preprocess_utils
+            from sam3d_objects.pipeline.inference_pipeline_pointmap import (
+                InferencePipeline,
+            )
+            from sam3d_objects.pipeline.inference_utils import (
+                SLAT_MEAN,
+                SLAT_STD,
+            )
+        except ImportError:
+            print(
+                "[MonkeyPatch] Error: Could not import sam3d_objects directly for infer pipeline."
+            )
+            return
+        def patch_init(
+            self,
+            ss_generator_config_path,
+            ss_generator_ckpt_path,
+            slat_generator_config_path,
+            slat_generator_ckpt_path,
+            ss_decoder_config_path,
+            ss_decoder_ckpt_path,
+            slat_decoder_gs_config_path,
+            slat_decoder_gs_ckpt_path,
+            slat_decoder_mesh_config_path,
+            slat_decoder_mesh_ckpt_path,
+            slat_decoder_gs_4_config_path=None,
+            slat_decoder_gs_4_ckpt_path=None,
+            ss_encoder_config_path=None,
+            ss_encoder_ckpt_path=None,
+            decode_formats=["gaussian", "mesh"],
+            dtype="bfloat16",
+            pad_size=1.0,
+            version="v0",
+            device="cuda",
+            ss_preprocessor=preprocess_utils.get_default_preprocessor(),
+            slat_preprocessor=preprocess_utils.get_default_preprocessor(),
+            ss_condition_input_mapping=["image"],
+            slat_condition_input_mapping=["image"],
+            pose_decoder_name="default",
+            workspace_dir="",
+            downsample_ss_dist=0,  # the distance we use to downsample
+            ss_inference_steps=25,
+            ss_rescale_t=3,
+            ss_cfg_strength=7,
+            ss_cfg_interval=[0, 500],
+            ss_cfg_strength_pm=0.0,
+            slat_inference_steps=25,
+            slat_rescale_t=3,
+            slat_cfg_strength=5,
+            slat_cfg_interval=[0, 500],
+            rendering_engine: str = "nvdiffrast",  # nvdiffrast OR pytorch3d,
+            shape_model_dtype=None,
+            compile_model=False,
+            slat_mean=SLAT_MEAN,
+            slat_std=SLAT_STD,
+        ):
+            self.rendering_engine = rendering_engine
+            self.device = torch.device(device)
+            self.compile_model = compile_model
+            logger.info(f"self.device: {self.device}")
+            logger.info(
+                f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', None)}"
+            )
+            logger.info(f"Actually using GPU: {torch.cuda.current_device()}")
+            with self.device:
+                self.decode_formats = decode_formats
+                self.pad_size = pad_size
+                self.version = version
+                self.ss_condition_input_mapping = ss_condition_input_mapping
+                self.slat_condition_input_mapping = (
+                    slat_condition_input_mapping
+                )
+                self.workspace_dir = workspace_dir
+                self.downsample_ss_dist = downsample_ss_dist
+                self.ss_inference_steps = ss_inference_steps
+                self.ss_rescale_t = ss_rescale_t
+                self.ss_cfg_strength = ss_cfg_strength
+                self.ss_cfg_interval = ss_cfg_interval
+                self.ss_cfg_strength_pm = ss_cfg_strength_pm
+                self.slat_inference_steps = slat_inference_steps
+                self.slat_rescale_t = slat_rescale_t
+                self.slat_cfg_strength = slat_cfg_strength
+                self.slat_cfg_interval = slat_cfg_interval
+                self.dtype = self._get_dtype(dtype)
+                if shape_model_dtype is None:
+                    self.shape_model_dtype = self.dtype
+                else:
+                    self.shape_model_dtype = self._get_dtype(shape_model_dtype)
+                # Setup preprocessors
+                self.pose_decoder = self.init_pose_decoder(
+                    ss_generator_config_path, pose_decoder_name
+                )
+                self.ss_preprocessor = self.init_ss_preprocessor(
+                    ss_preprocessor, ss_generator_config_path
+                )
+                self.slat_preprocessor = slat_preprocessor
+                logger.info("Loading model weights...")
+                raw_device = self.device
+                self.device = torch.device("cpu")
+                ss_generator = self.init_ss_generator(
+                    ss_generator_config_path, ss_generator_ckpt_path
+                )
+                slat_generator = self.init_slat_generator(
+                    slat_generator_config_path, slat_generator_ckpt_path
+                )
+                ss_decoder = self.init_ss_decoder(
+                    ss_decoder_config_path, ss_decoder_ckpt_path
+                )
+                ss_encoder = self.init_ss_encoder(
+                    ss_encoder_config_path, ss_encoder_ckpt_path
+                )
+                slat_decoder_gs = self.init_slat_decoder_gs(
+                    slat_decoder_gs_config_path, slat_decoder_gs_ckpt_path
+                )
+                slat_decoder_gs_4 = self.init_slat_decoder_gs(
+                    slat_decoder_gs_4_config_path, slat_decoder_gs_4_ckpt_path
+                )
+                slat_decoder_mesh = self.init_slat_decoder_mesh(
+                    slat_decoder_mesh_config_path, slat_decoder_mesh_ckpt_path
+                )
+                # Load conditioner embedder so that we only load it once
+                ss_condition_embedder = self.init_ss_condition_embedder(
+                    ss_generator_config_path, ss_generator_ckpt_path
+                )
+                slat_condition_embedder = self.init_slat_condition_embedder(
+                    slat_generator_config_path, slat_generator_ckpt_path
+                )
+                self.device = raw_device
+                self.condition_embedders = {
+                    "ss_condition_embedder": ss_condition_embedder,
+                    "slat_condition_embedder": slat_condition_embedder,
+                }
+                # override generator and condition embedder setting
+                self.override_ss_generator_cfg_config(
+                    ss_generator,
+                    cfg_strength=ss_cfg_strength,
+                    inference_steps=ss_inference_steps,
+                    rescale_t=ss_rescale_t,
+                    cfg_interval=ss_cfg_interval,
+                    cfg_strength_pm=ss_cfg_strength_pm,
+                )
+                self.override_slat_generator_cfg_config(
+                    slat_generator,
+                    cfg_strength=slat_cfg_strength,
+                    inference_steps=slat_inference_steps,
+                    rescale_t=slat_rescale_t,
+                    cfg_interval=slat_cfg_interval,
+                )
+                self.models = torch.nn.ModuleDict(
+                    {
+                        "ss_generator": ss_generator,
+                        "slat_generator": slat_generator,
+                        "ss_encoder": ss_encoder,
+                        "ss_decoder": ss_decoder,
+                        "slat_decoder_gs": slat_decoder_gs,
+                        "slat_decoder_gs_4": slat_decoder_gs_4,
+                        "slat_decoder_mesh": slat_decoder_mesh,
+                    }
+                )
+                logger.info("Loading model weights completed!")
+                if self.compile_model:
+                    logger.info("Compiling model...")
+                    self._compile()
+                    logger.info("Model compilation completed!")
+                self.slat_mean = torch.tensor(slat_mean)
+                self.slat_std = torch.tensor(slat_std)
+        InferencePipeline.__init__ = patch_init
+    patch_pointmap_infer_pipeline()
+    patch_infer_init()
+    return

embodied_gen/utils/trender.py CHANGED Viewed

@@ -16,6 +16,7 @@
 import os
 import sys
 import numpy as np
 import spaces
@@ -25,10 +26,8 @@ from tqdm import tqdm
 current_file_path = os.path.abspath(__file__)
 current_dir = os.path.dirname(current_file_path)
 sys.path.append(os.path.join(current_dir, "../.."))
-from thirdparty.TRELLIS.trellis.renderers.mesh_renderer import MeshRenderer
-from thirdparty.TRELLIS.trellis.representations import MeshExtractResult
 from thirdparty.TRELLIS.trellis.utils.render_utils import (
-    render_frames,
     yaw_pitch_r_fov_to_extrinsics_intrinsics,
 )
@@ -38,7 +37,7 @@ __all__ = [
 @spaces.GPU
-def render_mesh(sample, extrinsics, intrinsics, options={}, **kwargs):
     renderer = MeshRenderer()
     renderer.rendering_options.resolution = options.get("resolution", 512)
     renderer.rendering_options.near = options.get("near", 1)
@@ -60,6 +59,57 @@ def render_mesh(sample, extrinsics, intrinsics, options={}, **kwargs):
     return rets
 @spaces.GPU
 def render_video(
     sample,
@@ -77,7 +127,9 @@ def render_video(
         yaws, pitch, r, fov
     )
     render_fn = (
-        render_mesh if isinstance(sample, MeshExtractResult) else render_frames
     )
     result = render_fn(
         sample,

 import os
 import sys
+from collections import defaultdict
 import numpy as np
 import spaces
 current_file_path = os.path.abspath(__file__)
 current_dir = os.path.dirname(current_file_path)
 sys.path.append(os.path.join(current_dir, "../.."))
+from thirdparty.TRELLIS.trellis.renderers import GaussianRenderer, MeshRenderer
 from thirdparty.TRELLIS.trellis.utils.render_utils import (
     yaw_pitch_r_fov_to_extrinsics_intrinsics,
 )
 @spaces.GPU
+def render_mesh_frames(sample, extrinsics, intrinsics, options={}, **kwargs):
     renderer = MeshRenderer()
     renderer.rendering_options.resolution = options.get("resolution", 512)
     renderer.rendering_options.near = options.get("near", 1)
     return rets
+@spaces.GPU
+def render_gs_frames(
+    sample,
+    extrinsics,
+    intrinsics,
+    options=None,
+    colors_overwrite=None,
+    verbose=True,
+    **kwargs,
+):
+    def to_img(tensor):
+        return np.clip(
+            tensor.detach().cpu().numpy().transpose(1, 2, 0) * 255, 0, 255
+        ).astype(np.uint8)
+    def to_numpy(tensor):
+        return tensor.detach().cpu().numpy()
+    renderer = GaussianRenderer()
+    renderer.pipe.kernel_size = kwargs.get("kernel_size", 0.1)
+    renderer.pipe.use_mip_gaussian = True
+    defaults = {
+        "resolution": 512,
+        "near": 0.8,
+        "far": 1.6,
+        "bg_color": (0, 0, 0),
+        "ssaa": 1,
+    }
+    final_options = {**defaults, **(options or {})}
+    for k, v in final_options.items():
+        if hasattr(renderer.rendering_options, k):
+            setattr(renderer.rendering_options, k, v)
+    outputs = defaultdict(list)
+    iterator = zip(extrinsics, intrinsics)
+    if verbose:
+        iterator = tqdm(iterator, total=len(extrinsics), desc="Rendering")
+    for extr, intr in iterator:
+        res = renderer.render(
+            sample, extr, intr, colors_overwrite=colors_overwrite
+        )
+        outputs["color"].append(to_img(res["color"]))
+        depth = res.get("percent_depth") or res.get("depth")
+        outputs["depth"].append(to_numpy(depth) if depth is not None else None)
+    return dict(outputs)
 @spaces.GPU
 def render_video(
     sample,
         yaws, pitch, r, fov
     )
     render_fn = (
+        render_mesh_frames
+        if sample.__class__.__name__ == "MeshExtractResult"
+        else render_gs_frames
     )
     result = render_fn(
         sample,

requirements.txt CHANGED Viewed

@@ -52,4 +52,11 @@ pyquaternion
 shapely
 sapien==3.0.0b1
 typing_extensions==4.14.1
-coacd

 shapely
 sapien==3.0.0b1
 typing_extensions==4.14.1
+ninja
+packaging
+lightning
+astor
+optree
+loguru
+seaborn
+hydra-core

thirdparty/TRELLIS/trellis/utils/postprocessing_utils.py CHANGED Viewed

@@ -440,7 +440,7 @@ def to_glb(
     vertices, faces, uvs = parametrize_mesh(vertices, faces)
     # bake texture
-    observations, extrinsics, intrinsics = render_multiview(app_rep, resolution=1024, nviews=200)
     masks = [np.any(observation > 0, axis=-1) for observation in observations]
     extrinsics = [extrinsics[i].cpu().numpy() for i in range(len(extrinsics))]
     intrinsics = [intrinsics[i].cpu().numpy() for i in range(len(intrinsics))]

     vertices, faces, uvs = parametrize_mesh(vertices, faces)
     # bake texture
+    observations, extrinsics, intrinsics = render_multiview(app_rep, resolution=1024, nviews=100)
     masks = [np.any(observation > 0, axis=-1) for observation in observations]
     extrinsics = [extrinsics[i].cpu().numpy() for i in range(len(extrinsics))]
     intrinsics = [intrinsics[i].cpu().numpy() for i in range(len(intrinsics))]