ReconViaGen

Paused

App Files Files Community

notenoughram commited on Dec 20, 2025

Commit

752eebf

verified ·

1 Parent(s): 2618acd

Update app.py

Browse files

Files changed (1) hide show

app.py +200 -114

app.py CHANGED Viewed

@@ -1,33 +1,20 @@
 import os
-import sys
-import subprocess
-import gc
 import shutil
-from typing import *
-# [AUTO-INSTALL] accelerate 라이브러리
-try:
-    import accelerate
-except ImportError:
-    subprocess.check_call([sys.executable, "-m", "pip", "install", "accelerate"])
-# [중요] OOM 방지를 위한 메모리 파편화 설정
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 os.environ['SPCONV_ALGO'] = 'native'
 import torch
-import torch.nn as nn
 import numpy as np
 import imageio
 from easydict import EasyDict as edict
 from PIL import Image
-import gradio as gr
-from gradio_litmodel3d import LitModel3D
 from trellis.pipelines import TrellisVGGTTo3DPipeline
 from trellis.representations import Gaussian, MeshExtractResult
 from trellis.utils import render_utils, postprocessing_utils
-from accelerate import dispatch_model, infer_auto_device_map
 MAX_SEED = np.iinfo(np.int32).max
 TMP_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tmp')
@@ -37,18 +24,42 @@ def start_session(req: gr.Request):
     user_dir = os.path.join(TMP_DIR, str(req.session_hash))
     os.makedirs(user_dir, exist_ok=True)
 def end_session(req: gr.Request):
     user_dir = os.path.join(TMP_DIR, str(req.session_hash))
-    if os.path.exists(user_dir):
-        shutil.rmtree(user_dir)
-    gc.collect()
-    torch.cuda.empty_cache()
 def preprocess_image(image: Image.Image) -> Image.Image:
     processed_image = pipeline.preprocess_image(image)
     return processed_image
 def preprocess_videos(video: str) -> List[Tuple[Image.Image, str]]:
     vid = imageio.get_reader(video, 'ffmpeg')
     fps = vid.get_meta_data()['fps']
     images = []
@@ -63,10 +74,23 @@ def preprocess_videos(video: str) -> List[Tuple[Image.Image, str]]:
     return processed_images
 def preprocess_images(images: List[Tuple[Image.Image, str]]) -> List[Image.Image]:
     images = [image[0] for image in images]
     processed_images = [pipeline.preprocess_image(image) for image in images]
     return processed_images
 def pack_state(gs: Gaussian, mesh: MeshExtractResult) -> dict:
     return {
         'gaussian': {
@@ -82,9 +106,9 @@ def pack_state(gs: Gaussian, mesh: MeshExtractResult) -> dict:
             'faces': mesh.faces.cpu().numpy(),
         },
     }
 def unpack_state(state: dict) -> Tuple[Gaussian, edict, str]:
-    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
     gs = Gaussian(
         aabb=state['gaussian']['aabb'],
         sh_degree=state['gaussian']['sh_degree'],
@@ -93,21 +117,37 @@ def unpack_state(state: dict) -> Tuple[Gaussian, edict, str]:
         opacity_bias=state['gaussian']['opacity_bias'],
         scaling_activation=state['gaussian']['scaling_activation'],
     )
-    gs._xyz = torch.tensor(state['gaussian']['_xyz'], device=device)
-    gs._features_dc = torch.tensor(state['gaussian']['_features_dc'], device=device)
-    gs._scaling = torch.tensor(state['gaussian']['_scaling'], device=device)
-    gs._rotation = torch.tensor(state['gaussian']['_rotation'], device=device)
-    gs._opacity = torch.tensor(state['gaussian']['_opacity'], device=device)
     mesh = edict(
-        vertices=torch.tensor(state['mesh']['vertices'], device=device),
-        faces=torch.tensor(state['mesh']['faces'], device=device),
     )
     return gs, mesh
 def get_seed(randomize_seed: bool, seed: int) -> int:
     return np.random.randint(0, MAX_SEED) if randomize_seed else seed
 def generate_and_extract_glb(
     multiimages: List[Tuple[Image.Image, str]],
     seed: int,
@@ -120,35 +160,54 @@ def generate_and_extract_glb(
     texture_size: int,
     req: gr.Request,
 ) -> Tuple[dict, str, str, str]:
-    gc.collect()
-    torch.cuda.empty_cache()
     user_dir = os.path.join(TMP_DIR, str(req.session_hash))
     image_files = [image[0] for image in multiimages]
-    try:
-        # [중요] 추론 시 그래디언트 계산 끔 (메모리 절약)
-        with torch.no_grad():
-            outputs, _, _ = pipeline.run(
-                image=image_files,
-                seed=seed,
-                formats=["gaussian", "mesh"],
-                preprocess_image=False,
-                sparse_structure_sampler_params={
-                    "steps": ss_sampling_steps,
-                    "cfg_strength": ss_guidance_strength,
-                },
-                slat_sampler_params={
-                    "steps": slat_sampling_steps,
-                    "cfg_strength": slat_guidance_strength,
-                },
-                mode=multiimage_algo,
-            )
-    except Exception as e:
-        torch.cuda.empty_cache()
-        # 구체적인 에러 메시지 반환
-        raise RuntimeError(f"Generation Failed: {str(e)}")
     video = render_utils.render_video(outputs['gaussian'][0], num_frames=120)['color']
     video_geo = render_utils.render_video(outputs['mesh'][0], num_frames=120)['normal']
@@ -156,32 +215,44 @@ def generate_and_extract_glb(
     video_path = os.path.join(user_dir, 'sample.mp4')
     imageio.mimsave(video_path, video, fps=15)
     gs = outputs['gaussian'][0]
     mesh = outputs['mesh'][0]
     glb = postprocessing_utils.to_glb(gs, mesh, simplify=mesh_simplify, texture_size=texture_size, verbose=False)
     glb_path = os.path.join(user_dir, 'sample.glb')
     glb.export(glb_path)
     state = pack_state(gs, mesh)
-    del outputs, gs, mesh, glb
-    gc.collect()
     torch.cuda.empty_cache()
     return state, video_path, glb_path, glb_path
 def extract_gaussian(state: dict, req: gr.Request) -> Tuple[str, str]:
     user_dir = os.path.join(TMP_DIR, str(req.session_hash))
     gs, _ = unpack_state(state)
     gaussian_path = os.path.join(user_dir, 'sample.ply')
     gs.save_ply(gaussian_path)
-    del gs
     torch.cuda.empty_cache()
     return gaussian_path, gaussian_path
 def prepare_multi_example() -> List[Image.Image]:
-    if not os.path.exists("assets/example_multi_image"):
-        return []
     multi_case = list(set([i.split('_')[0] for i in os.listdir("assets/example_multi_image")]))
     images = []
     for case in multi_case:
@@ -196,7 +267,21 @@ def prepare_multi_example() -> List[Image.Image]:
             images.append(Image.fromarray(np.concatenate(_images, axis=1)))
     return images
 def split_image(image: Image.Image) -> List[Image.Image]:
     image = np.array(image)
     alpha = image[..., 3]
     alpha = np.any(alpha>0, axis=0)
@@ -219,7 +304,22 @@ demo = gr.Blocks(
     """
 )
 with demo:
-    gr.Markdown("# 💻 ReconViaGen (GPU 0 Freed)")
     with gr.Row():
         with gr.Column():
@@ -228,6 +328,9 @@ with demo:
                     input_video = gr.Video(label="Upload Video", interactive=True, height=300)
                     image_prompt = gr.Image(label="Image Prompt", format="png", visible=False, image_mode="RGBA", type="pil", height=300)
                     multiimage_prompt = gr.Gallery(label="Image Prompt", format="png", type="pil", height=300, columns=3)
             with gr.Accordion(label="Generation Settings", open=False):
                 seed = gr.Slider(0, MAX_SEED, label="Seed", value=0, step=1)
@@ -248,6 +351,9 @@ with demo:
             generate_btn = gr.Button("Generate & Extract GLB", variant="primary")
             extract_gs_btn = gr.Button("Extract Gaussian", interactive=False)
         with gr.Column():
             video_output = gr.Video(label="Generated 3D Asset", autoplay=True, loop=True, height=300)
@@ -259,6 +365,7 @@ with demo:
     output_buf = gr.State()
     with gr.Row() as multiimage_example:
         examples_multi = gr.Examples(
             examples=prepare_multi_example(),
@@ -273,12 +380,25 @@ with demo:
     demo.load(start_session)
     demo.unload(end_session)
-    input_video.upload(preprocess_videos, inputs=[input_video], outputs=[multiimage_prompt])
-    input_video.clear(lambda: tuple([None, None]), outputs=[input_video, multiimage_prompt])
-    multiimage_prompt.upload(preprocess_images, inputs=[multiimage_prompt], outputs=[multiimage_prompt])
     generate_btn.click(
-        get_seed, inputs=[randomize_seed, seed], outputs=[seed]
     ).then(
         generate_and_extract_glb,
         inputs=[multiimage_prompt, seed, ss_guidance_strength, ss_sampling_steps, slat_guidance_strength, slat_sampling_steps, multiimage_algo, mesh_simplify, texture_size],
@@ -293,59 +413,25 @@ with demo:
         outputs=[extract_gs_btn, download_glb, download_gs],
     )
-    extract_gs_btn.click(extract_gaussian, inputs=[output_buf], outputs=[model_output, download_gs]).then(
-        lambda: gr.Button(interactive=True), outputs=[download_gs]
     )
     model_output.clear(
         lambda: tuple([gr.Button(interactive=False), gr.Button(interactive=False)]),
         outputs=[download_glb, download_gs],
     )
-# Launch Script
 if __name__ == "__main__":
-    print("🚀 Initializing Pipeline...")
-    # 1. Pipeline 로드
     pipeline = TrellisVGGTTo3DPipeline.from_pretrained("esther11/trellis-vggt-v0-2")
-    # 2. 모든 모델을 일단 CUDA:0에 올려서 기본 설정(device mismatch 방지)을 완료함
     pipeline.cuda()
-    pipeline._device = torch.device("cuda:0") # 내부 device 속성 고정
-    gpu_count = torch.cuda.device_count()
-    print(f"⚡ Detected {gpu_count} GPUs.")
-    if gpu_count > 1:
-        print("⚡ Multi-GPU Mode: Offloading VGGT from GPU 0.")
-        # [핵심 로직] GPU 0을 비우기 위한 전략
-        # VGGT 모델을 잠시 CPU로 내립니다.
-        pipeline.VGGT_model.cpu()
-        print("   - Calculating Device Map (Banning GPU 0 for VGGT)...")
-        # max_memory 설정:
-        # GPU 0: "10MiB" (사실상 VGGT 모델 적재 금지)
-        # GPU 1~N: "20GiB" (여유롭게 할당)
-        max_mem = {0: "10MiB"}
-        for i in range(1, gpu_count):
-            max_mem[i] = "20GiB"
-        # 이 설정으로 맵을 짜면 accelerate는 GPU 0을 건너뛰고 GPU 1부터 모델을 채웁니다.
-        device_map = infer_auto_device_map(
-            pipeline.VGGT_model,
-            max_memory=max_mem,
-            no_split_module_classes=["Block", "ResnetBlock"]
-        )
-        # 맵 적용하여 분산 로드
-        pipeline.VGGT_model = dispatch_model(pipeline.VGGT_model, device_map=device_map)
-        print("✅ VGGT Model successfully pushed to GPU 1+.")
-        print("   - GPU 0: Birefnet (Preprocessing) + Controller")
-        print("   - GPU 1+: VGGT (Inference)")
-    else:
-        print("⚠️ Warning: Only 1 GPU detected. Expect OOM if VRAM < 24GB.")
     demo.launch()

+import gradio as gr
+from gradio_litmodel3d import LitModel3D
 import os
 import shutil
 os.environ['SPCONV_ALGO'] = 'native'
+from typing import *
 import torch
 import numpy as np
 import imageio
 from easydict import EasyDict as edict
 from PIL import Image
 from trellis.pipelines import TrellisVGGTTo3DPipeline
 from trellis.representations import Gaussian, MeshExtractResult
 from trellis.utils import render_utils, postprocessing_utils
 MAX_SEED = np.iinfo(np.int32).max
 TMP_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tmp')
     user_dir = os.path.join(TMP_DIR, str(req.session_hash))
     os.makedirs(user_dir, exist_ok=True)
 def end_session(req: gr.Request):
     user_dir = os.path.join(TMP_DIR, str(req.session_hash))
+    shutil.rmtree(user_dir)
 def preprocess_image(image: Image.Image) -> Image.Image:
+    """
+    Preprocess the input image for 3D generation.
+    This function is called when a user uploads an image or selects an example.
+    It applies background removal and other preprocessing steps necessary for
+    optimal 3D model generation.
+    Args:
+        image (Image.Image): The input image from the user
+    Returns:
+        Image.Image: The preprocessed image ready for 3D generation
+    """
     processed_image = pipeline.preprocess_image(image)
     return processed_image
 def preprocess_videos(video: str) -> List[Tuple[Image.Image, str]]:
+    """
+    Preprocess the input video for multi-image 3D generation.
+    This function is called when a user uploads a video.
+    It extracts frames from the video and processes each frame to prepare them
+    for the multi-image 3D generation pipeline.
+    Args:
+        video (str): The path to the input video file
+    Returns:
+        List[Tuple[Image.Image, str]]: The list of preprocessed images ready for 3D generation
+    """
     vid = imageio.get_reader(video, 'ffmpeg')
     fps = vid.get_meta_data()['fps']
     images = []
     return processed_images
 def preprocess_images(images: List[Tuple[Image.Image, str]]) -> List[Image.Image]:
+    """
+    Preprocess a list of input images for multi-image 3D generation.
+    This function is called when users upload multiple images in the gallery.
+    It processes each image to prepare them for the multi-image 3D generation pipeline.
+    Args:
+        images (List[Tuple[Image.Image, str]]): The input images from the gallery
+    Returns:
+        List[Image.Image]: The preprocessed images ready for 3D generation
+    """
     images = [image[0] for image in images]
     processed_images = [pipeline.preprocess_image(image) for image in images]
     return processed_images
 def pack_state(gs: Gaussian, mesh: MeshExtractResult) -> dict:
     return {
         'gaussian': {
             'faces': mesh.faces.cpu().numpy(),
         },
     }
 def unpack_state(state: dict) -> Tuple[Gaussian, edict, str]:
     gs = Gaussian(
         aabb=state['gaussian']['aabb'],
         sh_degree=state['gaussian']['sh_degree'],
         opacity_bias=state['gaussian']['opacity_bias'],
         scaling_activation=state['gaussian']['scaling_activation'],
     )
+    gs._xyz = torch.tensor(state['gaussian']['_xyz'], device='cuda')
+    gs._features_dc = torch.tensor(state['gaussian']['_features_dc'], device='cuda')
+    gs._scaling = torch.tensor(state['gaussian']['_scaling'], device='cuda')
+    gs._rotation = torch.tensor(state['gaussian']['_rotation'], device='cuda')
+    gs._opacity = torch.tensor(state['gaussian']['_opacity'], device='cuda')
     mesh = edict(
+        vertices=torch.tensor(state['mesh']['vertices'], device='cuda'),
+        faces=torch.tensor(state['mesh']['faces'], device='cuda'),
     )
     return gs, mesh
 def get_seed(randomize_seed: bool, seed: int) -> int:
+    """
+    Get the random seed for generation.
+    This function is called by the generate button to determine whether to use
+    a random seed or the user-specified seed value.
+    Args:
+        randomize_seed (bool): Whether to generate a random seed
+        seed (int): The user-specified seed value
+    Returns:
+        int: The seed to use for generation
+    """
     return np.random.randint(0, MAX_SEED) if randomize_seed else seed
 def generate_and_extract_glb(
     multiimages: List[Tuple[Image.Image, str]],
     seed: int,
     texture_size: int,
     req: gr.Request,
 ) -> Tuple[dict, str, str, str]:
+    """
+    Convert an image to a 3D model and extract GLB file.
+    Args:
+        image (Image.Image): The input image.
+        multiimages (List[Tuple[Image.Image, str]]): The input images in multi-image mode.
+        is_multiimage (bool): Whether is in multi-image mode.
+        seed (int): The random seed.
+        ss_guidance_strength (float): The guidance strength for sparse structure generation.
+        ss_sampling_steps (int): The number of sampling steps for sparse structure generation.
+        slat_guidance_strength (float): The guidance strength for structured latent generation.
+        slat_sampling_steps (int): The number of sampling steps for structured latent generation.
+        multiimage_algo (Literal["multidiffusion", "stochastic"]): The algorithm for multi-image generation.
+        mesh_simplify (float): The mesh simplification factor.
+        texture_size (int): The texture resolution.
+    Returns:
+        dict: The information of the generated 3D model.
+        str: The path to the video of the 3D model.
+        str: The path to the extracted GLB file.
+        str: The path to the extracted GLB file (for download).
+    """
     user_dir = os.path.join(TMP_DIR, str(req.session_hash))
     image_files = [image[0] for image in multiimages]
+    # Generate 3D model
+    outputs, _, _ = pipeline.run(
+        image=image_files,
+        seed=seed,
+        formats=["gaussian", "mesh"],
+        preprocess_image=False,
+        sparse_structure_sampler_params={
+            "steps": ss_sampling_steps,
+            "cfg_strength": ss_guidance_strength,
+        },
+        slat_sampler_params={
+            "steps": slat_sampling_steps,
+            "cfg_strength": slat_guidance_strength,
+        },
+        mode=multiimage_algo,
+    )
+    # Render video
+    # import uuid
+    # output_id = str(uuid.uuid4())
+    # os.makedirs(f"{TMP_DIR}/{output_id}", exist_ok=True)
+    # video_path = f"{TMP_DIR}/{output_id}/preview.mp4"
+    # glb_path = f"{TMP_DIR}/{output_id}/mesh.glb"
     video = render_utils.render_video(outputs['gaussian'][0], num_frames=120)['color']
     video_geo = render_utils.render_video(outputs['mesh'][0], num_frames=120)['normal']
     video_path = os.path.join(user_dir, 'sample.mp4')
     imageio.mimsave(video_path, video, fps=15)
+    # Extract GLB
     gs = outputs['gaussian'][0]
     mesh = outputs['mesh'][0]
     glb = postprocessing_utils.to_glb(gs, mesh, simplify=mesh_simplify, texture_size=texture_size, verbose=False)
     glb_path = os.path.join(user_dir, 'sample.glb')
     glb.export(glb_path)
+    # Pack state for optional Gaussian extraction
     state = pack_state(gs, mesh)
     torch.cuda.empty_cache()
     return state, video_path, glb_path, glb_path
 def extract_gaussian(state: dict, req: gr.Request) -> Tuple[str, str]:
+    """
+    Extract a Gaussian splatting file from the generated 3D model.
+    This function is called when the user clicks "Extract Gaussian" button.
+    It converts the 3D model state into a .ply file format containing
+    Gaussian splatting data for advanced 3D applications.
+    Args:
+        state (dict): The state of the generated 3D model containing Gaussian data
+        req (gr.Request): Gradio request object for session management
+    Returns:
+        Tuple[str, str]: Paths to the extracted Gaussian file (for display and download)
+    """
     user_dir = os.path.join(TMP_DIR, str(req.session_hash))
     gs, _ = unpack_state(state)
     gaussian_path = os.path.join(user_dir, 'sample.ply')
     gs.save_ply(gaussian_path)
     torch.cuda.empty_cache()
     return gaussian_path, gaussian_path
 def prepare_multi_example() -> List[Image.Image]:
     multi_case = list(set([i.split('_')[0] for i in os.listdir("assets/example_multi_image")]))
     images = []
     for case in multi_case:
             images.append(Image.fromarray(np.concatenate(_images, axis=1)))
     return images
 def split_image(image: Image.Image) -> List[Image.Image]:
+    """
+    Split a multi-view image into separate view images.
+    This function is called when users select multi-image examples that contain
+    multiple views in a single concatenated image. It automatically splits them
+    based on alpha channel boundaries and preprocesses each view.
+    Args:
+        image (Image.Image): A concatenated image containing multiple views
+    Returns:
+        List[Image.Image]: List of individual preprocessed view images
+    """
     image = np.array(image)
     alpha = image[..., 3]
     alpha = np.any(alpha>0, axis=0)
     """
 )
 with demo:
+    gr.Markdown("""
+    # 💻 ReconViaGen
+    <p align="center">
+    <a title="Github" href="https://github.com/GAP-LAB-CUHK-SZ/ReconViaGen" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
+        <img src="https://img.shields.io/github/stars/GAP-LAB-CUHK-SZ/ReconViaGen?label=GitHub%20%E2%98%85&logo=github&color=C8C" alt="badge-github-stars">
+    </a>
+    <a title="Website" href="https://jiahao620.github.io/reconviagen/" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
+        <img src="https://www.obukhov.ai/img/badges/badge-website.svg">
+    </a>
+    <a title="arXiv" href="https://jiahao620.github.io/reconviagen/" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
+        <img src="https://www.obukhov.ai/img/badges/badge-pdf.svg">
+    </a>
+    </p>
+    ✨This demo is partial. We will release the whole model later. Stay tuned!✨
+    """)
     with gr.Row():
         with gr.Column():
                     input_video = gr.Video(label="Upload Video", interactive=True, height=300)
                     image_prompt = gr.Image(label="Image Prompt", format="png", visible=False, image_mode="RGBA", type="pil", height=300)
                     multiimage_prompt = gr.Gallery(label="Image Prompt", format="png", type="pil", height=300, columns=3)
+                    gr.Markdown("""
+                        Input different views of the object in separate images.
+                                """)
             with gr.Accordion(label="Generation Settings", open=False):
                 seed = gr.Slider(0, MAX_SEED, label="Seed", value=0, step=1)
             generate_btn = gr.Button("Generate & Extract GLB", variant="primary")
             extract_gs_btn = gr.Button("Extract Gaussian", interactive=False)
+            gr.Markdown("""
+                        *NOTE: Gaussian file can be very large (~50MB), it will take a while to display and download.*
+                        """)
         with gr.Column():
             video_output = gr.Video(label="Generated 3D Asset", autoplay=True, loop=True, height=300)
     output_buf = gr.State()
+    # Example images at the bottom of the page
     with gr.Row() as multiimage_example:
         examples_multi = gr.Examples(
             examples=prepare_multi_example(),
     demo.load(start_session)
     demo.unload(end_session)
+    input_video.upload(
+        preprocess_videos,
+        inputs=[input_video],
+        outputs=[multiimage_prompt],
+    )
+    input_video.clear(
+        lambda: tuple([None, None]),
+        outputs=[input_video, multiimage_prompt],
+    )
+    multiimage_prompt.upload(
+        preprocess_images,
+        inputs=[multiimage_prompt],
+        outputs=[multiimage_prompt],
+    )
     generate_btn.click(
+        get_seed,
+        inputs=[randomize_seed, seed],
+        outputs=[seed],
     ).then(
         generate_and_extract_glb,
         inputs=[multiimage_prompt, seed, ss_guidance_strength, ss_sampling_steps, slat_guidance_strength, slat_sampling_steps, multiimage_algo, mesh_simplify, texture_size],
         outputs=[extract_gs_btn, download_glb, download_gs],
     )
+    extract_gs_btn.click(
+        extract_gaussian,
+        inputs=[output_buf],
+        outputs=[model_output, download_gs],
+    ).then(
+        lambda: gr.Button(interactive=True),
+        outputs=[download_gs],
     )
     model_output.clear(
         lambda: tuple([gr.Button(interactive=False), gr.Button(interactive=False)]),
         outputs=[download_glb, download_gs],
     )
+# Launch the Gradio app
 if __name__ == "__main__":
     pipeline = TrellisVGGTTo3DPipeline.from_pretrained("esther11/trellis-vggt-v0-2")
     pipeline.cuda()
+    pipeline.VGGT_model.cuda()
+    pipeline.birefnet_model.cuda()
     demo.launch()