ASM-o2.1 / app.py
arabago96's picture
fix: Improve video realism by using Forest HDRI envmap (same as UNLOCKED)
b04a736
import gradio as gr
from gradio_client import Client, handle_file
import os
# Conditional spaces import for RunPod/Local support
# If on RunPod, WE MUST MOCK SPACES to avoid ZeroGPU quota/crashes
if os.environ.get('SPACE_ID') and not os.environ.get('RUNPOD_POD_ID'):
import spaces
else:
# Mock spaces if not on HF (RunPod/Local)
class spaces:
@staticmethod
def GPU(duration=120):
def decorator(func):
return func
return decorator
import os
os.environ["OPENCV_IO_ENABLE_OPENEXR"] = '1'
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["ATTN_BACKEND"] = "flash_attn_3"
os.environ["FLEX_GEMM_AUTOTUNE_CACHE_PATH"] = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'autotune_cache.json')
os.environ["FLEX_GEMM_AUTOTUNER_VERBOSE"] = '1'
from datetime import datetime
import shutil
from typing import *
import torch
import numpy as np
from PIL import Image
import tempfile
import imageio
import cv2
from trellis2.modules.sparse import SparseTensor
from trellis2.pipelines import Trellis2ImageTo3DPipeline
from trellis2.renderers import EnvMap
from trellis2.utils import render_utils
import o_voxel
MAX_SEED = np.iinfo(np.int32).max
TMP_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tmp')
def start_session(req: gr.Request):
user_dir = os.path.join(TMP_DIR, str(req.session_hash))
os.makedirs(user_dir, exist_ok=True)
def end_session(req: gr.Request):
user_dir = os.path.join(TMP_DIR, str(req.session_hash))
if os.path.exists(user_dir):
shutil.rmtree(user_dir)
# BiRefNet Global Instance (Lazy Loaded)
from utils_birefnet import BiRefNet
birefnet = None
@spaces.GPU(duration=20)
def remove_background(input: Image.Image) -> Image.Image:
global birefnet
if birefnet is None:
print("[LAZY LOAD] Initializing BiRefNet...", flush=True)
birefnet = BiRefNet()
birefnet.cuda()
# Use local BiRefNet
return birefnet(input)
def preprocess_image(input: Image.Image) -> Image.Image:
"""
Preprocess the input image.
"""
# if has alpha channel, use it directly; otherwise, remove background
has_alpha = False
if input.mode == 'RGBA':
alpha = np.array(input)[:, :, 3]
if not np.all(alpha == 255):
has_alpha = True
max_size = max(input.size)
scale = min(1, 1024 / max_size)
if scale < 1:
input = input.resize((int(input.width * scale), int(input.height * scale)), Image.Resampling.LANCZOS)
if has_alpha:
output = input
else:
output = remove_background(input)
output_np = np.array(output)
alpha = output_np[:, :, 3]
bbox = np.argwhere(alpha > 0.8 * 255)
bbox = np.min(bbox[:, 1]), np.min(bbox[:, 0]), np.max(bbox[:, 1]), np.max(bbox[:, 0])
center = (bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2
size = max(bbox[2] - bbox[0], bbox[3] - bbox[1])
size = int(size * 1)
bbox = center[0] - size // 2, center[1] - size // 2, center[0] + size // 2, center[1] + size // 2
output = output.crop(bbox) # type: ignore
output = np.array(output).astype(np.float32) / 255
output = output[:, :, :3] * output[:, :, 3:4]
output = Image.fromarray((output * 255).astype(np.uint8))
return output
def pack_state(latents: Tuple[SparseTensor, SparseTensor, int]) -> dict:
shape_slat, tex_slat, res = latents
return {
'shape_slat_feats': shape_slat.feats.cpu().numpy(),
'tex_slat_feats': tex_slat.feats.cpu().numpy(),
'coords': shape_slat.coords.cpu().numpy(),
'res': res,
}
def unpack_state(state: dict) -> Tuple[SparseTensor, SparseTensor, int]:
shape_slat = SparseTensor(
feats=torch.from_numpy(state['shape_slat_feats']).cuda(),
coords=torch.from_numpy(state['coords']).cuda(),
)
tex_slat = shape_slat.replace(torch.from_numpy(state['tex_slat_feats']).cuda())
return shape_slat, tex_slat, state['res']
def get_seed(randomize_seed: bool, seed: int) -> int:
"""
Get the random seed.
"""
return np.random.randint(0, MAX_SEED) if randomize_seed else seed
import threading
GPU_LOCK = threading.Lock()
@spaces.GPU(duration=120)
def image_to_3d(
image: Image.Image,
seed: int,
resolution: str,
ss_guidance_strength: float,
ss_guidance_rescale: float,
ss_sampling_steps: int,
ss_rescale_t: float,
shape_slat_guidance_strength: float,
shape_slat_guidance_rescale: float,
shape_slat_sampling_steps: int,
shape_slat_rescale_t: float,
tex_slat_guidance_strength: float,
tex_slat_guidance_rescale: float,
tex_slat_sampling_steps: int,
tex_slat_rescale_t: float,
decimation_target: int,
texture_size: int,
req: gr.Request,
progress=gr.Progress(track_tqdm=True),
) -> Tuple[dict, str]:
"""
Convert an image to a 3D model.
Returns:
dict: The state of the generated 3D model.
str: The path to the video of the 3D model.
"""
# Start timing and cleanup
torch.cuda.empty_cache()
user_dir = os.path.join(TMP_DIR, str(req.session_hash))
os.makedirs(user_dir, exist_ok=True)
# Use Global Lock to prevent concurrent runs on RunPod
with GPU_LOCK:
try:
# --- Sampling ---
outputs, latents = pipeline.run(
image,
seed=seed,
preprocess_image=False,
sparse_structure_sampler_params={
"steps": ss_sampling_steps,
"guidance_strength": ss_guidance_strength,
"guidance_rescale": ss_guidance_rescale,
"rescale_t": ss_rescale_t,
},
shape_slat_sampler_params={
"steps": shape_slat_sampling_steps,
"guidance_strength": shape_slat_guidance_strength,
"guidance_rescale": shape_slat_guidance_rescale,
"rescale_t": shape_slat_rescale_t,
},
tex_slat_sampler_params={
"steps": tex_slat_sampling_steps,
"guidance_strength": tex_slat_guidance_strength,
"guidance_rescale": tex_slat_guidance_rescale,
"rescale_t": tex_slat_rescale_t,
},
pipeline_type={
"512": "512",
"1024": "1024_cascade",
"1536": "1536_cascade",
}[resolution],
return_latent=True,
)
mesh = outputs[0]
mesh.simplify(16777216) # nvdiffrast limit
# --- Render Video Preview (8 seconds at 15fps = 120 frames) ---
render_result = render_utils.render_video(mesh, num_frames=120, resolution=720, envmap=envmap, verbose=False)
video = render_result.get('shaded_forest', render_result.get('shaded'))
download_video = os.path.join(user_dir, 'sample.mp4')
imageio.mimsave(download_video, video, fps=15, pixelformat='yuv420p')
state = pack_state(latents)
return state, download_video
except Exception as e:
print(f"Error during generation: {e}")
raise e
finally:
# Cleanup latent tensors and mesh to free VRAM for mesh processing
# Use try-del pattern to avoid NameError if crash happens before variable creation
for var in ['latents', 'mesh', 'outputs', 'render_result', 'video']:
if var in locals():
del locals()[var]
import gc
gc.collect()
torch.cuda.empty_cache()
@spaces.GPU(duration=120)
def extract_glb(
state: dict,
decimation_target: int,
texture_size: int,
req: gr.Request,
progress=gr.Progress(track_tqdm=True),
) -> Tuple[str, str]:
"""
Extract a GLB file from the 3D model.
Args:
state (dict): The state of the generated 3D model.
decimation_target (int): The target face count for decimation.
texture_size (int): The texture resolution.
Returns:
str: The path to the extracted GLB file.
"""
user_dir = os.path.join(TMP_DIR, str(req.session_hash))
with GPU_LOCK:
try:
shape_slat, tex_slat, res = unpack_state(state)
mesh = pipeline.decode_latent(shape_slat, tex_slat, res)[0]
# Cleanup latent tensors immediately after decode if possible, but finally block handles it too
# del shape_slat, tex_slat
# torch.cuda.empty_cache()
mesh.simplify(16777216)
glb = o_voxel.postprocess.to_glb(
vertices=mesh.vertices,
faces=mesh.faces,
attr_volume=mesh.attrs,
coords=mesh.coords,
attr_layout=pipeline.pbr_attr_layout,
grid_size=res,
aabb=[[-0.5, -0.5, -0.5], [0.5, 0.5, 0.5]],
decimation_target=decimation_target,
texture_size=texture_size,
remesh=True,
remesh_band=1,
remesh_project=0,
use_tqdm=True,
)
now = datetime.now()
timestamp = now.strftime("%Y-%m-%dT%H%M%S") + f".{now.microsecond // 1000:03d}"
os.makedirs(user_dir, exist_ok=True)
glb_path = os.path.join(user_dir, f'sample_{timestamp}.glb')
glb.export(glb_path, extension_webp=True)
return glb_path, glb_path
except Exception as e:
print(f"Error during extraction: {e}")
raise e
finally:
# Cleanup
for var in ['shape_slat', 'tex_slat', 'mesh', 'glb']:
if var in locals():
del locals()[var]
import gc
gc.collect()
torch.cuda.empty_cache()
with gr.Blocks(delete_cache=(600, 600)) as demo:
gr.Markdown("""
## Image to 3D Asset with [TRELLIS.2](https://microsoft.github.io/TRELLIS.2)
* Upload an image (preferably with an alpha-masked foreground object) and click Generate to create a 3D asset.
* Click Extract GLB to export and download the generated GLB file if you're satisfied with the result. Otherwise, try another time.
""")
with gr.Row():
with gr.Column():
image_prompt = gr.Image(label="Image Prompt", format="png", image_mode="RGBA", type="pil", height=300)
resolution = gr.Radio(["512", "1024", "1536"], label="Resolution", value="1024")
seed = gr.Slider(0, MAX_SEED, label="Seed", value=0, step=1)
randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
generate_btn = gr.Button("Generate")
with gr.Accordion(label="Advanced Settings", open=False):
gr.Markdown("Stage 1: Sparse Structure Generation")
with gr.Row():
ss_guidance_strength = gr.Slider(1.0, 10.0, label="Guidance Strength", value=7.5, step=0.1)
ss_guidance_rescale = gr.Slider(0.0, 1.0, label="Guidance Rescale", value=0.7, step=0.01)
ss_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
ss_rescale_t = gr.Slider(1.0, 6.0, label="Rescale T", value=5.0, step=0.1)
gr.Markdown("Stage 2: Shape Generation")
with gr.Row():
shape_slat_guidance_strength = gr.Slider(1.0, 10.0, label="Guidance Strength", value=7.5, step=0.1)
shape_slat_guidance_rescale = gr.Slider(0.0, 1.0, label="Guidance Rescale", value=0.5, step=0.01)
shape_slat_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
shape_slat_rescale_t = gr.Slider(1.0, 6.0, label="Rescale T", value=3.0, step=0.1)
gr.Markdown("Stage 3: Material Generation")
with gr.Row():
tex_slat_guidance_strength = gr.Slider(1.0, 10.0, label="Guidance Strength", value=8.5, step=0.1)
tex_slat_guidance_rescale = gr.Slider(0.0, 1.0, label="Guidance Rescale", value=0.0, step=0.01)
tex_slat_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
tex_slat_rescale_t = gr.Slider(1.0, 6.0, label="Rescale T", value=3.0, step=0.1)
with gr.Accordion(label="GLB Extraction Settings", open=False):
decimation_target = gr.Slider(100000, 500000, label="Decimation Target", value=300000, step=10000)
texture_size = gr.Slider(1024, 4096, label="Texture Size", value=2048, step=1024)
extract_glb_btn = gr.Button("Extract GLB", interactive=False)
with gr.Column():
video_output = gr.Video(label="Generated 3D Asset", autoplay=True, loop=True, height=300)
model_output = gr.Model3D(label="Extracted GLB", height=300, display_mode="solid", clear_color=(0.25, 0.25, 0.25, 1.0))
download_glb = gr.DownloadButton(label="Download GLB", interactive=False)
with gr.Row():
examples = gr.Examples(
examples=[
f'assets/example_image/{image}'
for image in os.listdir("assets/example_image")
],
inputs=[image_prompt],
fn=preprocess_image,
outputs=[image_prompt],
run_on_click=True,
examples_per_page=18,
)
output_buf = gr.State()
# Handlers
demo.load(start_session)
demo.unload(end_session)
image_prompt.upload(
preprocess_image,
inputs=[image_prompt],
outputs=[image_prompt],
)
generate_btn.click(
get_seed,
inputs=[randomize_seed, seed],
outputs=[seed],
).then(
image_to_3d,
inputs=[
image_prompt, seed, resolution,
ss_guidance_strength, ss_guidance_rescale, ss_sampling_steps, ss_rescale_t,
shape_slat_guidance_strength, shape_slat_guidance_rescale, shape_slat_sampling_steps, shape_slat_rescale_t,
tex_slat_guidance_strength, tex_slat_guidance_rescale, tex_slat_sampling_steps, tex_slat_rescale_t,
decimation_target, texture_size,
],
outputs=[output_buf, video_output],
).then(
lambda: gr.update(interactive=True),
outputs=[extract_glb_btn],
)
video_output.clear(
lambda: gr.update(interactive=False),
outputs=[extract_glb_btn],
)
extract_glb_btn.click(
extract_glb,
inputs=[output_buf, decimation_target, texture_size],
outputs=[model_output, download_glb],
).then(
lambda: gr.update(interactive=True),
outputs=[download_glb],
)
model_output.clear(
lambda: gr.update(interactive=False),
outputs=[download_glb],
)
# Launch the Gradio app
if __name__ == "__main__":
os.makedirs(TMP_DIR, exist_ok=True)
pipeline = Trellis2ImageTo3DPipeline.from_pretrained('microsoft/TRELLIS.2-4B')
pipeline.rembg_model = None
pipeline.low_vram = True
pipeline.cuda()
# Load HDRI envmap for beautiful lighting
# Load HDRI envmap for beautiful lighting
envmap = {
'forest': EnvMap(torch.tensor(
cv2.cvtColor(cv2.imread('assets/hdri/forest.exr', cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGB),
dtype=torch.float32, device='cuda'
)),
'sunset': EnvMap(torch.tensor(
cv2.cvtColor(cv2.imread('assets/hdri/sunset.exr', cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGB),
dtype=torch.float32, device='cuda'
)),
'courtyard': EnvMap(torch.tensor(
cv2.cvtColor(cv2.imread('assets/hdri/courtyard.exr', cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGB),
dtype=torch.float32, device='cuda'
)),
}
demo.launch()