ASM-o2.1

Sleeping

App Files Files Community

ASM-o2.1 / app.py

arabago96

fix: Improve video realism by using Forest HDRI envmap (same as UNLOCKED)

b04a736 about 1 month ago

raw

history blame contribute delete

16.3 kB

	import gradio as gr
	from gradio_client import Client, handle_file
	import os

	# Conditional spaces import for RunPod/Local support
	# If on RunPod, WE MUST MOCK SPACES to avoid ZeroGPU quota/crashes
	if os.environ.get('SPACE_ID') and not os.environ.get('RUNPOD_POD_ID'):
	import spaces
	else:
	# Mock spaces if not on HF (RunPod/Local)
	class spaces:
	@staticmethod
	def GPU(duration=120):
	def decorator(func):
	return func
	return decorator

	import os
	os.environ["OPENCV_IO_ENABLE_OPENEXR"] = '1'
	os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
	os.environ["ATTN_BACKEND"] = "flash_attn_3"
	os.environ["FLEX_GEMM_AUTOTUNE_CACHE_PATH"] = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'autotune_cache.json')
	os.environ["FLEX_GEMM_AUTOTUNER_VERBOSE"] = '1'
	from datetime import datetime
	import shutil
	from typing import *
	import torch
	import numpy as np
	from PIL import Image
	import tempfile
	import imageio
	import cv2
	from trellis2.modules.sparse import SparseTensor
	from trellis2.pipelines import Trellis2ImageTo3DPipeline
	from trellis2.renderers import EnvMap
	from trellis2.utils import render_utils
	import o_voxel


	MAX_SEED = np.iinfo(np.int32).max
	TMP_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tmp')


	def start_session(req: gr.Request):
	user_dir = os.path.join(TMP_DIR, str(req.session_hash))
	os.makedirs(user_dir, exist_ok=True)


	def end_session(req: gr.Request):
	user_dir = os.path.join(TMP_DIR, str(req.session_hash))
	if os.path.exists(user_dir):
	shutil.rmtree(user_dir)


	# BiRefNet Global Instance (Lazy Loaded)
	from utils_birefnet import BiRefNet
	birefnet = None

	@spaces.GPU(duration=20)
	def remove_background(input: Image.Image) -> Image.Image:
	global birefnet
	if birefnet is None:
	print("[LAZY LOAD] Initializing BiRefNet...", flush=True)
	birefnet = BiRefNet()
	birefnet.cuda()

	# Use local BiRefNet
	return birefnet(input)


	def preprocess_image(input: Image.Image) -> Image.Image:
	"""
	Preprocess the input image.
	"""
	# if has alpha channel, use it directly; otherwise, remove background
	has_alpha = False
	if input.mode == 'RGBA':
	alpha = np.array(input)[:, :, 3]
	if not np.all(alpha == 255):
	has_alpha = True
	max_size = max(input.size)
	scale = min(1, 1024 / max_size)
	if scale < 1:
	input = input.resize((int(input.width * scale), int(input.height * scale)), Image.Resampling.LANCZOS)
	if has_alpha:
	output = input
	else:
	output = remove_background(input)
	output_np = np.array(output)
	alpha = output_np[:, :, 3]
	bbox = np.argwhere(alpha > 0.8 * 255)
	bbox = np.min(bbox[:, 1]), np.min(bbox[:, 0]), np.max(bbox[:, 1]), np.max(bbox[:, 0])
	center = (bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2
	size = max(bbox[2] - bbox[0], bbox[3] - bbox[1])
	size = int(size * 1)
	bbox = center[0] - size // 2, center[1] - size // 2, center[0] + size // 2, center[1] + size // 2
	output = output.crop(bbox) # type: ignore
	output = np.array(output).astype(np.float32) / 255
	output = output[:, :, :3] * output[:, :, 3:4]
	output = Image.fromarray((output * 255).astype(np.uint8))
	return output


	def pack_state(latents: Tuple[SparseTensor, SparseTensor, int]) -> dict:
	shape_slat, tex_slat, res = latents
	return {
	'shape_slat_feats': shape_slat.feats.cpu().numpy(),
	'tex_slat_feats': tex_slat.feats.cpu().numpy(),
	'coords': shape_slat.coords.cpu().numpy(),
	'res': res,
	}


	def unpack_state(state: dict) -> Tuple[SparseTensor, SparseTensor, int]:
	shape_slat = SparseTensor(
	feats=torch.from_numpy(state['shape_slat_feats']).cuda(),
	coords=torch.from_numpy(state['coords']).cuda(),
	)
	tex_slat = shape_slat.replace(torch.from_numpy(state['tex_slat_feats']).cuda())
	return shape_slat, tex_slat, state['res']


	def get_seed(randomize_seed: bool, seed: int) -> int:
	"""
	Get the random seed.
	"""
	return np.random.randint(0, MAX_SEED) if randomize_seed else seed


	import threading
	GPU_LOCK = threading.Lock()

	@spaces.GPU(duration=120)
	def image_to_3d(
	image: Image.Image,
	seed: int,
	resolution: str,
	ss_guidance_strength: float,
	ss_guidance_rescale: float,
	ss_sampling_steps: int,
	ss_rescale_t: float,
	shape_slat_guidance_strength: float,
	shape_slat_guidance_rescale: float,
	shape_slat_sampling_steps: int,
	shape_slat_rescale_t: float,
	tex_slat_guidance_strength: float,
	tex_slat_guidance_rescale: float,
	tex_slat_sampling_steps: int,
	tex_slat_rescale_t: float,
	decimation_target: int,
	texture_size: int,
	req: gr.Request,
	progress=gr.Progress(track_tqdm=True),
	) -> Tuple[dict, str]:
	"""
	Convert an image to a 3D model.

	Returns:
	dict: The state of the generated 3D model.
	str: The path to the video of the 3D model.
	"""
	# Start timing and cleanup
	torch.cuda.empty_cache()
	user_dir = os.path.join(TMP_DIR, str(req.session_hash))
	os.makedirs(user_dir, exist_ok=True)

	# Use Global Lock to prevent concurrent runs on RunPod
	with GPU_LOCK:
	try:
	# --- Sampling ---
	outputs, latents = pipeline.run(
	image,
	seed=seed,
	preprocess_image=False,
	sparse_structure_sampler_params={
	"steps": ss_sampling_steps,
	"guidance_strength": ss_guidance_strength,
	"guidance_rescale": ss_guidance_rescale,
	"rescale_t": ss_rescale_t,
	},
	shape_slat_sampler_params={
	"steps": shape_slat_sampling_steps,
	"guidance_strength": shape_slat_guidance_strength,
	"guidance_rescale": shape_slat_guidance_rescale,
	"rescale_t": shape_slat_rescale_t,
	},
	tex_slat_sampler_params={
	"steps": tex_slat_sampling_steps,
	"guidance_strength": tex_slat_guidance_strength,
	"guidance_rescale": tex_slat_guidance_rescale,
	"rescale_t": tex_slat_rescale_t,
	},
	pipeline_type={
	"512": "512",
	"1024": "1024_cascade",
	"1536": "1536_cascade",
	}[resolution],
	return_latent=True,
	)
	mesh = outputs[0]
	mesh.simplify(16777216) # nvdiffrast limit

	# --- Render Video Preview (8 seconds at 15fps = 120 frames) ---
	render_result = render_utils.render_video(mesh, num_frames=120, resolution=720, envmap=envmap, verbose=False)
	video = render_result.get('shaded_forest', render_result.get('shaded'))

	download_video = os.path.join(user_dir, 'sample.mp4')
	imageio.mimsave(download_video, video, fps=15, pixelformat='yuv420p')

	state = pack_state(latents)
	return state, download_video

	except Exception as e:
	print(f"Error during generation: {e}")
	raise e

	finally:
	# Cleanup latent tensors and mesh to free VRAM for mesh processing
	# Use try-del pattern to avoid NameError if crash happens before variable creation
	for var in ['latents', 'mesh', 'outputs', 'render_result', 'video']:
	if var in locals():
	del locals()[var]

	import gc
	gc.collect()
	torch.cuda.empty_cache()


	@spaces.GPU(duration=120)
	def extract_glb(
	state: dict,
	decimation_target: int,
	texture_size: int,
	req: gr.Request,
	progress=gr.Progress(track_tqdm=True),
	) -> Tuple[str, str]:
	"""
	Extract a GLB file from the 3D model.

	Args:
	state (dict): The state of the generated 3D model.
	decimation_target (int): The target face count for decimation.
	texture_size (int): The texture resolution.

	Returns:
	str: The path to the extracted GLB file.
	"""
	user_dir = os.path.join(TMP_DIR, str(req.session_hash))

	with GPU_LOCK:
	try:
	shape_slat, tex_slat, res = unpack_state(state)
	mesh = pipeline.decode_latent(shape_slat, tex_slat, res)[0]

	# Cleanup latent tensors immediately after decode if possible, but finally block handles it too
	# del shape_slat, tex_slat
	# torch.cuda.empty_cache()

	mesh.simplify(16777216)
	glb = o_voxel.postprocess.to_glb(
	vertices=mesh.vertices,
	faces=mesh.faces,
	attr_volume=mesh.attrs,
	coords=mesh.coords,
	attr_layout=pipeline.pbr_attr_layout,
	grid_size=res,
	aabb=[[-0.5, -0.5, -0.5], [0.5, 0.5, 0.5]],
	decimation_target=decimation_target,
	texture_size=texture_size,
	remesh=True,
	remesh_band=1,
	remesh_project=0,
	use_tqdm=True,
	)
	now = datetime.now()
	timestamp = now.strftime("%Y-%m-%dT%H%M%S") + f".{now.microsecond // 1000:03d}"
	os.makedirs(user_dir, exist_ok=True)
	glb_path = os.path.join(user_dir, f'sample_{timestamp}.glb')
	glb.export(glb_path, extension_webp=True)

	return glb_path, glb_path

	except Exception as e:
	print(f"Error during extraction: {e}")
	raise e

	finally:
	# Cleanup
	for var in ['shape_slat', 'tex_slat', 'mesh', 'glb']:
	if var in locals():
	del locals()[var]
	import gc
	gc.collect()
	torch.cuda.empty_cache()


	with gr.Blocks(delete_cache=(600, 600)) as demo:
	gr.Markdown("""
	## Image to 3D Asset with [TRELLIS.2](https://microsoft.github.io/TRELLIS.2)
	* Upload an image (preferably with an alpha-masked foreground object) and click Generate to create a 3D asset.
	* Click Extract GLB to export and download the generated GLB file if you're satisfied with the result. Otherwise, try another time.
	""")

	with gr.Row():
	with gr.Column():
	image_prompt = gr.Image(label="Image Prompt", format="png", image_mode="RGBA", type="pil", height=300)

	resolution = gr.Radio(["512", "1024", "1536"], label="Resolution", value="1024")
	seed = gr.Slider(0, MAX_SEED, label="Seed", value=0, step=1)
	randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)

	generate_btn = gr.Button("Generate")

	with gr.Accordion(label="Advanced Settings", open=False):
	gr.Markdown("Stage 1: Sparse Structure Generation")
	with gr.Row():
	ss_guidance_strength = gr.Slider(1.0, 10.0, label="Guidance Strength", value=7.5, step=0.1)
	ss_guidance_rescale = gr.Slider(0.0, 1.0, label="Guidance Rescale", value=0.7, step=0.01)
	ss_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
	ss_rescale_t = gr.Slider(1.0, 6.0, label="Rescale T", value=5.0, step=0.1)
	gr.Markdown("Stage 2: Shape Generation")
	with gr.Row():
	shape_slat_guidance_strength = gr.Slider(1.0, 10.0, label="Guidance Strength", value=7.5, step=0.1)
	shape_slat_guidance_rescale = gr.Slider(0.0, 1.0, label="Guidance Rescale", value=0.5, step=0.01)
	shape_slat_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
	shape_slat_rescale_t = gr.Slider(1.0, 6.0, label="Rescale T", value=3.0, step=0.1)
	gr.Markdown("Stage 3: Material Generation")
	with gr.Row():
	tex_slat_guidance_strength = gr.Slider(1.0, 10.0, label="Guidance Strength", value=8.5, step=0.1)
	tex_slat_guidance_rescale = gr.Slider(0.0, 1.0, label="Guidance Rescale", value=0.0, step=0.01)
	tex_slat_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
	tex_slat_rescale_t = gr.Slider(1.0, 6.0, label="Rescale T", value=3.0, step=0.1)

	with gr.Accordion(label="GLB Extraction Settings", open=False):
	decimation_target = gr.Slider(100000, 500000, label="Decimation Target", value=300000, step=10000)
	texture_size = gr.Slider(1024, 4096, label="Texture Size", value=2048, step=1024)

	extract_glb_btn = gr.Button("Extract GLB", interactive=False)

	with gr.Column():
	video_output = gr.Video(label="Generated 3D Asset", autoplay=True, loop=True, height=300)
	model_output = gr.Model3D(label="Extracted GLB", height=300, display_mode="solid", clear_color=(0.25, 0.25, 0.25, 1.0))
	download_glb = gr.DownloadButton(label="Download GLB", interactive=False)

	with gr.Row():
	examples = gr.Examples(
	examples=[
	f'assets/example_image/{image}'
	for image in os.listdir("assets/example_image")
	],
	inputs=[image_prompt],
	fn=preprocess_image,
	outputs=[image_prompt],
	run_on_click=True,
	examples_per_page=18,
	)

	output_buf = gr.State()


	# Handlers
	demo.load(start_session)
	demo.unload(end_session)

	image_prompt.upload(
	preprocess_image,
	inputs=[image_prompt],
	outputs=[image_prompt],
	)

	generate_btn.click(
	get_seed,
	inputs=[randomize_seed, seed],
	outputs=[seed],
	).then(
	image_to_3d,
	inputs=[
	image_prompt, seed, resolution,
	ss_guidance_strength, ss_guidance_rescale, ss_sampling_steps, ss_rescale_t,
	shape_slat_guidance_strength, shape_slat_guidance_rescale, shape_slat_sampling_steps, shape_slat_rescale_t,
	tex_slat_guidance_strength, tex_slat_guidance_rescale, tex_slat_sampling_steps, tex_slat_rescale_t,
	decimation_target, texture_size,
	],
	outputs=[output_buf, video_output],
	).then(
	lambda: gr.update(interactive=True),
	outputs=[extract_glb_btn],
	)

	video_output.clear(
	lambda: gr.update(interactive=False),
	outputs=[extract_glb_btn],
	)

	extract_glb_btn.click(
	extract_glb,
	inputs=[output_buf, decimation_target, texture_size],
	outputs=[model_output, download_glb],
	).then(
	lambda: gr.update(interactive=True),
	outputs=[download_glb],
	)

	model_output.clear(
	lambda: gr.update(interactive=False),
	outputs=[download_glb],
	)


	# Launch the Gradio app
	if __name__ == "__main__":
	os.makedirs(TMP_DIR, exist_ok=True)

	pipeline = Trellis2ImageTo3DPipeline.from_pretrained('microsoft/TRELLIS.2-4B')
	pipeline.rembg_model = None
	pipeline.low_vram = True
	pipeline.cuda()

	# Load HDRI envmap for beautiful lighting
	# Load HDRI envmap for beautiful lighting
	envmap = {
	'forest': EnvMap(torch.tensor(
	cv2.cvtColor(cv2.imread('assets/hdri/forest.exr', cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGB),
	dtype=torch.float32, device='cuda'
	)),
	'sunset': EnvMap(torch.tensor(
	cv2.cvtColor(cv2.imread('assets/hdri/sunset.exr', cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGB),
	dtype=torch.float32, device='cuda'
	)),
	'courtyard': EnvMap(torch.tensor(
	cv2.cvtColor(cv2.imread('assets/hdri/courtyard.exr', cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGB),
	dtype=torch.float32, device='cuda'
	)),
	}

	demo.launch()