ReconViaGen-v0.5

Running on Zero

App Files Files Community

ReconViaGen-v0.5 / app.py

Stable-X

Update app.py

fce5394 verified 9 days ago

raw

history blame contribute delete

26.1 kB

	"""
	app.py – ReconViaGen v0.5 HuggingFace Space (ZeroGPU)
	=====================================================
	Stage 1 (SS) – ReconViaGen VGGT-based sparse structure
	Stage 2 (Shape) – TRELLIS.2 shape_slat (DINOv3-conditioned)
	Stage 3 (Texture)– TRELLIS.2 tex_slat (DINOv3-conditioned, PBR)
	"""

	import sys, os

	# ── Path setup (must precede any trellis2 import) ────────────────────────────
	_HERE = os.path.dirname(os.path.abspath(__file__))
	_TRELLIS2 = os.path.join(_HERE, 'wheels', 'TRELLIS.2')
	if _TRELLIS2 not in sys.path:
	sys.path.insert(0, _TRELLIS2)

	# ── Environment variables (must be set BEFORE module imports) ────────────────
	os.environ['SPCONV_ALGO'] = 'native'
	os.environ['OPENCV_IO_ENABLE_OPENEXR'] = '1'
	os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
	os.environ['ATTN_BACKEND'] = 'flash_attn_3'
	os.environ['FLEX_GEMM_AUTOTUNE_CACHE_PATH'] = os.path.join(_HERE, 'autotune_cache.json')
	os.environ['FLEX_GEMM_AUTOTUNER_VERBOSE'] = '1'

	# ── Imports ───────────────────────────────────────────────────────────────────
	import spaces
	import gradio as gr
	from datetime import datetime
	import shutil
	import cv2
	import base64, io
	from typing import *

	import torch
	import numpy as np
	import imageio
	from PIL import Image
	import gc

	from trellis2.modules.sparse import SparseTensor
	from trellis2.utils import render_utils
	from trellis2.renderers import EnvMap
	import o_voxel

	from trellis.pipelines import TrellisVGGTTo3DPipeline
	from trellis2.pipelines import Trellis2ImageTo3DPipeline
	from trellis.pipelines.trellis_hybrid_pipeline import TrellisHybridPipeline


	# ── Constants ─────────────────────────────────────────────────────────────────
	MAX_SEED = np.iinfo(np.int32).max
	TMP_DIR = os.path.join(_HERE, 'tmp')
	os.makedirs(TMP_DIR, exist_ok=True)
	LOW_VRAM = False

	MODES = [
	{"name": "Shaded", "render_key": "shaded"},
	{"name": "Normal", "render_key": "normal"},
	{"name": "Base color", "render_key": "base_color"},
	{"name": "Metallic", "render_key": "metallic"},
	{"name": "Roughness", "render_key": "roughness"},
	]
	STEPS = 8
	DEFAULT_MODE = 0
	DEFAULT_STEP = 3


	# ── CSS / JS ──────────────────────────────────────────────────────────────────
	css = """
	.badge-row {
	text-align: center !important;
	}
	.badge-row p {
	display: inline-flex !important;
	gap: 8px;
	justify-content: center;
	align-items: center;
	}
	.badge-row a {
	display: inline-block !important;
	}
	.badge-row img {
	display: inline-block !important;
	}
	.previewer-container {
	position: relative;
	width: 100%;
	height: 520px;
	display: flex;
	flex-direction: column;
	align-items: center;
	justify-content: center;
	font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
	}
	.previewer-container .display-row {
	width: 100%; flex-grow: 1; display: flex;
	justify-content: center; align-items: center; min-height: 360px;
	}
	.previewer-container .previewer-main-image {
	max-width: 100%; max-height: 100%; object-fit: contain; display: none;
	}
	.previewer-container .previewer-main-image.visible { display: block; }
	.previewer-container .mode-row {
	width: 100%; display: flex; gap: 10px;
	justify-content: center; margin-bottom: 10px; flex-wrap: wrap;
	}
	.previewer-container .mode-btn {
	padding: 4px 12px; border-radius: 14px; border: 2px solid #ddd;
	cursor: pointer; font-size: 13px; background: none;
	opacity: 0.55; transition: all 0.2s;
	}
	.previewer-container .mode-btn:hover { opacity: 0.9; }
	.previewer-container .mode-btn.active {
	opacity: 1; border-color: var(--color-accent);
	color: var(--color-accent); font-weight: 600;
	}
	.previewer-container .slider-row {
	width: 100%; display: flex; align-items: center; padding: 0 12px;
	}
	.previewer-container input[type=range] {
	-webkit-appearance: none; width: 100%;
	background: transparent;
	}
	.previewer-container input[type=range]::-webkit-slider-runnable-track {
	height: 6px; background: #ddd; border-radius: 3px;
	}
	.previewer-container input[type=range]::-webkit-slider-thumb {
	height: 18px; width: 18px; border-radius: 50%;
	background: var(--color-accent); -webkit-appearance: none;
	margin-top: -6px; box-shadow: 0 2px 4px rgba(0,0,0,.2);
	}
	"""

	head = """
	<script>
	function refreshView(mode, step) {
	const allImgs = document.querySelectorAll('.previewer-main-image');
	for (let img of allImgs) {
	if (img.classList.contains('visible')) {
	const [_, m, s] = img.id.split('-');
	if (mode === -1) mode = parseInt(m.slice(1));
	if (step === -1) step = parseInt(s.slice(1));
	break;
	}
	}
	allImgs.forEach(img => img.classList.remove('visible'));
	const tgt = document.getElementById('view-m' + mode + '-s' + step);
	if (tgt) tgt.classList.add('visible');
	document.querySelectorAll('.mode-btn').forEach((btn, idx) => {
	btn.classList.toggle('active', idx === mode);
	});
	}
	function selectMode(mode) { refreshView(mode, -1); }
	function onSliderChange(val) { refreshView(-1, parseInt(val)); }
	</script>
	"""

	empty_html = """
	<div class="previewer-container">
	<svg style="opacity:.4;height:60px;color:var(--body-text-color)"
	xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none"
	stroke="currentColor" stroke-width="1.2">
	<rect x="3" y="3" width="18" height="18" rx="2"/>
	<circle cx="8.5" cy="8.5" r="1.5"/>
	<polyline points="21 15 16 10 5 21"/>
	</svg>
	</div>
	"""


	# ── Helpers ───────────────────────────────────────────────────────────────────

	def image_to_base64(image: Image.Image) -> str:
	buf = io.BytesIO()
	image.convert("RGB").save(buf, format="jpeg", quality=85)
	return "data:image/jpeg;base64," + base64.b64encode(buf.getvalue()).decode()


	def pack_state(latents: Tuple[SparseTensor, SparseTensor, int]) -> dict:
	shape_slat, tex_slat, res = latents
	return {
	'shape_slat_feats': shape_slat.feats.cpu().numpy(),
	'tex_slat_feats': tex_slat.feats.cpu().numpy(),
	'coords': shape_slat.coords.cpu().numpy(),
	'res': res,
	}


	def unpack_state(state: dict) -> Tuple[SparseTensor, SparseTensor, int]:
	shape_slat = SparseTensor(
	feats=torch.from_numpy(state['shape_slat_feats']).cuda(),
	coords=torch.from_numpy(state['coords']).cuda(),
	)
	tex_slat = shape_slat.replace(torch.from_numpy(state['tex_slat_feats']).cuda())
	return shape_slat, tex_slat, state['res']


	def get_seed(randomize_seed: bool, seed: int) -> int:
	return np.random.randint(0, MAX_SEED) if randomize_seed else seed


	# ── Session management ────────────────────────────────────────────────────────

	def start_session(req: gr.Request):
	os.makedirs(os.path.join(TMP_DIR, str(req.session_hash)), exist_ok=True)

	def end_session(req: gr.Request):
	user_dir = os.path.join(TMP_DIR, str(req.session_hash))
	if os.path.exists(user_dir):
	shutil.rmtree(user_dir)


	# ── Preprocessing ─────────────────────────────────────────────────────────────

	def preprocess_images(images: List[Tuple[Image.Image, str]]) -> List[Image.Image]:
	return [pipeline.preprocess_image(img[0]) for img in images]

	def preprocess_videos(video: str, num_per_second: float) -> List[Image.Image]:
	vid = imageio.get_reader(video, 'ffmpeg')
	fps = vid.get_meta_data()['fps']
	frames = []
	for i, frame in enumerate(vid):
	if i % max(int(fps/num_per_second), 1) == 0:
	img = Image.fromarray(frame)
	W, H = img.size
	img = img.resize((int(W / H * 512), 512))
	frames.append(img)
	vid.close()
	return frames


	# ── 3D generation ─────────────────────────────────────────────────────────────

	@spaces.GPU(duration=120)
	def image_to_3d(
	image_gallery,
	multi_image_strategy: str,
	seed: int,
	pipeline_type: str,
	ss_source: str,
	# SS params
	ss_guidance_strength: float,
	ss_guidance_rescale: float,
	ss_sampling_steps: int,
	ss_rescale_t: float,
	# SLat params
	slat_guidance_strength: float,
	slat_guidance_rescale: float,
	slat_sampling_steps: int,
	slat_rescale_t: float,
	# Shape SLat params
	shape_slat_guidance_strength: float,
	shape_slat_guidance_rescale: float,
	shape_slat_sampling_steps: int,
	shape_slat_rescale_t: float,
	# Tex SLat params
	tex_slat_guidance_strength: float,
	tex_slat_guidance_rescale: float,
	tex_slat_sampling_steps: int,
	tex_slat_rescale_t: float,
	req: gr.Request,
	progress=gr.Progress(track_tqdm=True),
	):
	# Collect images
	if not image_gallery:
	raise gr.Error("Please upload at least one image.")
	images = []
	for item in image_gallery:
	img = item[0] if isinstance(item, (tuple, list)) else item
	if isinstance(img, str):
	img = Image.open(img)
	if img.mode != 'RGBA':
	img = img.convert('RGBA')
	images.append(img)

	ss_params = {
	"steps": ss_sampling_steps,
	"cfg_strength": ss_guidance_strength,
	"cfg_interval": [0.6, 1.0],
	"guidance_rescale": ss_guidance_rescale,
	"rescale_t": ss_rescale_t,
	}
	slat_params = {
	"steps": slat_sampling_steps,
	"cfg_strength": slat_guidance_strength,
	"cfg_interval": [0.6, 1.0],
	"guidance_rescale": slat_guidance_rescale,
	"rescale_t": slat_rescale_t,
	}
	shape_slat_params = {
	"steps": shape_slat_sampling_steps,
	"guidance_strength": shape_slat_guidance_strength,
	"guidance_rescale": shape_slat_guidance_rescale,
	"rescale_t": shape_slat_rescale_t,
	}
	tex_slat_params = {
	"steps": tex_slat_sampling_steps,
	"guidance_strength": tex_slat_guidance_strength,
	"guidance_rescale": tex_slat_guidance_rescale,
	"rescale_t": tex_slat_rescale_t,
	}

	if len(images) == 1:
	out_mesh_list, latents = pipeline.run(
	images, seed=seed,
	ss_sampler_params=ss_params,
	slat_sampler_params=slat_params,
	shape_slat_sampler_params=shape_slat_params,
	tex_slat_sampler_params=tex_slat_params,
	pipeline_type=pipeline_type,
	preprocess_image=True,
	return_latent=True,
	ss_source=ss_source,
	)
	else:
	out_mesh_list, latents = pipeline.run_multi_image(
	images, strategy=multi_image_strategy, seed=seed,
	ss_sampler_params=ss_params,
	slat_sampler_params=slat_params,
	shape_slat_sampler_params=shape_slat_params,
	tex_slat_sampler_params=tex_slat_params,
	pipeline_type=pipeline_type,
	preprocess_image=True,
	return_latent=True,
	ss_source=ss_source,
	)

	mesh = out_mesh_list[0]
	mesh.simplify(16777216)

	render_views = render_utils.render_snapshot(
	mesh, resolution=1024, r=2, fov=36, nviews=STEPS, envmap=envmap
	)
	state = pack_state(latents)
	torch.cuda.empty_cache()

	# ── Build previewer HTML ──────────────────────────────────────────────────
	images_html = ""
	for m_idx, mode in enumerate(MODES):
	for s_idx in range(STEPS):
	uid = f"view-m{m_idx}-s{s_idx}"
	is_vis = (m_idx == DEFAULT_MODE and s_idx == DEFAULT_STEP)
	vis_class = "visible" if is_vis else ""
	render_key = mode['render_key']
	frames = render_views.get(render_key)
	if frames is None:
	continue
	img_b64 = image_to_base64(Image.fromarray(frames[s_idx]))
	images_html += f"""
	<img id="{uid}" class="previewer-main-image {vis_class}"
	src="{img_b64}" loading="eager">
	"""

	btns_html = ""
	for idx, mode in enumerate(MODES):
	active = "active" if idx == DEFAULT_MODE else ""
	btns_html += f"""
	<button class="mode-btn {active}" onclick="selectMode({idx})">{mode['name']}</button>
	"""

	full_html = f"""
	<div class="previewer-container">
	<div class="display-row">{images_html}</div>
	<div class="mode-row">{btns_html}</div>
	<div class="slider-row">
	<input type="range" min="0" max="{STEPS-1}" value="{DEFAULT_STEP}" step="1"
	oninput="onSliderChange(this.value)">
	</div>
	</div>
	"""
	return state, full_html


	# ── GLB extraction ────────────────────────────────────────────────────────────

	@spaces.GPU(duration=120)
	def extract_glb(
	state: dict,
	decimation_target: int,
	texture_size: int,
	req: gr.Request,
	progress=gr.Progress(track_tqdm=True),
	) -> Tuple[str, str]:
	if state is None:
	raise gr.Error("Please generate a 3D model first.")
	user_dir = os.path.join(TMP_DIR, str(req.session_hash))
	os.makedirs(user_dir, exist_ok=True)

	shape_slat, tex_slat, res = unpack_state(state)
	mesh = pipeline.trellis2_pipeline.decode_latent(shape_slat, tex_slat, res)[0]

	glb = o_voxel.postprocess.to_glb(
	vertices=mesh.vertices,
	faces=mesh.faces,
	attr_volume=mesh.attrs,
	coords=mesh.coords,
	attr_layout=pipeline.pbr_attr_layout,
	grid_size=res,
	aabb=[[-0.5, -0.5, -0.5], [0.5, 0.5, 0.5]],
	decimation_target=decimation_target,
	texture_size=texture_size,
	remesh=True,
	remesh_band=1,
	remesh_project=0,
	use_tqdm=True,
	)

	now = datetime.now()
	timestamp = now.strftime("%Y-%m-%dT%H%M%S") + f".{now.microsecond // 1000:03d}"
	glb_path = os.path.join(user_dir, f'sample_{timestamp}.glb')
	glb.export(glb_path, extension_webp=True)
	torch.cuda.empty_cache()
	return glb_path, glb_path


	# ── Multi-view examples ───────────────────────────────────────────────────────

	def prepare_multi_example() -> list:
	example_dir = os.path.join(_HERE, "assets", "example_multi_image")
	if not os.path.exists(example_dir):
	return []
	cases = sorted(set(i.split('_')[0] for i in os.listdir(example_dir)))
	result = []
	for case in cases:
	paths = []
	for i in range(1, 9):
	p = os.path.join(example_dir, f'{case}_{i}.png')
	if os.path.exists(p):
	paths.append(p)
	if paths:
	result.append([paths])
	return result


	# ── Gradio UI ─────────────────────────────────────────────────────────────────

	with gr.Blocks(
	title="ReconViaGen v0.5",
	delete_cache=(600, 600),
	) as demo:

	gr.Markdown("<center><h1>ReconViaGen-v0.5</h1></center>")
	gr.Markdown(
	"[![GitHub stars](https://img.shields.io/github/stars/GAP-LAB-CUHK-SZ/ReconViaGen?label=GitHub%20%E2%98%85&logo=github&color=C8C)](https://github.com/GAP-LAB-CUHK-SZ/ReconViaGen/tree/v0.5) "
	"[![Project Page](https://www.obukhov.ai/img/badges/badge-website.svg)](https://jiahao620.github.io/reconviagen/) "
	"[![Paper](https://www.obukhov.ai/img/badges/badge-pdf.svg)](https://arxiv.org/abs/2510.23306)",
	elem_classes=["badge-row"],
	)
	gr.Markdown("""
	Stage 1 - Sparse Structure: ReconViaGen (VGGT multi-view aware)
	Stage 2 - Shape SLat: TRELLIS.2 (DINOv3-conditioned)
	Stage 3 - Texture SLat: TRELLIS.2 (DINOv3-conditioned, PBR output)

	> Note: For deployment and runtime efficiency on Hugging Face Spaces, the number of denoising steps has been reduced compared to the full pipeline. This may result in slightly lower visual quality. For best results, consider running locally with more steps.
	""")

	with gr.Row():
	# ── Left panel ────────────────────────────────────────────────────────
	with gr.Column(scale=1, min_width=380):

	input_video = gr.Video(label="Upload Video", interactive=True, height=220)
	image_prompt = gr.Gallery(
	label="Image Prompts (upload one or more views)",
	columns=3, rows=2, height=250, interactive=True,
	type="pil", file_types=["image"],
	)

	with gr.Accordion("Pipeline Settings", open=False):
	multi_image_strategy = gr.Radio(
	choices=["average_right", "weighted_average", "sequential", "average", "adaptive_guidance_weight", "fixed_guidance_rescale"],
	value="adaptive_guidance_weight",
	label="Multi-image fusion strategy",
	)
	pipeline_type = gr.Radio(
	choices=["512", "1024", "1024_cascade", "1536_cascade"],
	value="1024_cascade",
	label="Output Resolution",
	)
	ss_source = gr.Radio(
	choices=["direct", "mesh", "mvtrellis2"],
	value="mesh",
	label="Stage 1 Coords Source",
	)

	seed = gr.Slider(0, MAX_SEED, label="Seed", value=0, step=1)
	randomize_seed = gr.Checkbox(label="Randomize Seed", value=False)
	decimation_target = gr.Slider(100000, 1000000, label="Decimation Target",
	value=500000, step=10000)
	texture_size = gr.Slider(1024, 4096, label="Texture Size",
	value=2048, step=1024)
	num_per_second = gr.Slider(0.1, 30, label="Frames Per Second",
	value=1, step=0.1)

	generate_btn = gr.Button("Generate", variant="primary")

	with gr.Accordion("Advanced Settings", open=False):
	gr.Markdown("Stage 1 - Sparse Structure (ReconViaGen)")
	with gr.Row():
	ss_guidance_strength = gr.Slider(0.0, 10.0, label="Guidance Strength",
	value=7.5, step=0.1)
	ss_guidance_rescale = gr.Slider(0.0, 1.0, label="Guidance Rescale",
	value=0.7, step=0.01)
	ss_sampling_steps = gr.Slider(1, 50, label="Sampling Steps",
	value=12, step=1)
	ss_rescale_t = gr.Slider(1.0, 6.0, label="Rescale T",
	value=5.0, step=0.1)

	gr.Markdown("Stage 2 - Structured Latent (ReconViaGen)")
	with gr.Row():
	slat_guidance_strength = gr.Slider(0.0, 10.0, label="Guidance Strength",
	value=7.5, step=0.1)
	slat_guidance_rescale = gr.Slider(0.0, 1.0, label="Guidance Rescale",
	value=0.5, step=0.01)
	slat_sampling_steps = gr.Slider(1, 50, label="Sampling Steps",
	value=12, step=1)
	slat_rescale_t = gr.Slider(1.0, 6.0, label="Rescale T",
	value=3.0, step=0.1)

	gr.Markdown("Stage 3 - Shape SLat (TRELLIS.2)")
	with gr.Row():
	shape_slat_guidance_strength = gr.Slider(1.0, 10.0,
	label="Guidance Strength", value=7.5, step=0.1)
	shape_slat_guidance_rescale = gr.Slider(0.0, 1.0,
	label="Guidance Rescale", value=0.5, step=0.01)
	shape_slat_sampling_steps = gr.Slider(1, 50,
	label="Sampling Steps", value=8, step=1)
	shape_slat_rescale_t = gr.Slider(1.0, 6.0,
	label="Rescale T", value=3.0, step=0.1)

	gr.Markdown("Stage 4 - Texture SLat (TRELLIS.2)")
	with gr.Row():
	tex_slat_guidance_strength = gr.Slider(1.0, 10.0,
	label="Guidance Strength", value=1.0, step=0.1)
	tex_slat_guidance_rescale = gr.Slider(0.0, 1.0,
	label="Guidance Rescale", value=0.0, step=0.01)
	tex_slat_sampling_steps = gr.Slider(1, 50,
	label="Sampling Steps", value=8, step=1)
	tex_slat_rescale_t = gr.Slider(1.0, 6.0,
	label="Rescale T", value=3.0, step=0.1)

	# ── Right panel ───────────────────────────────────────────────────────
	with gr.Column(scale=10):
	preview_output = gr.HTML(empty_html, label="3D Preview", show_label=True)
	extract_btn = gr.Button("Extract GLB")
	glb_output = gr.Model3D(label="Extracted GLB", height=480,
	display_mode="solid",
	clear_color=(0.25, 0.25, 0.25, 1.0))
	download_btn = gr.DownloadButton(label="Download GLB", interactive=False)

	output_buf = gr.State()

	# Example row
	with gr.Row():
	examples_multi = gr.Examples(
	examples=prepare_multi_example(),
	inputs=[image_prompt],
	examples_per_page=8,
	)

	# ── Event handlers ────────────────────────────────────────────────────────
	demo.load(start_session)
	demo.unload(end_session)

	input_video.upload(preprocess_videos, inputs=[input_video, num_per_second], outputs=[image_prompt])
	input_video.clear(lambda: (None, None), outputs=[input_video, image_prompt])

	generate_btn.click(
	get_seed, inputs=[randomize_seed, seed], outputs=[seed],
	).then(
	image_to_3d,
	inputs=[
	image_prompt, multi_image_strategy, seed, pipeline_type, ss_source,
	ss_guidance_strength, ss_guidance_rescale, ss_sampling_steps, ss_rescale_t,
	slat_guidance_strength, slat_guidance_rescale, slat_sampling_steps, slat_rescale_t,
	shape_slat_guidance_strength, shape_slat_guidance_rescale,
	shape_slat_sampling_steps, shape_slat_rescale_t,
	tex_slat_guidance_strength, tex_slat_guidance_rescale,
	tex_slat_sampling_steps, tex_slat_rescale_t,
	],
	outputs=[output_buf, preview_output],
	)

	extract_btn.click(
	extract_glb,
	inputs=[output_buf, decimation_target, texture_size],
	outputs=[glb_output, download_btn],
	).then(
	lambda: gr.update(interactive=True), outputs=[download_btn]
	)


	# ── Entry point ───────────────────────────────────────────────────────────────
	if __name__ == "__main__":
	# Load ReconViaGen pipeline (SS stage)
	print("[1/2] Loading ReconViaGen pipeline (SS stage) ...")
	vggt_pipeline = TrellisVGGTTo3DPipeline.from_pretrained("Stable-X/trellis-vggt-v0-2")
	vggt_pipeline.cuda()
	vggt_pipeline.VGGT_model.cuda()
	vggt_pipeline.birefnet_model.cuda()
	del vggt_pipeline.models['slat_decoder_gs']

	if LOW_VRAM:
	vggt_pipeline.VGGT_model.cpu()
	for model in vggt_pipeline.models.values():
	model.cpu()
	gc.collect()
	torch.cuda.empty_cache()

	# Load TRELLIS.2 pipeline (shape/tex slat + decode)
	print("[2/2] Loading TRELLIS.2 pipeline (shape/tex slat) ...")
	trellis2_pipeline = Trellis2ImageTo3DPipeline.from_pretrained("microsoft/TRELLIS.2-4B")
	trellis2_pipeline.cuda()
	if LOW_VRAM:
	trellis2_pipeline.low_vram = True
	gc.collect()
	torch.cuda.empty_cache()

	# Combine into hybrid pipeline
	pipeline = TrellisHybridPipeline(vggt_pipeline, trellis2_pipeline, low_vram=LOW_VRAM)

	# Load HDRI environment maps for PBR rendering
	_HDRI_DIR = os.path.join(_HERE, 'assets', 'hdri')
	envmap = EnvMap(torch.tensor(
	cv2.cvtColor(cv2.imread(os.path.join(_HDRI_DIR, 'courtyard.exr'), cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGB),
	dtype=torch.float32, device='cuda'
	))

	demo.launch(css=css, head=head)