jiggle-physics / reconstruction.py
Justin Wood
Initial backend
c401d3e
import numpy as np
import torch
import trimesh
import io
from PIL import Image
_triposr_cache = None
def get_triposr():
global _triposr_cache
if _triposr_cache is None:
from transformers import TripoSRForImageTo3D, TripoSRImageProcessor
processor = TripoSRImageProcessor.from_pretrained("stabilityai/TripoSR")
model = TripoSRForImageTo3D.from_pretrained("stabilityai/TripoSR")
model.eval()
_triposr_cache = (model, processor)
return _triposr_cache
def reconstruct_region(image: Image.Image, mask: list[list[bool]], bbox: list[int]) -> bytes:
"""
Crop the masked region from the image, run TripoSR, return GLB bytes.
"""
model, processor = get_triposr()
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
# Crop to bounding box with 20% padding
x, y, w, h = bbox
pad_x = int(w * 0.20)
pad_y = int(h * 0.20)
W, H = image.size
x0 = max(0, x - pad_x)
y0 = max(0, y - pad_y)
x1 = min(W, x + w + pad_x)
y1 = min(H, y + h + pad_y)
cropped = image.crop((x0, y0, x1, y1)).resize((512, 512), Image.LANCZOS)
# Apply mask as alpha channel so TripoSR focuses on the region
mask_arr = np.array(mask, dtype=np.uint8)[y0:y1, x0:x1]
mask_resized = np.array(
Image.fromarray(mask_arr * 255).resize((512, 512), Image.NEAREST)
)
rgba = np.array(cropped.convert("RGBA"))
rgba[:, :, 3] = mask_resized
input_img = Image.fromarray(rgba)
inputs = processor(images=input_img, return_tensors="pt").to(device)
with torch.no_grad():
outputs = model(**inputs)
# Export as GLB via trimesh
mesh_data = outputs.mesh # TripoSR returns a trimesh-compatible object
if hasattr(mesh_data, "export"):
glb_bytes = mesh_data.export(file_type="glb")
else:
# Fallback: build trimesh from vertices/faces tensors
verts = mesh_data.verts_list()[0].cpu().numpy()
faces = mesh_data.faces_list()[0].cpu().numpy()
mesh = trimesh.Trimesh(vertices=verts, faces=faces, process=False)
buf = io.BytesIO()
mesh.export(buf, file_type="glb")
glb_bytes = buf.getvalue()
return glb_bytes
def depth_to_mesh(depth: list[list[float]], mask: list[list[bool]], image: Image.Image) -> bytes:
"""
Fallback when TripoSR isn't available: lift depth map into a 3D mesh
constrained to the masked region, textured with the source image.
"""
depth_arr = np.array(depth, dtype=np.float32)
mask_arr = np.array(mask, dtype=bool)
H, W = depth_arr.shape
# Normalize depth to [0, 1] then scale to reasonable Z range
dmin, dmax = depth_arr.min(), depth_arr.max()
if dmax > dmin:
depth_norm = (depth_arr - dmin) / (dmax - dmin)
else:
depth_norm = np.zeros_like(depth_arr)
depth_scaled = depth_norm * 0.5 # 0.5 units of Z range
# Build vertex grid only for masked pixels
ys, xs = np.where(mask_arr)
if len(xs) == 0:
# Empty mask — return a flat quad
verts = np.array([[0, 0, 0], [1, 0, 0], [1, 1, 0], [0, 1, 0]], dtype=np.float32)
faces = np.array([[0, 1, 2], [0, 2, 3]])
mesh = trimesh.Trimesh(vertices=verts, faces=faces)
buf = io.BytesIO()
mesh.export(buf, file_type="glb")
return buf.getvalue()
# Normalize to [-0.5, 0.5] XY space
x_norm = (xs / W) - 0.5
y_norm = 0.5 - (ys / H)
z_vals = depth_scaled[ys, xs]
vertices = np.stack([x_norm, y_norm, z_vals], axis=1).astype(np.float32)
# UV = source pixel position
uvs = np.stack([xs / W, 1.0 - ys / H], axis=1).astype(np.float32)
# Triangulate the masked grid using Delaunay
from scipy.spatial import Delaunay
points_2d = np.stack([x_norm, y_norm], axis=1)
tri = Delaunay(points_2d)
faces = tri.simplices.astype(np.int32)
# Build mesh with texture
img_arr = np.array(image.convert("RGB"))
texture = trimesh.visual.texture.TextureVisuals(
uv=uvs,
image=Image.fromarray(img_arr),
)
mesh = trimesh.Trimesh(vertices=vertices, faces=faces, visual=texture, process=False)
buf = io.BytesIO()
mesh.export(buf, file_type="glb")
return buf.getvalue()