SingleImage3d

Sleeping

App Files Files Community

gagndeep commited on Dec 19, 2025

Commit

7e390f4

1 Parent(s): 671c57a

updates

Browse files

Files changed (2) hide show

app.py +37 -18
model_utils.py +240 -20

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 """
 SHARP Gradio Demo
-- Standard Native Layout
-- Fixed: Added @spaces.GPU for ZeroGPU compatibility (Fixes 'dummy' output)
-- Fixed: Download Button visibility logic
 """
 from __future__ import annotations
@@ -17,7 +17,6 @@ import gradio as gr
 try:
     import spaces
 except ImportError:
-    # Fallback for local testing if spaces is not installed
     class spaces:
         @staticmethod
         def GPU(func):
@@ -84,7 +83,10 @@ def get_example_files() -> list[list[str]]:
             examples.append([str(img)])
     return examples
-# --- 2. Apply @spaces.GPU Decorator ---
 @spaces.GPU(duration=120)
 def run_sharp(
     image_path: str | None,
@@ -102,20 +104,26 @@ def run_sharp(
     if not image_path:
         raise gr.Error("Please upload an image first.")
-    # Validate inputs
     out_long_side_val = None if int(output_long_side) <= 0 else int(output_long_side)
-    # Convert trajectory string to Enum safely
-    traj_key = trajectory_type.upper()
-    if hasattr(TrajectoryType, traj_key):
-        traj_enum = TrajectoryType[traj_key]
-    else:
-        traj_enum = trajectory_type
     try:
         progress(0.1, desc="Initializing SHARP model on GPU...")
-        # Call the backend model
         video_path, ply_path = predict_and_maybe_render_gpu(
             image_path,
             trajectory_type=traj_enum,
@@ -125,12 +133,14 @@ def run_sharp(
             render_video=bool(render_video),
         )
-        # Prepare outputs
         status_msg = f"### ✅ Success\nGenerated: `{ply_path.name}`"
         video_result = str(video_path) if video_path else None
         if video_path:
             status_msg += f"\nVideo: `{video_path.name}`"
         # Explicitly update the Download Button
         download_btn_update = gr.DownloadButton(
@@ -162,6 +172,7 @@ def build_demo() -> gr.Blocks:
     with gr.Blocks(theme=theme, head=SEO_HEAD, title="SHARP 3D Generator") as demo:
         with gr.Row():
             with gr.Column(scale=1):
                 gr.Markdown("# SHARP: Single-Image 3D Generator\nConvert any static image into a 3D Gaussian Splat scene instantly.")
@@ -178,7 +189,7 @@ def build_demo() -> gr.Blocks:
                     interactive=True
                 )
-                # Configs
                 with gr.Group():
                     with gr.Row():
                         trajectory = gr.Dropdown(
@@ -189,7 +200,15 @@ def build_demo() -> gr.Blocks:
                         )
                         output_res = gr.Dropdown(
                             label="Output Resolution",
-                            choices=[("Original", 0), ("512px", 512), ("1024px", 1024)],
                             value=0,
                             scale=1
                         )
@@ -223,7 +242,7 @@ def build_demo() -> gr.Blocks:
                 with gr.Group():
                     status_md = gr.Markdown("Ready to generate.")
-                    # Button starts hidden
                     ply_download = gr.DownloadButton(
                         label="Download .PLY File",
                         variant="secondary",

 """
 SHARP Gradio Demo
+- Standard Native Layout (Clean Two-Column)
+- Logic: Matches original Apple implementation (Robust Enum & Resolution handling)
+- System: ZeroGPU compatible
 """
 from __future__ import annotations
 try:
     import spaces
 except ImportError:
     class spaces:
         @staticmethod
         def GPU(func):
             examples.append([str(img)])
     return examples
+# -----------------------------------------------------------------------------
+# Main Inference Logic
+# -----------------------------------------------------------------------------
 @spaces.GPU(duration=120)
 def run_sharp(
     image_path: str | None,
     if not image_path:
         raise gr.Error("Please upload an image first.")
+    # 1. Logic: Robust Enum Conversion
+    # The model likely expects the Enum object, not the string.
+    try:
+        # Try exact match (e.g. "swipe" -> TrajectoryType.swipe)
+        traj_enum = TrajectoryType[trajectory_type]
+    except KeyError:
+        try:
+            # Try upper case (e.g. "swipe" -> TrajectoryType.SWIPE)
+            traj_enum = TrajectoryType[trajectory_type.upper()]
+        except KeyError:
+            # Fallback: pass the string itself
+            traj_enum = trajectory_type
+    # 2. Logic: Handle Resolution
     out_long_side_val = None if int(output_long_side) <= 0 else int(output_long_side)
     try:
         progress(0.1, desc="Initializing SHARP model on GPU...")
+        # 3. Call Backend
         video_path, ply_path = predict_and_maybe_render_gpu(
             image_path,
             trajectory_type=traj_enum,
             render_video=bool(render_video),
         )
+        # 4. Prepare Outputs
         status_msg = f"### ✅ Success\nGenerated: `{ply_path.name}`"
         video_result = str(video_path) if video_path else None
         if video_path:
             status_msg += f"\nVideo: `{video_path.name}`"
+        else:
+            status_msg += "\n(Video rendering disabled or failed)"
         # Explicitly update the Download Button
         download_btn_update = gr.DownloadButton(
     with gr.Blocks(theme=theme, head=SEO_HEAD, title="SHARP 3D Generator") as demo:
+        # --- Header ---
         with gr.Row():
             with gr.Column(scale=1):
                 gr.Markdown("# SHARP: Single-Image 3D Generator\nConvert any static image into a 3D Gaussian Splat scene instantly.")
                     interactive=True
                 )
+                # Configs (Updated with Full Options from Original File)
                 with gr.Group():
                     with gr.Row():
                         trajectory = gr.Dropdown(
                         )
                         output_res = gr.Dropdown(
                             label="Output Resolution",
+                            # Full list from the original logic
+                            choices=[
+                                ("Match input", 0),
+                                ("512", 512),
+                                ("768", 768),
+                                ("1024", 1024),
+                                ("1280", 1280),
+                                ("1536", 1536),
+                            ],
                             value=0,
                             scale=1
                         )
                 with gr.Group():
                     status_md = gr.Markdown("Ready to generate.")
+                    # Button starts hidden, becomes visible on success
                     ply_download = gr.DownloadButton(
                         label="Download .PLY File",
                         variant="secondary",

model_utils.py CHANGED Viewed

@@ -4,6 +4,7 @@ Design goals:
 - Reuse SHARP's own predict/render pipeline (no subprocess calls).
 - Be robust on Hugging Face Spaces + ZeroGPU.
 - Cache model weights and predictor construction across requests.
 Public API (used by the Gradio app):
 - TrajectoryType
@@ -12,6 +13,7 @@ Public API (used by the Gradio app):
 from __future__ import annotations
 import os
 import threading
 import time
@@ -42,7 +44,24 @@ from sharp.utils import camera, io
 from sharp.utils.gaussians import Gaussians3D, SceneMetaData, save_ply
 from sharp.utils.gsplat import GSplatRenderer
-TrajectoryType = Literal["swipe", "shake", "rotate", "rotate_forward"]
 # -----------------------------------------------------------------------------
 # Helpers
@@ -82,6 +101,189 @@ def _select_device(preference: str = "auto") -> torch.device:
     return torch.device("cpu")
 # -----------------------------------------------------------------------------
 # Prediction outputs
 # -----------------------------------------------------------------------------
@@ -406,10 +608,13 @@ class ModelWrapper:
         if fps < 1:
             raise ValueError("fps must be >= 1")
-        # Keep aligned with upstream CLI pipeline where possible.
-        if output_long_side is None and int(fps) == 30:
             params = camera.TrajectoryParams(
-                type=trajectory_type,
                 num_steps=int(num_frames),
                 num_repeats=1,
             )
@@ -428,7 +633,7 @@ class ModelWrapper:
                 pass
             return output_path
-        # Adapted pipeline for custom output resolution / FPS.
         src_w, src_h = metadata.resolution_px
         src_f = float(metadata.focal_length_px)
@@ -441,15 +646,37 @@ class ModelWrapper:
             out_h = _make_even(max(2, int(round(src_h * scale))))
             out_f = src_f * scale
-        traj_params = camera.TrajectoryParams(
-            type=trajectory_type,
-            num_steps=int(num_frames),
-            num_repeats=1,
-        )
         device = torch.device("cuda")
         gaussians_cuda = gaussians.to(device)
         intrinsics = torch.tensor(
             [
                 [out_f, 0.0, (out_w - 1) / 2.0, 0.0],
@@ -465,14 +692,7 @@ class ModelWrapper:
             gaussians_cuda,
             intrinsics,
             resolution_px=(out_w, out_h),
-            lookat_mode=traj_params.lookat_mode,
-        )
-        trajectory = camera.create_eye_trajectory(
-            gaussians_cuda,
-            traj_params,
-            resolution_px=(out_w, out_h),
-            f_px=out_f,
         )
         renderer = GSplatRenderer(color_space=metadata.color_space)
@@ -609,4 +829,4 @@ def predict_and_maybe_render(
 if spaces is not None:
     predict_and_maybe_render_gpu = spaces.GPU(duration=180)(predict_and_maybe_render)
 else:  # pragma: no cover
-    predict_and_maybe_render_gpu = predict_and_maybe_render

 - Reuse SHARP's own predict/render pipeline (no subprocess calls).
 - Be robust on Hugging Face Spaces + ZeroGPU.
 - Cache model weights and predictor construction across requests.
+- Support extended camera trajectories beyond the defaults.
 Public API (used by the Gradio app):
 - TrajectoryType
 from __future__ import annotations
+import math
 import os
 import threading
 import time
 from sharp.utils.gaussians import Gaussians3D, SceneMetaData, save_ply
 from sharp.utils.gsplat import GSplatRenderer
+# Extended list of supported trajectories (22 types)
+TrajectoryType = Literal[
+    # Standard SHARP defaults
+    "swipe", "shake", "rotate", "rotate_forward",
+    # Extended Rotations
+    "rotate_reverse", "rotate_up", "rotate_down",
+    # Zooms & Dollies
+    "zoom_in", "zoom_out", "dolly_in", "dolly_out",
+    # Pans (planar movement)
+    "pan_left", "pan_right", "pan_up", "pan_down",
+    # Complex Paths
+    "spiral_in", "spiral_out", "figure_eight", "loop", "heart",
+    "bounce", "ken_burns"
+]
+STANDARD_TRAJECTORIES: Final[set[str]] = {
+    "swipe", "shake", "rotate", "rotate_forward"
+}
 # -----------------------------------------------------------------------------
 # Helpers
     return torch.device("cpu")
+# -----------------------------------------------------------------------------
+# Custom Trajectory Generation
+# -----------------------------------------------------------------------------
+def _generate_custom_trajectory(
+    gaussians: torch.Tensor,
+    resolution: tuple[int, int],
+    focal_length: float,
+    traj_type: str,
+    num_frames: int
+) -> list[torch.Tensor]:
+    """
+    Generates a list of camera eye positions (tensors) for custom paths.
+    Uses the standard 'rotate' path to establish a baseline radius/elevation.
+    """
+    # 1. Get baseline Radius (R) and Elevation (Y) from the standard generator
+    # We generate just 1 step of the standard 'rotate' to see where SHARP puts the camera.
+    base_params = camera.TrajectoryParams(type="rotate", num_steps=1)
+    base_traj = camera.create_eye_trajectory(
+        gaussians, base_params, resolution_px=resolution, f_px=focal_length
+    )
+    start_pos = list(base_traj)[0].cpu() # [3] tensor (x, y, z)
+    # Calculate spherical coordinates from start_pos
+    # Assuming LookAt(0,0,0), radius is norm, elevation is y.
+    radius = float(torch.norm(start_pos))
+    base_y = float(start_pos[1])
+    # Starting azimuth (theta). Usually start_pos is roughly [0, 0, radius] or [radius, 0, 0]
+    # We'll compute it to be safe.
+    base_theta = math.atan2(start_pos[2], start_pos[0])
+    positions = []
+    # Time steps 0..1
+    t_vals = [i / (num_frames - 1) for i in range(num_frames)]
+    for t in t_vals:
+        x, y, z = 0.0, 0.0, 0.0
+        # --- Logic for 20+ movements ---
+        if traj_type == "rotate_reverse":
+            # Orbit opposite direction
+            theta = base_theta - (2 * math.pi * t)
+            x = radius * math.cos(theta)
+            z = radius * math.sin(theta)
+            y = base_y
+        elif traj_type == "rotate_up":
+            # Orbit over the top (vertical orbit)
+            phi = (math.pi / 4) * math.sin(2 * math.pi * t)
+            # Modulate Y significantly
+            theta = base_theta + (0.5 * math.pi * t) # Slow rotate
+            curr_r = radius
+            x = curr_r * math.cos(theta)
+            z = curr_r * math.sin(theta)
+            y = base_y + (radius * 0.8 * math.sin(math.pi * t)) # Arc up
+        elif traj_type == "rotate_down":
+            theta = base_theta + (0.5 * math.pi * t)
+            y = base_y - (radius * 0.5 * math.sin(math.pi * t))
+            x = radius * math.cos(theta)
+            z = radius * math.sin(theta)
+        elif traj_type in ["zoom_in", "dolly_in"]:
+            # Move from Radius to Radius*0.4
+            cur_r = radius * (1.0 - 0.6 * t)
+            x = cur_r * math.cos(base_theta)
+            z = cur_r * math.sin(base_theta)
+            y = base_y
+        elif traj_type in ["zoom_out", "dolly_out"]:
+            # Move from Radius*0.5 to Radius*1.2
+            cur_r = (radius * 0.5) + (radius * 0.7 * t)
+            x = cur_r * math.cos(base_theta)
+            z = cur_r * math.sin(base_theta)
+            y = base_y
+        elif traj_type == "pan_left":
+            # Linear slide perpendicular to view vector
+            # Approx: move X relative to view
+            offset = (t - 0.5) * 2.0 * (radius * 0.5)
+            x = start_pos[0] + offset
+            y = start_pos[1]
+            z = start_pos[2] # Simple approximation
+        elif traj_type == "pan_right":
+            offset = (0.5 - t) * 2.0 * (radius * 0.5)
+            x = start_pos[0] + offset
+            y = start_pos[1]
+            z = start_pos[2]
+        elif traj_type == "pan_up":
+            offset = (t - 0.5) * (radius * 0.8)
+            x = start_pos[0]
+            y = base_y - offset # In 3D, Y usually up, but check coord sys. usually Y is down in some CV.
+            # Assuming Y is Up for scene.
+            y = base_y + offset
+            z = start_pos[2]
+        elif traj_type == "pan_down":
+            offset = (t - 0.5) * (radius * 0.8)
+            x = start_pos[0]
+            y = base_y - offset
+            z = start_pos[2]
+        elif traj_type == "spiral_in":
+            # Rotate while getting closer
+            theta = base_theta + (2 * math.pi * t)
+            cur_r = radius * (1.0 - 0.6 * t)
+            x = cur_r * math.cos(theta)
+            z = cur_r * math.sin(theta)
+            y = base_y + (0.2 * radius * math.sin(4 * math.pi * t)) # Slight wobble
+        elif traj_type == "spiral_out":
+            theta = base_theta + (2 * math.pi * t)
+            cur_r = (radius * 0.4) + (radius * 0.8 * t)
+            x = cur_r * math.cos(theta)
+            z = cur_r * math.sin(theta)
+            y = base_y
+        elif traj_type == "figure_eight":
+            # Lemniscate on sphere surface
+            scale = 2 * math.pi * t
+            # Lissajous-ish
+            theta = base_theta + (0.5 * math.sin(scale))
+            phi_offset = 0.3 * math.sin(2 * scale)
+            y = base_y + (radius * phi_offset)
+            x = radius * math.cos(theta)
+            z = radius * math.sin(theta)
+        elif traj_type == "loop":
+            # Vertical circle
+            angle = 2 * math.pi * t
+            y_off = 0.5 * radius * math.sin(angle)
+            x_off = 0.2 * radius * math.cos(angle)
+            x = start_pos[0] + x_off
+            y = base_y + y_off
+            z = start_pos[2]
+        elif traj_type == "heart":
+            # Heart shape in XY plane projection
+            angle = 2 * math.pi * t
+            # Heart formula
+            h_x = 16 * math.sin(angle)**3
+            h_y = 13 * math.cos(angle) - 5*math.cos(2*angle) - 2*math.cos(3*angle) - math.cos(4*angle)
+            # Scale down
+            scale = radius * 0.02
+            x = start_pos[0] + (h_x * scale)
+            y = base_y + (h_y * scale)
+            z = start_pos[2]
+        elif traj_type == "bounce":
+            # Decay bounce
+            freq = 3 * math.pi
+            amp = abs(math.cos(freq * t)) * (1-t)
+            y = base_y + (radius * 0.5 * amp)
+            x = start_pos[0]
+            z = start_pos[2]
+        elif traj_type == "ken_burns":
+            # Pan diagonal + slow zoom
+            zoom_fac = 1.0 - (0.3 * t) # Zoom in 30%
+            pan_x = (t - 0.5) * (radius * 0.3)
+            pan_y = (t - 0.5) * (radius * 0.2)
+            cur_r = radius * zoom_fac
+            x = (cur_r * math.cos(base_theta)) + pan_x
+            y = base_y + pan_y
+            z = (cur_r * math.sin(base_theta))
+        else:
+            # Fallback for anything else (or minor variations)
+            return list(base_traj) # Should be caught by caller, but safe fallback
+        # Construct tensor
+        pos_tensor = torch.tensor([x, y, z], dtype=torch.float32, device=gaussians.device)
+        positions.append(pos_tensor)
+    return positions
 # -----------------------------------------------------------------------------
 # Prediction outputs
 # -----------------------------------------------------------------------------
         if fps < 1:
             raise ValueError("fps must be >= 1")
+        # FAST PATH: Standard SHARP trajectories + Default Resolution
+        # We only use the optimized CLI shortcut if it's a standard type AND default res.
+        is_standard_traj = trajectory_type in STANDARD_TRAJECTORIES
+        if output_long_side is None and int(fps) == 30 and is_standard_traj:
             params = camera.TrajectoryParams(
+                type=trajectory_type, # type: ignore
                 num_steps=int(num_frames),
                 num_repeats=1,
             )
                 pass
             return output_path
+        # CUSTOM PATH: Manual loop (Handles Custom Res, FPS, or Custom Trajectories)
         src_w, src_h = metadata.resolution_px
         src_f = float(metadata.focal_length_px)
             out_h = _make_even(max(2, int(round(src_h * scale))))
             out_f = src_f * scale
         device = torch.device("cuda")
         gaussians_cuda = gaussians.to(device)
+        # 1. Generate Camera Trajectory
+        if is_standard_traj:
+            # Use SHARP's built-in generator
+            traj_params = camera.TrajectoryParams(
+                type=trajectory_type, # type: ignore
+                num_steps=int(num_frames),
+                num_repeats=1,
+            )
+            trajectory = camera.create_eye_trajectory(
+                gaussians_cuda,
+                traj_params,
+                resolution_px=(out_w, out_h),
+                f_px=out_f,
+            )
+            lookat_mode = traj_params.lookat_mode
+        else:
+            # Use our custom generator
+            trajectory = _generate_custom_trajectory(
+                gaussians_cuda,
+                resolution=(out_w, out_h),
+                focal_length=out_f,
+                traj_type=trajectory_type,
+                num_frames=num_frames
+            )
+            # Custom trajectories always look at origin (0,0,0) for now
+            lookat_mode = "scene" # Assuming SHARP 'scene' mode implies look-at-center
+        # 2. Setup Camera Model
         intrinsics = torch.tensor(
             [
                 [out_f, 0.0, (out_w - 1) / 2.0, 0.0],
             gaussians_cuda,
             intrinsics,
             resolution_px=(out_w, out_h),
+            lookat_mode=lookat_mode,
         )
         renderer = GSplatRenderer(color_space=metadata.color_space)
 if spaces is not None:
     predict_and_maybe_render_gpu = spaces.GPU(duration=180)(predict_and_maybe_render)
 else:  # pragma: no cover
+    predict_and_maybe_render_gpu = predict_and_maybe_render