Spaces:

gagndeep
/

Apple-Sharp-Image-to-3D-View-Synthesis

Running on Zero

App Files Files Community

gagndeep commited on Dec 19, 2025

Commit

e719a67

1 Parent(s): 7e390f4

updates

Browse files

Files changed (2) hide show

app.py +18 -37
model_utils.py +20 -240

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 """
 SHARP Gradio Demo
-- Standard Native Layout (Clean Two-Column)
-- Logic: Matches original Apple implementation (Robust Enum & Resolution handling)
-- System: ZeroGPU compatible
 """
 from __future__ import annotations
@@ -17,6 +17,7 @@ import gradio as gr
 try:
     import spaces
 except ImportError:
     class spaces:
         @staticmethod
         def GPU(func):
@@ -83,10 +84,7 @@ def get_example_files() -> list[list[str]]:
             examples.append([str(img)])
     return examples
-# -----------------------------------------------------------------------------
-# Main Inference Logic
-# -----------------------------------------------------------------------------
 @spaces.GPU(duration=120)
 def run_sharp(
     image_path: str | None,
@@ -104,26 +102,20 @@ def run_sharp(
     if not image_path:
         raise gr.Error("Please upload an image first.")
-    # 1. Logic: Robust Enum Conversion
-    # The model likely expects the Enum object, not the string.
-    try:
-        # Try exact match (e.g. "swipe" -> TrajectoryType.swipe)
-        traj_enum = TrajectoryType[trajectory_type]
-    except KeyError:
-        try:
-            # Try upper case (e.g. "swipe" -> TrajectoryType.SWIPE)
-            traj_enum = TrajectoryType[trajectory_type.upper()]
-        except KeyError:
-            # Fallback: pass the string itself
-            traj_enum = trajectory_type
-    # 2. Logic: Handle Resolution
     out_long_side_val = None if int(output_long_side) <= 0 else int(output_long_side)
     try:
         progress(0.1, desc="Initializing SHARP model on GPU...")
-        # 3. Call Backend
         video_path, ply_path = predict_and_maybe_render_gpu(
             image_path,
             trajectory_type=traj_enum,
@@ -133,14 +125,12 @@ def run_sharp(
             render_video=bool(render_video),
         )
-        # 4. Prepare Outputs
         status_msg = f"### ✅ Success\nGenerated: `{ply_path.name}`"
         video_result = str(video_path) if video_path else None
         if video_path:
             status_msg += f"\nVideo: `{video_path.name}`"
-        else:
-            status_msg += "\n(Video rendering disabled or failed)"
         # Explicitly update the Download Button
         download_btn_update = gr.DownloadButton(
@@ -172,7 +162,6 @@ def build_demo() -> gr.Blocks:
     with gr.Blocks(theme=theme, head=SEO_HEAD, title="SHARP 3D Generator") as demo:
-        # --- Header ---
         with gr.Row():
             with gr.Column(scale=1):
                 gr.Markdown("# SHARP: Single-Image 3D Generator\nConvert any static image into a 3D Gaussian Splat scene instantly.")
@@ -189,7 +178,7 @@ def build_demo() -> gr.Blocks:
                     interactive=True
                 )
-                # Configs (Updated with Full Options from Original File)
                 with gr.Group():
                     with gr.Row():
                         trajectory = gr.Dropdown(
@@ -200,15 +189,7 @@ def build_demo() -> gr.Blocks:
                         )
                         output_res = gr.Dropdown(
                             label="Output Resolution",
-                            # Full list from the original logic
-                            choices=[
-                                ("Match input", 0),
-                                ("512", 512),
-                                ("768", 768),
-                                ("1024", 1024),
-                                ("1280", 1280),
-                                ("1536", 1536),
-                            ],
                             value=0,
                             scale=1
                         )
@@ -242,7 +223,7 @@ def build_demo() -> gr.Blocks:
                 with gr.Group():
                     status_md = gr.Markdown("Ready to generate.")
-                    # Button starts hidden, becomes visible on success
                     ply_download = gr.DownloadButton(
                         label="Download .PLY File",
                         variant="secondary",

 """
 SHARP Gradio Demo
+- Standard Native Layout
+- Fixed: Added @spaces.GPU for ZeroGPU compatibility (Fixes 'dummy' output)
+- Fixed: Download Button visibility logic
 """
 from __future__ import annotations
 try:
     import spaces
 except ImportError:
+    # Fallback for local testing if spaces is not installed
     class spaces:
         @staticmethod
         def GPU(func):
             examples.append([str(img)])
     return examples
+# --- 2. Apply @spaces.GPU Decorator ---
 @spaces.GPU(duration=120)
 def run_sharp(
     image_path: str | None,
     if not image_path:
         raise gr.Error("Please upload an image first.")
+    # Validate inputs
     out_long_side_val = None if int(output_long_side) <= 0 else int(output_long_side)
+    # Convert trajectory string to Enum safely
+    traj_key = trajectory_type.upper()
+    if hasattr(TrajectoryType, traj_key):
+        traj_enum = TrajectoryType[traj_key]
+    else:
+        traj_enum = trajectory_type
     try:
         progress(0.1, desc="Initializing SHARP model on GPU...")
+        # Call the backend model
         video_path, ply_path = predict_and_maybe_render_gpu(
             image_path,
             trajectory_type=traj_enum,
             render_video=bool(render_video),
         )
+        # Prepare outputs
         status_msg = f"### ✅ Success\nGenerated: `{ply_path.name}`"
         video_result = str(video_path) if video_path else None
         if video_path:
             status_msg += f"\nVideo: `{video_path.name}`"
         # Explicitly update the Download Button
         download_btn_update = gr.DownloadButton(
     with gr.Blocks(theme=theme, head=SEO_HEAD, title="SHARP 3D Generator") as demo:
         with gr.Row():
             with gr.Column(scale=1):
                 gr.Markdown("# SHARP: Single-Image 3D Generator\nConvert any static image into a 3D Gaussian Splat scene instantly.")
                     interactive=True
                 )
+                # Configs
                 with gr.Group():
                     with gr.Row():
                         trajectory = gr.Dropdown(
                         )
                         output_res = gr.Dropdown(
                             label="Output Resolution",
+                            choices=[("Original", 0), ("512px", 512), ("1024px", 1024)],
                             value=0,
                             scale=1
                         )
                 with gr.Group():
                     status_md = gr.Markdown("Ready to generate.")
+                    # Button starts hidden
                     ply_download = gr.DownloadButton(
                         label="Download .PLY File",
                         variant="secondary",

model_utils.py CHANGED Viewed

@@ -4,7 +4,6 @@ Design goals:
 - Reuse SHARP's own predict/render pipeline (no subprocess calls).
 - Be robust on Hugging Face Spaces + ZeroGPU.
 - Cache model weights and predictor construction across requests.
-- Support extended camera trajectories beyond the defaults.
 Public API (used by the Gradio app):
 - TrajectoryType
@@ -13,7 +12,6 @@ Public API (used by the Gradio app):
 from __future__ import annotations
-import math
 import os
 import threading
 import time
@@ -44,24 +42,7 @@ from sharp.utils import camera, io
 from sharp.utils.gaussians import Gaussians3D, SceneMetaData, save_ply
 from sharp.utils.gsplat import GSplatRenderer
-# Extended list of supported trajectories (22 types)
-TrajectoryType = Literal[
-    # Standard SHARP defaults
-    "swipe", "shake", "rotate", "rotate_forward",
-    # Extended Rotations
-    "rotate_reverse", "rotate_up", "rotate_down",
-    # Zooms & Dollies
-    "zoom_in", "zoom_out", "dolly_in", "dolly_out",
-    # Pans (planar movement)
-    "pan_left", "pan_right", "pan_up", "pan_down",
-    # Complex Paths
-    "spiral_in", "spiral_out", "figure_eight", "loop", "heart",
-    "bounce", "ken_burns"
-]
-STANDARD_TRAJECTORIES: Final[set[str]] = {
-    "swipe", "shake", "rotate", "rotate_forward"
-}
 # -----------------------------------------------------------------------------
 # Helpers
@@ -101,189 +82,6 @@ def _select_device(preference: str = "auto") -> torch.device:
     return torch.device("cpu")
-# -----------------------------------------------------------------------------
-# Custom Trajectory Generation
-# -----------------------------------------------------------------------------
-def _generate_custom_trajectory(
-    gaussians: torch.Tensor,
-    resolution: tuple[int, int],
-    focal_length: float,
-    traj_type: str,
-    num_frames: int
-) -> list[torch.Tensor]:
-    """
-    Generates a list of camera eye positions (tensors) for custom paths.
-    Uses the standard 'rotate' path to establish a baseline radius/elevation.
-    """
-    # 1. Get baseline Radius (R) and Elevation (Y) from the standard generator
-    # We generate just 1 step of the standard 'rotate' to see where SHARP puts the camera.
-    base_params = camera.TrajectoryParams(type="rotate", num_steps=1)
-    base_traj = camera.create_eye_trajectory(
-        gaussians, base_params, resolution_px=resolution, f_px=focal_length
-    )
-    start_pos = list(base_traj)[0].cpu() # [3] tensor (x, y, z)
-    # Calculate spherical coordinates from start_pos
-    # Assuming LookAt(0,0,0), radius is norm, elevation is y.
-    radius = float(torch.norm(start_pos))
-    base_y = float(start_pos[1])
-    # Starting azimuth (theta). Usually start_pos is roughly [0, 0, radius] or [radius, 0, 0]
-    # We'll compute it to be safe.
-    base_theta = math.atan2(start_pos[2], start_pos[0])
-    positions = []
-    # Time steps 0..1
-    t_vals = [i / (num_frames - 1) for i in range(num_frames)]
-    for t in t_vals:
-        x, y, z = 0.0, 0.0, 0.0
-        # --- Logic for 20+ movements ---
-        if traj_type == "rotate_reverse":
-            # Orbit opposite direction
-            theta = base_theta - (2 * math.pi * t)
-            x = radius * math.cos(theta)
-            z = radius * math.sin(theta)
-            y = base_y
-        elif traj_type == "rotate_up":
-            # Orbit over the top (vertical orbit)
-            phi = (math.pi / 4) * math.sin(2 * math.pi * t)
-            # Modulate Y significantly
-            theta = base_theta + (0.5 * math.pi * t) # Slow rotate
-            curr_r = radius
-            x = curr_r * math.cos(theta)
-            z = curr_r * math.sin(theta)
-            y = base_y + (radius * 0.8 * math.sin(math.pi * t)) # Arc up
-        elif traj_type == "rotate_down":
-            theta = base_theta + (0.5 * math.pi * t)
-            y = base_y - (radius * 0.5 * math.sin(math.pi * t))
-            x = radius * math.cos(theta)
-            z = radius * math.sin(theta)
-        elif traj_type in ["zoom_in", "dolly_in"]:
-            # Move from Radius to Radius*0.4
-            cur_r = radius * (1.0 - 0.6 * t)
-            x = cur_r * math.cos(base_theta)
-            z = cur_r * math.sin(base_theta)
-            y = base_y
-        elif traj_type in ["zoom_out", "dolly_out"]:
-            # Move from Radius*0.5 to Radius*1.2
-            cur_r = (radius * 0.5) + (radius * 0.7 * t)
-            x = cur_r * math.cos(base_theta)
-            z = cur_r * math.sin(base_theta)
-            y = base_y
-        elif traj_type == "pan_left":
-            # Linear slide perpendicular to view vector
-            # Approx: move X relative to view
-            offset = (t - 0.5) * 2.0 * (radius * 0.5)
-            x = start_pos[0] + offset
-            y = start_pos[1]
-            z = start_pos[2] # Simple approximation
-        elif traj_type == "pan_right":
-            offset = (0.5 - t) * 2.0 * (radius * 0.5)
-            x = start_pos[0] + offset
-            y = start_pos[1]
-            z = start_pos[2]
-        elif traj_type == "pan_up":
-            offset = (t - 0.5) * (radius * 0.8)
-            x = start_pos[0]
-            y = base_y - offset # In 3D, Y usually up, but check coord sys. usually Y is down in some CV.
-            # Assuming Y is Up for scene.
-            y = base_y + offset
-            z = start_pos[2]
-        elif traj_type == "pan_down":
-            offset = (t - 0.5) * (radius * 0.8)
-            x = start_pos[0]
-            y = base_y - offset
-            z = start_pos[2]
-        elif traj_type == "spiral_in":
-            # Rotate while getting closer
-            theta = base_theta + (2 * math.pi * t)
-            cur_r = radius * (1.0 - 0.6 * t)
-            x = cur_r * math.cos(theta)
-            z = cur_r * math.sin(theta)
-            y = base_y + (0.2 * radius * math.sin(4 * math.pi * t)) # Slight wobble
-        elif traj_type == "spiral_out":
-            theta = base_theta + (2 * math.pi * t)
-            cur_r = (radius * 0.4) + (radius * 0.8 * t)
-            x = cur_r * math.cos(theta)
-            z = cur_r * math.sin(theta)
-            y = base_y
-        elif traj_type == "figure_eight":
-            # Lemniscate on sphere surface
-            scale = 2 * math.pi * t
-            # Lissajous-ish
-            theta = base_theta + (0.5 * math.sin(scale))
-            phi_offset = 0.3 * math.sin(2 * scale)
-            y = base_y + (radius * phi_offset)
-            x = radius * math.cos(theta)
-            z = radius * math.sin(theta)
-        elif traj_type == "loop":
-            # Vertical circle
-            angle = 2 * math.pi * t
-            y_off = 0.5 * radius * math.sin(angle)
-            x_off = 0.2 * radius * math.cos(angle)
-            x = start_pos[0] + x_off
-            y = base_y + y_off
-            z = start_pos[2]
-        elif traj_type == "heart":
-            # Heart shape in XY plane projection
-            angle = 2 * math.pi * t
-            # Heart formula
-            h_x = 16 * math.sin(angle)**3
-            h_y = 13 * math.cos(angle) - 5*math.cos(2*angle) - 2*math.cos(3*angle) - math.cos(4*angle)
-            # Scale down
-            scale = radius * 0.02
-            x = start_pos[0] + (h_x * scale)
-            y = base_y + (h_y * scale)
-            z = start_pos[2]
-        elif traj_type == "bounce":
-            # Decay bounce
-            freq = 3 * math.pi
-            amp = abs(math.cos(freq * t)) * (1-t)
-            y = base_y + (radius * 0.5 * amp)
-            x = start_pos[0]
-            z = start_pos[2]
-        elif traj_type == "ken_burns":
-            # Pan diagonal + slow zoom
-            zoom_fac = 1.0 - (0.3 * t) # Zoom in 30%
-            pan_x = (t - 0.5) * (radius * 0.3)
-            pan_y = (t - 0.5) * (radius * 0.2)
-            cur_r = radius * zoom_fac
-            x = (cur_r * math.cos(base_theta)) + pan_x
-            y = base_y + pan_y
-            z = (cur_r * math.sin(base_theta))
-        else:
-            # Fallback for anything else (or minor variations)
-            return list(base_traj) # Should be caught by caller, but safe fallback
-        # Construct tensor
-        pos_tensor = torch.tensor([x, y, z], dtype=torch.float32, device=gaussians.device)
-        positions.append(pos_tensor)
-    return positions
 # -----------------------------------------------------------------------------
 # Prediction outputs
 # -----------------------------------------------------------------------------
@@ -608,13 +406,10 @@ class ModelWrapper:
         if fps < 1:
             raise ValueError("fps must be >= 1")
-        # FAST PATH: Standard SHARP trajectories + Default Resolution
-        # We only use the optimized CLI shortcut if it's a standard type AND default res.
-        is_standard_traj = trajectory_type in STANDARD_TRAJECTORIES
-        if output_long_side is None and int(fps) == 30 and is_standard_traj:
             params = camera.TrajectoryParams(
-                type=trajectory_type, # type: ignore
                 num_steps=int(num_frames),
                 num_repeats=1,
             )
@@ -633,7 +428,7 @@ class ModelWrapper:
                 pass
             return output_path
-        # CUSTOM PATH: Manual loop (Handles Custom Res, FPS, or Custom Trajectories)
         src_w, src_h = metadata.resolution_px
         src_f = float(metadata.focal_length_px)
@@ -646,37 +441,15 @@ class ModelWrapper:
             out_h = _make_even(max(2, int(round(src_h * scale))))
             out_f = src_f * scale
         device = torch.device("cuda")
         gaussians_cuda = gaussians.to(device)
-        # 1. Generate Camera Trajectory
-        if is_standard_traj:
-            # Use SHARP's built-in generator
-            traj_params = camera.TrajectoryParams(
-                type=trajectory_type, # type: ignore
-                num_steps=int(num_frames),
-                num_repeats=1,
-            )
-            trajectory = camera.create_eye_trajectory(
-                gaussians_cuda,
-                traj_params,
-                resolution_px=(out_w, out_h),
-                f_px=out_f,
-            )
-            lookat_mode = traj_params.lookat_mode
-        else:
-            # Use our custom generator
-            trajectory = _generate_custom_trajectory(
-                gaussians_cuda,
-                resolution=(out_w, out_h),
-                focal_length=out_f,
-                traj_type=trajectory_type,
-                num_frames=num_frames
-            )
-            # Custom trajectories always look at origin (0,0,0) for now
-            lookat_mode = "scene" # Assuming SHARP 'scene' mode implies look-at-center
-        # 2. Setup Camera Model
         intrinsics = torch.tensor(
             [
                 [out_f, 0.0, (out_w - 1) / 2.0, 0.0],
@@ -692,7 +465,14 @@ class ModelWrapper:
             gaussians_cuda,
             intrinsics,
             resolution_px=(out_w, out_h),
-            lookat_mode=lookat_mode,
         )
         renderer = GSplatRenderer(color_space=metadata.color_space)
@@ -829,4 +609,4 @@ def predict_and_maybe_render(
 if spaces is not None:
     predict_and_maybe_render_gpu = spaces.GPU(duration=180)(predict_and_maybe_render)
 else:  # pragma: no cover
-    predict_and_maybe_render_gpu = predict_and_maybe_render

 - Reuse SHARP's own predict/render pipeline (no subprocess calls).
 - Be robust on Hugging Face Spaces + ZeroGPU.
 - Cache model weights and predictor construction across requests.
 Public API (used by the Gradio app):
 - TrajectoryType
 from __future__ import annotations
 import os
 import threading
 import time
 from sharp.utils.gaussians import Gaussians3D, SceneMetaData, save_ply
 from sharp.utils.gsplat import GSplatRenderer
+TrajectoryType = Literal["swipe", "shake", "rotate", "rotate_forward"]
 # -----------------------------------------------------------------------------
 # Helpers
     return torch.device("cpu")
 # -----------------------------------------------------------------------------
 # Prediction outputs
 # -----------------------------------------------------------------------------
         if fps < 1:
             raise ValueError("fps must be >= 1")
+        # Keep aligned with upstream CLI pipeline where possible.
+        if output_long_side is None and int(fps) == 30:
             params = camera.TrajectoryParams(
+                type=trajectory_type,
                 num_steps=int(num_frames),
                 num_repeats=1,
             )
                 pass
             return output_path
+        # Adapted pipeline for custom output resolution / FPS.
         src_w, src_h = metadata.resolution_px
         src_f = float(metadata.focal_length_px)
             out_h = _make_even(max(2, int(round(src_h * scale))))
             out_f = src_f * scale
+        traj_params = camera.TrajectoryParams(
+            type=trajectory_type,
+            num_steps=int(num_frames),
+            num_repeats=1,
+        )
         device = torch.device("cuda")
         gaussians_cuda = gaussians.to(device)
         intrinsics = torch.tensor(
             [
                 [out_f, 0.0, (out_w - 1) / 2.0, 0.0],
             gaussians_cuda,
             intrinsics,
             resolution_px=(out_w, out_h),
+            lookat_mode=traj_params.lookat_mode,
+        )
+        trajectory = camera.create_eye_trajectory(
+            gaussians_cuda,
+            traj_params,
+            resolution_px=(out_w, out_h),
+            f_px=out_f,
         )
         renderer = GSplatRenderer(color_space=metadata.color_space)
 if spaces is not None:
     predict_and_maybe_render_gpu = spaces.GPU(duration=180)(predict_and_maybe_render)
 else:  # pragma: no cover
+    predict_and_maybe_render_gpu = predict_and_maybe_render