SAM3d / app.py
bhatanerohan's picture
Update app.py
c4891c1 verified
"""
Text-to-3D Pipeline with Editing: Gemini + SAM-3D
MCP Server + Gradio UI for MCP Hackathon
"""
import os
import io
import json
import tempfile
import logging
from datetime import datetime
import hashlib
import shutil
import gradio as gr
from google import genai
from google.genai import types
from PIL import Image
import modal
# ============================================================
# LOGGING SETUP
# ============================================================
# Setup logging for console output (visible in HF Spaces logs)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler() # This goes to HF Spaces logs
]
)
logger = logging.getLogger(__name__)
# Create persistent logs directory (use HF Datasets for permanent storage)
LOGS_DIR = "generation_logs"
os.makedirs(LOGS_DIR, exist_ok=True)
def save_generation_log(prompt, images_dict, metadata=None, request_info=None):
"""Save generation logs with images and user info"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
session_id = hashlib.md5(f"{prompt}{timestamp}".encode()).hexdigest()[:8]
session_dir = os.path.join(LOGS_DIR, f"{timestamp}_{session_id}")
os.makedirs(session_dir, exist_ok=True)
# Save metadata WITH request info
log_data = {
"timestamp": timestamp,
"session_id": session_id,
"prompt": prompt,
"client_ip": request_info.get("ip") if request_info else None,
"user_agent": request_info.get("user_agent") if request_info else None,
"metadata": metadata or {}
}
log_file = os.path.join(session_dir, "metadata.json")
with open(log_file, 'w') as f:
json.dump(log_data, f, indent=2)
# Save images
for name, image_path in images_dict.items():
if image_path and os.path.exists(image_path):
dest = os.path.join(session_dir, f"{name}.png")
shutil.copy(image_path, dest)
logger.info(f"โœ“ Saved logs to: {session_dir}")
return session_dir
# ============================================================
# Initialize Gemini client
# ============================================================
client = None
def init_gemini():
global client
api_key = os.environ.get("GEMINI_API_KEY")
if api_key:
os.environ["GEMINI_API_KEY"] = api_key
client = genai.Client()
logger.info("โœ“ Gemini client initialized")
return True
logger.error("โœ— GEMINI_API_KEY not found")
return False
def image_to_bytes(image):
"""Convert PIL Image to PNG bytes"""
buffer = io.BytesIO()
image.save(buffer, format='PNG')
return buffer.getvalue()
def run_sam3d(image, mask):
"""Send image and mask to SAM-3D on Modal"""
logger.info("โ†’ Sending to SAM-3D on Modal...")
img_bytes = image_to_bytes(image.convert("RGB"))
mask_bytes = image_to_bytes(mask)
SAM3DModel = modal.Cls.from_name("sam3d-objects-inference", "SAM3DModel")
model = SAM3DModel()
ply_bytes, glb_bytes = model.reconstruct.remote(img_bytes, mask_bytes)
logger.info(f"โœ“ SAM-3D complete - PLY: {len(ply_bytes)} bytes, GLB: {len(glb_bytes) if glb_bytes else 0} bytes")
return ply_bytes, glb_bytes
# ============================================================
# MCP TOOLS - These functions are exposed as MCP tools
# ============================================================
def generate_3d_model(prompt: str) -> str:
"""
Generate a 3D model from a text description.
Args:
prompt: Text description of the object to generate (e.g., "a red sports car", "a wooden chair")
Returns:
JSON string with paths to generated files
"""
logger.info(f"=== MCP TOOL: generate_3d_model ===")
logger.info(f"Prompt: {prompt}")
if not client:
if not init_gemini():
return json.dumps({"error": "GEMINI_API_KEY not configured"})
try:
# STEP 1: Generate image
initial_prompt = f"{prompt}, three-quarter front view angle, natural daylight, soft shadows showing depth and contours, clean simple background, full object visible, photorealistic"
logger.info("โ†’ Generating initial image...")
response_gen = client.models.generate_content(
model="gemini-2.5-flash-image",
contents=[initial_prompt],
)
initial_image = None
for part in response_gen.parts:
if part.inline_data:
image_bytes = part.inline_data.data
initial_image = Image.open(io.BytesIO(image_bytes))
break
if initial_image is None:
logger.error("โœ— Image generation failed")
return json.dumps({"error": "Image generation failed"})
logger.info(f"โœ“ Initial image generated: {initial_image.size}")
# STEP 2: Remove background
logger.info("โ†’ Removing background...")
edit_prompt = "Remove the background completely, make the background transparent. Preserve the object's shadow for realism."
image_part = types.Part.from_bytes(
data=image_to_bytes(initial_image),
mime_type="image/png"
)
response_edit = client.models.generate_content(
model="gemini-3-pro-image-preview",
contents=[edit_prompt, image_part],
)
final_image = None
for part in response_edit.parts:
if part.inline_data:
edited_bytes = part.inline_data.data
final_image = Image.open(io.BytesIO(edited_bytes))
break
if final_image is None:
logger.error("โœ— Background removal failed")
return json.dumps({"error": "Background removal failed"})
logger.info("โœ“ Background removed")
# STEP 3: Create grayscale mask
gray = final_image.convert("L")
# STEP 4: Run SAM-3D
ply_bytes, glb_bytes = run_sam3d(final_image, gray)
# Save all outputs
temp_dir = tempfile.mkdtemp()
original_path = os.path.join(temp_dir, "original.png")
nobg_path = os.path.join(temp_dir, "transparent.png")
mask_path = os.path.join(temp_dir, "mask.png")
ply_path = os.path.join(temp_dir, "model.ply")
initial_image.save(original_path)
final_image.save(nobg_path)
gray.save(mask_path)
with open(ply_path, 'wb') as f:
f.write(ply_bytes)
glb_path = None
if glb_bytes:
glb_path = os.path.join(temp_dir, "model.glb")
with open(glb_path, 'wb') as f:
f.write(glb_bytes)
# Save logs
images_dict = {
"original": original_path,
"transparent": nobg_path,
"mask": mask_path
}
metadata = {
"type": "generation",
"has_glb": glb_path is not None,
"ply_size_bytes": len(ply_bytes),
"glb_size_bytes": len(glb_bytes) if glb_bytes else 0
}
log_dir = save_generation_log(prompt, images_dict, metadata)
logger.info(f"โœ“ Generation complete!")
return json.dumps({
"success": True,
"prompt": prompt,
"original_image": original_path,
"transparent_image": nobg_path,
"mask_image": mask_path,
"ply_model": ply_path,
"glb_model": glb_path,
"log_directory": log_dir,
"message": f"Successfully generated 3D model for: {prompt}"
})
except Exception as e:
logger.error(f"โœ— Error: {e}", exc_info=True)
return json.dumps({"error": str(e)})
def edit_3d_model(edit_prompt: str, transparent_image_path: str) -> str:
"""
Edit an existing 3D model by modifying its transparent image and regenerating.
Args:
edit_prompt: Description of the edit to apply (e.g., "remove the wings", "change color to blue")
transparent_image_path: Path to the transparent PNG image from a previous generation
Returns:
JSON string with paths to the new edited files
"""
logger.info(f"=== MCP TOOL: edit_3d_model ===")
logger.info(f"Edit: {edit_prompt}")
if not client:
if not init_gemini():
return json.dumps({"error": "GEMINI_API_KEY not configured"})
try:
current_image = Image.open(transparent_image_path)
logger.info(f"โ†’ Loaded image: {current_image.size}")
image_part = types.Part.from_bytes(
data=image_to_bytes(current_image),
mime_type="image/png"
)
full_edit_prompt = f"{edit_prompt}. Keep the background transparent. Maintain image quality and lighting."
logger.info("โ†’ Applying edit...")
response_edit = client.models.generate_content(
model="gemini-3-pro-image-preview",
contents=[full_edit_prompt, image_part],
)
edited_image = None
for part in response_edit.parts:
if part.inline_data:
edited_bytes = part.inline_data.data
edited_image = Image.open(io.BytesIO(edited_bytes))
break
if edited_image is None:
logger.error("โœ— Edit failed")
return json.dumps({"error": "Edit failed"})
logger.info("โœ“ Edit applied")
gray = edited_image.convert("L")
ply_bytes, glb_bytes = run_sam3d(edited_image, gray)
temp_dir = tempfile.mkdtemp()
nobg_path = os.path.join(temp_dir, "edited.png")
mask_path = os.path.join(temp_dir, "mask.png")
ply_path = os.path.join(temp_dir, "model.ply")
edited_image.save(nobg_path)
gray.save(mask_path)
with open(ply_path, 'wb') as f:
f.write(ply_bytes)
glb_path = None
if glb_bytes:
glb_path = os.path.join(temp_dir, "model.glb")
with open(glb_path, 'wb') as f:
f.write(glb_bytes)
# Save logs
images_dict = {
"edited": nobg_path,
"mask": mask_path
}
metadata = {
"type": "edit",
"has_glb": glb_path is not None
}
log_dir = save_generation_log(edit_prompt, images_dict, metadata)
logger.info(f"โœ“ Edit complete!")
return json.dumps({
"success": True,
"edit_prompt": edit_prompt,
"transparent_image": nobg_path,
"mask_image": mask_path,
"ply_model": ply_path,
"glb_model": glb_path,
"log_directory": log_dir,
"message": f"Successfully applied edit: {edit_prompt}"
})
except Exception as e:
logger.error(f"โœ— Error: {e}", exc_info=True)
return json.dumps({"error": str(e)})
# ============================================================
# GRADIO UI FUNCTIONS
# ============================================================
def generate_3d_ui(prompt, request: gr.Request, progress=gr.Progress()):
"""UI wrapper with progress updates"""
# Get client info
client_ip = request.headers.get("x-forwarded-for", "").split(",")[0].strip()
if not client_ip:
client_ip = request.client.host if request else "unknown"
user_agent = request.headers.get("user-agent", "unknown") if request else "unknown"
logger.info(f"=== NEW GENERATION REQUEST ===")
logger.info(f"IP: {client_ip}")
logger.info(f"Prompt: {prompt}")
if not client:
if not init_gemini():
raise gr.Error("GEMINI_API_KEY not set in Space secrets")
progress(0.1, desc="Generating image...")
initial_prompt = f"{prompt}, three-quarter front view angle, natural daylight, soft shadows showing depth and contours, clean simple background, full object visible, photorealistic"
try:
response_gen = client.models.generate_content(
model="gemini-2.5-flash-image",
contents=[initial_prompt],
)
initial_image = None
for part in response_gen.parts:
if part.inline_data:
image_bytes = part.inline_data.data
initial_image = Image.open(io.BytesIO(image_bytes))
break
if initial_image is None:
raise gr.Error("Image generation failed")
logger.info(f"โœ“ Image generated: {initial_image.size}")
except Exception as e:
logger.error(f"โœ— Image generation failed: {e}")
raise gr.Error(f"Image generation failed: {e}")
progress(0.3, desc="Removing background...")
try:
image_part = types.Part.from_bytes(
data=image_to_bytes(initial_image),
mime_type="image/png"
)
response_edit = client.models.generate_content(
model="gemini-3-pro-image-preview",
contents=["Remove the background completely, make the background transparent. Preserve the object's shadow for realism.", image_part],
)
final_image = None
for part in response_edit.parts:
if part.inline_data:
edited_bytes = part.inline_data.data
final_image = Image.open(io.BytesIO(edited_bytes))
break
if final_image is None:
raise gr.Error("Background removal failed")
logger.info("โœ“ Background removed")
except Exception as e:
logger.error(f"โœ— Background removal failed: {e}")
raise gr.Error(f"Background removal failed: {e}")
progress(0.4, desc="Creating mask...")
gray = final_image.convert("L")
progress(0.5, desc="Running SAM-3D (1-2 min, first run may take longer)...")
try:
ply_bytes, glb_bytes = run_sam3d(final_image, gray)
except Exception as e:
logger.error(f"โœ— SAM-3D failed: {e}")
raise gr.Error(f"SAM-3D failed: {e}")
progress(0.9, desc="Saving outputs...")
temp_dir = tempfile.mkdtemp()
original_path = os.path.join(temp_dir, "original.png")
nobg_path = os.path.join(temp_dir, "no_background.png")
mask_path = os.path.join(temp_dir, "mask.png")
ply_path = os.path.join(temp_dir, "model.ply")
initial_image.save(original_path)
final_image.save(nobg_path)
gray.save(mask_path)
with open(ply_path, 'wb') as f:
f.write(ply_bytes)
glb_path = None
if glb_bytes:
glb_path = os.path.join(temp_dir, "model.glb")
with open(glb_path, 'wb') as f:
f.write(glb_bytes)
# Save logs WITH request info
images_dict = {
"original": original_path,
"transparent": nobg_path,
"mask": mask_path
}
metadata = {
"type": "ui_generation",
"has_glb": glb_path is not None,
"ply_size_bytes": len(ply_bytes),
"glb_size_bytes": len(glb_bytes) if glb_bytes else 0
}
request_info = {
"ip": client_ip,
"user_agent": user_agent
}
save_generation_log(prompt, images_dict, metadata, request_info)
progress(1.0, desc="Done!")
logger.info(f"โœ“ Generation complete!")
return (
original_path,
nobg_path,
mask_path,
glb_path if glb_path else ply_path,
glb_path,
ply_path,
final_image,
1,
)
def edit_3d_ui(edit_prompt, current_image, edit_count, request: gr.Request, progress=gr.Progress()):
"""UI wrapper for editing"""
# Get client info
client_ip = request.headers.get("x-forwarded-for", "").split(",")[0].strip()
if not client_ip:
client_ip = request.client.host if request else "unknown"
user_agent = request.headers.get("user-agent", "unknown") if request else "unknown"
logger.info(f"=== EDIT REQUEST #{edit_count + 1} ===")
logger.info(f"IP: {client_ip}")
logger.info(f"Edit: {edit_prompt}")
if current_image is None:
raise gr.Error("No image to edit. Generate a 3D model first!")
if not client:
if not init_gemini():
raise gr.Error("GEMINI_API_KEY not set")
progress(0.1, desc=f"Applying edit: {edit_prompt}...")
try:
image_part = types.Part.from_bytes(
data=image_to_bytes(current_image),
mime_type="image/png"
)
full_edit_prompt = f"{edit_prompt}. Keep the background transparent. Maintain image quality and lighting."
response_edit = client.models.generate_content(
model="gemini-3-pro-image-preview",
contents=[full_edit_prompt, image_part],
)
edited_image = None
for part in response_edit.parts:
if part.inline_data:
edited_bytes = part.inline_data.data
edited_image = Image.open(io.BytesIO(edited_bytes))
break
if edited_image is None:
raise gr.Error("Edit failed")
logger.info("โœ“ Edit applied")
except Exception as e:
logger.error(f"โœ— Edit failed: {e}")
raise gr.Error(f"Edit failed: {e}")
progress(0.3, desc="Creating new mask...")
gray = edited_image.convert("L")
progress(0.4, desc="Running SAM-3D (1-2 min)...")
try:
ply_bytes, glb_bytes = run_sam3d(edited_image, gray)
except Exception as e:
logger.error(f"โœ— SAM-3D failed: {e}")
raise gr.Error(f"SAM-3D failed: {e}")
progress(0.9, desc="Saving outputs...")
temp_dir = tempfile.mkdtemp()
nobg_path = os.path.join(temp_dir, "edited.png")
mask_path = os.path.join(temp_dir, "mask.png")
ply_path = os.path.join(temp_dir, "model.ply")
edited_image.save(nobg_path)
gray.save(mask_path)
with open(ply_path, 'wb') as f:
f.write(ply_bytes)
glb_path = None
if glb_bytes:
glb_path = os.path.join(temp_dir, "model.glb")
with open(glb_path, 'wb') as f:
f.write(glb_bytes)
new_edit_count = edit_count + 1
# Save logs WITH request info
images_dict = {
"edited": nobg_path,
"mask": mask_path
}
metadata = {
"type": "ui_edit",
"edit_number": new_edit_count,
"has_glb": glb_path is not None
}
request_info = {
"ip": client_ip,
"user_agent": user_agent
}
save_generation_log(edit_prompt, images_dict, metadata, request_info)
progress(1.0, desc=f"Edit #{new_edit_count} complete!")
logger.info(f"โœ“ Edit #{new_edit_count} complete!")
return (
nobg_path,
mask_path,
glb_path if glb_path else ply_path,
glb_path,
ply_path,
edited_image,
new_edit_count,
)
# ============================================================
# MCP TOOL INTERFACES
# ============================================================
generate_tool = gr.Interface(
fn=generate_3d_model,
inputs=gr.Textbox(label="Prompt", placeholder="A red sports car"),
outputs=gr.Textbox(label="Result (JSON)"),
api_name="generate_3d",
title="Generate 3D Model",
description="Generate a 3D model from a text description"
)
edit_tool = gr.Interface(
fn=edit_3d_model,
inputs=[
gr.Textbox(label="Edit Prompt", placeholder="Remove the wings"),
gr.Textbox(label="Transparent Image Path", placeholder="/path/to/transparent.png")
],
outputs=gr.Textbox(label="Result (JSON)"),
api_name="edit_3d",
title="Edit 3D Model",
description="Edit an existing 3D model"
)
# ============================================================
# MAIN UI
# ============================================================
with gr.Blocks() as main_ui:
current_image_state = gr.State(None)
edit_count_state = gr.State(0)
gr.Markdown("""
# ๐ŸŽจ Text to 3D Model (MCP Server)
### Powered by Gemini + SAM-3D Objects
**This app is also an MCP Server!** Claude Desktop, Cursor, and other MCP clients can use the `generate_3d` and `edit_3d` tools.
โฑ๏ธ *Generation takes 1-2 minutes. First run may take longer as the model warms up.*
๐Ÿ“ *Note: IP addresses are logged for analytics.*
""")
gr.Markdown("## 1๏ธโƒฃ Generate Initial 3D Model")
with gr.Row():
with gr.Column(scale=2):
prompt_input = gr.Textbox(label="Text Prompt", placeholder="A plane with eagle wings", lines=2)
with gr.Column(scale=1):
generate_btn = gr.Button("๐Ÿš€ Generate", variant="primary", size="lg")
gr.Examples(
examples=["A plane with eagle wings", "A wooden chair", "A red sports car", "A ceramic coffee mug", "A robot dog"],
inputs=prompt_input
)
gr.Markdown("## 2๏ธโƒฃ Edit Your Model")
with gr.Row():
with gr.Column(scale=2):
edit_input = gr.Textbox(label="Edit Prompt", placeholder="Remove the wings", lines=2)
with gr.Column(scale=1):
edit_btn = gr.Button("โœ๏ธ Apply Edit", variant="secondary", size="lg")
edit_counter = gr.Markdown("*No edits yet*")
gr.Examples(
examples=["Remove the wings", "Change color to blue", "Add racing stripes", "Make it larger", "Add wheels"],
inputs=edit_input
)
gr.Markdown("## ๐Ÿ“ธ Images")
with gr.Row():
original_output = gr.Image(label="1. Original", type="filepath")
nobg_output = gr.Image(label="2. Transparent", type="filepath")
mask_output = gr.Image(label="3. Mask", type="filepath")
gr.Markdown("## ๐ŸŽฎ 3D Model")
model_output = gr.Model3D(label="Interactive 3D Model (drag to rotate)", clear_color=[0.1, 0.1, 0.1, 1.0])
gr.Markdown("## ๐Ÿ“ฅ Downloads")
with gr.Row():
glb_download = gr.File(label="GLB (mesh)")
ply_download = gr.File(label="PLY (splat)")
gr.Markdown("""
---
## ๐Ÿ”Œ MCP Server Info
This app exposes two MCP tools: `generate_3d` and `edit_3d`
**Connect via:** `https://YOUR-SPACE.hf.space/gradio_api/mcp/sse`
---
**Built for [MCP 1st Birthday Hackathon](https://huggingface.co/MCP-1st-Birthday)** ๐ŸŽ‚
""")
def update_counter(count):
return "*No edits yet*" if count == 0 else f"**Edits applied: {count}**"
generate_btn.click(
fn=generate_3d_ui,
inputs=[prompt_input],
outputs=[original_output, nobg_output, mask_output, model_output, glb_download, ply_download, current_image_state, edit_count_state]
).then(fn=update_counter, inputs=[edit_count_state], outputs=[edit_counter])
edit_btn.click(
fn=edit_3d_ui,
inputs=[edit_input, current_image_state, edit_count_state],
outputs=[nobg_output, mask_output, model_output, glb_download, ply_download, current_image_state, edit_count_state]
).then(fn=update_counter, inputs=[edit_count_state], outputs=[edit_counter])
# ============================================================
# COMBINE UI + MCP TOOLS
# ============================================================
demo = gr.TabbedInterface(
interface_list=[main_ui, generate_tool, edit_tool],
tab_names=["๐ŸŽจ Interactive UI", "๐Ÿ”ง Generate Tool", "โœ๏ธ Edit Tool"],
title="Text to 3D | MCP Server"
)
if __name__ == "__main__":
logger.info("=== Starting Text-to-3D MCP Server ===")
demo.launch(mcp_server=True)