Spaces:

bhatanerohan
/

SAM3d

Running

App Files Files Community

SAM3d / app.py

bhatanerohan

Update app.py

c4891c1 verified 18 days ago

raw

history blame contribute delete

24 kB

	"""
	Text-to-3D Pipeline with Editing: Gemini + SAM-3D
	MCP Server + Gradio UI for MCP Hackathon
	"""

	import os
	import io
	import json
	import tempfile
	import logging
	from datetime import datetime
	import hashlib
	import shutil
	import gradio as gr
	from google import genai
	from google.genai import types
	from PIL import Image
	import modal

	# ============================================================
	# LOGGING SETUP
	# ============================================================

	# Setup logging for console output (visible in HF Spaces logs)
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s',
	handlers=[
	logging.StreamHandler() # This goes to HF Spaces logs
	]
	)
	logger = logging.getLogger(__name__)

	# Create persistent logs directory (use HF Datasets for permanent storage)
	LOGS_DIR = "generation_logs"
	os.makedirs(LOGS_DIR, exist_ok=True)

	def save_generation_log(prompt, images_dict, metadata=None, request_info=None):
	"""Save generation logs with images and user info"""
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	session_id = hashlib.md5(f"{prompt}{timestamp}".encode()).hexdigest()[:8]

	session_dir = os.path.join(LOGS_DIR, f"{timestamp}_{session_id}")
	os.makedirs(session_dir, exist_ok=True)

	# Save metadata WITH request info
	log_data = {
	"timestamp": timestamp,
	"session_id": session_id,
	"prompt": prompt,
	"client_ip": request_info.get("ip") if request_info else None,
	"user_agent": request_info.get("user_agent") if request_info else None,
	"metadata": metadata or {}
	}

	log_file = os.path.join(session_dir, "metadata.json")
	with open(log_file, 'w') as f:
	json.dump(log_data, f, indent=2)

	# Save images
	for name, image_path in images_dict.items():
	if image_path and os.path.exists(image_path):
	dest = os.path.join(session_dir, f"{name}.png")
	shutil.copy(image_path, dest)

	logger.info(f"✓ Saved logs to: {session_dir}")
	return session_dir

	# ============================================================
	# Initialize Gemini client
	# ============================================================

	client = None

	def init_gemini():
	global client
	api_key = os.environ.get("GEMINI_API_KEY")
	if api_key:
	os.environ["GEMINI_API_KEY"] = api_key
	client = genai.Client()
	logger.info("✓ Gemini client initialized")
	return True
	logger.error("✗ GEMINI_API_KEY not found")
	return False

	def image_to_bytes(image):
	"""Convert PIL Image to PNG bytes"""
	buffer = io.BytesIO()
	image.save(buffer, format='PNG')
	return buffer.getvalue()

	def run_sam3d(image, mask):
	"""Send image and mask to SAM-3D on Modal"""
	logger.info("→ Sending to SAM-3D on Modal...")
	img_bytes = image_to_bytes(image.convert("RGB"))
	mask_bytes = image_to_bytes(mask)

	SAM3DModel = modal.Cls.from_name("sam3d-objects-inference", "SAM3DModel")
	model = SAM3DModel()
	ply_bytes, glb_bytes = model.reconstruct.remote(img_bytes, mask_bytes)

	logger.info(f"✓ SAM-3D complete - PLY: {len(ply_bytes)} bytes, GLB: {len(glb_bytes) if glb_bytes else 0} bytes")
	return ply_bytes, glb_bytes


	# ============================================================
	# MCP TOOLS - These functions are exposed as MCP tools
	# ============================================================

	def generate_3d_model(prompt: str) -> str:
	"""
	Generate a 3D model from a text description.

	Args:
	prompt: Text description of the object to generate (e.g., "a red sports car", "a wooden chair")

	Returns:
	JSON string with paths to generated files
	"""
	logger.info(f"=== MCP TOOL: generate_3d_model ===")
	logger.info(f"Prompt: {prompt}")

	if not client:
	if not init_gemini():
	return json.dumps({"error": "GEMINI_API_KEY not configured"})

	try:
	# STEP 1: Generate image
	initial_prompt = f"{prompt}, three-quarter front view angle, natural daylight, soft shadows showing depth and contours, clean simple background, full object visible, photorealistic"
	logger.info("→ Generating initial image...")

	response_gen = client.models.generate_content(
	model="gemini-2.5-flash-image",
	contents=[initial_prompt],
	)

	initial_image = None
	for part in response_gen.parts:
	if part.inline_data:
	image_bytes = part.inline_data.data
	initial_image = Image.open(io.BytesIO(image_bytes))
	break

	if initial_image is None:
	logger.error("✗ Image generation failed")
	return json.dumps({"error": "Image generation failed"})

	logger.info(f"✓ Initial image generated: {initial_image.size}")

	# STEP 2: Remove background
	logger.info("→ Removing background...")
	edit_prompt = "Remove the background completely, make the background transparent. Preserve the object's shadow for realism."
	image_part = types.Part.from_bytes(
	data=image_to_bytes(initial_image),
	mime_type="image/png"
	)

	response_edit = client.models.generate_content(
	model="gemini-3-pro-image-preview",
	contents=[edit_prompt, image_part],
	)

	final_image = None
	for part in response_edit.parts:
	if part.inline_data:
	edited_bytes = part.inline_data.data
	final_image = Image.open(io.BytesIO(edited_bytes))
	break

	if final_image is None:
	logger.error("✗ Background removal failed")
	return json.dumps({"error": "Background removal failed"})

	logger.info("✓ Background removed")

	# STEP 3: Create grayscale mask
	gray = final_image.convert("L")

	# STEP 4: Run SAM-3D
	ply_bytes, glb_bytes = run_sam3d(final_image, gray)

	# Save all outputs
	temp_dir = tempfile.mkdtemp()

	original_path = os.path.join(temp_dir, "original.png")
	nobg_path = os.path.join(temp_dir, "transparent.png")
	mask_path = os.path.join(temp_dir, "mask.png")
	ply_path = os.path.join(temp_dir, "model.ply")

	initial_image.save(original_path)
	final_image.save(nobg_path)
	gray.save(mask_path)

	with open(ply_path, 'wb') as f:
	f.write(ply_bytes)

	glb_path = None
	if glb_bytes:
	glb_path = os.path.join(temp_dir, "model.glb")
	with open(glb_path, 'wb') as f:
	f.write(glb_bytes)

	# Save logs
	images_dict = {
	"original": original_path,
	"transparent": nobg_path,
	"mask": mask_path
	}
	metadata = {
	"type": "generation",
	"has_glb": glb_path is not None,
	"ply_size_bytes": len(ply_bytes),
	"glb_size_bytes": len(glb_bytes) if glb_bytes else 0
	}
	log_dir = save_generation_log(prompt, images_dict, metadata)

	logger.info(f"✓ Generation complete!")

	return json.dumps({
	"success": True,
	"prompt": prompt,
	"original_image": original_path,
	"transparent_image": nobg_path,
	"mask_image": mask_path,
	"ply_model": ply_path,
	"glb_model": glb_path,
	"log_directory": log_dir,
	"message": f"Successfully generated 3D model for: {prompt}"
	})

	except Exception as e:
	logger.error(f"✗ Error: {e}", exc_info=True)
	return json.dumps({"error": str(e)})


	def edit_3d_model(edit_prompt: str, transparent_image_path: str) -> str:
	"""
	Edit an existing 3D model by modifying its transparent image and regenerating.

	Args:
	edit_prompt: Description of the edit to apply (e.g., "remove the wings", "change color to blue")
	transparent_image_path: Path to the transparent PNG image from a previous generation

	Returns:
	JSON string with paths to the new edited files
	"""
	logger.info(f"=== MCP TOOL: edit_3d_model ===")
	logger.info(f"Edit: {edit_prompt}")

	if not client:
	if not init_gemini():
	return json.dumps({"error": "GEMINI_API_KEY not configured"})

	try:
	current_image = Image.open(transparent_image_path)
	logger.info(f"→ Loaded image: {current_image.size}")

	image_part = types.Part.from_bytes(
	data=image_to_bytes(current_image),
	mime_type="image/png"
	)

	full_edit_prompt = f"{edit_prompt}. Keep the background transparent. Maintain image quality and lighting."
	logger.info("→ Applying edit...")

	response_edit = client.models.generate_content(
	model="gemini-3-pro-image-preview",
	contents=[full_edit_prompt, image_part],
	)

	edited_image = None
	for part in response_edit.parts:
	if part.inline_data:
	edited_bytes = part.inline_data.data
	edited_image = Image.open(io.BytesIO(edited_bytes))
	break

	if edited_image is None:
	logger.error("✗ Edit failed")
	return json.dumps({"error": "Edit failed"})

	logger.info("✓ Edit applied")

	gray = edited_image.convert("L")
	ply_bytes, glb_bytes = run_sam3d(edited_image, gray)

	temp_dir = tempfile.mkdtemp()

	nobg_path = os.path.join(temp_dir, "edited.png")
	mask_path = os.path.join(temp_dir, "mask.png")
	ply_path = os.path.join(temp_dir, "model.ply")

	edited_image.save(nobg_path)
	gray.save(mask_path)

	with open(ply_path, 'wb') as f:
	f.write(ply_bytes)

	glb_path = None
	if glb_bytes:
	glb_path = os.path.join(temp_dir, "model.glb")
	with open(glb_path, 'wb') as f:
	f.write(glb_bytes)

	# Save logs
	images_dict = {
	"edited": nobg_path,
	"mask": mask_path
	}
	metadata = {
	"type": "edit",
	"has_glb": glb_path is not None
	}
	log_dir = save_generation_log(edit_prompt, images_dict, metadata)

	logger.info(f"✓ Edit complete!")

	return json.dumps({
	"success": True,
	"edit_prompt": edit_prompt,
	"transparent_image": nobg_path,
	"mask_image": mask_path,
	"ply_model": ply_path,
	"glb_model": glb_path,
	"log_directory": log_dir,
	"message": f"Successfully applied edit: {edit_prompt}"
	})

	except Exception as e:
	logger.error(f"✗ Error: {e}", exc_info=True)
	return json.dumps({"error": str(e)})


	# ============================================================
	# GRADIO UI FUNCTIONS
	# ============================================================

	def generate_3d_ui(prompt, request: gr.Request, progress=gr.Progress()):
	"""UI wrapper with progress updates"""

	# Get client info
	client_ip = request.headers.get("x-forwarded-for", "").split(",")[0].strip()
	if not client_ip:
	client_ip = request.client.host if request else "unknown"
	user_agent = request.headers.get("user-agent", "unknown") if request else "unknown"

	logger.info(f"=== NEW GENERATION REQUEST ===")
	logger.info(f"IP: {client_ip}")
	logger.info(f"Prompt: {prompt}")

	if not client:
	if not init_gemini():
	raise gr.Error("GEMINI_API_KEY not set in Space secrets")

	progress(0.1, desc="Generating image...")

	initial_prompt = f"{prompt}, three-quarter front view angle, natural daylight, soft shadows showing depth and contours, clean simple background, full object visible, photorealistic"

	try:
	response_gen = client.models.generate_content(
	model="gemini-2.5-flash-image",
	contents=[initial_prompt],
	)

	initial_image = None
	for part in response_gen.parts:
	if part.inline_data:
	image_bytes = part.inline_data.data
	initial_image = Image.open(io.BytesIO(image_bytes))
	break

	if initial_image is None:
	raise gr.Error("Image generation failed")

	logger.info(f"✓ Image generated: {initial_image.size}")

	except Exception as e:
	logger.error(f"✗ Image generation failed: {e}")
	raise gr.Error(f"Image generation failed: {e}")

	progress(0.3, desc="Removing background...")

	try:
	image_part = types.Part.from_bytes(
	data=image_to_bytes(initial_image),
	mime_type="image/png"
	)

	response_edit = client.models.generate_content(
	model="gemini-3-pro-image-preview",
	contents=["Remove the background completely, make the background transparent. Preserve the object's shadow for realism.", image_part],
	)

	final_image = None
	for part in response_edit.parts:
	if part.inline_data:
	edited_bytes = part.inline_data.data
	final_image = Image.open(io.BytesIO(edited_bytes))
	break

	if final_image is None:
	raise gr.Error("Background removal failed")

	logger.info("✓ Background removed")

	except Exception as e:
	logger.error(f"✗ Background removal failed: {e}")
	raise gr.Error(f"Background removal failed: {e}")

	progress(0.4, desc="Creating mask...")
	gray = final_image.convert("L")

	progress(0.5, desc="Running SAM-3D (1-2 min, first run may take longer)...")

	try:
	ply_bytes, glb_bytes = run_sam3d(final_image, gray)
	except Exception as e:
	logger.error(f"✗ SAM-3D failed: {e}")
	raise gr.Error(f"SAM-3D failed: {e}")

	progress(0.9, desc="Saving outputs...")

	temp_dir = tempfile.mkdtemp()

	original_path = os.path.join(temp_dir, "original.png")
	nobg_path = os.path.join(temp_dir, "no_background.png")
	mask_path = os.path.join(temp_dir, "mask.png")
	ply_path = os.path.join(temp_dir, "model.ply")

	initial_image.save(original_path)
	final_image.save(nobg_path)
	gray.save(mask_path)

	with open(ply_path, 'wb') as f:
	f.write(ply_bytes)

	glb_path = None
	if glb_bytes:
	glb_path = os.path.join(temp_dir, "model.glb")
	with open(glb_path, 'wb') as f:
	f.write(glb_bytes)

	# Save logs WITH request info
	images_dict = {
	"original": original_path,
	"transparent": nobg_path,
	"mask": mask_path
	}
	metadata = {
	"type": "ui_generation",
	"has_glb": glb_path is not None,
	"ply_size_bytes": len(ply_bytes),
	"glb_size_bytes": len(glb_bytes) if glb_bytes else 0
	}
	request_info = {
	"ip": client_ip,
	"user_agent": user_agent
	}
	save_generation_log(prompt, images_dict, metadata, request_info)

	progress(1.0, desc="Done!")
	logger.info(f"✓ Generation complete!")

	return (
	original_path,
	nobg_path,
	mask_path,
	glb_path if glb_path else ply_path,
	glb_path,
	ply_path,
	final_image,
	1,
	)


	def edit_3d_ui(edit_prompt, current_image, edit_count, request: gr.Request, progress=gr.Progress()):
	"""UI wrapper for editing"""

	# Get client info
	client_ip = request.headers.get("x-forwarded-for", "").split(",")[0].strip()
	if not client_ip:
	client_ip = request.client.host if request else "unknown"
	user_agent = request.headers.get("user-agent", "unknown") if request else "unknown"

	logger.info(f"=== EDIT REQUEST #{edit_count + 1} ===")
	logger.info(f"IP: {client_ip}")
	logger.info(f"Edit: {edit_prompt}")

	if current_image is None:
	raise gr.Error("No image to edit. Generate a 3D model first!")

	if not client:
	if not init_gemini():
	raise gr.Error("GEMINI_API_KEY not set")

	progress(0.1, desc=f"Applying edit: {edit_prompt}...")

	try:
	image_part = types.Part.from_bytes(
	data=image_to_bytes(current_image),
	mime_type="image/png"
	)

	full_edit_prompt = f"{edit_prompt}. Keep the background transparent. Maintain image quality and lighting."

	response_edit = client.models.generate_content(
	model="gemini-3-pro-image-preview",
	contents=[full_edit_prompt, image_part],
	)

	edited_image = None
	for part in response_edit.parts:
	if part.inline_data:
	edited_bytes = part.inline_data.data
	edited_image = Image.open(io.BytesIO(edited_bytes))
	break

	if edited_image is None:
	raise gr.Error("Edit failed")

	logger.info("✓ Edit applied")

	except Exception as e:
	logger.error(f"✗ Edit failed: {e}")
	raise gr.Error(f"Edit failed: {e}")

	progress(0.3, desc="Creating new mask...")
	gray = edited_image.convert("L")

	progress(0.4, desc="Running SAM-3D (1-2 min)...")

	try:
	ply_bytes, glb_bytes = run_sam3d(edited_image, gray)
	except Exception as e:
	logger.error(f"✗ SAM-3D failed: {e}")
	raise gr.Error(f"SAM-3D failed: {e}")

	progress(0.9, desc="Saving outputs...")

	temp_dir = tempfile.mkdtemp()

	nobg_path = os.path.join(temp_dir, "edited.png")
	mask_path = os.path.join(temp_dir, "mask.png")
	ply_path = os.path.join(temp_dir, "model.ply")

	edited_image.save(nobg_path)
	gray.save(mask_path)

	with open(ply_path, 'wb') as f:
	f.write(ply_bytes)

	glb_path = None
	if glb_bytes:
	glb_path = os.path.join(temp_dir, "model.glb")
	with open(glb_path, 'wb') as f:
	f.write(glb_bytes)

	new_edit_count = edit_count + 1

	# Save logs WITH request info
	images_dict = {
	"edited": nobg_path,
	"mask": mask_path
	}
	metadata = {
	"type": "ui_edit",
	"edit_number": new_edit_count,
	"has_glb": glb_path is not None
	}
	request_info = {
	"ip": client_ip,
	"user_agent": user_agent
	}
	save_generation_log(edit_prompt, images_dict, metadata, request_info)

	progress(1.0, desc=f"Edit #{new_edit_count} complete!")
	logger.info(f"✓ Edit #{new_edit_count} complete!")

	return (
	nobg_path,
	mask_path,
	glb_path if glb_path else ply_path,
	glb_path,
	ply_path,
	edited_image,
	new_edit_count,
	)


	# ============================================================
	# MCP TOOL INTERFACES
	# ============================================================

	generate_tool = gr.Interface(
	fn=generate_3d_model,
	inputs=gr.Textbox(label="Prompt", placeholder="A red sports car"),
	outputs=gr.Textbox(label="Result (JSON)"),
	api_name="generate_3d",
	title="Generate 3D Model",
	description="Generate a 3D model from a text description"
	)

	edit_tool = gr.Interface(
	fn=edit_3d_model,
	inputs=[
	gr.Textbox(label="Edit Prompt", placeholder="Remove the wings"),
	gr.Textbox(label="Transparent Image Path", placeholder="/path/to/transparent.png")
	],
	outputs=gr.Textbox(label="Result (JSON)"),
	api_name="edit_3d",
	title="Edit 3D Model",
	description="Edit an existing 3D model"
	)


	# ============================================================
	# MAIN UI
	# ============================================================

	with gr.Blocks() as main_ui:

	current_image_state = gr.State(None)
	edit_count_state = gr.State(0)

	gr.Markdown("""
	# 🎨 Text to 3D Model (MCP Server)
	### Powered by Gemini + SAM-3D Objects

	This app is also an MCP Server! Claude Desktop, Cursor, and other MCP clients can use the `generate_3d` and `edit_3d` tools.

	⏱️ Generation takes 1-2 minutes. First run may take longer as the model warms up.

	📍 Note: IP addresses are logged for analytics.
	""")

	gr.Markdown("## 1️⃣ Generate Initial 3D Model")

	with gr.Row():
	with gr.Column(scale=2):
	prompt_input = gr.Textbox(label="Text Prompt", placeholder="A plane with eagle wings", lines=2)
	with gr.Column(scale=1):
	generate_btn = gr.Button("🚀 Generate", variant="primary", size="lg")

	gr.Examples(
	examples=["A plane with eagle wings", "A wooden chair", "A red sports car", "A ceramic coffee mug", "A robot dog"],
	inputs=prompt_input
	)

	gr.Markdown("## 2️⃣ Edit Your Model")

	with gr.Row():
	with gr.Column(scale=2):
	edit_input = gr.Textbox(label="Edit Prompt", placeholder="Remove the wings", lines=2)
	with gr.Column(scale=1):
	edit_btn = gr.Button("✏️ Apply Edit", variant="secondary", size="lg")
	edit_counter = gr.Markdown("No edits yet")

	gr.Examples(
	examples=["Remove the wings", "Change color to blue", "Add racing stripes", "Make it larger", "Add wheels"],
	inputs=edit_input
	)

	gr.Markdown("## 📸 Images")
	with gr.Row():
	original_output = gr.Image(label="1. Original", type="filepath")
	nobg_output = gr.Image(label="2. Transparent", type="filepath")
	mask_output = gr.Image(label="3. Mask", type="filepath")

	gr.Markdown("## 🎮 3D Model")
	model_output = gr.Model3D(label="Interactive 3D Model (drag to rotate)", clear_color=[0.1, 0.1, 0.1, 1.0])

	gr.Markdown("## 📥 Downloads")
	with gr.Row():
	glb_download = gr.File(label="GLB (mesh)")
	ply_download = gr.File(label="PLY (splat)")

	gr.Markdown("""
	---
	## 🔌 MCP Server Info

	This app exposes two MCP tools: `generate_3d` and `edit_3d`

	Connect via: `https://YOUR-SPACE.hf.space/gradio_api/mcp/sse`

	---
	Built for [MCP 1st Birthday Hackathon](https://huggingface.co/MCP-1st-Birthday) 🎂
	""")

	def update_counter(count):
	return "No edits yet" if count == 0 else f"Edits applied: {count}"

	generate_btn.click(
	fn=generate_3d_ui,
	inputs=[prompt_input],
	outputs=[original_output, nobg_output, mask_output, model_output, glb_download, ply_download, current_image_state, edit_count_state]
	).then(fn=update_counter, inputs=[edit_count_state], outputs=[edit_counter])

	edit_btn.click(
	fn=edit_3d_ui,
	inputs=[edit_input, current_image_state, edit_count_state],
	outputs=[nobg_output, mask_output, model_output, glb_download, ply_download, current_image_state, edit_count_state]
	).then(fn=update_counter, inputs=[edit_count_state], outputs=[edit_counter])


	# ============================================================
	# COMBINE UI + MCP TOOLS
	# ============================================================

	demo = gr.TabbedInterface(
	interface_list=[main_ui, generate_tool, edit_tool],
	tab_names=["🎨 Interactive UI", "🔧 Generate Tool", "✏️ Edit Tool"],
	title="Text to 3D \| MCP Server"
	)

	if __name__ == "__main__":
	logger.info("=== Starting Text-to-3D MCP Server ===")
	demo.launch(mcp_server=True)