""" Text-to-3D Pipeline with Editing: Gemini + SAM-3D MCP Server + Gradio UI for MCP Hackathon """ import os import io import json import tempfile import logging from datetime import datetime import hashlib import shutil import gradio as gr from google import genai from google.genai import types from PIL import Image import modal # ============================================================ # LOGGING SETUP # ============================================================ # Setup logging for console output (visible in HF Spaces logs) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler() # This goes to HF Spaces logs ] ) logger = logging.getLogger(__name__) # Create persistent logs directory (use HF Datasets for permanent storage) LOGS_DIR = "generation_logs" os.makedirs(LOGS_DIR, exist_ok=True) def save_generation_log(prompt, images_dict, metadata=None, request_info=None): """Save generation logs with images and user info""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") session_id = hashlib.md5(f"{prompt}{timestamp}".encode()).hexdigest()[:8] session_dir = os.path.join(LOGS_DIR, f"{timestamp}_{session_id}") os.makedirs(session_dir, exist_ok=True) # Save metadata WITH request info log_data = { "timestamp": timestamp, "session_id": session_id, "prompt": prompt, "client_ip": request_info.get("ip") if request_info else None, "user_agent": request_info.get("user_agent") if request_info else None, "metadata": metadata or {} } log_file = os.path.join(session_dir, "metadata.json") with open(log_file, 'w') as f: json.dump(log_data, f, indent=2) # Save images for name, image_path in images_dict.items(): if image_path and os.path.exists(image_path): dest = os.path.join(session_dir, f"{name}.png") shutil.copy(image_path, dest) logger.info(f"✓ Saved logs to: {session_dir}") return session_dir # ============================================================ # Initialize Gemini client # ============================================================ client = None def init_gemini(): global client api_key = os.environ.get("GEMINI_API_KEY") if api_key: os.environ["GEMINI_API_KEY"] = api_key client = genai.Client() logger.info("✓ Gemini client initialized") return True logger.error("✗ GEMINI_API_KEY not found") return False def image_to_bytes(image): """Convert PIL Image to PNG bytes""" buffer = io.BytesIO() image.save(buffer, format='PNG') return buffer.getvalue() def run_sam3d(image, mask): """Send image and mask to SAM-3D on Modal""" logger.info("→ Sending to SAM-3D on Modal...") img_bytes = image_to_bytes(image.convert("RGB")) mask_bytes = image_to_bytes(mask) SAM3DModel = modal.Cls.from_name("sam3d-objects-inference", "SAM3DModel") model = SAM3DModel() ply_bytes, glb_bytes = model.reconstruct.remote(img_bytes, mask_bytes) logger.info(f"✓ SAM-3D complete - PLY: {len(ply_bytes)} bytes, GLB: {len(glb_bytes) if glb_bytes else 0} bytes") return ply_bytes, glb_bytes # ============================================================ # MCP TOOLS - These functions are exposed as MCP tools # ============================================================ def generate_3d_model(prompt: str) -> str: """ Generate a 3D model from a text description. Args: prompt: Text description of the object to generate (e.g., "a red sports car", "a wooden chair") Returns: JSON string with paths to generated files """ logger.info(f"=== MCP TOOL: generate_3d_model ===") logger.info(f"Prompt: {prompt}") if not client: if not init_gemini(): return json.dumps({"error": "GEMINI_API_KEY not configured"}) try: # STEP 1: Generate image initial_prompt = f"{prompt}, three-quarter front view angle, natural daylight, soft shadows showing depth and contours, clean simple background, full object visible, photorealistic" logger.info("→ Generating initial image...") response_gen = client.models.generate_content( model="gemini-2.5-flash-image", contents=[initial_prompt], ) initial_image = None for part in response_gen.parts: if part.inline_data: image_bytes = part.inline_data.data initial_image = Image.open(io.BytesIO(image_bytes)) break if initial_image is None: logger.error("✗ Image generation failed") return json.dumps({"error": "Image generation failed"}) logger.info(f"✓ Initial image generated: {initial_image.size}") # STEP 2: Remove background logger.info("→ Removing background...") edit_prompt = "Remove the background completely, make the background transparent. Preserve the object's shadow for realism." image_part = types.Part.from_bytes( data=image_to_bytes(initial_image), mime_type="image/png" ) response_edit = client.models.generate_content( model="gemini-3-pro-image-preview", contents=[edit_prompt, image_part], ) final_image = None for part in response_edit.parts: if part.inline_data: edited_bytes = part.inline_data.data final_image = Image.open(io.BytesIO(edited_bytes)) break if final_image is None: logger.error("✗ Background removal failed") return json.dumps({"error": "Background removal failed"}) logger.info("✓ Background removed") # STEP 3: Create grayscale mask gray = final_image.convert("L") # STEP 4: Run SAM-3D ply_bytes, glb_bytes = run_sam3d(final_image, gray) # Save all outputs temp_dir = tempfile.mkdtemp() original_path = os.path.join(temp_dir, "original.png") nobg_path = os.path.join(temp_dir, "transparent.png") mask_path = os.path.join(temp_dir, "mask.png") ply_path = os.path.join(temp_dir, "model.ply") initial_image.save(original_path) final_image.save(nobg_path) gray.save(mask_path) with open(ply_path, 'wb') as f: f.write(ply_bytes) glb_path = None if glb_bytes: glb_path = os.path.join(temp_dir, "model.glb") with open(glb_path, 'wb') as f: f.write(glb_bytes) # Save logs images_dict = { "original": original_path, "transparent": nobg_path, "mask": mask_path } metadata = { "type": "generation", "has_glb": glb_path is not None, "ply_size_bytes": len(ply_bytes), "glb_size_bytes": len(glb_bytes) if glb_bytes else 0 } log_dir = save_generation_log(prompt, images_dict, metadata) logger.info(f"✓ Generation complete!") return json.dumps({ "success": True, "prompt": prompt, "original_image": original_path, "transparent_image": nobg_path, "mask_image": mask_path, "ply_model": ply_path, "glb_model": glb_path, "log_directory": log_dir, "message": f"Successfully generated 3D model for: {prompt}" }) except Exception as e: logger.error(f"✗ Error: {e}", exc_info=True) return json.dumps({"error": str(e)}) def edit_3d_model(edit_prompt: str, transparent_image_path: str) -> str: """ Edit an existing 3D model by modifying its transparent image and regenerating. Args: edit_prompt: Description of the edit to apply (e.g., "remove the wings", "change color to blue") transparent_image_path: Path to the transparent PNG image from a previous generation Returns: JSON string with paths to the new edited files """ logger.info(f"=== MCP TOOL: edit_3d_model ===") logger.info(f"Edit: {edit_prompt}") if not client: if not init_gemini(): return json.dumps({"error": "GEMINI_API_KEY not configured"}) try: current_image = Image.open(transparent_image_path) logger.info(f"→ Loaded image: {current_image.size}") image_part = types.Part.from_bytes( data=image_to_bytes(current_image), mime_type="image/png" ) full_edit_prompt = f"{edit_prompt}. Keep the background transparent. Maintain image quality and lighting." logger.info("→ Applying edit...") response_edit = client.models.generate_content( model="gemini-3-pro-image-preview", contents=[full_edit_prompt, image_part], ) edited_image = None for part in response_edit.parts: if part.inline_data: edited_bytes = part.inline_data.data edited_image = Image.open(io.BytesIO(edited_bytes)) break if edited_image is None: logger.error("✗ Edit failed") return json.dumps({"error": "Edit failed"}) logger.info("✓ Edit applied") gray = edited_image.convert("L") ply_bytes, glb_bytes = run_sam3d(edited_image, gray) temp_dir = tempfile.mkdtemp() nobg_path = os.path.join(temp_dir, "edited.png") mask_path = os.path.join(temp_dir, "mask.png") ply_path = os.path.join(temp_dir, "model.ply") edited_image.save(nobg_path) gray.save(mask_path) with open(ply_path, 'wb') as f: f.write(ply_bytes) glb_path = None if glb_bytes: glb_path = os.path.join(temp_dir, "model.glb") with open(glb_path, 'wb') as f: f.write(glb_bytes) # Save logs images_dict = { "edited": nobg_path, "mask": mask_path } metadata = { "type": "edit", "has_glb": glb_path is not None } log_dir = save_generation_log(edit_prompt, images_dict, metadata) logger.info(f"✓ Edit complete!") return json.dumps({ "success": True, "edit_prompt": edit_prompt, "transparent_image": nobg_path, "mask_image": mask_path, "ply_model": ply_path, "glb_model": glb_path, "log_directory": log_dir, "message": f"Successfully applied edit: {edit_prompt}" }) except Exception as e: logger.error(f"✗ Error: {e}", exc_info=True) return json.dumps({"error": str(e)}) # ============================================================ # GRADIO UI FUNCTIONS # ============================================================ def generate_3d_ui(prompt, request: gr.Request, progress=gr.Progress()): """UI wrapper with progress updates""" # Get client info client_ip = request.headers.get("x-forwarded-for", "").split(",")[0].strip() if not client_ip: client_ip = request.client.host if request else "unknown" user_agent = request.headers.get("user-agent", "unknown") if request else "unknown" logger.info(f"=== NEW GENERATION REQUEST ===") logger.info(f"IP: {client_ip}") logger.info(f"Prompt: {prompt}") if not client: if not init_gemini(): raise gr.Error("GEMINI_API_KEY not set in Space secrets") progress(0.1, desc="Generating image...") initial_prompt = f"{prompt}, three-quarter front view angle, natural daylight, soft shadows showing depth and contours, clean simple background, full object visible, photorealistic" try: response_gen = client.models.generate_content( model="gemini-2.5-flash-image", contents=[initial_prompt], ) initial_image = None for part in response_gen.parts: if part.inline_data: image_bytes = part.inline_data.data initial_image = Image.open(io.BytesIO(image_bytes)) break if initial_image is None: raise gr.Error("Image generation failed") logger.info(f"✓ Image generated: {initial_image.size}") except Exception as e: logger.error(f"✗ Image generation failed: {e}") raise gr.Error(f"Image generation failed: {e}") progress(0.3, desc="Removing background...") try: image_part = types.Part.from_bytes( data=image_to_bytes(initial_image), mime_type="image/png" ) response_edit = client.models.generate_content( model="gemini-3-pro-image-preview", contents=["Remove the background completely, make the background transparent. Preserve the object's shadow for realism.", image_part], ) final_image = None for part in response_edit.parts: if part.inline_data: edited_bytes = part.inline_data.data final_image = Image.open(io.BytesIO(edited_bytes)) break if final_image is None: raise gr.Error("Background removal failed") logger.info("✓ Background removed") except Exception as e: logger.error(f"✗ Background removal failed: {e}") raise gr.Error(f"Background removal failed: {e}") progress(0.4, desc="Creating mask...") gray = final_image.convert("L") progress(0.5, desc="Running SAM-3D (1-2 min, first run may take longer)...") try: ply_bytes, glb_bytes = run_sam3d(final_image, gray) except Exception as e: logger.error(f"✗ SAM-3D failed: {e}") raise gr.Error(f"SAM-3D failed: {e}") progress(0.9, desc="Saving outputs...") temp_dir = tempfile.mkdtemp() original_path = os.path.join(temp_dir, "original.png") nobg_path = os.path.join(temp_dir, "no_background.png") mask_path = os.path.join(temp_dir, "mask.png") ply_path = os.path.join(temp_dir, "model.ply") initial_image.save(original_path) final_image.save(nobg_path) gray.save(mask_path) with open(ply_path, 'wb') as f: f.write(ply_bytes) glb_path = None if glb_bytes: glb_path = os.path.join(temp_dir, "model.glb") with open(glb_path, 'wb') as f: f.write(glb_bytes) # Save logs WITH request info images_dict = { "original": original_path, "transparent": nobg_path, "mask": mask_path } metadata = { "type": "ui_generation", "has_glb": glb_path is not None, "ply_size_bytes": len(ply_bytes), "glb_size_bytes": len(glb_bytes) if glb_bytes else 0 } request_info = { "ip": client_ip, "user_agent": user_agent } save_generation_log(prompt, images_dict, metadata, request_info) progress(1.0, desc="Done!") logger.info(f"✓ Generation complete!") return ( original_path, nobg_path, mask_path, glb_path if glb_path else ply_path, glb_path, ply_path, final_image, 1, ) def edit_3d_ui(edit_prompt, current_image, edit_count, request: gr.Request, progress=gr.Progress()): """UI wrapper for editing""" # Get client info client_ip = request.headers.get("x-forwarded-for", "").split(",")[0].strip() if not client_ip: client_ip = request.client.host if request else "unknown" user_agent = request.headers.get("user-agent", "unknown") if request else "unknown" logger.info(f"=== EDIT REQUEST #{edit_count + 1} ===") logger.info(f"IP: {client_ip}") logger.info(f"Edit: {edit_prompt}") if current_image is None: raise gr.Error("No image to edit. Generate a 3D model first!") if not client: if not init_gemini(): raise gr.Error("GEMINI_API_KEY not set") progress(0.1, desc=f"Applying edit: {edit_prompt}...") try: image_part = types.Part.from_bytes( data=image_to_bytes(current_image), mime_type="image/png" ) full_edit_prompt = f"{edit_prompt}. Keep the background transparent. Maintain image quality and lighting." response_edit = client.models.generate_content( model="gemini-3-pro-image-preview", contents=[full_edit_prompt, image_part], ) edited_image = None for part in response_edit.parts: if part.inline_data: edited_bytes = part.inline_data.data edited_image = Image.open(io.BytesIO(edited_bytes)) break if edited_image is None: raise gr.Error("Edit failed") logger.info("✓ Edit applied") except Exception as e: logger.error(f"✗ Edit failed: {e}") raise gr.Error(f"Edit failed: {e}") progress(0.3, desc="Creating new mask...") gray = edited_image.convert("L") progress(0.4, desc="Running SAM-3D (1-2 min)...") try: ply_bytes, glb_bytes = run_sam3d(edited_image, gray) except Exception as e: logger.error(f"✗ SAM-3D failed: {e}") raise gr.Error(f"SAM-3D failed: {e}") progress(0.9, desc="Saving outputs...") temp_dir = tempfile.mkdtemp() nobg_path = os.path.join(temp_dir, "edited.png") mask_path = os.path.join(temp_dir, "mask.png") ply_path = os.path.join(temp_dir, "model.ply") edited_image.save(nobg_path) gray.save(mask_path) with open(ply_path, 'wb') as f: f.write(ply_bytes) glb_path = None if glb_bytes: glb_path = os.path.join(temp_dir, "model.glb") with open(glb_path, 'wb') as f: f.write(glb_bytes) new_edit_count = edit_count + 1 # Save logs WITH request info images_dict = { "edited": nobg_path, "mask": mask_path } metadata = { "type": "ui_edit", "edit_number": new_edit_count, "has_glb": glb_path is not None } request_info = { "ip": client_ip, "user_agent": user_agent } save_generation_log(edit_prompt, images_dict, metadata, request_info) progress(1.0, desc=f"Edit #{new_edit_count} complete!") logger.info(f"✓ Edit #{new_edit_count} complete!") return ( nobg_path, mask_path, glb_path if glb_path else ply_path, glb_path, ply_path, edited_image, new_edit_count, ) # ============================================================ # MCP TOOL INTERFACES # ============================================================ generate_tool = gr.Interface( fn=generate_3d_model, inputs=gr.Textbox(label="Prompt", placeholder="A red sports car"), outputs=gr.Textbox(label="Result (JSON)"), api_name="generate_3d", title="Generate 3D Model", description="Generate a 3D model from a text description" ) edit_tool = gr.Interface( fn=edit_3d_model, inputs=[ gr.Textbox(label="Edit Prompt", placeholder="Remove the wings"), gr.Textbox(label="Transparent Image Path", placeholder="/path/to/transparent.png") ], outputs=gr.Textbox(label="Result (JSON)"), api_name="edit_3d", title="Edit 3D Model", description="Edit an existing 3D model" ) # ============================================================ # MAIN UI # ============================================================ with gr.Blocks() as main_ui: current_image_state = gr.State(None) edit_count_state = gr.State(0) gr.Markdown(""" # 🎨 Text to 3D Model (MCP Server) ### Powered by Gemini + SAM-3D Objects **This app is also an MCP Server!** Claude Desktop, Cursor, and other MCP clients can use the `generate_3d` and `edit_3d` tools. ⏱️ *Generation takes 1-2 minutes. First run may take longer as the model warms up.* 📍 *Note: IP addresses are logged for analytics.* """) gr.Markdown("## 1️⃣ Generate Initial 3D Model") with gr.Row(): with gr.Column(scale=2): prompt_input = gr.Textbox(label="Text Prompt", placeholder="A plane with eagle wings", lines=2) with gr.Column(scale=1): generate_btn = gr.Button("🚀 Generate", variant="primary", size="lg") gr.Examples( examples=["A plane with eagle wings", "A wooden chair", "A red sports car", "A ceramic coffee mug", "A robot dog"], inputs=prompt_input ) gr.Markdown("## 2️⃣ Edit Your Model") with gr.Row(): with gr.Column(scale=2): edit_input = gr.Textbox(label="Edit Prompt", placeholder="Remove the wings", lines=2) with gr.Column(scale=1): edit_btn = gr.Button("✏️ Apply Edit", variant="secondary", size="lg") edit_counter = gr.Markdown("*No edits yet*") gr.Examples( examples=["Remove the wings", "Change color to blue", "Add racing stripes", "Make it larger", "Add wheels"], inputs=edit_input ) gr.Markdown("## 📸 Images") with gr.Row(): original_output = gr.Image(label="1. Original", type="filepath") nobg_output = gr.Image(label="2. Transparent", type="filepath") mask_output = gr.Image(label="3. Mask", type="filepath") gr.Markdown("## 🎮 3D Model") model_output = gr.Model3D(label="Interactive 3D Model (drag to rotate)", clear_color=[0.1, 0.1, 0.1, 1.0]) gr.Markdown("## 📥 Downloads") with gr.Row(): glb_download = gr.File(label="GLB (mesh)") ply_download = gr.File(label="PLY (splat)") gr.Markdown(""" --- ## 🔌 MCP Server Info This app exposes two MCP tools: `generate_3d` and `edit_3d` **Connect via:** `https://YOUR-SPACE.hf.space/gradio_api/mcp/sse` --- **Built for [MCP 1st Birthday Hackathon](https://huggingface.co/MCP-1st-Birthday)** 🎂 """) def update_counter(count): return "*No edits yet*" if count == 0 else f"**Edits applied: {count}**" generate_btn.click( fn=generate_3d_ui, inputs=[prompt_input], outputs=[original_output, nobg_output, mask_output, model_output, glb_download, ply_download, current_image_state, edit_count_state] ).then(fn=update_counter, inputs=[edit_count_state], outputs=[edit_counter]) edit_btn.click( fn=edit_3d_ui, inputs=[edit_input, current_image_state, edit_count_state], outputs=[nobg_output, mask_output, model_output, glb_download, ply_download, current_image_state, edit_count_state] ).then(fn=update_counter, inputs=[edit_count_state], outputs=[edit_counter]) # ============================================================ # COMBINE UI + MCP TOOLS # ============================================================ demo = gr.TabbedInterface( interface_list=[main_ui, generate_tool, edit_tool], tab_names=["🎨 Interactive UI", "🔧 Generate Tool", "✏️ Edit Tool"], title="Text to 3D | MCP Server" ) if __name__ == "__main__": logger.info("=== Starting Text-to-3D MCP Server ===") demo.launch(mcp_server=True)