import base64 import json import os import random import time from io import BytesIO from typing import Optional, Tuple import gradio as gr import numpy as np import requests import spaces import torch from PIL import Image from fibo_edit_pipeline import BriaFiboEditPipeline from utils import AngleInstruction # --- Configuration --- device = "cuda" if torch.cuda.is_available() else "cpu" # Run locally or on HuggingFace Spaces RUN_LOCAL = False # Model paths BASE_CHECKPOINT = "briaai/FIBO-Edit" # HuggingFace model ID LORA_CHECKPOINT = "briaai/fibo_edit_multi_angle_full_0121_full_1k" # HuggingFace LoRA model ID # BRIA API configuration BRIA_API_URL = "https://engine.prod.bria-api.com/v2/structured_prompt/generate/pro" BRIA_API_TOKEN = os.environ.get("BRIA_API_TOKEN") if not BRIA_API_TOKEN: raise ValueError( "BRIA_API_TOKEN environment variable is not set. " "Please add it as a HuggingFace Space secret." ) # Generation defaults DEFAULT_NUM_INFERENCE_STEPS = 50 DEFAULT_GUIDANCE_SCALE = 3.5 DEFAULT_SEED = 100050 MAX_SEED = np.iinfo(np.int32).max print("🚀 Starting Fibo Edit Multi-Angle LoRA Gradio App") print(f"Device: {device}") print(f"Base checkpoint: {BASE_CHECKPOINT}") print(f"LoRA checkpoint: {LORA_CHECKPOINT}") # --- Helper Functions --- def load_pipeline_fiboedit( checkpoint: str, lora_checkpoint: Optional[str] = None, lora_scale: Optional[float] = None, fuse_lora: bool = True, ): """ Load the Fibo Edit pipeline using BriaFiboEditPipeline with optional LoRA weights. Args: checkpoint: HuggingFace model ID for base model lora_checkpoint: Optional HuggingFace model ID for LoRA weights lora_scale: Scale for LoRA weights when fusing (default None = 1.0) fuse_lora: Whether to fuse LoRA into base weights (default True) Returns: Loaded BriaFiboEditPipeline """ print(f"Loading BriaFiboEditPipeline from {checkpoint}") if lora_checkpoint: print(f" with LoRA from {lora_checkpoint}") # Load pipeline from HuggingFace print("Loading pipeline...") pipe = BriaFiboEditPipeline.from_pretrained( checkpoint, torch_dtype=torch.bfloat16, ) pipe.to("cuda") print(f" Pipeline loaded from {checkpoint}") # Load LoRA weights if provided (PEFT format) if lora_checkpoint: print(f"Loading PEFT LoRA from {lora_checkpoint}...") from peft import PeftModel print(" Loading PEFT adapter onto transformer...") pipe.transformer = PeftModel.from_pretrained( pipe.transformer, lora_checkpoint, ) print(" PEFT adapter loaded successfully") if fuse_lora: print(" Merging LoRA into base weights...") if hasattr(pipe.transformer, "merge_and_unload"): pipe.transformer = pipe.transformer.merge_and_unload() print(" LoRA merged and unloaded") else: print(" [WARN] transformer.merge_and_unload() not available") print("✅ Pipeline loaded successfully!") return pipe def generate_structured_caption( image: Image.Image, prompt: str, seed: int = 1 ) -> Optional[dict]: """Generate structured caption using BRIA API.""" buffered = BytesIO() image.save(buffered, format="PNG") image_bytes = base64.b64encode(buffered.getvalue()).decode("utf-8") payload = { "seed": seed, "sync": True, "images": [image_bytes], "prompt": prompt, } headers = { "Content-Type": "application/json", "api_token": BRIA_API_TOKEN, } max_retries = 3 for attempt in range(max_retries): try: response = requests.post( BRIA_API_URL, json=payload, headers=headers, timeout=60 ) response.raise_for_status() data = response.json() structured_prompt_str = data["result"]["structured_prompt"] return json.loads(structured_prompt_str) except Exception as e: if attempt == max_retries - 1: print(f"Failed to generate structured caption: {e}") return None time.sleep(3) return None # --- Model Loading --- print("Loading Fibo Edit pipeline...") try: pipe = load_pipeline_fiboedit( checkpoint=BASE_CHECKPOINT, lora_checkpoint=LORA_CHECKPOINT, lora_scale=None, fuse_lora=True, ) if torch.cuda.is_available(): mem_allocated = torch.cuda.memory_allocated(0) / 1024**3 print(f" GPU memory allocated: {mem_allocated:.2f} GB") except Exception as e: print(f"❌ Error loading pipeline: {e}") import traceback traceback.print_exc() raise def build_camera_prompt( rotate_deg: float = 0.0, zoom: float = 0.0, vertical_tilt: float = 0.0 ) -> str: """Build a natural language camera instruction from parameters.""" # Create AngleInstruction from camera parameters angle_instruction = AngleInstruction.from_camera_params( rotation=rotate_deg, tilt=vertical_tilt, zoom=zoom ) # Generate natural language description view_map = { "back view": "view from the opposite side", "back-left quarter view": "rotate 135 degrees left", "back-right quarter view": "rotate 135 degrees right", "front view": "keep the front view", "front-left quarter view": "rotate 45 degrees left", "front-right quarter view": "rotate 45 degrees right", "left side view": "rotate 90 degrees left", "right side view": "rotate 90 degrees right", } shot_map = { "elevated shot": "with an elevated viewing angle", "eye-level shot": "with an eye-level viewing angle", "high-angle shot": "with a high-angle viewing angle", "low-angle shot": "with a low-angle viewing angle", } zoom_map = { "close-up": "and make it a close-up shot", "medium shot": "", # Omit medium shot "wide shot": "and make it a wide shot", } view_text = view_map[angle_instruction.view.value] shot_text = shot_map[angle_instruction.shot.value] zoom_text = zoom_map[angle_instruction.zoom.value] # Construct the natural language prompt starting with "Change the viewing angle" parts = [view_text, shot_text] if zoom_text: # Only add zoom if not empty (medium shot is omitted) parts.append(zoom_text) natural_prompt = "Change the viewing angle: " + ", ".join(parts) return natural_prompt, angle_instruction def fetch_structured_caption( image: Optional[Image.Image] = None, rotate_deg: float = 0.0, zoom: float = 0.0, vertical_tilt: float = 0.0, seed: int = 0, randomize_seed: bool = True, prev_output: Optional[Image.Image] = None, ) -> Tuple[int, str, dict, Image.Image]: """Fetch structured caption from BRIA API.""" # Build natural language prompt and angle instruction natural_prompt, angle_instruction = build_camera_prompt( rotate_deg, zoom, vertical_tilt ) print(f"Natural Language Prompt: {natural_prompt}") print(f"Angle Instruction: {str(angle_instruction)}") if randomize_seed: seed = random.randint(0, MAX_SEED) # Get input image if image is not None: if isinstance(image, Image.Image): input_image = image.convert("RGB") elif hasattr(image, "name"): input_image = Image.open(image.name).convert("RGB") else: input_image = image elif prev_output: input_image = prev_output.convert("RGB") else: raise gr.Error("Please upload an image first.") # Generate structured caption using BRIA API print("Generating structured caption from BRIA API...") structured_caption = generate_structured_caption( input_image, natural_prompt, seed=seed ) if structured_caption is None: raise gr.Error("Failed to generate structured caption from BRIA API") # Replace edit_instruction with angle instruction string structured_caption["edit_instruction"] = str(angle_instruction) print( f"Structured caption received: {json.dumps(structured_caption, ensure_ascii=False)}" ) return seed, natural_prompt, structured_caption, input_image @spaces.GPU(duration=240) def generate_image_from_caption( input_image: Image.Image, structured_caption: dict, seed: int, guidance_scale: float = 3.5, num_inference_steps: int = 50, ) -> Image.Image: """Generate image using Fibo Edit pipeline with structured caption.""" structured_prompt = json.dumps(structured_caption, ensure_ascii=False) print("Generating image with structured prompt...") generator = torch.Generator(device=device).manual_seed(seed) result = pipe( image=input_image, prompt=structured_prompt, guidance_scale=guidance_scale, num_inference_steps=num_inference_steps, generator=generator, num_images_per_prompt=1, ).images[0] return result # --- 3D Camera Control Component --- # Using gr.HTML directly with templates (Gradio 6 style) CAMERA_3D_HTML_TEMPLATE = """
Rotation (↔)
Vertical Tilt (↕)
Distance/Zoom
""" CAMERA_3D_JS = """ (() => { const wrapper = element.querySelector('#camera-control-wrapper'); const promptOverlay = element.querySelector('#prompt-overlay'); const initScene = () => { if (typeof THREE === 'undefined') { setTimeout(initScene, 100); return; } const scene = new THREE.Scene(); scene.background = new THREE.Color(0x1a1a1a); const camera = new THREE.PerspectiveCamera(50, wrapper.clientWidth / wrapper.clientHeight, 0.1, 1000); camera.position.set(4, 3, 4); camera.lookAt(0, 0.75, 0); const renderer = new THREE.WebGLRenderer({ antialias: true }); renderer.setSize(wrapper.clientWidth, wrapper.clientHeight); renderer.setPixelRatio(Math.min(window.devicePixelRatio, 2)); wrapper.insertBefore(renderer.domElement, wrapper.firstChild); scene.add(new THREE.AmbientLight(0xffffff, 0.6)); const dirLight = new THREE.DirectionalLight(0xffffff, 0.6); dirLight.position.set(5, 10, 5); scene.add(dirLight); scene.add(new THREE.GridHelper(6, 12, 0x333333, 0x222222)); const CENTER = new THREE.Vector3(0, 0.75, 0); const BASE_DISTANCE = 2.0; const ROTATION_RADIUS = 2.2; const TILT_RADIUS = 1.6; let rotateDeg = props.value?.rotate_deg || 0; let zoom = props.value?.zoom || 5.0; let verticalTilt = props.value?.vertical_tilt || 0; const rotateSteps = [-180, -135, -90, -45, 0, 45, 90, 135, 180]; const zoomSteps = [0, 5, 10]; const tiltSteps = [-1, -0.5, 0, 0.5, 1]; function snapToNearest(value, steps) { return steps.reduce((prev, curr) => Math.abs(curr - value) < Math.abs(prev - value) ? curr : prev); } function createPlaceholderTexture() { const canvas = document.createElement('canvas'); canvas.width = 256; canvas.height = 256; const ctx = canvas.getContext('2d'); ctx.fillStyle = '#3a3a4a'; ctx.fillRect(0, 0, 256, 256); ctx.fillStyle = '#ffcc99'; ctx.beginPath(); ctx.arc(128, 128, 80, 0, Math.PI * 2); ctx.fill(); ctx.fillStyle = '#333'; ctx.beginPath(); ctx.arc(100, 110, 10, 0, Math.PI * 2); ctx.arc(156, 110, 10, 0, Math.PI * 2); ctx.fill(); ctx.strokeStyle = '#333'; ctx.lineWidth = 3; ctx.beginPath(); ctx.arc(128, 130, 35, 0.2, Math.PI - 0.2); ctx.stroke(); return new THREE.CanvasTexture(canvas); } let currentTexture = createPlaceholderTexture(); const planeMaterial = new THREE.MeshBasicMaterial({ map: currentTexture, side: THREE.DoubleSide }); let targetPlane = new THREE.Mesh(new THREE.PlaneGeometry(1.2, 1.2), planeMaterial); targetPlane.position.copy(CENTER); scene.add(targetPlane); function updateTextureFromUrl(url) { if (!url) { planeMaterial.map = createPlaceholderTexture(); planeMaterial.needsUpdate = true; scene.remove(targetPlane); targetPlane = new THREE.Mesh(new THREE.PlaneGeometry(1.2, 1.2), planeMaterial); targetPlane.position.copy(CENTER); scene.add(targetPlane); return; } const loader = new THREE.TextureLoader(); loader.crossOrigin = 'anonymous'; loader.load(url, (texture) => { texture.minFilter = THREE.LinearFilter; texture.magFilter = THREE.LinearFilter; planeMaterial.map = texture; planeMaterial.needsUpdate = true; const img = texture.image; if (img && img.width && img.height) { const aspect = img.width / img.height; const maxSize = 1.4; let planeWidth, planeHeight; if (aspect > 1) { planeWidth = maxSize; planeHeight = maxSize / aspect; } else { planeHeight = maxSize; planeWidth = maxSize * aspect; } scene.remove(targetPlane); targetPlane = new THREE.Mesh(new THREE.PlaneGeometry(planeWidth, planeHeight), planeMaterial); targetPlane.position.copy(CENTER); scene.add(targetPlane); } }); } if (props.imageUrl) { updateTextureFromUrl(props.imageUrl); } const cameraGroup = new THREE.Group(); const bodyMat = new THREE.MeshStandardMaterial({ color: 0x6699cc, metalness: 0.5, roughness: 0.3 }); const body = new THREE.Mesh(new THREE.BoxGeometry(0.28, 0.2, 0.35), bodyMat); cameraGroup.add(body); const lens = new THREE.Mesh( new THREE.CylinderGeometry(0.08, 0.1, 0.16, 16), new THREE.MeshStandardMaterial({ color: 0x6699cc, metalness: 0.5, roughness: 0.3 }) ); lens.rotation.x = Math.PI / 2; lens.position.z = 0.24; cameraGroup.add(lens); scene.add(cameraGroup); const rotationArcPoints = []; for (let i = 0; i <= 64; i++) { const angle = THREE.MathUtils.degToRad((360 * i / 64)); rotationArcPoints.push(new THREE.Vector3(ROTATION_RADIUS * Math.sin(angle), 0.05, ROTATION_RADIUS * Math.cos(angle))); } const rotationCurve = new THREE.CatmullRomCurve3(rotationArcPoints); const rotationArc = new THREE.Mesh( new THREE.TubeGeometry(rotationCurve, 64, 0.035, 8, true), new THREE.MeshStandardMaterial({ color: 0x00ff88, emissive: 0x00ff88, emissiveIntensity: 0.3 }) ); scene.add(rotationArc); const rotationHandle = new THREE.Mesh( new THREE.SphereGeometry(0.16, 16, 16), new THREE.MeshStandardMaterial({ color: 0x00ff88, emissive: 0x00ff88, emissiveIntensity: 0.5 }) ); rotationHandle.userData.type = 'rotation'; scene.add(rotationHandle); const tiltArcPoints = []; for (let i = 0; i <= 32; i++) { const angle = THREE.MathUtils.degToRad(-45 + (90 * i / 32)); tiltArcPoints.push(new THREE.Vector3(-0.7, TILT_RADIUS * Math.sin(angle) + CENTER.y, TILT_RADIUS * Math.cos(angle))); } const tiltCurve = new THREE.CatmullRomCurve3(tiltArcPoints); const tiltArc = new THREE.Mesh( new THREE.TubeGeometry(tiltCurve, 32, 0.035, 8, false), new THREE.MeshStandardMaterial({ color: 0xff69b4, emissive: 0xff69b4, emissiveIntensity: 0.3 }) ); scene.add(tiltArc); const tiltHandle = new THREE.Mesh( new THREE.SphereGeometry(0.16, 16, 16), new THREE.MeshStandardMaterial({ color: 0xff69b4, emissive: 0xff69b4, emissiveIntensity: 0.5 }) ); tiltHandle.userData.type = 'tilt'; scene.add(tiltHandle); const distanceLineGeo = new THREE.BufferGeometry(); const distanceLine = new THREE.Line(distanceLineGeo, new THREE.LineBasicMaterial({ color: 0xffa500 })); scene.add(distanceLine); const distanceHandle = new THREE.Mesh( new THREE.SphereGeometry(0.16, 16, 16), new THREE.MeshStandardMaterial({ color: 0xffa500, emissive: 0xffa500, emissiveIntensity: 0.5 }) ); distanceHandle.userData.type = 'distance'; scene.add(distanceHandle); function buildPromptText(rot, zoomVal, tilt) { const parts = []; if (rot !== 0) { const dir = rot > 0 ? 'right' : 'left'; parts.push('Rotate ' + Math.abs(rot) + '° ' + dir); } if (zoomVal >= 6.66) parts.push('Close-up'); else if (zoomVal >= 3.33) parts.push('Medium shot'); else parts.push('Wide angle'); if (tilt >= 0.66) parts.push("High angle"); else if (tilt >= 0.33) parts.push("Elevated"); else if (tilt <= -0.33) parts.push("Low angle"); else parts.push("Eye level"); return parts.length > 0 ? parts.join(' • ') : 'No camera movement'; } function updatePositions() { const rotRad = THREE.MathUtils.degToRad(rotateDeg); // Map zoom 0-10 to distance: zoom 0 = far (3.0), zoom 10 = close (1.0) const distance = 3.0 - (zoom / 10) * 2.0; const tiltAngle = verticalTilt * 35; const tiltRad = THREE.MathUtils.degToRad(tiltAngle); const camX = distance * Math.sin(rotRad) * Math.cos(tiltRad); const camY = distance * Math.sin(tiltRad) + CENTER.y; const camZ = distance * Math.cos(rotRad) * Math.cos(tiltRad); cameraGroup.position.set(camX, camY, camZ); cameraGroup.lookAt(CENTER); rotationHandle.position.set(ROTATION_RADIUS * Math.sin(rotRad), 0.05, ROTATION_RADIUS * Math.cos(rotRad)); const tiltHandleAngle = THREE.MathUtils.degToRad(tiltAngle); tiltHandle.position.set(-0.7, TILT_RADIUS * Math.sin(tiltHandleAngle) + CENTER.y, TILT_RADIUS * Math.cos(tiltHandleAngle)); const handleDist = distance - 0.4; distanceHandle.position.set( handleDist * Math.sin(rotRad) * Math.cos(tiltRad), handleDist * Math.sin(tiltRad) + CENTER.y, handleDist * Math.cos(rotRad) * Math.cos(tiltRad) ); distanceLineGeo.setFromPoints([cameraGroup.position.clone(), CENTER.clone()]); promptOverlay.textContent = buildPromptText(rotateDeg, zoom, verticalTilt); } function updatePropsAndTrigger() { const rotSnap = snapToNearest(rotateDeg, rotateSteps); const zoomSnap = snapToNearest(zoom, zoomSteps); const tiltSnap = snapToNearest(verticalTilt, tiltSteps); props.value = { rotate_deg: rotSnap, zoom: zoomSnap, vertical_tilt: tiltSnap }; trigger('change', props.value); } const raycaster = new THREE.Raycaster(); const mouse = new THREE.Vector2(); let isDragging = false; let dragTarget = null; let dragStartMouse = new THREE.Vector2(); let dragStartZoom = 0; const intersection = new THREE.Vector3(); const canvas = renderer.domElement; canvas.addEventListener('mousedown', (e) => { const rect = canvas.getBoundingClientRect(); mouse.x = ((e.clientX - rect.left) / rect.width) * 2 - 1; mouse.y = -((e.clientY - rect.top) / rect.height) * 2 + 1; raycaster.setFromCamera(mouse, camera); const intersects = raycaster.intersectObjects([rotationHandle, tiltHandle, distanceHandle]); if (intersects.length > 0) { isDragging = true; dragTarget = intersects[0].object; dragTarget.material.emissiveIntensity = 1.0; dragTarget.scale.setScalar(1.3); dragStartMouse.copy(mouse); dragStartZoom = zoom; canvas.style.cursor = 'grabbing'; } }); canvas.addEventListener('mousemove', (e) => { const rect = canvas.getBoundingClientRect(); mouse.x = ((e.clientX - rect.left) / rect.width) * 2 - 1; mouse.y = -((e.clientY - rect.top) / rect.height) * 2 + 1; if (isDragging && dragTarget) { raycaster.setFromCamera(mouse, camera); if (dragTarget.userData.type === 'rotation') { const plane = new THREE.Plane(new THREE.Vector3(0, 1, 0), -0.05); if (raycaster.ray.intersectPlane(plane, intersection)) { let angle = THREE.MathUtils.radToDeg(Math.atan2(intersection.x, intersection.z)); rotateDeg = THREE.MathUtils.clamp(angle, -180, 180); } } else if (dragTarget.userData.type === 'tilt') { const plane = new THREE.Plane(new THREE.Vector3(1, 0, 0), 0.7); if (raycaster.ray.intersectPlane(plane, intersection)) { const relY = intersection.y - CENTER.y; const relZ = intersection.z; const angle = THREE.MathUtils.radToDeg(Math.atan2(relY, relZ)); verticalTilt = THREE.MathUtils.clamp(angle / 35, -1, 1); } } else if (dragTarget.userData.type === 'distance') { const deltaY = mouse.y - dragStartMouse.y; zoom = THREE.MathUtils.clamp(dragStartZoom + deltaY * 20, 0, 10); } updatePositions(); } else { raycaster.setFromCamera(mouse, camera); const intersects = raycaster.intersectObjects([rotationHandle, tiltHandle, distanceHandle]); [rotationHandle, tiltHandle, distanceHandle].forEach(h => { h.material.emissiveIntensity = 0.5; h.scale.setScalar(1); }); if (intersects.length > 0) { intersects[0].object.material.emissiveIntensity = 0.8; intersects[0].object.scale.setScalar(1.1); canvas.style.cursor = 'grab'; } else { canvas.style.cursor = 'default'; } } }); const onMouseUp = () => { if (dragTarget) { dragTarget.material.emissiveIntensity = 0.5; dragTarget.scale.setScalar(1); const targetRot = snapToNearest(rotateDeg, rotateSteps); const targetZoom = snapToNearest(zoom, zoomSteps); const targetTilt = snapToNearest(verticalTilt, tiltSteps); const startRot = rotateDeg, startZoom = zoom, startTilt = verticalTilt; const startTime = Date.now(); function animateSnap() { const t = Math.min((Date.now() - startTime) / 200, 1); const ease = 1 - Math.pow(1 - t, 3); rotateDeg = startRot + (targetRot - startRot) * ease; zoom = startZoom + (targetZoom - startZoom) * ease; verticalTilt = startTilt + (targetTilt - startTilt) * ease; updatePositions(); if (t < 1) requestAnimationFrame(animateSnap); else updatePropsAndTrigger(); } animateSnap(); } isDragging = false; dragTarget = null; canvas.style.cursor = 'default'; }; canvas.addEventListener('mouseup', onMouseUp); canvas.addEventListener('mouseleave', onMouseUp); canvas.addEventListener('touchstart', (e) => { e.preventDefault(); const touch = e.touches[0]; const rect = canvas.getBoundingClientRect(); mouse.x = ((touch.clientX - rect.left) / rect.width) * 2 - 1; mouse.y = -((touch.clientY - rect.top) / rect.height) * 2 + 1; raycaster.setFromCamera(mouse, camera); const intersects = raycaster.intersectObjects([rotationHandle, tiltHandle, distanceHandle]); if (intersects.length > 0) { isDragging = true; dragTarget = intersects[0].object; dragTarget.material.emissiveIntensity = 1.0; dragTarget.scale.setScalar(1.3); dragStartMouse.copy(mouse); dragStartZoom = zoom; } }, { passive: false }); canvas.addEventListener('touchmove', (e) => { e.preventDefault(); const touch = e.touches[0]; const rect = canvas.getBoundingClientRect(); mouse.x = ((touch.clientX - rect.left) / rect.width) * 2 - 1; mouse.y = -((touch.clientY - rect.top) / rect.height) * 2 + 1; if (isDragging && dragTarget) { raycaster.setFromCamera(mouse, camera); if (dragTarget.userData.type === 'rotation') { const plane = new THREE.Plane(new THREE.Vector3(0, 1, 0), -0.05); if (raycaster.ray.intersectPlane(plane, intersection)) { let angle = THREE.MathUtils.radToDeg(Math.atan2(intersection.x, intersection.z)); rotateDeg = THREE.MathUtils.clamp(angle, -180, 180); } } else if (dragTarget.userData.type === 'tilt') { const plane = new THREE.Plane(new THREE.Vector3(1, 0, 0), 0.7); if (raycaster.ray.intersectPlane(plane, intersection)) { const relY = intersection.y - CENTER.y; const relZ = intersection.z; const angle = THREE.MathUtils.radToDeg(Math.atan2(relY, relZ)); verticalTilt = THREE.MathUtils.clamp(angle / 35, -1, 1); } } else if (dragTarget.userData.type === 'distance') { const deltaY = mouse.y - dragStartMouse.y; zoom = THREE.MathUtils.clamp(dragStartZoom + deltaY * 20, 0, 10); } updatePositions(); } }, { passive: false }); canvas.addEventListener('touchend', (e) => { e.preventDefault(); onMouseUp(); }, { passive: false }); canvas.addEventListener('touchcancel', (e) => { e.preventDefault(); onMouseUp(); }, { passive: false }); updatePositions(); function render() { requestAnimationFrame(render); renderer.render(scene, camera); } render(); new ResizeObserver(() => { camera.aspect = wrapper.clientWidth / wrapper.clientHeight; camera.updateProjectionMatrix(); renderer.setSize(wrapper.clientWidth, wrapper.clientHeight); }).observe(wrapper); wrapper._updateTexture = updateTextureFromUrl; let lastImageUrl = props.imageUrl; let lastValue = JSON.stringify(props.value); setInterval(() => { if (props.imageUrl !== lastImageUrl) { lastImageUrl = props.imageUrl; updateTextureFromUrl(props.imageUrl); } const currentValue = JSON.stringify(props.value); if (currentValue !== lastValue) { lastValue = currentValue; if (props.value && typeof props.value === 'object') { rotateDeg = props.value.rotate_deg ?? rotateDeg; zoom = props.value.zoom ?? zoom; verticalTilt = props.value.vertical_tilt ?? verticalTilt; updatePositions(); } } }, 100); }; initScene(); })(); """ def create_camera_3d_component(value=None, imageUrl=None, **kwargs): """Create a 3D camera control component using gr.HTML.""" if value is None: value = {"rotate_deg": 0, "zoom": 5.0, "vertical_tilt": 0} return gr.HTML( value=value, html_template=CAMERA_3D_HTML_TEMPLATE, js_on_load=CAMERA_3D_JS, imageUrl=imageUrl, **kwargs, ) # --- UI --- css = """ :root { --name: citrus; --primary-50: #fffbeb; --primary-100: #fef3c7; --primary-200: #fde68a; --primary-300: #fcd34d; --primary-400: #fbbf24; --primary-500: #f59e0b; --primary-600: #d97706; --primary-700: #b45309; --primary-800: #92400e; --primary-900: #78350f; --primary-950: #6c370f; --secondary-50: #fffbeb; --secondary-100: #fef3c7; --secondary-200: #fde68a; --secondary-300: #fcd34d; --secondary-400: #fbbf24; --secondary-500: #f59e0b; --secondary-600: #d97706; --secondary-700: #b45309; --secondary-800: #92400e; --secondary-900: #78350f; --secondary-950: #6c370f; --neutral-50: #fafaf9; --neutral-100: #f5f5f4; --neutral-200: #e7e5e4; --neutral-300: #d6d3d1; --neutral-400: #a8a29e; --neutral-500: #78716c; --neutral-600: #57534e; --neutral-700: #44403c; --neutral-800: #292524; --neutral-900: #1c1917; --neutral-950: #0f0e0d; --spacing-xxs: 2px; --spacing-xs: 4px; --spacing-sm: 6px; --spacing-md: 8px; --spacing-lg: 10px; --spacing-xl: 14px; --spacing-xxl: 28px; --radius-xxs: 1px; --radius-xs: 2px; --radius-sm: 4px; --radius-md: 6px; --radius-lg: 8px; --radius-xl: 12px; --radius-xxl: 22px; --text-xxs: 9px; --text-xs: 10px; --text-sm: 12px; --text-md: 14px; --text-lg: 16px; --text-xl: 22px; --text-xxl: 26px; --font: 'Ubuntu', ui-sans-serif, system-ui, sans-serif; --font-mono: 'Roboto Mono', ui-monospace, Consolas, monospace; --body-background-fill: var(--background-fill-primary); --body-text-color: var(--neutral-800); --body-text-size: var(--text-md); --body-text-weight: 400; --embed-radius: var(--radius-sm); --color-accent: var(--primary-500); --color-accent-soft: var(--primary-50); --background-fill-primary: var(--neutral-50); --background-fill-secondary: var(--neutral-50); --border-color-accent: var(--primary-300); --border-color-primary: var(--neutral-200); --link-text-color: var(--secondary-600); --link-text-color-active: var(--secondary-600); --link-text-color-hover: var(--secondary-700); --link-text-color-visited: var(--secondary-500); --body-text-color-subdued: var(--neutral-400); --accordion-text-color: var(--body-text-color); --table-text-color: var(--body-text-color); --shadow-drop: rgba(0,0,0,0.05) 0px 1px 2px 0px; --shadow-drop-lg: 0 1px 3px 0 rgb(0 0 0 / 0.1), 0 1px 2px -1px rgb(0 0 0 / 0.1); --shadow-inset: rgba(0,0,0,0.05) 0px 2px 4px 0px inset; --shadow-spread: 3px; --block-background-fill: var(--neutral-100); --block-border-color: var(--neutral-300); --block-border-width: 1px; --block-info-text-color: var(--body-text-color-subdued); --block-info-text-size: var(--text-sm); --block-info-text-weight: 400; --block-label-background-fill: var(--background-fill-primary); --block-label-border-color: var(--border-color-primary); --block-label-border-width: 1px; --block-label-shadow: none; --block-label-text-color: var(--neutral-500); --block-label-margin: 0; --block-label-padding: var(--spacing-sm) var(--spacing-lg); --block-label-radius: calc(var(--radius-sm) - 1px) 0 calc(var(--radius-sm) - 1px) 0; --block-label-right-radius: 0 calc(var(--radius-sm) - 1px) 0 calc(var(--radius-sm) - 1px); --block-label-text-size: var(--text-sm); --block-label-text-weight: 400; --block-padding: var(--spacing-xl) calc(var(--spacing-xl) + 2px); --block-radius: var(--radius-sm); --block-shadow: 0px 3px 0px 0px var(--neutral-300); --block-title-background-fill: none; --block-title-border-color: none; --block-title-border-width: 0px; --block-title-text-color: var(--neutral-500); --block-title-padding: 0; --block-title-radius: none; --block-title-text-size: var(--text-md); --block-title-text-weight: 400; --container-radius: var(--radius-sm); --form-gap-width: 0px; --layout-gap: var(--spacing-xxl); --panel-background-fill: var(--background-fill-secondary); --panel-border-color: var(--border-color-primary); --panel-border-width: 1px; --section-header-text-size: var(--text-md); --section-header-text-weight: 400; --border-color-accent-subdued: var(--border-color-accent); --code-background-fill: var(--neutral-100); --chatbot-text-size: var(--text-lg); --checkbox-background-color: var(--background-fill-primary); --checkbox-background-color-focus: var(--checkbox-background-color); --checkbox-background-color-hover: var(--checkbox-background-color); --checkbox-background-color-selected: var(--color-accent); --checkbox-border-color: var(--neutral-300); --checkbox-border-color-focus: var(--color-accent); --checkbox-border-color-hover: var(--neutral-300); --checkbox-border-color-selected: var(--color-accent); --checkbox-border-radius: var(--radius-sm); --checkbox-border-width: var(--input-border-width); --checkbox-label-background-fill: var(--neutral-200); --checkbox-label-background-fill-hover: var(--checkbox-label-background-fill); --checkbox-label-background-fill-selected: var(--primary-400); --checkbox-label-border-color: var(--border-color-primary); --checkbox-label-border-color-hover: var(--checkbox-label-border-color); --checkbox-label-border-color-selected: var(--primary-300); --checkbox-label-border-width: 2px; --checkbox-label-gap: var(--spacing-lg); --checkbox-label-padding: var(--spacing-md) calc(2 * var(--spacing-md)); --checkbox-label-shadow: none; --checkbox-label-text-size: var(--text-md); --checkbox-label-text-weight: 400; --checkbox-check: url("data:image/svg+xml,%3csvg viewBox='0 0 16 16' fill='white' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M12.207 4.793a1 1 0 010 1.414l-5 5a1 1 0 01-1.414 0l-2-2a1 1 0 011.414-1.414L6.5 9.086l4.293-4.293a1 1 0 011.414 0z'/%3e%3c/svg%3e"); --radio-circle: url("data:image/svg+xml,%3csvg viewBox='0 0 16 16' fill='white' xmlns='http://www.w3.org/2000/svg'%3e%3ccircle cx='8' cy='8' r='3'/%3e%3c/svg%3e"); --checkbox-shadow: none; --checkbox-label-text-color: var(--body-text-color); --checkbox-label-text-color-selected: var(--checkbox-label-text-color); --error-background-fill: #fef2f2; --error-border-color: #b91c1c; --error-border-width: 1px; --error-text-color: #b91c1c; --error-icon-color: #b91c1c; --input-background-fill: var(--neutral-50); --input-background-fill-focus: var(--primary-50); --input-background-fill-hover: var(--input-background-fill); --input-border-color: var(--border-color-primary); --input-border-color-focus: var(--secondary-300); --input-border-color-hover: var(--input-border-color); --input-border-width: 1px; --input-padding: var(--spacing-xl); --input-placeholder-color: var(--neutral-400); --input-radius: var(--radius-sm); --input-shadow: 0px -1px 0px 0px var(--neutral-300); --input-shadow-focus: 0px -1px 0px 0px var(--primary-300); --input-text-size: var(--text-md); --input-text-weight: 400; --loader-color: var(--color-accent); --prose-text-size: var(--text-md); --prose-text-weight: 400; --prose-header-text-weight: 600; --slider-color: var(--primary-400); --stat-background-fill: var(--primary-300); --table-border-color: var(--neutral-300); --table-even-background-fill: white; --table-odd-background-fill: var(--neutral-50); --table-radius: var(--radius-sm); --table-row-focus: var(--color-accent-soft); --button-border-width: 0px; --button-cancel-background-fill: #ef4444; --button-cancel-background-fill-hover: #dc2626; --button-cancel-border-color: var(--button-secondary-border-color); --button-cancel-border-color-hover: var(--button-secondary-border-color-hover); --button-cancel-text-color: white; --button-cancel-text-color-hover: white; --button-cancel-shadow: 0px 3px 0px 0px rgb(248 113 113); --button-cancel-shadow-hover: 0px 5px 0px 0px rgb(248 113 113); --button-cancel-shadow-active: 0px 2px 0px 0px rgb(248 113 113); --button-transform-hover: translateY(-2px); --button-transform-active: translateY(1px); --button-transition: all 0.1s; --button-large-padding: var(--spacing-lg) calc(2 * var(--spacing-lg)); --button-large-radius: var(--radius-md); --button-large-text-size: var(--text-lg); --button-large-text-weight: 600; --button-primary-background-fill: var(--primary-500); --button-primary-background-fill-hover: var(--button-primary-background-fill); --button-primary-border-color: var(--primary-500); --button-primary-border-color-hover: var(--primary-500); --button-primary-text-color: var(--button-secondary-text-color); --button-primary-text-color-hover: var(--button-primary-text-color); --button-primary-shadow: 0px 3px 0px 0px var(--primary-400); --button-primary-shadow-hover: 0px 5px 0px 0px var(--primary-400); --button-primary-shadow-active: 0px 2px 0px 0px var(--primary-400); --button-secondary-background-fill: var(--primary-400); --button-secondary-background-fill-hover: var(--button-secondary-background-fill); --button-secondary-border-color: var(--neutral-200); --button-secondary-border-color-hover: var(--neutral-200); --button-secondary-text-color: black; --button-secondary-text-color-hover: var(--button-secondary-text-color); --button-secondary-shadow: 0px 3px 0px 0px var(--primary-300); --button-secondary-shadow-hover: 0px 5px 0px 0px var(--primary-300); --button-secondary-shadow-active: 0px 2px 0px 0px var(--primary-300); --button-small-padding: var(--spacing-sm) calc(1.5 * var(--spacing-sm)); --button-small-radius: var(--radius-md); --button-small-text-size: var(--text-sm); --button-small-text-weight: 400; --button-medium-padding: var(--spacing-md) calc(2 * var(--spacing-md)); --button-medium-radius: var(--radius-md); --button-medium-text-size: var(--text-md); --button-medium-text-weight: 600; } :root.dark, :root .dark { --body-background-fill: var(--background-fill-primary); --body-text-color: var(--neutral-100); --color-accent-soft: var(--neutral-700); --background-fill-primary: var(--neutral-950); --background-fill-secondary: var(--neutral-900); --border-color-accent: var(--neutral-600); --border-color-primary: var(--neutral-700); --link-text-color-active: var(--secondary-500); --link-text-color: var(--secondary-500); --link-text-color-hover: var(--secondary-400); --link-text-color-visited: var(--secondary-600); --body-text-color-subdued: var(--neutral-400); --accordion-text-color: var(--body-text-color); --table-text-color: var(--body-text-color); --shadow-spread: 1px; --block-background-fill: var(--neutral-800); --block-border-color: var(--border-color-primary); --block-info-text-color: var(--body-text-color-subdued); --block-label-background-fill: var(--background-fill-secondary); --block-label-border-color: var(--border-color-primary); --block-label-text-color: var(--neutral-200); --block-shadow: 0px 3px 0px 0px var(--neutral-700); --block-title-text-color: var(--neutral-200); --panel-background-fill: var(--background-fill-secondary); --panel-border-color: var(--border-color-primary); --border-color-accent-subdued: var(--border-color-accent); --code-background-fill: var(--neutral-800); --checkbox-background-color: var(--neutral-400); --checkbox-background-color-focus: var(--checkbox-background-color); --checkbox-background-color-hover: var(--checkbox-background-color); --checkbox-background-color-selected: var(--primary-600); --checkbox-border-color: var(--neutral-700); --checkbox-border-color-focus: var(--color-accent); --checkbox-border-color-hover: var(--neutral-600); --checkbox-border-color-selected: var(--color-accent); --checkbox-border-width: var(--input-border-width); --checkbox-label-background-fill: var(--neutral-700); --checkbox-label-background-fill-hover: var(--checkbox-label-background-fill); --checkbox-label-background-fill-selected: var(--primary-500); --checkbox-label-border-color: var(--border-color-primary); --checkbox-label-border-color-hover: var(--checkbox-label-border-color); --checkbox-label-border-color-selected: var(--primary-600); --checkbox-label-border-width: 2px; --checkbox-label-text-color: var(--body-text-color); --checkbox-label-text-color-selected: var(--button-primary-text-color); --error-background-fill: var(--background-fill-primary); --error-border-color: #ef4444; --error-text-color: #fef2f2; --error-icon-color: #ef4444; --input-background-fill: var(--neutral-900); --input-background-fill-focus: none; --input-background-fill-hover: var(--input-background-fill); --input-border-color: var(--border-color-primary); --input-border-color-focus: var(--neutral-700); --input-border-color-hover: var(--input-border-color); --input-placeholder-color: var(--neutral-500); --input-shadow: 0px -1px 0px 0px var(--neutral-700); --input-shadow-focus: 0px -1px 0px 0px var(--primary-600); --slider-color: var(--primary-500); --stat-background-fill: var(--primary-500); --table-border-color: var(--neutral-700); --table-even-background-fill: var(--neutral-950); --table-odd-background-fill: var(--neutral-900); --table-row-focus: var(--color-accent-soft); --button-cancel-background-fill: #b91c1c; --button-cancel-background-fill-hover: #991b1b; --button-cancel-border-color: var(--button-secondary-border-color); --button-cancel-border-color-hover: var(--button-secondary-border-color-hover); --button-cancel-text-color: white; --button-cancel-text-color-hover: white; --button-cancel-shadow: 0px 3px 0px 0px rgb(220 38 38); --button-cancel-shadow-hover: 0px 5px 0px 0px rgb(220 38 38); --button-cancel-shadow-active: 0px 2px 0px 0px rgb(220 38 38); --button-primary-background-fill: var(--primary-600); --button-primary-background-fill-hover: var(--button-primary-background-fill); --button-primary-border-color: var(--primary-600); --button-primary-border-color-hover: var(--primary-500); --button-primary-text-color: var(--button-secondary-text-color); --button-primary-text-color-hover: var(--button-primary-text-color); --button-primary-shadow: 0px 3px 0px 0px var(--primary-700); --button-primary-shadow-hover: 0px 5px 0px 0px var(--primary-700); --button-primary-shadow-active: 0px 2px 0px 0px var(--primary-700); --button-secondary-background-fill: var(--primary-500); --button-secondary-background-fill-hover: var(--button-secondary-background-fill); --button-secondary-border-color: var(--neutral-600); --button-secondary-border-color-hover: var(--neutral-500); --button-secondary-text-color: var(--neutral-900); --button-secondary-text-color-hover: var(--button-secondary-text-color); --button-secondary-shadow: 0px 3px 0px 0px var(--primary-600); --button-secondary-shadow-hover: 0px 5px 0px 0px var(--primary-600); --button-secondary-shadow-active: 0px 2px 0px 0px var(--primary-600); } #col-container { max-width: 1100px; margin: 0 auto; } .dark .progress-text { color: white !important; } #camera-3d-control { min-height: 400px; } #examples { max-width: 1100px; margin: 0 auto; } .fillable { max-width: 1250px !important; } """ def reset_all() -> list: """Reset all camera control knobs and flags to their default values.""" return [0, 5.0, 0, True] # rotate_deg, zoom, vertical_tilt, is_reset def end_reset() -> bool: """Mark the end of a reset cycle.""" return False def update_dimensions_on_upload(image: Optional[Image.Image]) -> Tuple[int, int]: """Compute recommended (width, height) for the output resolution.""" if image is None: return 1024, 1024 original_width, original_height = image.size if original_width > original_height: new_width = 1024 aspect_ratio = original_height / original_width new_height = int(new_width * aspect_ratio) else: new_height = 1024 aspect_ratio = original_width / original_height new_width = int(new_height * aspect_ratio) new_width = (new_width // 8) * 8 new_height = (new_height // 8) * 8 return new_width, new_height with gr.Blocks() as demo: gr.Markdown(""" ## 🎬 Fibo Edit — Camera Angle Control Fibo Edit with Multi-Angle LoRA for precise camera control ✨ Control rotation, tilt, and zoom to generate images from any angle 🎥 ### Fine-tuning data was created by [Lovis](https://huggingface.co/fal/Qwen-Image-Edit-2511-Multiple-Angles-LoRA) and UI by [Apolinario](https://huggingface.co/spaces/multimodalart/qwen-image-multiple-angles-3d-camera) """) with gr.Row(): with gr.Column(scale=1): image = gr.Image(label="Input Image", type="pil", height=280) prev_output = gr.Image(value=None, visible=False) is_reset = gr.Checkbox(value=False, visible=False) # Hidden state to pass processed image between steps processed_image = gr.State(None) gr.Markdown("### 🎮 3D Camera Control") camera_3d = create_camera_3d_component( value={"rotate_deg": 0, "zoom": 5.0, "vertical_tilt": 0}, elem_id="camera-3d-control", ) with gr.Row(): reset_btn = gr.Button("🔄 Reset", size="sm") run_btn = gr.Button("🚀 Generate", variant="primary", size="lg") with gr.Column(scale=1): result = gr.Image(label="Output Image", interactive=False, height=350) gr.Markdown("### 🎚️ Slider Controls") rotate_deg = gr.Slider( label="Horizontal Rotation (°)", minimum=-180, maximum=180, step=45, value=0, info="-180/180: back, -90: left, 0: front, 90: right", ) zoom = gr.Slider( label="Zoom Level", minimum=0, maximum=10, step=1, value=5.0, info="0-3.33: wide, 3.33-6.66: medium, 6.66-10: close-up", ) vertical_tilt = gr.Slider( label="Vertical Tilt", minimum=-1, maximum=1, step=0.5, value=0, info="-1: low-angle, 0: eye-level, 1: high-angle", ) prompt_preview = gr.Textbox(label="Generated Prompt", interactive=False) with gr.Accordion("📋 Structured Caption (BRIA API)", open=False): structured_json = gr.JSON(label="JSON Response", container=False) with gr.Accordion("⚙️ Advanced Settings", open=False): seed = gr.Slider( label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=DEFAULT_SEED, ) randomize_seed = gr.Checkbox(label="Randomize Seed", value=True) guidance_scale = gr.Slider( label="Guidance Scale", minimum=1.0, maximum=10.0, step=0.1, value=DEFAULT_GUIDANCE_SCALE, ) num_inference_steps = gr.Slider( label="Inference Steps", minimum=1, maximum=100, step=1, value=DEFAULT_NUM_INFERENCE_STEPS, ) height = gr.Slider( label="Height", minimum=256, maximum=2048, step=8, value=1024 ) width = gr.Slider( label="Width", minimum=256, maximum=2048, step=8, value=1024 ) # --- Helper Functions --- def update_prompt_from_sliders(rotate, zoom_val, tilt): prompt, _ = build_camera_prompt(rotate, zoom_val, tilt) return prompt def sync_3d_to_sliders(camera_value): if camera_value and isinstance(camera_value, dict): rot = camera_value.get("rotate_deg", 0) zoom_val = camera_value.get("zoom", 5.0) tilt = camera_value.get("vertical_tilt", 0) prompt, _ = build_camera_prompt(rot, zoom_val, tilt) return rot, zoom_val, tilt, prompt return gr.update(), gr.update(), gr.update(), gr.update() def sync_sliders_to_3d(rotate, zoom_val, tilt): return {"rotate_deg": rotate, "zoom": zoom_val, "vertical_tilt": tilt} def update_3d_image(img): if img is None: return gr.update(imageUrl=None) buffered = BytesIO() img.save(buffered, format="PNG") img_str = base64.b64encode(buffered.getvalue()).decode() data_url = f"data:image/png;base64,{img_str}" return gr.update(imageUrl=data_url) # --- Event Handlers --- # Slider -> Prompt preview for slider in [rotate_deg, zoom, vertical_tilt]: slider.change( fn=update_prompt_from_sliders, inputs=[rotate_deg, zoom, vertical_tilt], outputs=[prompt_preview], ) # 3D control -> Sliders + Prompt (no auto-inference) camera_3d.change( fn=sync_3d_to_sliders, inputs=[camera_3d], outputs=[rotate_deg, zoom, vertical_tilt, prompt_preview], ) # Sliders -> 3D control (no auto-inference) for slider in [rotate_deg, zoom, vertical_tilt]: slider.release( fn=sync_sliders_to_3d, inputs=[rotate_deg, zoom, vertical_tilt], outputs=[camera_3d], ) # Reset reset_btn.click( fn=reset_all, inputs=None, outputs=[rotate_deg, zoom, vertical_tilt, is_reset], queue=False, ).then(fn=end_reset, inputs=None, outputs=[is_reset], queue=False).then( fn=sync_sliders_to_3d, inputs=[rotate_deg, zoom, vertical_tilt], outputs=[camera_3d], ) # Generate button - Two-stage process # Stage 1: Fetch structured caption from BRIA API and display it immediately run_event = run_btn.click( fn=fetch_structured_caption, inputs=[ image, rotate_deg, zoom, vertical_tilt, seed, randomize_seed, prev_output, ], outputs=[seed, prompt_preview, structured_json, processed_image], ).then( # Stage 2: Generate image with Fibo Edit pipeline fn=generate_image_from_caption, inputs=[ processed_image, structured_json, seed, guidance_scale, num_inference_steps, ], outputs=[result], ) # Image upload image.upload( fn=update_dimensions_on_upload, inputs=[image], outputs=[width, height] ).then( fn=reset_all, inputs=None, outputs=[rotate_deg, zoom, vertical_tilt, is_reset], queue=False, ).then(fn=end_reset, inputs=None, outputs=[is_reset], queue=False).then( fn=update_3d_image, inputs=[image], outputs=[camera_3d] ) image.clear(fn=lambda: gr.update(imageUrl=None), outputs=[camera_3d]) run_event.then(lambda img, *_: img, inputs=[result], outputs=[prev_output]) # Examples - Commenting out for now since we need actual example images # Note: With the two-stage inference process, examples would need custom handling # to properly chain fetch_structured_caption -> generate_image_from_caption # Sync 3D component when sliders change (covers example loading) def sync_3d_on_slider_change(img, rot, zoom_val, tilt): camera_value = {"rotate_deg": rot, "zoom": zoom_val, "vertical_tilt": tilt} if img is not None: buffered = BytesIO() img.save(buffered, format="PNG") img_str = base64.b64encode(buffered.getvalue()).decode() data_url = f"data:image/png;base64,{img_str}" return gr.update(value=camera_value, imageUrl=data_url) return gr.update(value=camera_value) # When any slider value changes (including from examples), sync the 3D component for slider in [rotate_deg, zoom, vertical_tilt]: slider.change( fn=sync_3d_on_slider_change, inputs=[image, rotate_deg, zoom, vertical_tilt], outputs=[camera_3d], ) # API endpoints for the two-stage inference process gr.api(fetch_structured_caption, api_name="fetch_caption") gr.api(generate_image_from_caption, api_name="generate_image") if __name__ == "__main__": head = '' if RUN_LOCAL: # Local development configuration demo.launch( mcp_server=True, head=head, footer_links=["api", "gradio", "settings"], server_name="0.0.0.0", server_port=8081, css=css, ) else: # HuggingFace Spaces standard configuration # demo.launch(head=head, debug=True, show_error=True, css=css) demo.launch(head=head, css=css)