Spaces:

briaai
/

fibo-edit-camera-angle

Running on Zero

App Files Files Community

derek tingle commited on Jan 28

Commit

6062b47

1 Parent(s): 56bbe8e

Initial commit

Browse files

Files changed (5) hide show

README.md +52 -13
app.py +1043 -0
fibo_edit_pipeline.py +953 -0
requirements.txt +133 -0
utils.py +113 -0

README.md CHANGED Viewed

@@ -1,13 +1,52 @@
----
-title: Fibo Edit Camera Angle
-emoji: 📈
-colorFrom: blue
-colorTo: pink
-sdk: gradio
-sdk_version: 6.4.0
-app_file: app.py
-pinned: false
-short_description: Camera Angle Control using Fibo Edit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Fibo Edit — Camera Angle Control
+Fibo Edit with Multi-Angle LoRA for precise camera control. Control rotation, tilt, and zoom to generate images from any angle.
+## Features
+- 🎬 Interactive 3D camera control widget
+- 🎨 Multi-angle image generation using Fibo Edit model
+- 📐 Precise control over rotation, tilt, and zoom
+- 🤖 BRIA API integration for structured captions
+- ⚡ GPU-accelerated inference with Spaces GPU
+## Setup
+### Required Secrets
+This Space requires the following environment variable to be set as a **HuggingFace Space Secret**:
+- `BRIA_API_TOKEN` - Your BRIA API token for structured caption generation
+To add this secret:
+1. Go to your Space's Settings
+2. Navigate to "Repository secrets"
+3. Add a new secret named `BRIA_API_TOKEN` with your API token value
+### Hardware Requirements
+This Space requires a GPU to run. Make sure to configure your Space to use a GPU instance.
+## Usage
+1. Upload an input image
+2. Use the 3D camera control or sliders to adjust:
+   - **Rotation**: -180° (back) to +180° (back)
+   - **Vertical Tilt**: -1 (low angle) to +1 (high angle)
+   - **Zoom**: 0 (wide) to 10 (close-up)
+3. Click "Generate" to create the image from the new camera angle
+4. View the structured caption from BRIA API in the accordion
+## Model Information
+- **Base Model**: [briaai/FIBO-Edit](https://huggingface.co/briaai/FIBO-Edit)
+- **LoRA**: [briaai/fibo_edit_multi_angle_full_0121_full_1k](https://huggingface.co/briaai/fibo_edit_multi_angle_full_0121_full_1k)
+- **Text Encoder**: SmolLM3
+- **Scheduler**: FlowMatchEulerDiscreteScheduler
+## Credits
+Built with:
+- [Gradio](https://gradio.app/)
+- [Diffusers](https://huggingface.co/docs/diffusers)
+- [BRIA AI](https://bria.ai/)

app.py ADDED Viewed

	@@ -0,0 +1,1043 @@

+import base64
+import json
+import os
+import random
+import time
+from io import BytesIO
+from typing import Optional, Tuple
+import gradio as gr
+import numpy as np
+import requests
+import spaces
+import torch
+from PIL import Image
+from fibo_edit_pipeline import BriaFiboEditPipeline
+from utils import AngleInstruction
+# --- Configuration ---
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Run locally or on HuggingFace Spaces
+RUN_LOCAL = True
+# Model paths
+BASE_CHECKPOINT = "briaai/FIBO-Edit"  # HuggingFace model ID
+LORA_CHECKPOINT = "briaai/fibo_edit_multi_angle_full_0121_full_1k"  # HuggingFace LoRA model ID
+# BRIA API configuration
+BRIA_API_URL = "https://engine.prod.bria-api.com/v2/structured_prompt/generate/pro"
+BRIA_API_TOKEN = os.environ.get("BRIA_API_TOKEN")
+if not BRIA_API_TOKEN:
+    raise ValueError(
+        "BRIA_API_TOKEN environment variable is not set. "
+        "Please add it as a HuggingFace Space secret."
+    )
+# Generation defaults
+DEFAULT_NUM_INFERENCE_STEPS = 50
+DEFAULT_GUIDANCE_SCALE = 3.5
+DEFAULT_SEED = 100050
+MAX_SEED = np.iinfo(np.int32).max
+print("🚀 Starting Fibo Edit Multi-Angle LoRA Gradio App")
+print(f"Device: {device}")
+print(f"Base checkpoint: {BASE_CHECKPOINT}")
+print(f"LoRA checkpoint: {LORA_CHECKPOINT}")
+# --- Helper Functions ---
+def load_pipeline_fiboedit(
+    checkpoint: str,
+    lora_checkpoint: Optional[str] = None,
+    lora_scale: Optional[float] = None,
+    fuse_lora: bool = True,
+):
+    """
+    Load the Fibo Edit pipeline using BriaFiboEditPipeline with optional LoRA weights.
+    Args:
+        checkpoint: HuggingFace model ID for base model
+        lora_checkpoint: Optional HuggingFace model ID for LoRA weights
+        lora_scale: Scale for LoRA weights when fusing (default None = 1.0)
+        fuse_lora: Whether to fuse LoRA into base weights (default True)
+    Returns:
+        Loaded BriaFiboEditPipeline
+    """
+    print(f"Loading BriaFiboEditPipeline from {checkpoint}")
+    if lora_checkpoint:
+        print(f"  with LoRA from {lora_checkpoint}")
+    # Load pipeline from HuggingFace
+    print("Loading pipeline...")
+    pipe = BriaFiboEditPipeline.from_pretrained(
+        checkpoint,
+        torch_dtype=torch.bfloat16,
+    )
+    pipe.to("cuda")
+    print(f"  Pipeline loaded from {checkpoint}")
+    # Load LoRA weights if provided (PEFT format)
+    if lora_checkpoint:
+        print(f"Loading PEFT LoRA from {lora_checkpoint}...")
+        from peft import PeftModel
+        print("  Loading PEFT adapter onto transformer...")
+        pipe.transformer = PeftModel.from_pretrained(pipe.transformer, lora_checkpoint)
+        print("  PEFT adapter loaded successfully")
+        if fuse_lora:
+            print("  Merging LoRA into base weights...")
+            if hasattr(pipe.transformer, "merge_and_unload"):
+                pipe.transformer = pipe.transformer.merge_and_unload()
+                print("  LoRA merged and unloaded")
+            else:
+                print("  [WARN] transformer.merge_and_unload() not available")
+    print("✅ Pipeline loaded successfully!")
+    return pipe
+def generate_structured_caption(
+    image: Image.Image, prompt: str, seed: int = 1
+) -> Optional[dict]:
+    """Generate structured caption using BRIA API."""
+    buffered = BytesIO()
+    image.save(buffered, format="PNG")
+    image_bytes = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    payload = {
+        "seed": seed,
+        "sync": True,
+        "images": [image_bytes],
+        "prompt": prompt,
+    }
+    headers = {
+        "Content-Type": "application/json",
+        "api_token": BRIA_API_TOKEN,
+    }
+    max_retries = 3
+    for attempt in range(max_retries):
+        try:
+            response = requests.post(
+                BRIA_API_URL, json=payload, headers=headers, timeout=60
+            )
+            response.raise_for_status()
+            data = response.json()
+            structured_prompt_str = data["result"]["structured_prompt"]
+            return json.loads(structured_prompt_str)
+        except Exception as e:
+            if attempt == max_retries - 1:
+                print(f"Failed to generate structured caption: {e}")
+                return None
+            time.sleep(3)
+    return None
+# --- Model Loading ---
+print("Loading Fibo Edit pipeline...")
+try:
+    pipe = load_pipeline_fiboedit(
+        checkpoint=BASE_CHECKPOINT,
+        lora_checkpoint=LORA_CHECKPOINT,
+        lora_scale=None,
+        fuse_lora=True,
+    )
+    if torch.cuda.is_available():
+        mem_allocated = torch.cuda.memory_allocated(0) / 1024**3
+        print(f"   GPU memory allocated: {mem_allocated:.2f} GB")
+except Exception as e:
+    print(f"❌ Error loading pipeline: {e}")
+    import traceback
+    traceback.print_exc()
+    raise
+def build_camera_prompt(
+    rotate_deg: float = 0.0, zoom: float = 0.0, vertical_tilt: float = 0.0
+) -> str:
+    """Build a natural language camera instruction from parameters."""
+    # Create AngleInstruction from camera parameters
+    angle_instruction = AngleInstruction.from_camera_params(
+        rotation=rotate_deg, tilt=vertical_tilt, zoom=zoom
+    )
+    # Generate natural language description
+    view_map = {
+        "back view": "view from the opposite side",
+        "back-left quarter view": "rotate 135 degrees left",
+        "back-right quarter view": "rotate 135 degrees right",
+        "front view": "keep the front view",
+        "front-left quarter view": "rotate 45 degrees left",
+        "front-right quarter view": "rotate 45 degrees right",
+        "left side view": "rotate 90 degrees left",
+        "right side view": "rotate 90 degrees right",
+    }
+    shot_map = {
+        "elevated shot": "with an elevated viewing angle",
+        "eye-level shot": "with an eye-level viewing angle",
+        "high-angle shot": "with a high-angle viewing angle",
+        "low-angle shot": "with a low-angle viewing angle",
+    }
+    zoom_map = {
+        "close-up": "and make it a close-up shot",
+        "medium shot": "",  # Omit medium shot
+        "wide shot": "and make it a wide shot",
+    }
+    view_text = view_map[angle_instruction.view.value]
+    shot_text = shot_map[angle_instruction.shot.value]
+    zoom_text = zoom_map[angle_instruction.zoom.value]
+    # Construct the natural language prompt starting with "Change the viewing angle"
+    parts = [view_text, shot_text]
+    if zoom_text:  # Only add zoom if not empty (medium shot is omitted)
+        parts.append(zoom_text)
+    natural_prompt = "Change the viewing angle: " + ", ".join(parts)
+    return natural_prompt, angle_instruction
+def fetch_structured_caption(
+    image: Optional[Image.Image] = None,
+    rotate_deg: float = 0.0,
+    zoom: float = 0.0,
+    vertical_tilt: float = 0.0,
+    seed: int = 0,
+    randomize_seed: bool = True,
+    prev_output: Optional[Image.Image] = None,
+) -> Tuple[int, str, dict, Image.Image]:
+    """Fetch structured caption from BRIA API."""
+    # Build natural language prompt and angle instruction
+    natural_prompt, angle_instruction = build_camera_prompt(
+        rotate_deg, zoom, vertical_tilt
+    )
+    print(f"Natural Language Prompt: {natural_prompt}")
+    print(f"Angle Instruction: {str(angle_instruction)}")
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    # Get input image
+    if image is not None:
+        if isinstance(image, Image.Image):
+            input_image = image.convert("RGB")
+        elif hasattr(image, "name"):
+            input_image = Image.open(image.name).convert("RGB")
+        else:
+            input_image = image
+    elif prev_output:
+        input_image = prev_output.convert("RGB")
+    else:
+        raise gr.Error("Please upload an image first.")
+    # Generate structured caption using BRIA API
+    print("Generating structured caption from BRIA API...")
+    structured_caption = generate_structured_caption(
+        input_image, natural_prompt, seed=seed
+    )
+    if structured_caption is None:
+        raise gr.Error("Failed to generate structured caption from BRIA API")
+    # Replace edit_instruction with angle instruction string
+    structured_caption["edit_instruction"] = str(angle_instruction)
+    print(
+        f"Structured caption received: {json.dumps(structured_caption, ensure_ascii=False)}"
+    )
+    return seed, natural_prompt, structured_caption, input_image
+@spaces.GPU
+def generate_image_from_caption(
+    input_image: Image.Image,
+    structured_caption: dict,
+    seed: int,
+    guidance_scale: float = 3.5,
+    num_inference_steps: int = 50,
+) -> Image.Image:
+    """Generate image using Fibo Edit pipeline with structured caption."""
+    structured_prompt = json.dumps(structured_caption, ensure_ascii=False)
+    print("Generating image with structured prompt...")
+    generator = torch.Generator(device=device).manual_seed(seed)
+    result = pipe(
+        image=input_image,
+        prompt=structured_prompt,
+        guidance_scale=guidance_scale,
+        num_inference_steps=num_inference_steps,
+        generator=generator,
+        num_images_per_prompt=1,
+    ).images[0]
+    return result
+# --- 3D Camera Control Component ---
+# Using gr.HTML directly with templates (Gradio 6 style)
+CAMERA_3D_HTML_TEMPLATE = """
+<div id="camera-control-wrapper" style="width: 100%; height: 400px; position: relative; background: #1a1a1a; border-radius: 12px; overflow: hidden;">
+    <div id="prompt-overlay" style="position: absolute; bottom: 10px; left: 50%; transform: translateX(-50%); background: rgba(0,0,0,0.8); padding: 8px 16px; border-radius: 8px; font-family: monospace; font-size: 11px; color: #00ff88; white-space: nowrap; z-index: 10; max-width: 90%; overflow: hidden; text-overflow: ellipsis;"></div>
+    <div id="control-legend" style="position: absolute; top: 10px; left: 10px; background: rgba(0,0,0,0.7); padding: 8px 12px; border-radius: 8px; font-family: system-ui; font-size: 11px; color: #fff; z-index: 10;">
+        <div style="margin-bottom: 4px;"><span style="color: #00ff88;">●</span> Rotation (↔)</div>
+        <div style="margin-bottom: 4px;"><span style="color: #ff69b4;">●</span> Vertical Tilt (↕)</div>
+        <div><span style="color: #ffa500;">●</span> Distance/Zoom</div>
+    </div>
+</div>
+"""
+CAMERA_3D_JS = """
+(() => {
+    const wrapper = element.querySelector('#camera-control-wrapper');
+    const promptOverlay = element.querySelector('#prompt-overlay');
+    const initScene = () => {
+        if (typeof THREE === 'undefined') {
+            setTimeout(initScene, 100);
+            return;
+        }
+        const scene = new THREE.Scene();
+        scene.background = new THREE.Color(0x1a1a1a);
+        const camera = new THREE.PerspectiveCamera(50, wrapper.clientWidth / wrapper.clientHeight, 0.1, 1000);
+        camera.position.set(4, 3, 4);
+        camera.lookAt(0, 0.75, 0);
+        const renderer = new THREE.WebGLRenderer({ antialias: true });
+        renderer.setSize(wrapper.clientWidth, wrapper.clientHeight);
+        renderer.setPixelRatio(Math.min(window.devicePixelRatio, 2));
+        wrapper.insertBefore(renderer.domElement, wrapper.firstChild);
+        scene.add(new THREE.AmbientLight(0xffffff, 0.6));
+        const dirLight = new THREE.DirectionalLight(0xffffff, 0.6);
+        dirLight.position.set(5, 10, 5);
+        scene.add(dirLight);
+        scene.add(new THREE.GridHelper(6, 12, 0x333333, 0x222222));
+        const CENTER = new THREE.Vector3(0, 0.75, 0);
+        const BASE_DISTANCE = 2.0;
+        const ROTATION_RADIUS = 2.2;
+        const TILT_RADIUS = 1.6;
+        let rotateDeg = props.value?.rotate_deg || 0;
+        let zoom = props.value?.zoom || 5.0;
+        let verticalTilt = props.value?.vertical_tilt || 0;
+        const rotateSteps = [-180, -135, -90, -45, 0, 45, 90, 135, 180];
+        const zoomSteps = [0, 5, 10];
+        const tiltSteps = [-1, -0.5, 0, 0.5, 1];
+        function snapToNearest(value, steps) {
+            return steps.reduce((prev, curr) => Math.abs(curr - value) < Math.abs(prev - value) ? curr : prev);
+        }
+        function createPlaceholderTexture() {
+            const canvas = document.createElement('canvas');
+            canvas.width = 256;
+            canvas.height = 256;
+            const ctx = canvas.getContext('2d');
+            ctx.fillStyle = '#3a3a4a';
+            ctx.fillRect(0, 0, 256, 256);
+            ctx.fillStyle = '#ffcc99';
+            ctx.beginPath();
+            ctx.arc(128, 128, 80, 0, Math.PI * 2);
+            ctx.fill();
+            ctx.fillStyle = '#333';
+            ctx.beginPath();
+            ctx.arc(100, 110, 10, 0, Math.PI * 2);
+            ctx.arc(156, 110, 10, 0, Math.PI * 2);
+            ctx.fill();
+            ctx.strokeStyle = '#333';
+            ctx.lineWidth = 3;
+            ctx.beginPath();
+            ctx.arc(128, 130, 35, 0.2, Math.PI - 0.2);
+            ctx.stroke();
+            return new THREE.CanvasTexture(canvas);
+        }
+        let currentTexture = createPlaceholderTexture();
+        const planeMaterial = new THREE.MeshBasicMaterial({ map: currentTexture, side: THREE.DoubleSide });
+        let targetPlane = new THREE.Mesh(new THREE.PlaneGeometry(1.2, 1.2), planeMaterial);
+        targetPlane.position.copy(CENTER);
+        scene.add(targetPlane);
+        function updateTextureFromUrl(url) {
+            if (!url) {
+                planeMaterial.map = createPlaceholderTexture();
+                planeMaterial.needsUpdate = true;
+                scene.remove(targetPlane);
+                targetPlane = new THREE.Mesh(new THREE.PlaneGeometry(1.2, 1.2), planeMaterial);
+                targetPlane.position.copy(CENTER);
+                scene.add(targetPlane);
+                return;
+            }
+            const loader = new THREE.TextureLoader();
+            loader.crossOrigin = 'anonymous';
+            loader.load(url, (texture) => {
+                texture.minFilter = THREE.LinearFilter;
+                texture.magFilter = THREE.LinearFilter;
+                planeMaterial.map = texture;
+                planeMaterial.needsUpdate = true;
+                const img = texture.image;
+                if (img && img.width && img.height) {
+                    const aspect = img.width / img.height;
+                    const maxSize = 1.4;
+                    let planeWidth, planeHeight;
+                    if (aspect > 1) {
+                        planeWidth = maxSize;
+                        planeHeight = maxSize / aspect;
+                    } else {
+                        planeHeight = maxSize;
+                        planeWidth = maxSize * aspect;
+                    }
+                    scene.remove(targetPlane);
+                    targetPlane = new THREE.Mesh(new THREE.PlaneGeometry(planeWidth, planeHeight), planeMaterial);
+                    targetPlane.position.copy(CENTER);
+                    scene.add(targetPlane);
+                }
+            });
+        }
+        if (props.imageUrl) {
+            updateTextureFromUrl(props.imageUrl);
+        }
+        const cameraGroup = new THREE.Group();
+        const bodyMat = new THREE.MeshStandardMaterial({ color: 0x6699cc, metalness: 0.5, roughness: 0.3 });
+        const body = new THREE.Mesh(new THREE.BoxGeometry(0.28, 0.2, 0.35), bodyMat);
+        cameraGroup.add(body);
+        const lens = new THREE.Mesh(
+            new THREE.CylinderGeometry(0.08, 0.1, 0.16, 16),
+            new THREE.MeshStandardMaterial({ color: 0x6699cc, metalness: 0.5, roughness: 0.3 })
+        );
+        lens.rotation.x = Math.PI / 2;
+        lens.position.z = 0.24;
+        cameraGroup.add(lens);
+        scene.add(cameraGroup);
+        const rotationArcPoints = [];
+        for (let i = 0; i <= 64; i++) {
+            const angle = THREE.MathUtils.degToRad((360 * i / 64));
+            rotationArcPoints.push(new THREE.Vector3(ROTATION_RADIUS * Math.sin(angle), 0.05, ROTATION_RADIUS * Math.cos(angle)));
+        }
+        const rotationCurve = new THREE.CatmullRomCurve3(rotationArcPoints);
+        const rotationArc = new THREE.Mesh(
+            new THREE.TubeGeometry(rotationCurve, 64, 0.035, 8, true),
+            new THREE.MeshStandardMaterial({ color: 0x00ff88, emissive: 0x00ff88, emissiveIntensity: 0.3 })
+        );
+        scene.add(rotationArc);
+        const rotationHandle = new THREE.Mesh(
+            new THREE.SphereGeometry(0.16, 16, 16),
+            new THREE.MeshStandardMaterial({ color: 0x00ff88, emissive: 0x00ff88, emissiveIntensity: 0.5 })
+        );
+        rotationHandle.userData.type = 'rotation';
+        scene.add(rotationHandle);
+        const tiltArcPoints = [];
+        for (let i = 0; i <= 32; i++) {
+            const angle = THREE.MathUtils.degToRad(-45 + (90 * i / 32));
+            tiltArcPoints.push(new THREE.Vector3(-0.7, TILT_RADIUS * Math.sin(angle) + CENTER.y, TILT_RADIUS * Math.cos(angle)));
+        }
+        const tiltCurve = new THREE.CatmullRomCurve3(tiltArcPoints);
+        const tiltArc = new THREE.Mesh(
+            new THREE.TubeGeometry(tiltCurve, 32, 0.035, 8, false),
+            new THREE.MeshStandardMaterial({ color: 0xff69b4, emissive: 0xff69b4, emissiveIntensity: 0.3 })
+        );
+        scene.add(tiltArc);
+        const tiltHandle = new THREE.Mesh(
+            new THREE.SphereGeometry(0.16, 16, 16),
+            new THREE.MeshStandardMaterial({ color: 0xff69b4, emissive: 0xff69b4, emissiveIntensity: 0.5 })
+        );
+        tiltHandle.userData.type = 'tilt';
+        scene.add(tiltHandle);
+        const distanceLineGeo = new THREE.BufferGeometry();
+        const distanceLine = new THREE.Line(distanceLineGeo, new THREE.LineBasicMaterial({ color: 0xffa500 }));
+        scene.add(distanceLine);
+        const distanceHandle = new THREE.Mesh(
+            new THREE.SphereGeometry(0.16, 16, 16),
+            new THREE.MeshStandardMaterial({ color: 0xffa500, emissive: 0xffa500, emissiveIntensity: 0.5 })
+        );
+        distanceHandle.userData.type = 'distance';
+        scene.add(distanceHandle);
+        function buildPromptText(rot, zoomVal, tilt) {
+            const parts = [];
+            if (rot !== 0) {
+                const dir = rot > 0 ? 'right' : 'left';
+                parts.push('Rotate ' + Math.abs(rot) + '° ' + dir);
+            }
+            if (zoomVal >= 6.66) parts.push('Close-up');
+            else if (zoomVal >= 3.33) parts.push('Medium shot');
+            else parts.push('Wide angle');
+            if (tilt >= 0.66) parts.push("High angle");
+            else if (tilt >= 0.33) parts.push("Elevated");
+            else if (tilt <= -0.33) parts.push("Low angle");
+            else parts.push("Eye level");
+            return parts.length > 0 ? parts.join(' • ') : 'No camera movement';
+        }
+        function updatePositions() {
+            const rotRad = THREE.MathUtils.degToRad(rotateDeg);
+            // Map zoom 0-10 to distance: zoom 0 = far (3.0), zoom 10 = close (1.0)
+            const distance = 3.0 - (zoom / 10) * 2.0;
+            const tiltAngle = verticalTilt * 35;
+            const tiltRad = THREE.MathUtils.degToRad(tiltAngle);
+            const camX = distance * Math.sin(rotRad) * Math.cos(tiltRad);
+            const camY = distance * Math.sin(tiltRad) + CENTER.y;
+            const camZ = distance * Math.cos(rotRad) * Math.cos(tiltRad);
+            cameraGroup.position.set(camX, camY, camZ);
+            cameraGroup.lookAt(CENTER);
+            rotationHandle.position.set(ROTATION_RADIUS * Math.sin(rotRad), 0.05, ROTATION_RADIUS * Math.cos(rotRad));
+            const tiltHandleAngle = THREE.MathUtils.degToRad(tiltAngle);
+            tiltHandle.position.set(-0.7, TILT_RADIUS * Math.sin(tiltHandleAngle) + CENTER.y, TILT_RADIUS * Math.cos(tiltHandleAngle));
+            const handleDist = distance - 0.4;
+            distanceHandle.position.set(
+                handleDist * Math.sin(rotRad) * Math.cos(tiltRad),
+                handleDist * Math.sin(tiltRad) + CENTER.y,
+                handleDist * Math.cos(rotRad) * Math.cos(tiltRad)
+            );
+            distanceLineGeo.setFromPoints([cameraGroup.position.clone(), CENTER.clone()]);
+            promptOverlay.textContent = buildPromptText(rotateDeg, zoom, verticalTilt);
+        }
+        function updatePropsAndTrigger() {
+            const rotSnap = snapToNearest(rotateDeg, rotateSteps);
+            const zoomSnap = snapToNearest(zoom, zoomSteps);
+            const tiltSnap = snapToNearest(verticalTilt, tiltSteps);
+            props.value = { rotate_deg: rotSnap, zoom: zoomSnap, vertical_tilt: tiltSnap };
+            trigger('change', props.value);
+        }
+        const raycaster = new THREE.Raycaster();
+        const mouse = new THREE.Vector2();
+        let isDragging = false;
+        let dragTarget = null;
+        let dragStartMouse = new THREE.Vector2();
+        let dragStartZoom = 0;
+        const intersection = new THREE.Vector3();
+        const canvas = renderer.domElement;
+        canvas.addEventListener('mousedown', (e) => {
+            const rect = canvas.getBoundingClientRect();
+            mouse.x = ((e.clientX - rect.left) / rect.width) * 2 - 1;
+            mouse.y = -((e.clientY - rect.top) / rect.height) * 2 + 1;
+            raycaster.setFromCamera(mouse, camera);
+            const intersects = raycaster.intersectObjects([rotationHandle, tiltHandle, distanceHandle]);
+            if (intersects.length > 0) {
+                isDragging = true;
+                dragTarget = intersects[0].object;
+                dragTarget.material.emissiveIntensity = 1.0;
+                dragTarget.scale.setScalar(1.3);
+                dragStartMouse.copy(mouse);
+                dragStartZoom = zoom;
+                canvas.style.cursor = 'grabbing';
+            }
+        });
+        canvas.addEventListener('mousemove', (e) => {
+            const rect = canvas.getBoundingClientRect();
+            mouse.x = ((e.clientX - rect.left) / rect.width) * 2 - 1;
+            mouse.y = -((e.clientY - rect.top) / rect.height) * 2 + 1;
+            if (isDragging && dragTarget) {
+                raycaster.setFromCamera(mouse, camera);
+                if (dragTarget.userData.type === 'rotation') {
+                    const plane = new THREE.Plane(new THREE.Vector3(0, 1, 0), -0.05);
+                    if (raycaster.ray.intersectPlane(plane, intersection)) {
+                        let angle = THREE.MathUtils.radToDeg(Math.atan2(intersection.x, intersection.z));
+                        rotateDeg = THREE.MathUtils.clamp(angle, -180, 180);
+                    }
+                } else if (dragTarget.userData.type === 'tilt') {
+                    const plane = new THREE.Plane(new THREE.Vector3(1, 0, 0), 0.7);
+                    if (raycaster.ray.intersectPlane(plane, intersection)) {
+                        const relY = intersection.y - CENTER.y;
+                        const relZ = intersection.z;
+                        const angle = THREE.MathUtils.radToDeg(Math.atan2(relY, relZ));
+                        verticalTilt = THREE.MathUtils.clamp(angle / 35, -1, 1);
+                    }
+                } else if (dragTarget.userData.type === 'distance') {
+                    const deltaY = mouse.y - dragStartMouse.y;
+                    zoom = THREE.MathUtils.clamp(dragStartZoom + deltaY * 20, 0, 10);
+                }
+                updatePositions();
+            } else {
+                raycaster.setFromCamera(mouse, camera);
+                const intersects = raycaster.intersectObjects([rotationHandle, tiltHandle, distanceHandle]);
+                [rotationHandle, tiltHandle, distanceHandle].forEach(h => {
+                    h.material.emissiveIntensity = 0.5;
+                    h.scale.setScalar(1);
+                });
+                if (intersects.length > 0) {
+                    intersects[0].object.material.emissiveIntensity = 0.8;
+                    intersects[0].object.scale.setScalar(1.1);
+                    canvas.style.cursor = 'grab';
+                } else {
+                    canvas.style.cursor = 'default';
+                }
+            }
+        });
+        const onMouseUp = () => {
+            if (dragTarget) {
+                dragTarget.material.emissiveIntensity = 0.5;
+                dragTarget.scale.setScalar(1);
+                const targetRot = snapToNearest(rotateDeg, rotateSteps);
+                const targetZoom = snapToNearest(zoom, zoomSteps);
+                const targetTilt = snapToNearest(verticalTilt, tiltSteps);
+                const startRot = rotateDeg, startZoom = zoom, startTilt = verticalTilt;
+                const startTime = Date.now();
+                function animateSnap() {
+                    const t = Math.min((Date.now() - startTime) / 200, 1);
+                    const ease = 1 - Math.pow(1 - t, 3);
+                    rotateDeg = startRot + (targetRot - startRot) * ease;
+                    zoom = startZoom + (targetZoom - startZoom) * ease;
+                    verticalTilt = startTilt + (targetTilt - startTilt) * ease;
+                    updatePositions();
+                    if (t < 1) requestAnimationFrame(animateSnap);
+                    else updatePropsAndTrigger();
+                }
+                animateSnap();
+            }
+            isDragging = false;
+            dragTarget = null;
+            canvas.style.cursor = 'default';
+        };
+        canvas.addEventListener('mouseup', onMouseUp);
+        canvas.addEventListener('mouseleave', onMouseUp);
+        canvas.addEventListener('touchstart', (e) => {
+            e.preventDefault();
+            const touch = e.touches[0];
+            const rect = canvas.getBoundingClientRect();
+            mouse.x = ((touch.clientX - rect.left) / rect.width) * 2 - 1;
+            mouse.y = -((touch.clientY - rect.top) / rect.height) * 2 + 1;
+            raycaster.setFromCamera(mouse, camera);
+            const intersects = raycaster.intersectObjects([rotationHandle, tiltHandle, distanceHandle]);
+            if (intersects.length > 0) {
+                isDragging = true;
+                dragTarget = intersects[0].object;
+                dragTarget.material.emissiveIntensity = 1.0;
+                dragTarget.scale.setScalar(1.3);
+                dragStartMouse.copy(mouse);
+                dragStartZoom = zoom;
+            }
+        }, { passive: false });
+        canvas.addEventListener('touchmove', (e) => {
+            e.preventDefault();
+            const touch = e.touches[0];
+            const rect = canvas.getBoundingClientRect();
+            mouse.x = ((touch.clientX - rect.left) / rect.width) * 2 - 1;
+            mouse.y = -((touch.clientY - rect.top) / rect.height) * 2 + 1;
+            if (isDragging && dragTarget) {
+                raycaster.setFromCamera(mouse, camera);
+                if (dragTarget.userData.type === 'rotation') {
+                    const plane = new THREE.Plane(new THREE.Vector3(0, 1, 0), -0.05);
+                    if (raycaster.ray.intersectPlane(plane, intersection)) {
+                        let angle = THREE.MathUtils.radToDeg(Math.atan2(intersection.x, intersection.z));
+                        rotateDeg = THREE.MathUtils.clamp(angle, -180, 180);
+                    }
+                } else if (dragTarget.userData.type === 'tilt') {
+                    const plane = new THREE.Plane(new THREE.Vector3(1, 0, 0), 0.7);
+                    if (raycaster.ray.intersectPlane(plane, intersection)) {
+                        const relY = intersection.y - CENTER.y;
+                        const relZ = intersection.z;
+                        const angle = THREE.MathUtils.radToDeg(Math.atan2(relY, relZ));
+                        verticalTilt = THREE.MathUtils.clamp(angle / 35, -1, 1);
+                    }
+                } else if (dragTarget.userData.type === 'distance') {
+                    const deltaY = mouse.y - dragStartMouse.y;
+                    zoom = THREE.MathUtils.clamp(dragStartZoom + deltaY * 20, 0, 10);
+                }
+                updatePositions();
+            }
+        }, { passive: false });
+        canvas.addEventListener('touchend', (e) => { e.preventDefault(); onMouseUp(); }, { passive: false });
+        canvas.addEventListener('touchcancel', (e) => { e.preventDefault(); onMouseUp(); }, { passive: false });
+        updatePositions();
+        function render() {
+            requestAnimationFrame(render);
+            renderer.render(scene, camera);
+        }
+        render();
+        new ResizeObserver(() => {
+            camera.aspect = wrapper.clientWidth / wrapper.clientHeight;
+            camera.updateProjectionMatrix();
+            renderer.setSize(wrapper.clientWidth, wrapper.clientHeight);
+        }).observe(wrapper);
+        wrapper._updateTexture = updateTextureFromUrl;
+        let lastImageUrl = props.imageUrl;
+        let lastValue = JSON.stringify(props.value);
+        setInterval(() => {
+            if (props.imageUrl !== lastImageUrl) {
+                lastImageUrl = props.imageUrl;
+                updateTextureFromUrl(props.imageUrl);
+            }
+            const currentValue = JSON.stringify(props.value);
+            if (currentValue !== lastValue) {
+                lastValue = currentValue;
+                if (props.value && typeof props.value === 'object') {
+                    rotateDeg = props.value.rotate_deg ?? rotateDeg;
+                    zoom = props.value.zoom ?? zoom;
+                    verticalTilt = props.value.vertical_tilt ?? verticalTilt;
+                    updatePositions();
+                }
+            }
+        }, 100);
+    };
+    initScene();
+})();
+"""
+def create_camera_3d_component(value=None, imageUrl=None, **kwargs):
+    """Create a 3D camera control component using gr.HTML."""
+    if value is None:
+        value = {"rotate_deg": 0, "zoom": 5.0, "vertical_tilt": 0}
+    return gr.HTML(
+        value=value,
+        html_template=CAMERA_3D_HTML_TEMPLATE,
+        js_on_load=CAMERA_3D_JS,
+        imageUrl=imageUrl,
+        **kwargs,
+    )
+# --- UI ---
+css = """
+#col-container { max-width: 1100px; margin: 0 auto; }
+.dark .progress-text { color: white !important; }
+#camera-3d-control { min-height: 400px; }
+#examples { max-width: 1100px; margin: 0 auto; }
+.fillable{max-width: 1250px !important}
+"""
+def reset_all() -> list:
+    """Reset all camera control knobs and flags to their default values."""
+    return [0, 5.0, 0, True]  # rotate_deg, zoom, vertical_tilt, is_reset
+def end_reset() -> bool:
+    """Mark the end of a reset cycle."""
+    return False
+def update_dimensions_on_upload(image: Optional[Image.Image]) -> Tuple[int, int]:
+    """Compute recommended (width, height) for the output resolution."""
+    if image is None:
+        return 1024, 1024
+    original_width, original_height = image.size
+    if original_width > original_height:
+        new_width = 1024
+        aspect_ratio = original_height / original_width
+        new_height = int(new_width * aspect_ratio)
+    else:
+        new_height = 1024
+        aspect_ratio = original_width / original_height
+        new_width = int(new_height * aspect_ratio)
+    new_width = (new_width // 8) * 8
+    new_height = (new_height // 8) * 8
+    return new_width, new_height
+with gr.Blocks(css=css, theme=gr.themes.Citrus()) as demo:
+    gr.Markdown("""
+    ## 🎬 Fibo Edit — Camera Angle Control
+    Fibo Edit with Multi-Angle LoRA for precise camera control ✨
+    Control rotation, tilt, and zoom to generate images from any angle 🎥
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            image = gr.Image(label="Input Image", type="pil", height=280)
+            prev_output = gr.Image(value=None, visible=False)
+            is_reset = gr.Checkbox(value=False, visible=False)
+            # Hidden state to pass processed image between steps
+            processed_image = gr.State(None)
+            gr.Markdown("### 🎮 3D Camera Control")
+            camera_3d = create_camera_3d_component(
+                value={"rotate_deg": 0, "zoom": 5.0, "vertical_tilt": 0},
+                elem_id="camera-3d-control",
+            )
+            with gr.Row():
+                reset_btn = gr.Button("🔄 Reset", size="sm")
+                run_btn = gr.Button("🚀 Generate", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            result = gr.Image(label="Output Image", interactive=False, height=350)
+            gr.Markdown("### 🎚️ Slider Controls")
+            rotate_deg = gr.Slider(
+                label="Horizontal Rotation (°)",
+                minimum=-180,
+                maximum=180,
+                step=45,
+                value=0,
+                info="-180/180: back, -90: left, 0: front, 90: right",
+            )
+            zoom = gr.Slider(
+                label="Zoom Level",
+                minimum=0,
+                maximum=10,
+                step=1,
+                value=5.0,
+                info="0-3.33: wide, 3.33-6.66: medium, 6.66-10: close-up",
+            )
+            vertical_tilt = gr.Slider(
+                label="Vertical Tilt",
+                minimum=-1,
+                maximum=1,
+                step=0.5,
+                value=0,
+                info="-1: low-angle, 0: eye-level, 1: high-angle",
+            )
+            prompt_preview = gr.Textbox(label="Generated Prompt", interactive=False)
+            with gr.Accordion("📋 Structured Caption (BRIA API)", open=False):
+                structured_json = gr.JSON(label="JSON Response", container=False)
+            with gr.Accordion("⚙️ Advanced Settings", open=False):
+                seed = gr.Slider(
+                    label="Seed",
+                    minimum=0,
+                    maximum=MAX_SEED,
+                    step=1,
+                    value=DEFAULT_SEED,
+                )
+                randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
+                guidance_scale = gr.Slider(
+                    label="Guidance Scale",
+                    minimum=1.0,
+                    maximum=10.0,
+                    step=0.1,
+                    value=DEFAULT_GUIDANCE_SCALE,
+                )
+                num_inference_steps = gr.Slider(
+                    label="Inference Steps",
+                    minimum=1,
+                    maximum=100,
+                    step=1,
+                    value=DEFAULT_NUM_INFERENCE_STEPS,
+                )
+                height = gr.Slider(
+                    label="Height", minimum=256, maximum=2048, step=8, value=1024
+                )
+                width = gr.Slider(
+                    label="Width", minimum=256, maximum=2048, step=8, value=1024
+                )
+    # --- Helper Functions ---
+    def update_prompt_from_sliders(rotate, zoom_val, tilt):
+        prompt, _ = build_camera_prompt(rotate, zoom_val, tilt)
+        return prompt
+    def sync_3d_to_sliders(camera_value):
+        if camera_value and isinstance(camera_value, dict):
+            rot = camera_value.get("rotate_deg", 0)
+            zoom_val = camera_value.get("zoom", 5.0)
+            tilt = camera_value.get("vertical_tilt", 0)
+            prompt, _ = build_camera_prompt(rot, zoom_val, tilt)
+            return rot, zoom_val, tilt, prompt
+        return gr.update(), gr.update(), gr.update(), gr.update()
+    def sync_sliders_to_3d(rotate, zoom_val, tilt):
+        return {"rotate_deg": rotate, "zoom": zoom_val, "vertical_tilt": tilt}
+    def update_3d_image(img):
+        if img is None:
+            return gr.update(imageUrl=None)
+        buffered = BytesIO()
+        img.save(buffered, format="PNG")
+        img_str = base64.b64encode(buffered.getvalue()).decode()
+        data_url = f"data:image/png;base64,{img_str}"
+        return gr.update(imageUrl=data_url)
+    # --- Event Handlers ---
+    # Slider -> Prompt preview
+    for slider in [rotate_deg, zoom, vertical_tilt]:
+        slider.change(
+            fn=update_prompt_from_sliders,
+            inputs=[rotate_deg, zoom, vertical_tilt],
+            outputs=[prompt_preview],
+        )
+    # 3D control -> Sliders + Prompt (no auto-inference)
+    camera_3d.change(
+        fn=sync_3d_to_sliders,
+        inputs=[camera_3d],
+        outputs=[rotate_deg, zoom, vertical_tilt, prompt_preview],
+    )
+    # Sliders -> 3D control (no auto-inference)
+    for slider in [rotate_deg, zoom, vertical_tilt]:
+        slider.release(
+            fn=sync_sliders_to_3d,
+            inputs=[rotate_deg, zoom, vertical_tilt],
+            outputs=[camera_3d],
+        )
+    # Reset
+    reset_btn.click(
+        fn=reset_all,
+        inputs=None,
+        outputs=[rotate_deg, zoom, vertical_tilt, is_reset],
+        queue=False,
+    ).then(fn=end_reset, inputs=None, outputs=[is_reset], queue=False).then(
+        fn=sync_sliders_to_3d,
+        inputs=[rotate_deg, zoom, vertical_tilt],
+        outputs=[camera_3d],
+    )
+    # Generate button - Two-stage process
+    # Stage 1: Fetch structured caption from BRIA API and display it immediately
+    run_event = run_btn.click(
+        fn=fetch_structured_caption,
+        inputs=[
+            image,
+            rotate_deg,
+            zoom,
+            vertical_tilt,
+            seed,
+            randomize_seed,
+            prev_output,
+        ],
+        outputs=[seed, prompt_preview, structured_json, processed_image],
+    ).then(
+        # Stage 2: Generate image with Fibo Edit pipeline
+        fn=generate_image_from_caption,
+        inputs=[
+            processed_image,
+            structured_json,
+            seed,
+            guidance_scale,
+            num_inference_steps,
+        ],
+        outputs=[result],
+    )
+    # Image upload
+    image.upload(
+        fn=update_dimensions_on_upload, inputs=[image], outputs=[width, height]
+    ).then(
+        fn=reset_all,
+        inputs=None,
+        outputs=[rotate_deg, zoom, vertical_tilt, is_reset],
+        queue=False,
+    ).then(fn=end_reset, inputs=None, outputs=[is_reset], queue=False).then(
+        fn=update_3d_image, inputs=[image], outputs=[camera_3d]
+    )
+    image.clear(fn=lambda: gr.update(imageUrl=None), outputs=[camera_3d])
+    run_event.then(lambda img, *_: img, inputs=[result], outputs=[prev_output])
+    # Examples - Commenting out for now since we need actual example images
+    # Note: With the two-stage inference process, examples would need custom handling
+    # to properly chain fetch_structured_caption -> generate_image_from_caption
+    # Sync 3D component when sliders change (covers example loading)
+    def sync_3d_on_slider_change(img, rot, zoom_val, tilt):
+        camera_value = {"rotate_deg": rot, "zoom": zoom_val, "vertical_tilt": tilt}
+        if img is not None:
+            buffered = BytesIO()
+            img.save(buffered, format="PNG")
+            img_str = base64.b64encode(buffered.getvalue()).decode()
+            data_url = f"data:image/png;base64,{img_str}"
+            return gr.update(value=camera_value, imageUrl=data_url)
+        return gr.update(value=camera_value)
+    # When any slider value changes (including from examples), sync the 3D component
+    for slider in [rotate_deg, zoom, vertical_tilt]:
+        slider.change(
+            fn=sync_3d_on_slider_change,
+            inputs=[image, rotate_deg, zoom, vertical_tilt],
+            outputs=[camera_3d],
+        )
+    # API endpoints for the two-stage inference process
+    gr.api(fetch_structured_caption, api_name="fetch_caption")
+    gr.api(generate_image_from_caption, api_name="generate_image")
+if __name__ == "__main__":
+    head = '<script src="https://cdnjs.cloudflare.com/ajax/libs/three.js/r128/three.min.js"></script>'
+    if RUN_LOCAL:
+        # Local development configuration
+        demo.launch(
+            mcp_server=True,
+            head=head,
+            footer_links=["api", "gradio", "settings"],
+            server_name="0.0.0.0",
+            server_port=8081,
+        )
+    else:
+        # HuggingFace Spaces standard configuration
+        demo.launch(head=head)

fibo_edit_pipeline.py ADDED Viewed

	@@ -0,0 +1,953 @@

+# Copyright (c) Bria.ai. All rights reserved.
+#
+# This file is licensed under the Creative Commons Attribution-NonCommercial 4.0 International Public License (CC-BY-NC-4.0).
+# You may obtain a copy of the license at https://creativecommons.org/licenses/by-nc/4.0/
+#
+# You are free to share and adapt this material for non-commercial purposes provided you give appropriate credit,
+# indicate if changes were made, and do not use the material for commercial purposes.
+#
+# See the license for further details.
+from typing import Any, Callable, Dict, List, Optional, Union
+import numpy as np
+import torch
+from transformers import AutoTokenizer
+from transformers.models.smollm3.modeling_smollm3 import SmolLM3ForCausalLM
+import PIL
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import FluxLoraLoaderMixin
+from diffusers.models.autoencoders.autoencoder_kl_wan import AutoencoderKLWan
+from diffusers.models.transformers.transformer_bria_fibo import BriaFiboTransformer2DModel
+from diffusers.pipelines.bria_fibo.pipeline_output import BriaFiboPipelineOutput
+from diffusers.pipelines.flux.pipeline_flux import calculate_shift, retrieve_timesteps
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler, KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Example:
+    ```python
+    import torch
+    from diffusers import BriaFiboPipeline
+    from diffusers.modular_pipelines import ModularPipeline
+    torch.set_grad_enabled(False)
+    vlm_pipe = ModularPipeline.from_pretrained("briaai/FIBO-VLM-prompt-to-JSON", trust_remote_code=True)
+    pipe = BriaFiboPipeline.from_pretrained(
+        "briaai/FIBO",
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16,
+    )
+    pipe.enable_model_cpu_offload()
+    with torch.inference_mode():
+        # 1. Create a prompt to generate an initial image
+        output = vlm_pipe(prompt="a beautiful dog")
+        json_prompt_generate = output.values["json_prompt"]
+        # Generate the image from the structured json prompt
+        results_generate = pipe(prompt=json_prompt_generate, num_inference_steps=50, guidance_scale=5)
+        results_generate.images[0].save("image_generate.png")
+    ```
+"""
+PREFERRED_RESOLUTION = {
+    256 * 256: [(208, 304), (224, 288), (256, 256), (288, 224), (304, 208), (320, 192), (336, 192)],
+    512 * 512: [
+        (416, 624),
+        (432, 592),
+        (464, 560),
+        (512, 512),
+        (544, 480),
+        (576, 448),
+        (592, 432),
+        (608, 416),
+        (624, 416),
+        (640, 400),
+        (672, 384),
+        (704, 368),
+    ],
+    1024 * 1024: [
+        (832, 1248),
+        (880, 1184),
+        (912, 1136),
+        (1024, 1024),
+        (1136, 912),
+        (1184, 880),
+        (1216, 848),
+        (1248, 832),
+        (1248, 832),
+        (1264, 816),
+        (1296, 800),
+        (1360, 768),
+    ],
+}
+class BriaFiboEditPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
+    r"""
+    Args:
+        transformer (`BriaFiboTransformer2DModel`):
+            The transformer model for 2D diffusion modeling.
+        scheduler (`FlowMatchEulerDiscreteScheduler` or `KarrasDiffusionSchedulers`):
+            Scheduler to be used with `transformer` to denoise the encoded latents.
+        vae (`AutoencoderKLWan`):
+            Variational Auto-Encoder for encoding and decoding images to and from latent representations.
+        text_encoder (`SmolLM3ForCausalLM`):
+            Text encoder for processing input prompts.
+        tokenizer (`AutoTokenizer`):
+            Tokenizer used for processing the input text prompts for the text_encoder.
+    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->transformer->vae"
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+    def __init__(
+        self,
+        transformer: BriaFiboTransformer2DModel,
+        scheduler: Union[FlowMatchEulerDiscreteScheduler, KarrasDiffusionSchedulers],
+        vae: AutoencoderKLWan,
+        text_encoder: SmolLM3ForCausalLM,
+        tokenizer: AutoTokenizer,
+    ):
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 16
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)  # * 2)
+        self.default_sample_size = 32  # 64
+    def get_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 2048,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if not prompt:
+            raise ValueError("`prompt` must be a non-empty string or list of strings.")
+        batch_size = len(prompt)
+        bot_token_id = 128000
+        text_encoder_device = device if device is not None else torch.device("cpu")
+        if not isinstance(text_encoder_device, torch.device):
+            text_encoder_device = torch.device(text_encoder_device)
+        if all(p == "" for p in prompt):
+            input_ids = torch.full((batch_size, 1), bot_token_id, dtype=torch.long, device=text_encoder_device)
+            attention_mask = torch.ones_like(input_ids)
+        else:
+            tokenized = self.tokenizer(
+                prompt,
+                padding="longest",
+                max_length=max_sequence_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            input_ids = tokenized.input_ids.to(text_encoder_device)
+            attention_mask = tokenized.attention_mask.to(text_encoder_device)
+            if any(p == "" for p in prompt):
+                empty_rows = torch.tensor([p == "" for p in prompt], dtype=torch.bool, device=text_encoder_device)
+                input_ids[empty_rows] = bot_token_id
+                attention_mask[empty_rows] = 1
+        encoder_outputs = self.text_encoder(
+            input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+        )
+        hidden_states = encoder_outputs.hidden_states
+        prompt_embeds = torch.cat([hidden_states[-1], hidden_states[-2]], dim=-1)
+        prompt_embeds = prompt_embeds.to(device=device, dtype=dtype)
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        hidden_states = tuple(
+            layer.repeat_interleave(num_images_per_prompt, dim=0).to(device=device) for layer in hidden_states
+        )
+        attention_mask = attention_mask.repeat_interleave(num_images_per_prompt, dim=0).to(device=device)
+        return prompt_embeds, hidden_states, attention_mask
+    @staticmethod
+    def pad_embedding(prompt_embeds, max_tokens, attention_mask=None):
+        # Pad embeddings to `max_tokens` while preserving the mask of real tokens.
+        batch_size, seq_len, dim = prompt_embeds.shape
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_len), dtype=prompt_embeds.dtype, device=prompt_embeds.device)
+        else:
+            attention_mask = attention_mask.to(device=prompt_embeds.device, dtype=prompt_embeds.dtype)
+        if max_tokens < seq_len:
+            raise ValueError("`max_tokens` must be greater or equal to the current sequence length.")
+        if max_tokens > seq_len:
+            pad_length = max_tokens - seq_len
+            padding = torch.zeros((batch_size, pad_length, dim), dtype=prompt_embeds.dtype, device=prompt_embeds.device)
+            prompt_embeds = torch.cat([prompt_embeds, padding], dim=1)
+            mask_padding = torch.zeros((batch_size, pad_length), dtype=prompt_embeds.dtype, device=prompt_embeds.device)
+            attention_mask = torch.cat([attention_mask, mask_padding], dim=1)
+        return prompt_embeds, attention_mask
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        guidance_scale: float = 5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 3000,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            guidance_scale (`float`):
+                Guidance scale for classifier free guidance.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        device = device or self._execution_device
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        prompt_attention_mask = None
+        negative_prompt_attention_mask = None
+        if prompt_embeds is None:
+            prompt_embeds, prompt_layers, prompt_attention_mask = self.get_prompt_embeds(
+                prompt=prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+            prompt_embeds = prompt_embeds.to(dtype=self.transformer.dtype)
+            prompt_layers = [tensor.to(dtype=self.transformer.dtype) for tensor in prompt_layers]
+        if guidance_scale > 1:
+            if isinstance(negative_prompt, list) and negative_prompt[0] is None:
+                negative_prompt = ""
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            negative_prompt_embeds, negative_prompt_layers, negative_prompt_attention_mask = self.get_prompt_embeds(
+                prompt=negative_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.transformer.dtype)
+            negative_prompt_layers = [tensor.to(dtype=self.transformer.dtype) for tensor in negative_prompt_layers]
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+        # Pad to longest
+        if prompt_attention_mask is not None:
+            prompt_attention_mask = prompt_attention_mask.to(device=prompt_embeds.device, dtype=prompt_embeds.dtype)
+        if negative_prompt_embeds is not None:
+            if negative_prompt_attention_mask is not None:
+                negative_prompt_attention_mask = negative_prompt_attention_mask.to(
+                    device=negative_prompt_embeds.device, dtype=negative_prompt_embeds.dtype
+                )
+            max_tokens = max(negative_prompt_embeds.shape[1], prompt_embeds.shape[1])
+            prompt_embeds, prompt_attention_mask = self.pad_embedding(
+                prompt_embeds, max_tokens, attention_mask=prompt_attention_mask
+            )
+            prompt_layers = [self.pad_embedding(layer, max_tokens)[0] for layer in prompt_layers]
+            negative_prompt_embeds, negative_prompt_attention_mask = self.pad_embedding(
+                negative_prompt_embeds, max_tokens, attention_mask=negative_prompt_attention_mask
+            )
+            negative_prompt_layers = [self.pad_embedding(layer, max_tokens)[0] for layer in negative_prompt_layers]
+        else:
+            max_tokens = prompt_embeds.shape[1]
+            prompt_embeds, prompt_attention_mask = self.pad_embedding(
+                prompt_embeds, max_tokens, attention_mask=prompt_attention_mask
+            )
+            negative_prompt_layers = None
+        dtype = self.text_encoder.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[0], max_tokens, 3).to(device=device, dtype=dtype)
+        return (
+            prompt_embeds,
+            negative_prompt_embeds,
+            text_ids,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+            prompt_layers,
+            negative_prompt_layers,
+        )
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @staticmethod
+    # Based on diffusers.pipelines.flux.pipeline_flux.FluxPipeline._unpack_latents
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+        height = height // vae_scale_factor
+        width = width // vae_scale_factor
+        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+        latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
+        return latents
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height, width, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+        return latent_image_ids.to(device=device, dtype=dtype)
+    @staticmethod
+    def _unpack_latents_no_patch(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+        height = height // vae_scale_factor
+        width = width // vae_scale_factor
+        latents = latents.view(batch_size, height, width, channels)
+        latents = latents.permute(0, 3, 1, 2)
+        return latents
+    @staticmethod
+    def _pack_latents_no_patch(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.permute(0, 2, 3, 1)
+        latents = latents.reshape(batch_size, height * width, num_channels_latents)
+        return latents
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._pack_latents
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+        return latents
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        do_patching=False,
+    ):
+        height = int(height) // self.vae_scale_factor
+        width = int(width) // self.vae_scale_factor
+        shape = (batch_size, num_channels_latents, height, width)
+        if latents is not None:
+            latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+            return latents.to(device=device, dtype=dtype), latent_image_ids
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        if do_patching:
+            latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+            latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
+        else:
+            latents = self._pack_latents_no_patch(latents, batch_size, num_channels_latents, height, width)
+            latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+        return latents, latent_image_ids
+    @staticmethod
+    def _prepare_attention_mask(attention_mask):
+        attention_matrix = torch.einsum("bi,bj->bij", attention_mask, attention_mask)
+        # convert to 0 - keep, -inf ignore
+        attention_matrix = torch.where(
+            attention_matrix == 1, 0.0, -torch.inf
+        )  # Apply -inf to ignored tokens for nulling softmax score
+        return attention_matrix
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Optional[Union[PIL.Image.Image, torch.FloatTensor]] = None,
+        num_inference_steps: int = 30,
+        timesteps: List[int] = None,
+        guidance_scale: float = 5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 3000,
+        do_patching=False,
+        _auto_resize: bool = True,
+        base_resolution: int = 1024,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`PIL.Image.Image` or `torch.FloatTensor`, *optional*):
+                The image to guide the image generation. If not defined, the pipeline will generate an image from scratch.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 3000): Maximum sequence length to use with the `prompt`.
+            do_patching (`bool`, *optional*, defaults to `False`): Whether to use patching.
+        Examples:
+          Returns:
+            [`~pipelines.flux.BriaFiboPipelineOutput`] or `tuple`: [`~pipelines.flux.BriaFiboPipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+        if image is not None and _auto_resize:
+            image_height, image_width = self.image_processor.get_default_height_width(image)
+            # area = min(prefered_resolutions.keys(),key=lambda size: abs(image_height*image_width-size))
+            image_width, image_height = min(
+                PREFERRED_RESOLUTION[base_resolution * base_resolution],
+                key=lambda size: abs(size[0] / size[1] - image_width / image_height),
+            )
+            width, height = image_width, image_height
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(  # check flux
+            prompt=prompt,
+            prompt_embeds=prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        lora_scale = self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            text_ids,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+            prompt_layers,
+            negative_prompt_layers,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            guidance_scale=guidance_scale,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            device=device,
+            max_sequence_length=max_sequence_length,
+            num_images_per_prompt=num_images_per_prompt,
+            lora_scale=lora_scale,
+        )
+        prompt_batch_size = prompt_embeds.shape[0]
+        if guidance_scale > 1:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            prompt_layers = [
+                torch.cat([negative_prompt_layers[i], prompt_layers[i]], dim=0) for i in range(len(prompt_layers))
+            ]
+            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+        total_num_layers_transformer = len(self.transformer.transformer_blocks) + len(
+            self.transformer.single_transformer_blocks
+        )
+        if len(prompt_layers) >= total_num_layers_transformer:
+            # remove first layers
+            prompt_layers = prompt_layers[len(prompt_layers) - total_num_layers_transformer :]
+        else:
+            # duplicate last layer
+            prompt_layers = prompt_layers + [prompt_layers[-1]] * (total_num_layers_transformer - len(prompt_layers))
+        # Preprocess image
+        if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels):
+            image = self.image_processor.resize(image, height, width)
+            image = self.image_processor.preprocess(image, height, width)
+        # 5. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        if do_patching:
+            num_channels_latents = int(num_channels_latents / 4)
+        latents, latent_image_ids = self.prepare_latents(
+            prompt_batch_size,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            do_patching,
+        )
+        if image is not None:
+            image_latents, image_ids = self.prepare_image_latents(
+                image=image,
+                batch_size=batch_size * num_images_per_prompt,
+                num_channels_latents=num_channels_latents,
+                height=height,
+                width=width,
+                dtype=prompt_embeds.dtype,
+                device=device,
+                generator=generator,
+            )
+            latent_image_ids = torch.cat([latent_image_ids, image_ids], dim=0)  # dim 0 is sequence dimension
+        else:
+            image_latents = None
+        latent_attention_mask = torch.ones(
+            [latents.shape[0], latents.shape[1]], dtype=latents.dtype, device=latents.device
+        )
+        if guidance_scale > 1:
+            latent_attention_mask = latent_attention_mask.repeat(2, 1)
+        if image_latents is None:
+            attention_mask = torch.cat([prompt_attention_mask, latent_attention_mask], dim=1)
+        else:
+            image_latent_attention_mask = torch.ones(
+                [image_latents.shape[0], image_latents.shape[1]],
+                dtype=image_latents.dtype,
+                device=image_latents.device,
+            )
+            if guidance_scale > 1:
+                image_latent_attention_mask = image_latent_attention_mask.repeat(2, 1)
+            attention_mask = torch.cat(
+                [prompt_attention_mask, latent_attention_mask, image_latent_attention_mask], dim=1
+            )
+        attention_mask = self.create_attention_matrix(attention_mask)  # batch, seq => batch, seq, seq
+        attention_mask = attention_mask.unsqueeze(dim=1).to(dtype=self.transformer.dtype)  # for head broadcasting
+        if self._joint_attention_kwargs is None:
+            self._joint_attention_kwargs = {}
+        self._joint_attention_kwargs["attention_mask"] = attention_mask
+        # Adapt scheduler to dynamic shifting (resolution dependent)
+        if do_patching:
+            seq_len = (height // (self.vae_scale_factor * 2)) * (width // (self.vae_scale_factor * 2))
+        else:
+            seq_len = (height // self.vae_scale_factor) * (width // self.vae_scale_factor)
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        mu = calculate_shift(
+            seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        # Init sigmas and timesteps according to shift size
+        # This changes the scheduler in-place according to the dynamic scheduling
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps=num_inference_steps,
+            device=device,
+            timesteps=None,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # Support old different diffusers versions
+        if len(latent_image_ids.shape) == 3:
+            latent_image_ids = latent_image_ids[0]
+        if len(text_ids.shape) == 3:
+            text_ids = text_ids[0]
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                latent_model_input = latents
+                if image_latents is not None:
+                    latent_model_input = torch.cat([latent_model_input, image_latents], dim=1)
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latent_model_input] * 2) if guidance_scale > 1 else latent_model_input
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0]).to(
+                    device=latent_model_input.device, dtype=latent_model_input.dtype
+                )
+                # This is predicts "v" from flow-matching or eps from diffusion
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    text_encoder_layers=prompt_layers,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                )[0]
+                # perform guidance
+                if guidance_scale > 1:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred[:, : latents.shape[1], ...], t, latents, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        if output_type == "latent":
+            image = latents
+        else:
+            if do_patching:
+                latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            else:
+                latents = self._unpack_latents_no_patch(latents, height, width, self.vae_scale_factor)
+            latents = latents.unsqueeze(dim=2)
+            latents_device = latents[0].device
+            latents_dtype = latents[0].dtype
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean)
+                .view(1, self.vae.config.z_dim, 1, 1, 1)
+                .to(latents_device, latents_dtype)
+            )
+            latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+                latents_device, latents_dtype
+            )
+            latents_scaled = [latent / latents_std + latents_mean for latent in latents]
+            latents_scaled = torch.cat(latents_scaled, dim=0)
+            image = []
+            for scaled_latent in latents_scaled:
+                curr_image = self.vae.decode(scaled_latent.unsqueeze(0), return_dict=False)[0]
+                curr_image = self.image_processor.postprocess(curr_image.squeeze(dim=2), output_type=output_type)
+                image.append(curr_image)
+            if len(image) == 1:
+                image = image[0]
+            else:
+                image = np.stack(image, axis=0)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return BriaFiboPipelineOutput(images=image)
+    def prepare_image_latents(
+        self,
+        image: torch.Tensor,
+        batch_size: int,
+        num_channels_latents: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    ):
+        image = image.to(device=device, dtype=dtype)
+        height = int(height) // self.vae_scale_factor
+        width = int(width) // self.vae_scale_factor
+        # scaling
+        latents_mean = (
+            torch.tensor(self.vae.config.latents_mean).view(1, self.vae.config.z_dim, 1, 1, 1).to(device, dtype)
+        )
+        latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+            device, dtype
+        )
+        image_latents_cthw = self.vae.encode(image.unsqueeze(2)).latent_dist.mean
+        latents_scaled = [(latent - latents_mean) * latents_std for latent in image_latents_cthw]
+        image_latents_cthw = torch.concat(latents_scaled, dim=0)
+        image_latents_bchw = image_latents_cthw[:, :, 0, :, :]
+        image_latent_height, image_latent_width = image_latents_bchw.shape[2:]
+        image_latents_bsd = self._pack_latents_no_patch(
+            latents=image_latents_bchw,
+            batch_size=batch_size,
+            num_channels_latents=num_channels_latents,
+            height=image_latent_height,
+            width=image_latent_width,
+        )
+        # breakpoint()
+        image_ids = self._prepare_latent_image_ids(
+            batch_size=batch_size, height=image_latent_height, width=image_latent_width, device=device, dtype=dtype
+        )
+        # image ids are the same as latent ids with the first dimension set to 1 instead of 0
+        image_ids[..., 0] = 1
+        return image_latents_bsd, image_ids
+    def check_inputs(
+        self,
+        prompt,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if max_sequence_length is not None and max_sequence_length > 3000:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 3000 but is {max_sequence_length}")
+    def create_attention_matrix(self, attention_mask):
+        attention_matrix = torch.einsum("bi,bj->bij", attention_mask, attention_mask)
+        # convert to 0 - keep, -inf ignore
+        attention_matrix = torch.where(
+            attention_matrix == 1, 0.0, -torch.inf
+        )  # Apply -inf to ignored tokens for nulling softmax score
+        return attention_matrix

requirements.txt ADDED Viewed

	@@ -0,0 +1,133 @@

+accelerate==1.12.0
+aiofiles==24.1.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anyio==4.12.1
+asttokens==3.0.1
+attrs==25.4.0
+boto3==1.42.28
+botocore==1.42.28
+brotli==1.2.0
+certifi==2026.1.4
+cffi==2.0.0 ; platform_python_implementation != 'PyPy'
+charset-normalizer==3.4.4
+click==8.3.1
+colorama==0.4.6 ; sys_platform == 'win32'
+cryptography==46.0.3
+cuda-bindings==12.9.4 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+cuda-pathfinder==1.3.3 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+decorator==5.2.1
+diffusers @ git+https://github.com/huggingface/diffusers@956bdcc3ea4897eaeb6c828b8433bdcae71e9f0f
+einops==0.8.2
+exceptiongroup==1.3.1 ; python_full_version < '3.11'
+executing==2.2.1
+fal-client==0.12.0
+fastapi==0.128.0
+ffmpy==1.0.0
+filelock==3.20.3
+fsspec==2026.1.0
+gradio==6.4.0
+gradio-client==2.0.3
+groovy==0.1.2
+h11==0.16.0
+hf-xet==1.2.0 ; platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'
+httpcore==1.0.9
+httpx==0.28.1
+httpx-sse==0.4.3
+huggingface-hub==1.3.4
+idna==3.11
+importlib-metadata==8.7.1
+ipython==8.38.0 ; python_full_version < '3.11'
+ipython==9.9.0 ; python_full_version >= '3.11'
+ipython-pygments-lexers==1.1.1 ; python_full_version >= '3.11'
+jedi==0.19.2
+jinja2==3.1.6
+jmespath==1.0.1
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+markdown-it-py==4.0.0
+markupsafe==3.0.3
+matplotlib-inline==0.2.1
+mcp==1.26.0
+mdurl==0.1.2
+mpmath==1.3.0
+msgpack==1.1.2
+networkx==3.4.2 ; python_full_version < '3.11'
+networkx==3.6.1 ; python_full_version >= '3.11'
+numpy==2.2.6 ; python_full_version < '3.11'
+numpy==2.4.1 ; python_full_version >= '3.11'
+nvidia-cublas-cu12==12.8.4.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-cuda-cupti-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-cuda-nvrtc-cu12==12.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-cuda-runtime-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-cudnn-cu12==9.10.2.21 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-cufft-cu12==11.3.3.83 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-cufile-cu12==1.13.1.3 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-curand-cu12==10.3.9.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-cusolver-cu12==11.7.3.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-cusparse-cu12==12.5.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-cusparselt-cu12==0.7.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-nccl-cu12==2.27.5 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-nvjitlink-cu12==12.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-nvshmem-cu12==3.4.5 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+nvidia-nvtx-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+orjson==3.11.5
+packaging==26.0
+pandas==2.3.3
+parso==0.8.5
+peft==0.18.1
+pexpect==4.9.0 ; sys_platform != 'emscripten' and sys_platform != 'win32'
+pillow==12.1.0
+prompt-toolkit==3.0.52
+psutil==5.9.8
+ptyprocess==0.7.0 ; sys_platform != 'emscripten' and sys_platform != 'win32'
+pure-eval==0.2.3
+pycparser==3.0 ; implementation_name != 'PyPy' and platform_python_implementation != 'PyPy'
+pydantic==2.12.5
+pydantic-core==2.41.5
+pydantic-settings==2.12.0
+pydub==0.25.1
+pygments==2.19.2
+pyjwt==2.10.1
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.1
+python-multipart==0.0.22
+pytz==2025.2
+pywin32==311 ; sys_platform == 'win32'
+pyyaml==6.0.3
+referencing==0.37.0
+regex==2026.1.15
+requests==2.32.5
+rich==14.3.1
+rpds-py==0.30.0
+s3transfer==0.16.0
+safehttpx==0.1.7
+safetensors==0.7.0
+semantic-version==2.10.0
+setuptools==80.10.2 ; python_full_version >= '3.12'
+shellingham==1.5.4
+six==1.17.0
+spaces==0.47.0
+sse-starlette==3.2.0
+stack-data==0.6.3
+starlette==0.50.0
+sympy==1.14.0
+tokenizers==0.22.2
+tomlkit==0.13.3
+torch==2.10.0
+torchvision==0.25.0
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==5.0.0
+triton==3.6.0 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+typer==0.21.1
+typer-slim==0.21.1
+typing-extensions==4.15.0
+typing-inspection==0.4.2
+tzdata==2025.3
+ujson==5.11.0
+urllib3==2.6.3
+uvicorn==0.40.0
+wcwidth==0.2.14
+websockets==16.0
+zipp==3.23.0

utils.py ADDED Viewed

	@@ -0,0 +1,113 @@

+"""Camera angle data structures for Fibo Edit."""
+from dataclasses import dataclass
+from enum import Enum
+class View(Enum):
+    """Camera view angles"""
+    BACK_VIEW = "back view"
+    BACK_LEFT_QUARTER = "back-left quarter view"
+    BACK_RIGHT_QUARTER = "back-right quarter view"
+    FRONT_VIEW = "front view"
+    FRONT_LEFT_QUARTER = "front-left quarter view"
+    FRONT_RIGHT_QUARTER = "front-right quarter view"
+    LEFT_SIDE = "left side view"
+    RIGHT_SIDE = "right side view"
+class Shot(Enum):
+    """
+    Camera shot angles (measured from horizontal/eye-level as 0 degrees)
+    - ELEVATED: 45-60 degrees above subject (moderately elevated)
+    - EYE_LEVEL: 0 degrees (horizontal with subject)
+    - HIGH_ANGLE: 60-90 degrees above subject (steep overhead, bird's eye)
+    - LOW_ANGLE: Below eye level (looking up at subject)
+    """
+    ELEVATED = "elevated shot"
+    EYE_LEVEL = "eye-level shot"
+    HIGH_ANGLE = "high-angle shot"
+    LOW_ANGLE = "low-angle shot"
+class Zoom(Enum):
+    """Camera zoom levels"""
+    CLOSE_UP = "close-up"
+    MEDIUM = "medium shot"
+    WIDE = "wide shot"
+@dataclass
+class AngleInstruction:
+    view: View
+    shot: Shot
+    zoom: Zoom
+    def __str__(self):
+        return f"<sks> {self.view.value} {self.shot.value} {self.zoom.value}"
+    @classmethod
+    def from_camera_params(cls, rotation: float, tilt: float, zoom: float) -> "AngleInstruction":
+        """
+        Create an AngleInstruction from camera parameters.
+        Args:
+            rotation: Horizontal rotation in degrees (-180 to 180)
+                     -180/180: back view, -90: left view, 0: front view, 90: right view
+            tilt: Vertical tilt (-1 to 1)
+                  -1 to -0.33: low-angle shot
+                  -0.33 to 0.33: eye-level shot
+                  0.33 to 0.66: elevated shot
+                  0.66 to 1: high-angle shot
+            zoom: Zoom level (0 to 10)
+                  0-3.33: wide shot
+                  3.33-6.66: medium shot
+                  6.66-10: close-up
+        Returns:
+            AngleInstruction instance
+        """
+        # Map rotation to View
+        # Normalize rotation to -180 to 180 range
+        rotation = rotation % 360
+        if rotation > 180:
+            rotation -= 360
+        # Determine view based on rotation
+        if -157.5 <= rotation < -112.5:
+            view = View.BACK_LEFT_QUARTER
+        elif -112.5 <= rotation < -67.5:
+            view = View.LEFT_SIDE
+        elif -67.5 <= rotation < -22.5:
+            view = View.FRONT_LEFT_QUARTER
+        elif -22.5 <= rotation < 22.5:
+            view = View.FRONT_VIEW
+        elif 22.5 <= rotation < 67.5:
+            view = View.FRONT_RIGHT_QUARTER
+        elif 67.5 <= rotation < 112.5:
+            view = View.RIGHT_SIDE
+        elif 112.5 <= rotation < 157.5:
+            view = View.BACK_RIGHT_QUARTER
+        else:  # 157.5 to 180 or -180 to -157.5
+            view = View.BACK_VIEW
+        # Map tilt to Shot
+        if tilt < -0.33:
+            shot = Shot.LOW_ANGLE
+        elif tilt < 0.33:
+            shot = Shot.EYE_LEVEL
+        elif tilt < 0.66:
+            shot = Shot.ELEVATED
+        else:
+            shot = Shot.HIGH_ANGLE
+        # Map zoom to Zoom
+        if zoom < 3.33:
+            zoom_level = Zoom.WIDE
+        elif zoom < 6.66:
+            zoom_level = Zoom.MEDIUM
+        else:
+            zoom_level = Zoom.CLOSE_UP
+        return cls(view=view, shot=shot, zoom=zoom_level)