Spaces:

Leema-Krishna
/

ObjectInsertion

Runtime error

File size: 3,484 Bytes

f3d0a26

# frame_editor.py

import numpy as np
from PIL import Image
import torch
import cv2

def load_qwen_image_edit(use_lightning=True, device="cuda"):
    from diffusers import QwenImageEditPlusPipeline, FlowMatchEulerDiscreteScheduler

    scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
        "Qwen/Qwen-Image-Edit-2511", subfolder="scheduler"
    )
    pipe = QwenImageEditPlusPipeline.from_pretrained(
        "Qwen/Qwen-Image-Edit-2511",
        scheduler=scheduler,
        torch_dtype=torch.bfloat16,
    ).to(device)

    if use_lightning:
        pipe.load_lora_weights(
            "lightx2v/Qwen-Image-Edit-2511-Lightning",
            weight_name="Qwen-Image-Edit-2511-Lightning-4steps-V1.0-bf16.safetensors"
        )
        pipe.fuse_lora()

    return pipe


def insert_object_qwen_edit(
    first_frame,           # np.ndarray [H, W, 3] uint8 RGB
    box,                   # [x1, y1, x2, y2]
    object_description,    # e.g. "a red sports car"
    pipe,
    context_pad=60,        # pixels of context around box — helps Qwen understand scene
    num_inference_steps=4,
    guidance_scale=1.0,
    seed=42,
):
    """
    Inserts object into ONLY the bounding box region.
    Background outside the box is pixel-identical to original.
    
    Strategy:
      1. Crop (box + padding) from original → gives Qwen scene context
      2. Edit the crop with Qwen-Image-Edit
      3. Extract only the box pixels from the edited crop
      4. Paste back onto original frame
    """
    H, W = first_frame.shape[:2]
    x1, y1, x2, y2 = [int(v) for v in box]

    # --- Step 1: Crop with context padding ---
    cx1 = max(0, x1 - context_pad)
    cy1 = max(0, y1 - context_pad)
    cx2 = min(W, x2 + context_pad)
    cy2 = min(H, y2 + context_pad)

    crop = first_frame[cy1:cy2, cx1:cx2].copy()   # [cH, cW, 3]
    cH, cW = crop.shape[:2]

    # Box coordinates relative to crop
    lx1 = x1 - cx1
    ly1 = y1 - cy1
    lx2 = x2 - cx1
    ly2 = y2 - cy1

    # --- Step 2: Build focused edit instruction ---
    prompt = (
        f"Insert {object_description} in the region ({lx1},{ly1}) to ({lx2},{ly2}). "
        f"Keep everything outside that region exactly the same. "
        f"Match the scene lighting, shadows, and perspective."
    )

    generator = torch.Generator().manual_seed(seed)

    edited = pipe(
        image=[Image.fromarray(crop)],
        prompt=prompt,
        num_inference_steps=num_inference_steps,
        true_cfg_scale=guidance_scale,
        negative_prompt=" ",
        generator=generator,
    ).images[0]

    edited_np = np.array(edited)  # [cH', cW', 3]

    # Resize back if pipeline changed resolution
    if edited_np.shape[:2] != (cH, cW):
        edited_np = cv2.resize(edited_np, (cW, cH), interpolation=cv2.INTER_LINEAR)

    # --- Step 3: Hard composite — only paste the box region back ---
    result = first_frame.copy()
    result[y1:y2, x1:x2] = edited_np[ly1:ly2, lx1:lx2]

    return result  # [H, W, 3] uint8 RGB — background unchanged



def segment_existing_object(
    first_frame: np.ndarray,
    box: list,
    sam2_predictor
) -> np.ndarray:
    """
    Use SAM2 to get a precise mask of an existing object.
    Returns: [H, W] binary float32 mask
    """
    sam2_predictor.set_image(first_frame)

    input_box = np.array([box])
    masks, scores, _ = sam2_predictor.predict(
        box=input_box,
        multimask_output=False
    )

    return masks[np.argmax(scores)].astype(np.float32)