ObjectInsertion / frame_editor.py
Leema Krishna Murali
Initial commit
f3d0a26
# frame_editor.py
import numpy as np
from PIL import Image
import torch
import cv2
def load_qwen_image_edit(use_lightning=True, device="cuda"):
from diffusers import QwenImageEditPlusPipeline, FlowMatchEulerDiscreteScheduler
scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
"Qwen/Qwen-Image-Edit-2511", subfolder="scheduler"
)
pipe = QwenImageEditPlusPipeline.from_pretrained(
"Qwen/Qwen-Image-Edit-2511",
scheduler=scheduler,
torch_dtype=torch.bfloat16,
).to(device)
if use_lightning:
pipe.load_lora_weights(
"lightx2v/Qwen-Image-Edit-2511-Lightning",
weight_name="Qwen-Image-Edit-2511-Lightning-4steps-V1.0-bf16.safetensors"
)
pipe.fuse_lora()
return pipe
def insert_object_qwen_edit(
first_frame, # np.ndarray [H, W, 3] uint8 RGB
box, # [x1, y1, x2, y2]
object_description, # e.g. "a red sports car"
pipe,
context_pad=60, # pixels of context around box β€” helps Qwen understand scene
num_inference_steps=4,
guidance_scale=1.0,
seed=42,
):
"""
Inserts object into ONLY the bounding box region.
Background outside the box is pixel-identical to original.
Strategy:
1. Crop (box + padding) from original β†’ gives Qwen scene context
2. Edit the crop with Qwen-Image-Edit
3. Extract only the box pixels from the edited crop
4. Paste back onto original frame
"""
H, W = first_frame.shape[:2]
x1, y1, x2, y2 = [int(v) for v in box]
# --- Step 1: Crop with context padding ---
cx1 = max(0, x1 - context_pad)
cy1 = max(0, y1 - context_pad)
cx2 = min(W, x2 + context_pad)
cy2 = min(H, y2 + context_pad)
crop = first_frame[cy1:cy2, cx1:cx2].copy() # [cH, cW, 3]
cH, cW = crop.shape[:2]
# Box coordinates relative to crop
lx1 = x1 - cx1
ly1 = y1 - cy1
lx2 = x2 - cx1
ly2 = y2 - cy1
# --- Step 2: Build focused edit instruction ---
prompt = (
f"Insert {object_description} in the region ({lx1},{ly1}) to ({lx2},{ly2}). "
f"Keep everything outside that region exactly the same. "
f"Match the scene lighting, shadows, and perspective."
)
generator = torch.Generator().manual_seed(seed)
edited = pipe(
image=[Image.fromarray(crop)],
prompt=prompt,
num_inference_steps=num_inference_steps,
true_cfg_scale=guidance_scale,
negative_prompt=" ",
generator=generator,
).images[0]
edited_np = np.array(edited) # [cH', cW', 3]
# Resize back if pipeline changed resolution
if edited_np.shape[:2] != (cH, cW):
edited_np = cv2.resize(edited_np, (cW, cH), interpolation=cv2.INTER_LINEAR)
# --- Step 3: Hard composite β€” only paste the box region back ---
result = first_frame.copy()
result[y1:y2, x1:x2] = edited_np[ly1:ly2, lx1:lx2]
return result # [H, W, 3] uint8 RGB β€” background unchanged
def segment_existing_object(
first_frame: np.ndarray,
box: list,
sam2_predictor
) -> np.ndarray:
"""
Use SAM2 to get a precise mask of an existing object.
Returns: [H, W] binary float32 mask
"""
sam2_predictor.set_image(first_frame)
input_box = np.array([box])
masks, scores, _ = sam2_predictor.predict(
box=input_box,
multimask_output=False
)
return masks[np.argmax(scores)].astype(np.float32)