# frame_editor.py import numpy as np from PIL import Image import torch import cv2 def load_qwen_image_edit(use_lightning=True, device="cuda"): from diffusers import QwenImageEditPlusPipeline, FlowMatchEulerDiscreteScheduler scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained( "Qwen/Qwen-Image-Edit-2511", subfolder="scheduler" ) pipe = QwenImageEditPlusPipeline.from_pretrained( "Qwen/Qwen-Image-Edit-2511", scheduler=scheduler, torch_dtype=torch.bfloat16, ).to(device) if use_lightning: pipe.load_lora_weights( "lightx2v/Qwen-Image-Edit-2511-Lightning", weight_name="Qwen-Image-Edit-2511-Lightning-4steps-V1.0-bf16.safetensors" ) pipe.fuse_lora() return pipe def insert_object_qwen_edit( first_frame, # np.ndarray [H, W, 3] uint8 RGB box, # [x1, y1, x2, y2] object_description, # e.g. "a red sports car" pipe, context_pad=60, # pixels of context around box — helps Qwen understand scene num_inference_steps=4, guidance_scale=1.0, seed=42, ): """ Inserts object into ONLY the bounding box region. Background outside the box is pixel-identical to original. Strategy: 1. Crop (box + padding) from original → gives Qwen scene context 2. Edit the crop with Qwen-Image-Edit 3. Extract only the box pixels from the edited crop 4. Paste back onto original frame """ H, W = first_frame.shape[:2] x1, y1, x2, y2 = [int(v) for v in box] # --- Step 1: Crop with context padding --- cx1 = max(0, x1 - context_pad) cy1 = max(0, y1 - context_pad) cx2 = min(W, x2 + context_pad) cy2 = min(H, y2 + context_pad) crop = first_frame[cy1:cy2, cx1:cx2].copy() # [cH, cW, 3] cH, cW = crop.shape[:2] # Box coordinates relative to crop lx1 = x1 - cx1 ly1 = y1 - cy1 lx2 = x2 - cx1 ly2 = y2 - cy1 # --- Step 2: Build focused edit instruction --- prompt = ( f"Insert {object_description} in the region ({lx1},{ly1}) to ({lx2},{ly2}). " f"Keep everything outside that region exactly the same. " f"Match the scene lighting, shadows, and perspective." ) generator = torch.Generator().manual_seed(seed) edited = pipe( image=[Image.fromarray(crop)], prompt=prompt, num_inference_steps=num_inference_steps, true_cfg_scale=guidance_scale, negative_prompt=" ", generator=generator, ).images[0] edited_np = np.array(edited) # [cH', cW', 3] # Resize back if pipeline changed resolution if edited_np.shape[:2] != (cH, cW): edited_np = cv2.resize(edited_np, (cW, cH), interpolation=cv2.INTER_LINEAR) # --- Step 3: Hard composite — only paste the box region back --- result = first_frame.copy() result[y1:y2, x1:x2] = edited_np[ly1:ly2, lx1:lx2] return result # [H, W, 3] uint8 RGB — background unchanged def segment_existing_object( first_frame: np.ndarray, box: list, sam2_predictor ) -> np.ndarray: """ Use SAM2 to get a precise mask of an existing object. Returns: [H, W] binary float32 mask """ sam2_predictor.set_image(first_frame) input_box = np.array([box]) masks, scores, _ = sam2_predictor.predict( box=input_box, multimask_output=False ) return masks[np.argmax(scores)].astype(np.float32)