Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| LongCat-Image-Edit Client | |
| ========================= | |
| Client for Meituan's LongCat-Image-Edit model. | |
| Supports instruction-following image editing with bilingual (Chinese-English) support. | |
| This is a SOTA open-source image editing model with excellent: | |
| - Global editing, local editing, text modification | |
| - Reference-guided editing | |
| - Consistency preservation (layout, texture, color tone, identity) | |
| - Multi-turn editing capabilities | |
| """ | |
| import logging | |
| import time | |
| from typing import Optional, List | |
| from PIL import Image | |
| import torch | |
| from .models import GenerationRequest, GenerationResult | |
| logger = logging.getLogger(__name__) | |
| class LongCatEditClient: | |
| """ | |
| Client for LongCat-Image-Edit model from Meituan. | |
| Features: | |
| - Instruction-following image editing | |
| - Bilingual support (Chinese-English) | |
| - Excellent consistency preservation | |
| - Multi-turn editing | |
| Requires ~18GB VRAM with CPU offload. | |
| """ | |
| MODEL_ID = "meituan-longcat/LongCat-Image-Edit" | |
| # Aspect ratio to dimensions mapping | |
| ASPECT_RATIOS = { | |
| "1:1": (1024, 1024), | |
| "16:9": (1344, 768), | |
| "9:16": (768, 1344), | |
| "21:9": (1536, 640), # Cinematic ultra-wide | |
| "3:2": (1248, 832), | |
| "2:3": (832, 1248), | |
| "3:4": (896, 1152), | |
| "4:3": (1152, 896), | |
| "4:5": (896, 1120), | |
| "5:4": (1120, 896), | |
| } | |
| # Default generation settings | |
| DEFAULT_STEPS = 50 | |
| DEFAULT_GUIDANCE = 4.5 | |
| def __init__( | |
| self, | |
| device: str = "cuda", | |
| dtype: torch.dtype = torch.bfloat16, | |
| enable_cpu_offload: bool = True, | |
| ): | |
| """ | |
| Initialize LongCat-Image-Edit client. | |
| Args: | |
| device: Device to use (cuda or cpu) | |
| dtype: Data type for model weights (bfloat16 recommended) | |
| enable_cpu_offload: Enable CPU offload to save VRAM (~18GB required) | |
| """ | |
| self.device = device | |
| self.dtype = dtype | |
| self.enable_cpu_offload = enable_cpu_offload | |
| self.pipe = None | |
| self._loaded = False | |
| logger.info(f"LongCatEditClient initialized (cpu_offload: {enable_cpu_offload})") | |
| def load_model(self) -> bool: | |
| """Load the model into memory.""" | |
| if self._loaded: | |
| return True | |
| try: | |
| logger.info(f"Loading LongCat-Image-Edit from {self.MODEL_ID}...") | |
| start_time = time.time() | |
| # Import LongCat pipeline | |
| # Requires latest diffusers: pip install git+https://github.com/huggingface/diffusers | |
| from diffusers import LongCatImageEditPipeline | |
| self.pipe = LongCatImageEditPipeline.from_pretrained( | |
| self.MODEL_ID, | |
| torch_dtype=self.dtype, | |
| ) | |
| # Apply memory optimization | |
| if self.enable_cpu_offload: | |
| self.pipe.enable_model_cpu_offload() | |
| logger.info("CPU offload enabled (~18GB VRAM)") | |
| else: | |
| self.pipe.to(self.device, self.dtype) | |
| logger.info(f"Model moved to {self.device} (high VRAM mode)") | |
| load_time = time.time() - start_time | |
| logger.info(f"LongCat-Image-Edit loaded in {load_time:.1f}s") | |
| self._loaded = True | |
| return True | |
| except Exception as e: | |
| logger.error(f"Failed to load LongCat-Image-Edit: {e}", exc_info=True) | |
| return False | |
| def unload_model(self): | |
| """Unload model from memory.""" | |
| if self.pipe is not None: | |
| del self.pipe | |
| self.pipe = None | |
| self._loaded = False | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| logger.info("LongCat-Image-Edit unloaded") | |
| def generate( | |
| self, | |
| request: GenerationRequest, | |
| num_inference_steps: int = None, | |
| guidance_scale: float = None | |
| ) -> GenerationResult: | |
| """ | |
| Edit image using LongCat-Image-Edit. | |
| Args: | |
| request: GenerationRequest object with: | |
| - prompt: The editing instruction (e.g., "Change the background to a forest") | |
| - input_images: List with the source image to edit | |
| - aspect_ratio: Output aspect ratio | |
| num_inference_steps: Number of denoising steps (default: 50) | |
| guidance_scale: Classifier-free guidance scale (default: 4.5) | |
| Returns: | |
| GenerationResult object | |
| """ | |
| if not self._loaded: | |
| if not self.load_model(): | |
| return GenerationResult.error_result("Failed to load LongCat-Image-Edit model") | |
| # Use defaults if not specified | |
| if num_inference_steps is None: | |
| num_inference_steps = self.DEFAULT_STEPS | |
| if guidance_scale is None: | |
| guidance_scale = self.DEFAULT_GUIDANCE | |
| try: | |
| start_time = time.time() | |
| # Get input image | |
| if not request.has_input_images: | |
| return GenerationResult.error_result("LongCat-Image-Edit requires an input image to edit") | |
| input_image = None | |
| for img in request.input_images: | |
| if img is not None: | |
| input_image = img | |
| break | |
| if input_image is None: | |
| return GenerationResult.error_result("No valid input image provided") | |
| # Get dimensions from aspect ratio | |
| width, height = self._get_dimensions(request.aspect_ratio) | |
| # Resize input image to target dimensions | |
| input_image = input_image.convert('RGB') | |
| input_image = input_image.resize((width, height), Image.Resampling.LANCZOS) | |
| logger.info(f"Editing with LongCat: steps={num_inference_steps}, guidance={guidance_scale}") | |
| logger.info(f"Edit instruction: {request.prompt[:100]}...") | |
| # Build generation kwargs | |
| gen_kwargs = { | |
| "image": input_image, | |
| "prompt": request.prompt, | |
| "negative_prompt": request.negative_prompt or "", | |
| "guidance_scale": guidance_scale, | |
| "num_inference_steps": num_inference_steps, | |
| "num_images_per_prompt": 1, | |
| "generator": torch.Generator("cpu").manual_seed(42), | |
| } | |
| # Generate | |
| with torch.inference_mode(): | |
| output = self.pipe(**gen_kwargs) | |
| image = output.images[0] | |
| generation_time = time.time() - start_time | |
| logger.info(f"Edited in {generation_time:.2f}s: {image.size}") | |
| return GenerationResult.success_result( | |
| image=image, | |
| message=f"Edited with LongCat-Image-Edit in {generation_time:.2f}s", | |
| generation_time=generation_time | |
| ) | |
| except Exception as e: | |
| logger.error(f"LongCat-Image-Edit generation failed: {e}", exc_info=True) | |
| return GenerationResult.error_result(f"LongCat-Image-Edit error: {str(e)}") | |
| def edit_with_instruction( | |
| self, | |
| source_image: Image.Image, | |
| instruction: str, | |
| negative_prompt: str = "", | |
| num_inference_steps: int = None, | |
| guidance_scale: float = None, | |
| seed: int = 42 | |
| ) -> GenerationResult: | |
| """ | |
| Simplified method for instruction-based image editing. | |
| Args: | |
| source_image: The image to edit | |
| instruction: Natural language editing instruction | |
| Examples: | |
| - "Change the background to a sunset beach" | |
| - "Make the person wear a red dress" | |
| - "Add snow to the scene" | |
| - "Change the cat to a dog" | |
| negative_prompt: What to avoid in the output | |
| num_inference_steps: Denoising steps (default: 50) | |
| guidance_scale: CFG scale (default: 4.5) | |
| seed: Random seed for reproducibility | |
| Returns: | |
| GenerationResult with the edited image | |
| """ | |
| if not self._loaded: | |
| if not self.load_model(): | |
| return GenerationResult.error_result("Failed to load LongCat-Image-Edit model") | |
| if num_inference_steps is None: | |
| num_inference_steps = self.DEFAULT_STEPS | |
| if guidance_scale is None: | |
| guidance_scale = self.DEFAULT_GUIDANCE | |
| try: | |
| start_time = time.time() | |
| # Ensure RGB | |
| source_image = source_image.convert('RGB') | |
| logger.info(f"Editing image with instruction: {instruction[:100]}...") | |
| with torch.inference_mode(): | |
| output = self.pipe( | |
| image=source_image, | |
| prompt=instruction, | |
| negative_prompt=negative_prompt, | |
| guidance_scale=guidance_scale, | |
| num_inference_steps=num_inference_steps, | |
| num_images_per_prompt=1, | |
| generator=torch.Generator("cpu").manual_seed(seed), | |
| ) | |
| image = output.images[0] | |
| generation_time = time.time() - start_time | |
| logger.info(f"Edit completed in {generation_time:.2f}s") | |
| return GenerationResult.success_result( | |
| image=image, | |
| message=f"Edited with instruction in {generation_time:.2f}s", | |
| generation_time=generation_time | |
| ) | |
| except Exception as e: | |
| logger.error(f"Instruction-based edit failed: {e}", exc_info=True) | |
| return GenerationResult.error_result(f"Edit error: {str(e)}") | |
| def _get_dimensions(self, aspect_ratio: str) -> tuple: | |
| """Get pixel dimensions for aspect ratio.""" | |
| ratio = aspect_ratio.split()[0] if " " in aspect_ratio else aspect_ratio | |
| return self.ASPECT_RATIOS.get(ratio, (1024, 1024)) | |
| def is_healthy(self) -> bool: | |
| """Check if model is loaded and ready.""" | |
| return self._loaded and self.pipe is not None | |
| def get_dimensions(cls, aspect_ratio: str) -> tuple: | |
| """Get pixel dimensions for aspect ratio.""" | |
| ratio = aspect_ratio.split()[0] if " " in aspect_ratio else aspect_ratio | |
| return cls.ASPECT_RATIOS.get(ratio, (1024, 1024)) | |