CharacterForgePro / src /longcat_edit_client.py
ghmk's picture
Deploy full Character Sheet Pro with HF auth
da23dfe
"""
LongCat-Image-Edit Client
=========================
Client for Meituan's LongCat-Image-Edit model.
Supports instruction-following image editing with bilingual (Chinese-English) support.
This is a SOTA open-source image editing model with excellent:
- Global editing, local editing, text modification
- Reference-guided editing
- Consistency preservation (layout, texture, color tone, identity)
- Multi-turn editing capabilities
"""
import logging
import time
from typing import Optional, List
from PIL import Image
import torch
from .models import GenerationRequest, GenerationResult
logger = logging.getLogger(__name__)
class LongCatEditClient:
"""
Client for LongCat-Image-Edit model from Meituan.
Features:
- Instruction-following image editing
- Bilingual support (Chinese-English)
- Excellent consistency preservation
- Multi-turn editing
Requires ~18GB VRAM with CPU offload.
"""
MODEL_ID = "meituan-longcat/LongCat-Image-Edit"
# Aspect ratio to dimensions mapping
ASPECT_RATIOS = {
"1:1": (1024, 1024),
"16:9": (1344, 768),
"9:16": (768, 1344),
"21:9": (1536, 640), # Cinematic ultra-wide
"3:2": (1248, 832),
"2:3": (832, 1248),
"3:4": (896, 1152),
"4:3": (1152, 896),
"4:5": (896, 1120),
"5:4": (1120, 896),
}
# Default generation settings
DEFAULT_STEPS = 50
DEFAULT_GUIDANCE = 4.5
def __init__(
self,
device: str = "cuda",
dtype: torch.dtype = torch.bfloat16,
enable_cpu_offload: bool = True,
):
"""
Initialize LongCat-Image-Edit client.
Args:
device: Device to use (cuda or cpu)
dtype: Data type for model weights (bfloat16 recommended)
enable_cpu_offload: Enable CPU offload to save VRAM (~18GB required)
"""
self.device = device
self.dtype = dtype
self.enable_cpu_offload = enable_cpu_offload
self.pipe = None
self._loaded = False
logger.info(f"LongCatEditClient initialized (cpu_offload: {enable_cpu_offload})")
def load_model(self) -> bool:
"""Load the model into memory."""
if self._loaded:
return True
try:
logger.info(f"Loading LongCat-Image-Edit from {self.MODEL_ID}...")
start_time = time.time()
# Import LongCat pipeline
# Requires latest diffusers: pip install git+https://github.com/huggingface/diffusers
from diffusers import LongCatImageEditPipeline
self.pipe = LongCatImageEditPipeline.from_pretrained(
self.MODEL_ID,
torch_dtype=self.dtype,
)
# Apply memory optimization
if self.enable_cpu_offload:
self.pipe.enable_model_cpu_offload()
logger.info("CPU offload enabled (~18GB VRAM)")
else:
self.pipe.to(self.device, self.dtype)
logger.info(f"Model moved to {self.device} (high VRAM mode)")
load_time = time.time() - start_time
logger.info(f"LongCat-Image-Edit loaded in {load_time:.1f}s")
self._loaded = True
return True
except Exception as e:
logger.error(f"Failed to load LongCat-Image-Edit: {e}", exc_info=True)
return False
def unload_model(self):
"""Unload model from memory."""
if self.pipe is not None:
del self.pipe
self.pipe = None
self._loaded = False
if torch.cuda.is_available():
torch.cuda.empty_cache()
logger.info("LongCat-Image-Edit unloaded")
def generate(
self,
request: GenerationRequest,
num_inference_steps: int = None,
guidance_scale: float = None
) -> GenerationResult:
"""
Edit image using LongCat-Image-Edit.
Args:
request: GenerationRequest object with:
- prompt: The editing instruction (e.g., "Change the background to a forest")
- input_images: List with the source image to edit
- aspect_ratio: Output aspect ratio
num_inference_steps: Number of denoising steps (default: 50)
guidance_scale: Classifier-free guidance scale (default: 4.5)
Returns:
GenerationResult object
"""
if not self._loaded:
if not self.load_model():
return GenerationResult.error_result("Failed to load LongCat-Image-Edit model")
# Use defaults if not specified
if num_inference_steps is None:
num_inference_steps = self.DEFAULT_STEPS
if guidance_scale is None:
guidance_scale = self.DEFAULT_GUIDANCE
try:
start_time = time.time()
# Get input image
if not request.has_input_images:
return GenerationResult.error_result("LongCat-Image-Edit requires an input image to edit")
input_image = None
for img in request.input_images:
if img is not None:
input_image = img
break
if input_image is None:
return GenerationResult.error_result("No valid input image provided")
# Get dimensions from aspect ratio
width, height = self._get_dimensions(request.aspect_ratio)
# Resize input image to target dimensions
input_image = input_image.convert('RGB')
input_image = input_image.resize((width, height), Image.Resampling.LANCZOS)
logger.info(f"Editing with LongCat: steps={num_inference_steps}, guidance={guidance_scale}")
logger.info(f"Edit instruction: {request.prompt[:100]}...")
# Build generation kwargs
gen_kwargs = {
"image": input_image,
"prompt": request.prompt,
"negative_prompt": request.negative_prompt or "",
"guidance_scale": guidance_scale,
"num_inference_steps": num_inference_steps,
"num_images_per_prompt": 1,
"generator": torch.Generator("cpu").manual_seed(42),
}
# Generate
with torch.inference_mode():
output = self.pipe(**gen_kwargs)
image = output.images[0]
generation_time = time.time() - start_time
logger.info(f"Edited in {generation_time:.2f}s: {image.size}")
return GenerationResult.success_result(
image=image,
message=f"Edited with LongCat-Image-Edit in {generation_time:.2f}s",
generation_time=generation_time
)
except Exception as e:
logger.error(f"LongCat-Image-Edit generation failed: {e}", exc_info=True)
return GenerationResult.error_result(f"LongCat-Image-Edit error: {str(e)}")
def edit_with_instruction(
self,
source_image: Image.Image,
instruction: str,
negative_prompt: str = "",
num_inference_steps: int = None,
guidance_scale: float = None,
seed: int = 42
) -> GenerationResult:
"""
Simplified method for instruction-based image editing.
Args:
source_image: The image to edit
instruction: Natural language editing instruction
Examples:
- "Change the background to a sunset beach"
- "Make the person wear a red dress"
- "Add snow to the scene"
- "Change the cat to a dog"
negative_prompt: What to avoid in the output
num_inference_steps: Denoising steps (default: 50)
guidance_scale: CFG scale (default: 4.5)
seed: Random seed for reproducibility
Returns:
GenerationResult with the edited image
"""
if not self._loaded:
if not self.load_model():
return GenerationResult.error_result("Failed to load LongCat-Image-Edit model")
if num_inference_steps is None:
num_inference_steps = self.DEFAULT_STEPS
if guidance_scale is None:
guidance_scale = self.DEFAULT_GUIDANCE
try:
start_time = time.time()
# Ensure RGB
source_image = source_image.convert('RGB')
logger.info(f"Editing image with instruction: {instruction[:100]}...")
with torch.inference_mode():
output = self.pipe(
image=source_image,
prompt=instruction,
negative_prompt=negative_prompt,
guidance_scale=guidance_scale,
num_inference_steps=num_inference_steps,
num_images_per_prompt=1,
generator=torch.Generator("cpu").manual_seed(seed),
)
image = output.images[0]
generation_time = time.time() - start_time
logger.info(f"Edit completed in {generation_time:.2f}s")
return GenerationResult.success_result(
image=image,
message=f"Edited with instruction in {generation_time:.2f}s",
generation_time=generation_time
)
except Exception as e:
logger.error(f"Instruction-based edit failed: {e}", exc_info=True)
return GenerationResult.error_result(f"Edit error: {str(e)}")
def _get_dimensions(self, aspect_ratio: str) -> tuple:
"""Get pixel dimensions for aspect ratio."""
ratio = aspect_ratio.split()[0] if " " in aspect_ratio else aspect_ratio
return self.ASPECT_RATIOS.get(ratio, (1024, 1024))
def is_healthy(self) -> bool:
"""Check if model is loaded and ready."""
return self._loaded and self.pipe is not None
@classmethod
def get_dimensions(cls, aspect_ratio: str) -> tuple:
"""Get pixel dimensions for aspect ratio."""
ratio = aspect_ratio.split()[0] if " " in aspect_ratio else aspect_ratio
return cls.ASPECT_RATIOS.get(ratio, (1024, 1024))