Spaces:

cronos3k
/

CharacterForgePro

Running on Zero

App Files Files Community

CharacterForgePro / src /longcat_edit_client.py

ghmk

Deploy full Character Sheet Pro with HF auth

da23dfe 2 days ago

raw

history blame contribute delete

10.3 kB

	"""
	LongCat-Image-Edit Client
	=========================

	Client for Meituan's LongCat-Image-Edit model.
	Supports instruction-following image editing with bilingual (Chinese-English) support.

	This is a SOTA open-source image editing model with excellent:
	- Global editing, local editing, text modification
	- Reference-guided editing
	- Consistency preservation (layout, texture, color tone, identity)
	- Multi-turn editing capabilities
	"""

	import logging
	import time
	from typing import Optional, List
	from PIL import Image

	import torch

	from .models import GenerationRequest, GenerationResult


	logger = logging.getLogger(__name__)


	class LongCatEditClient:
	"""
	Client for LongCat-Image-Edit model from Meituan.

	Features:
	- Instruction-following image editing
	- Bilingual support (Chinese-English)
	- Excellent consistency preservation
	- Multi-turn editing

	Requires ~18GB VRAM with CPU offload.
	"""

	MODEL_ID = "meituan-longcat/LongCat-Image-Edit"

	# Aspect ratio to dimensions mapping
	ASPECT_RATIOS = {
	"1:1": (1024, 1024),
	"16:9": (1344, 768),
	"9:16": (768, 1344),
	"21:9": (1536, 640), # Cinematic ultra-wide
	"3:2": (1248, 832),
	"2:3": (832, 1248),
	"3:4": (896, 1152),
	"4:3": (1152, 896),
	"4:5": (896, 1120),
	"5:4": (1120, 896),
	}

	# Default generation settings
	DEFAULT_STEPS = 50
	DEFAULT_GUIDANCE = 4.5

	def __init__(
	self,
	device: str = "cuda",
	dtype: torch.dtype = torch.bfloat16,
	enable_cpu_offload: bool = True,
	):
	"""
	Initialize LongCat-Image-Edit client.

	Args:
	device: Device to use (cuda or cpu)
	dtype: Data type for model weights (bfloat16 recommended)
	enable_cpu_offload: Enable CPU offload to save VRAM (~18GB required)
	"""
	self.device = device
	self.dtype = dtype
	self.enable_cpu_offload = enable_cpu_offload
	self.pipe = None
	self._loaded = False

	logger.info(f"LongCatEditClient initialized (cpu_offload: {enable_cpu_offload})")

	def load_model(self) -> bool:
	"""Load the model into memory."""
	if self._loaded:
	return True

	try:
	logger.info(f"Loading LongCat-Image-Edit from {self.MODEL_ID}...")

	start_time = time.time()

	# Import LongCat pipeline
	# Requires latest diffusers: pip install git+https://github.com/huggingface/diffusers
	from diffusers import LongCatImageEditPipeline

	self.pipe = LongCatImageEditPipeline.from_pretrained(
	self.MODEL_ID,
	torch_dtype=self.dtype,
	)

	# Apply memory optimization
	if self.enable_cpu_offload:
	self.pipe.enable_model_cpu_offload()
	logger.info("CPU offload enabled (~18GB VRAM)")
	else:
	self.pipe.to(self.device, self.dtype)
	logger.info(f"Model moved to {self.device} (high VRAM mode)")

	load_time = time.time() - start_time
	logger.info(f"LongCat-Image-Edit loaded in {load_time:.1f}s")

	self._loaded = True
	return True

	except Exception as e:
	logger.error(f"Failed to load LongCat-Image-Edit: {e}", exc_info=True)
	return False

	def unload_model(self):
	"""Unload model from memory."""
	if self.pipe is not None:
	del self.pipe
	self.pipe = None

	self._loaded = False

	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	logger.info("LongCat-Image-Edit unloaded")

	def generate(
	self,
	request: GenerationRequest,
	num_inference_steps: int = None,
	guidance_scale: float = None
	) -> GenerationResult:
	"""
	Edit image using LongCat-Image-Edit.

	Args:
	request: GenerationRequest object with:
	- prompt: The editing instruction (e.g., "Change the background to a forest")
	- input_images: List with the source image to edit
	- aspect_ratio: Output aspect ratio
	num_inference_steps: Number of denoising steps (default: 50)
	guidance_scale: Classifier-free guidance scale (default: 4.5)

	Returns:
	GenerationResult object
	"""
	if not self._loaded:
	if not self.load_model():
	return GenerationResult.error_result("Failed to load LongCat-Image-Edit model")

	# Use defaults if not specified
	if num_inference_steps is None:
	num_inference_steps = self.DEFAULT_STEPS
	if guidance_scale is None:
	guidance_scale = self.DEFAULT_GUIDANCE

	try:
	start_time = time.time()

	# Get input image
	if not request.has_input_images:
	return GenerationResult.error_result("LongCat-Image-Edit requires an input image to edit")

	input_image = None
	for img in request.input_images:
	if img is not None:
	input_image = img
	break

	if input_image is None:
	return GenerationResult.error_result("No valid input image provided")

	# Get dimensions from aspect ratio
	width, height = self._get_dimensions(request.aspect_ratio)

	# Resize input image to target dimensions
	input_image = input_image.convert('RGB')
	input_image = input_image.resize((width, height), Image.Resampling.LANCZOS)

	logger.info(f"Editing with LongCat: steps={num_inference_steps}, guidance={guidance_scale}")
	logger.info(f"Edit instruction: {request.prompt[:100]}...")

	# Build generation kwargs
	gen_kwargs = {
	"image": input_image,
	"prompt": request.prompt,
	"negative_prompt": request.negative_prompt or "",
	"guidance_scale": guidance_scale,
	"num_inference_steps": num_inference_steps,
	"num_images_per_prompt": 1,
	"generator": torch.Generator("cpu").manual_seed(42),
	}

	# Generate
	with torch.inference_mode():
	output = self.pipe(**gen_kwargs)
	image = output.images[0]

	generation_time = time.time() - start_time
	logger.info(f"Edited in {generation_time:.2f}s: {image.size}")

	return GenerationResult.success_result(
	image=image,
	message=f"Edited with LongCat-Image-Edit in {generation_time:.2f}s",
	generation_time=generation_time
	)

	except Exception as e:
	logger.error(f"LongCat-Image-Edit generation failed: {e}", exc_info=True)
	return GenerationResult.error_result(f"LongCat-Image-Edit error: {str(e)}")

	def edit_with_instruction(
	self,
	source_image: Image.Image,
	instruction: str,
	negative_prompt: str = "",
	num_inference_steps: int = None,
	guidance_scale: float = None,
	seed: int = 42
	) -> GenerationResult:
	"""
	Simplified method for instruction-based image editing.

	Args:
	source_image: The image to edit
	instruction: Natural language editing instruction
	Examples:
	- "Change the background to a sunset beach"
	- "Make the person wear a red dress"
	- "Add snow to the scene"
	- "Change the cat to a dog"
	negative_prompt: What to avoid in the output
	num_inference_steps: Denoising steps (default: 50)
	guidance_scale: CFG scale (default: 4.5)
	seed: Random seed for reproducibility

	Returns:
	GenerationResult with the edited image
	"""
	if not self._loaded:
	if not self.load_model():
	return GenerationResult.error_result("Failed to load LongCat-Image-Edit model")

	if num_inference_steps is None:
	num_inference_steps = self.DEFAULT_STEPS
	if guidance_scale is None:
	guidance_scale = self.DEFAULT_GUIDANCE

	try:
	start_time = time.time()

	# Ensure RGB
	source_image = source_image.convert('RGB')

	logger.info(f"Editing image with instruction: {instruction[:100]}...")

	with torch.inference_mode():
	output = self.pipe(
	image=source_image,
	prompt=instruction,
	negative_prompt=negative_prompt,
	guidance_scale=guidance_scale,
	num_inference_steps=num_inference_steps,
	num_images_per_prompt=1,
	generator=torch.Generator("cpu").manual_seed(seed),
	)
	image = output.images[0]

	generation_time = time.time() - start_time
	logger.info(f"Edit completed in {generation_time:.2f}s")

	return GenerationResult.success_result(
	image=image,
	message=f"Edited with instruction in {generation_time:.2f}s",
	generation_time=generation_time
	)

	except Exception as e:
	logger.error(f"Instruction-based edit failed: {e}", exc_info=True)
	return GenerationResult.error_result(f"Edit error: {str(e)}")

	def _get_dimensions(self, aspect_ratio: str) -> tuple:
	"""Get pixel dimensions for aspect ratio."""
	ratio = aspect_ratio.split()[0] if " " in aspect_ratio else aspect_ratio
	return self.ASPECT_RATIOS.get(ratio, (1024, 1024))

	def is_healthy(self) -> bool:
	"""Check if model is loaded and ready."""
	return self._loaded and self.pipe is not None

	@classmethod
	def get_dimensions(cls, aspect_ratio: str) -> tuple:
	"""Get pixel dimensions for aspect ratio."""
	ratio = aspect_ratio.split()[0] if " " in aspect_ratio else aspect_ratio
	return cls.ASPECT_RATIOS.get(ratio, (1024, 1024))