Spaces:

cronos3k
/

CharacterForgePro

Running on Zero

File size: 10,288 Bytes

da23dfe

"""
LongCat-Image-Edit Client
=========================

Client for Meituan's LongCat-Image-Edit model.
Supports instruction-following image editing with bilingual (Chinese-English) support.

This is a SOTA open-source image editing model with excellent:
- Global editing, local editing, text modification
- Reference-guided editing
- Consistency preservation (layout, texture, color tone, identity)
- Multi-turn editing capabilities
"""

import logging
import time
from typing import Optional, List
from PIL import Image

import torch

from .models import GenerationRequest, GenerationResult


logger = logging.getLogger(__name__)


class LongCatEditClient:
    """
    Client for LongCat-Image-Edit model from Meituan.

    Features:
    - Instruction-following image editing
    - Bilingual support (Chinese-English)
    - Excellent consistency preservation
    - Multi-turn editing

    Requires ~18GB VRAM with CPU offload.
    """

    MODEL_ID = "meituan-longcat/LongCat-Image-Edit"

    # Aspect ratio to dimensions mapping
    ASPECT_RATIOS = {
        "1:1": (1024, 1024),
        "16:9": (1344, 768),
        "9:16": (768, 1344),
        "21:9": (1536, 640),    # Cinematic ultra-wide
        "3:2": (1248, 832),
        "2:3": (832, 1248),
        "3:4": (896, 1152),
        "4:3": (1152, 896),
        "4:5": (896, 1120),
        "5:4": (1120, 896),
    }

    # Default generation settings
    DEFAULT_STEPS = 50
    DEFAULT_GUIDANCE = 4.5

    def __init__(
        self,
        device: str = "cuda",
        dtype: torch.dtype = torch.bfloat16,
        enable_cpu_offload: bool = True,
    ):
        """
        Initialize LongCat-Image-Edit client.

        Args:
            device: Device to use (cuda or cpu)
            dtype: Data type for model weights (bfloat16 recommended)
            enable_cpu_offload: Enable CPU offload to save VRAM (~18GB required)
        """
        self.device = device
        self.dtype = dtype
        self.enable_cpu_offload = enable_cpu_offload
        self.pipe = None
        self._loaded = False

        logger.info(f"LongCatEditClient initialized (cpu_offload: {enable_cpu_offload})")

    def load_model(self) -> bool:
        """Load the model into memory."""
        if self._loaded:
            return True

        try:
            logger.info(f"Loading LongCat-Image-Edit from {self.MODEL_ID}...")

            start_time = time.time()

            # Import LongCat pipeline
            # Requires latest diffusers: pip install git+https://github.com/huggingface/diffusers
            from diffusers import LongCatImageEditPipeline

            self.pipe = LongCatImageEditPipeline.from_pretrained(
                self.MODEL_ID,
                torch_dtype=self.dtype,
            )

            # Apply memory optimization
            if self.enable_cpu_offload:
                self.pipe.enable_model_cpu_offload()
                logger.info("CPU offload enabled (~18GB VRAM)")
            else:
                self.pipe.to(self.device, self.dtype)
                logger.info(f"Model moved to {self.device} (high VRAM mode)")

            load_time = time.time() - start_time
            logger.info(f"LongCat-Image-Edit loaded in {load_time:.1f}s")

            self._loaded = True
            return True

        except Exception as e:
            logger.error(f"Failed to load LongCat-Image-Edit: {e}", exc_info=True)
            return False

    def unload_model(self):
        """Unload model from memory."""
        if self.pipe is not None:
            del self.pipe
            self.pipe = None

        self._loaded = False

        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        logger.info("LongCat-Image-Edit unloaded")

    def generate(
        self,
        request: GenerationRequest,
        num_inference_steps: int = None,
        guidance_scale: float = None
    ) -> GenerationResult:
        """
        Edit image using LongCat-Image-Edit.

        Args:
            request: GenerationRequest object with:
                - prompt: The editing instruction (e.g., "Change the background to a forest")
                - input_images: List with the source image to edit
                - aspect_ratio: Output aspect ratio
            num_inference_steps: Number of denoising steps (default: 50)
            guidance_scale: Classifier-free guidance scale (default: 4.5)

        Returns:
            GenerationResult object
        """
        if not self._loaded:
            if not self.load_model():
                return GenerationResult.error_result("Failed to load LongCat-Image-Edit model")

        # Use defaults if not specified
        if num_inference_steps is None:
            num_inference_steps = self.DEFAULT_STEPS
        if guidance_scale is None:
            guidance_scale = self.DEFAULT_GUIDANCE

        try:
            start_time = time.time()

            # Get input image
            if not request.has_input_images:
                return GenerationResult.error_result("LongCat-Image-Edit requires an input image to edit")

            input_image = None
            for img in request.input_images:
                if img is not None:
                    input_image = img
                    break

            if input_image is None:
                return GenerationResult.error_result("No valid input image provided")

            # Get dimensions from aspect ratio
            width, height = self._get_dimensions(request.aspect_ratio)

            # Resize input image to target dimensions
            input_image = input_image.convert('RGB')
            input_image = input_image.resize((width, height), Image.Resampling.LANCZOS)

            logger.info(f"Editing with LongCat: steps={num_inference_steps}, guidance={guidance_scale}")
            logger.info(f"Edit instruction: {request.prompt[:100]}...")

            # Build generation kwargs
            gen_kwargs = {
                "image": input_image,
                "prompt": request.prompt,
                "negative_prompt": request.negative_prompt or "",
                "guidance_scale": guidance_scale,
                "num_inference_steps": num_inference_steps,
                "num_images_per_prompt": 1,
                "generator": torch.Generator("cpu").manual_seed(42),
            }

            # Generate
            with torch.inference_mode():
                output = self.pipe(**gen_kwargs)
                image = output.images[0]

            generation_time = time.time() - start_time
            logger.info(f"Edited in {generation_time:.2f}s: {image.size}")

            return GenerationResult.success_result(
                image=image,
                message=f"Edited with LongCat-Image-Edit in {generation_time:.2f}s",
                generation_time=generation_time
            )

        except Exception as e:
            logger.error(f"LongCat-Image-Edit generation failed: {e}", exc_info=True)
            return GenerationResult.error_result(f"LongCat-Image-Edit error: {str(e)}")

    def edit_with_instruction(
        self,
        source_image: Image.Image,
        instruction: str,
        negative_prompt: str = "",
        num_inference_steps: int = None,
        guidance_scale: float = None,
        seed: int = 42
    ) -> GenerationResult:
        """
        Simplified method for instruction-based image editing.

        Args:
            source_image: The image to edit
            instruction: Natural language editing instruction
                Examples:
                - "Change the background to a sunset beach"
                - "Make the person wear a red dress"
                - "Add snow to the scene"
                - "Change the cat to a dog"
            negative_prompt: What to avoid in the output
            num_inference_steps: Denoising steps (default: 50)
            guidance_scale: CFG scale (default: 4.5)
            seed: Random seed for reproducibility

        Returns:
            GenerationResult with the edited image
        """
        if not self._loaded:
            if not self.load_model():
                return GenerationResult.error_result("Failed to load LongCat-Image-Edit model")

        if num_inference_steps is None:
            num_inference_steps = self.DEFAULT_STEPS
        if guidance_scale is None:
            guidance_scale = self.DEFAULT_GUIDANCE

        try:
            start_time = time.time()

            # Ensure RGB
            source_image = source_image.convert('RGB')

            logger.info(f"Editing image with instruction: {instruction[:100]}...")

            with torch.inference_mode():
                output = self.pipe(
                    image=source_image,
                    prompt=instruction,
                    negative_prompt=negative_prompt,
                    guidance_scale=guidance_scale,
                    num_inference_steps=num_inference_steps,
                    num_images_per_prompt=1,
                    generator=torch.Generator("cpu").manual_seed(seed),
                )
                image = output.images[0]

            generation_time = time.time() - start_time
            logger.info(f"Edit completed in {generation_time:.2f}s")

            return GenerationResult.success_result(
                image=image,
                message=f"Edited with instruction in {generation_time:.2f}s",
                generation_time=generation_time
            )

        except Exception as e:
            logger.error(f"Instruction-based edit failed: {e}", exc_info=True)
            return GenerationResult.error_result(f"Edit error: {str(e)}")

    def _get_dimensions(self, aspect_ratio: str) -> tuple:
        """Get pixel dimensions for aspect ratio."""
        ratio = aspect_ratio.split()[0] if " " in aspect_ratio else aspect_ratio
        return self.ASPECT_RATIOS.get(ratio, (1024, 1024))

    def is_healthy(self) -> bool:
        """Check if model is loaded and ready."""
        return self._loaded and self.pipe is not None

    @classmethod
    def get_dimensions(cls, aspect_ratio: str) -> tuple:
        """Get pixel dimensions for aspect ratio."""
        ratio = aspect_ratio.split()[0] if " " in aspect_ratio else aspect_ratio
        return cls.ASPECT_RATIOS.get(ratio, (1024, 1024))