LongCat-Image

Running on Zero

File size: 9,059 Bytes

99582fb
de41cc1
 
 
 
 
51779c5
de41cc1
36abdd4
 
 
 
 
 
 
de41cc1
d2176ba
40c2e73
 
 
 
51779c5
85df522
de41cc1
d0e03bd
 
 
 
be5446b
85df522
 
 
d0e03bd
be5446b
85df522
be5446b
85df522
 
 
2cefbad
d0e03bd
 
 
85df522
2cefbad
85df522
d0e03bd
 
 
 
 
 
be5446b
d0e03bd
 
 
 
be5446b
d0e03bd
be5446b
d0e03bd
 
 
 
 
 
 
 
 
 
 
cf3fdd8
 
 
 
d0e03bd
cf3fdd8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4002df7
d0e03bd
4002df7
 
 
 
 
 
 
 
 
 
d0e03bd
2cefbad
4002df7
 
 
 
 
 
ed65bd6
4002df7
 
 
 
 
 
 
de41cc1
6e4901a
de41cc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e4901a
de41cc1
d0e03bd
de41cc1
 
be5446b
 
 
de41cc1
 
 
 
4670eb3
de41cc1
 
 
f81c51a
4002df7
 
 
 
de41cc1
d2176ba
4002df7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be5446b
4002df7
16c8236
4002df7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36abdd4
4002df7
 
 
 
 
 
 
 
 
 
 
d0e03bd
4002df7
 
de41cc1
4002df7
 
 
 
 
d0e03bd
4002df7
 
 
be5446b
4002df7

import spaces
import gradio as gr
import torch
from PIL import Image
from transformers import AutoProcessor
from longcat_image.models import LongCatImageTransformer2DModel
from longcat_image.pipelines import LongCatImageEditPipeline, LongCatImagePipeline
import numpy as np
import random

import os
import requests
import tempfile
import shutil
from urllib.parse import urlparse


MAX_SEED = np.iinfo(np.int32).max
MAX_IMAGE_SIZE = 2048


# --- Model Loading ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Text-to-Image Model
t2i_model_id = 'meituan-longcat/LongCat-Image'
print(f"🔄 Loading Text-to-Image model from {t2i_model_id}...")
t2i_text_processor = AutoProcessor.from_pretrained(
    t2i_model_id,
    subfolder='tokenizer'
)

t2i_transformer = LongCatImageTransformer2DModel.from_pretrained(
    t2i_model_id,
    subfolder='transformer',
    torch_dtype=torch.bfloat16,
    use_safetensors=True
).to(device)

pipe = LongCatImagePipeline.from_pretrained(
    t2i_model_id,
    transformer=t2i_transformer,
    text_processor=t2i_text_processor,
)
pipe.to(device, torch.bfloat16)

print(f"✅ Text-to-Image model loaded successfully")

# Image Edit Model
edit_model_id = 'meituan-longcat/LongCat-Image-Edit'
print(f"🔄 Loading Image Edit model from {edit_model_id}...")
edit_text_processor = AutoProcessor.from_pretrained(
    edit_model_id,
    subfolder='tokenizer'
)

edit_transformer = LongCatImageTransformer2DModel.from_pretrained(
    edit_model_id,
    subfolder='transformer',
    torch_dtype=torch.bfloat16,
    use_safetensors=True
).to(device)

edit_pipe = LongCatImageEditPipeline.from_pretrained(
    edit_model_id,
    transformer=edit_transformer,
    text_processor=edit_text_processor,
)
edit_pipe.to(device, torch.bfloat16)

print(f"✅ Image Edit model loaded successfully on {device}")
def load_lora_auto(pipe, lora_input):
    lora_input = lora_input.strip()
    if not lora_input:
        return

    # If it's just an ID like "author/model"
    if "/" in lora_input and not lora_input.startswith("http"):
        pipe.load_lora_weights(lora_input)
        return

    if lora_input.startswith("http"):
        url = lora_input

        # Repo page (no blob/resolve)
        if "huggingface.co" in url and "/blob/" not in url and "/resolve/" not in url:
            repo_id = urlparse(url).path.strip("/")
            pipe.load_lora_weights(repo_id)
            return

        # Blob link → convert to resolve link
        if "/blob/" in url:
            url = url.replace("/blob/", "/resolve/")

        # Download direct file
        tmp_dir = tempfile.mkdtemp()
        local_path = os.path.join(tmp_dir, os.path.basename(urlparse(url).path))

        try:
            print(f"Downloading LoRA from {url}...")
            resp = requests.get(url, stream=True)
            resp.raise_for_status()
            with open(local_path, "wb") as f:
                for chunk in resp.iter_content(chunk_size=8192):
                    f.write(chunk)
            print(f"Saved LoRA to {local_path}")
            pipe.load_lora_weights(local_path)
        finally:
            shutil.rmtree(tmp_dir, ignore_errors=True)

@spaces.GPU(duration=120)
def infer(prompt, seed=42, randomize_seed=False, width=1024, height=1024, guidance_scale=4, num_inference_steps=28, lora_id=None, lora_scale=0.95, progress=gr.Progress(track_tqdm=True)):
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    generator = torch.Generator().manual_seed(seed)

    
    if lora_id and lora_id.strip() != "":
        pipe.unload_lora_weights()
        load_lora_auto(pipe, lora_id)
    
    try:
        image = pipe(
        prompt=prompt,
        negative_prompt="",
        width=width,
        height=height,
        num_inference_steps=num_inference_steps,
        generator=generator,
        guidance_scale=guidance_scale
    ).images[0]
        print("Image Generation Completed for: ", prompt, lora_id)
        return image, seed
    finally:
        # Unload LoRA weights if they were loaded
        if lora_id:
            pipe.unload_lora_weights()

@spaces.GPU(duration=120)
def edit_image(
    input_image: Image.Image,
    prompt: str,
    seed: int,
    progress=gr.Progress()
):
    """Edit image based on text prompt"""
    if input_image is None:
        raise gr.Error("Please upload an image first")
    if not prompt or prompt.strip() == "":
        raise gr.Error("Please enter an edit instruction")
    try:
        progress(0.1, desc="Preparing image...")
        if input_image.mode != 'RGB':
            input_image = input_image.convert('RGB')
        progress(0.2, desc="Generating edited image...")
        generator = torch.Generator("cuda" if torch.cuda.is_available() else "cpu").manual_seed(seed)
        with torch.inference_mode():
            output = edit_pipe(
                input_image,
                prompt,
                negative_prompt="",
                guidance_scale=4.5,
                num_inference_steps=50,
                num_images_per_prompt=1,
                generator=generator
            )
        progress(1.0, desc="Done!")
        return output.images[0]
    except Exception as e:
        raise gr.Error(f"Error during image editing: {str(e)}")


examples = [
    "a tiny astronaut hatching from an egg on the moon",
    "a cat holding a sign that says hello world",
    "an anime illustration of a wiener schnitzel",
]
    
css = """
#col-container {
   margin: 0 auto;
   max-width: 960px;
}
.generate-btn {
   background: linear-gradient(90deg, #4B79A1 0%, #283E51 100%) !important;
   border: none !important;
   color: white !important;
}
.generate-btn:hover {
   transform: translateY(-2px);
   box-shadow: 0 5px 15px rgba(0,0,0,0.2);
}
"""

with gr.Blocks(css=css) as app:
    gr.HTML("<center><h1>LongCat-Image 6B</h1></center>")
    with gr.Column(elem_id="col-container"):
        with gr.Row():
            with gr.Column():
                with gr.Row():
                    text_prompt = gr.Textbox(label="Prompt", placeholder="Enter a prompt here", lines=3, elem_id="prompt-text-input")
                # with gr.Row():
                #     custom_lora = gr.Textbox(label="Custom LoRA (optional)", info="URL or the path to the LoRA weights", placeholder="kudzueye/boreal-qwen-image")
                with gr.Row():
                    with gr.Accordion("Advanced Settings", open=False):
            
                        with gr.Row():
                            custom_lora = gr.Textbox(label="Custom LoRA (optional)", info="URL or the path to the LoRA weights", placeholder="kudzueye/boreal-qwen-image")
                            lora_scale = gr.Slider(
                                label="LoRA Scale",
                                minimum=0,
                                maximum=2,
                                step=0.01,
                                value=1,
                            )
                        with gr.Row():
                            width = gr.Slider(label="Width", value=1024, minimum=64, maximum=2048, step=16)
                            height = gr.Slider(label="Height", value=1024, minimum=64, maximum=2048, step=16)
                        seed = gr.Slider(label="Seed", value=-1, minimum=-1, maximum=4294967296, step=1)
                        randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
                        with gr.Row():
                            steps = gr.Slider(label="Inference steps steps", value=28, minimum=1, maximum=100, step=1)
                            cfg = gr.Slider(label="Guidance Scale", value=4.5, minimum=1, maximum=20, step=0.5)
                        # method = gr.Radio(label="Sampling method", value="DPM++ 2M Karras", choices=["DPM++ 2M Karras", "DPM++ SDE Karras", "Euler", "Euler a", "Heun", "DDIM"])

                with gr.Row():
                    # text_button = gr.Button("Run", variant='primary', elem_id="gen-button")
                    text_button = gr.Button("✨ Generate Image", variant='primary', elem_classes=["generate-btn"])
            with gr.Column():
                with gr.Row():
                    image_output = gr.Image(type="pil", label="Image Output", elem_id="gallery")
        
        # gr.Markdown(article_text)
        with gr.Column():
            gr.Examples(
                examples = examples,
                inputs = [text_prompt],
            )
    gr.on(
        triggers=[text_button.click, text_prompt.submit],
        fn = infer,
        inputs=[text_prompt, seed, randomize_seed, width, height, cfg, steps, custom_lora, lora_scale], 
        outputs=[image_output, seed]
    )
        
        # text_button.click(query, inputs=[custom_lora, text_prompt, steps, cfg, randomize_seed, seed, width, height], outputs=[image_output,seed_output, seed])
        # text_button.click(infer, inputs=[text_prompt, seed, randomize_seed, width, height, cfg, steps, custom_lora, lora_scale], outputs=[image_output,seed_output, seed])

app.launch(share=True, mcp_server=True)