|
|
import gradio as gr |
|
|
import numpy as np |
|
|
from datetime import datetime |
|
|
import torch |
|
|
from PIL import Image |
|
|
|
|
|
import spaces |
|
|
from huggingface_hub import snapshot_download |
|
|
from pipeline_calligrapher import CalligrapherPipeline |
|
|
from models.calligrapher import Calligrapher |
|
|
from models.transformer_flux_inpainting import FluxTransformer2DModel |
|
|
from utils import process_gradio_source, get_bbox_from_mask, crop_image_from_bb, \ |
|
|
resize_img_and_pad, generate_context_reference_image |
|
|
|
|
|
|
|
|
|
|
|
def load_models(): |
|
|
snapshot_download( |
|
|
repo_id="Calligrapher2025/Calligrapher", |
|
|
allow_patterns="calligrapher.bin", |
|
|
local_dir="./", |
|
|
) |
|
|
print("calligrapher.bin successfully downloaded!") |
|
|
transformer = FluxTransformer2DModel.from_pretrained("black-forest-labs/FLUX.1-Fill-dev", |
|
|
subfolder="transformer", |
|
|
torch_dtype=torch.bfloat16) |
|
|
pipe = CalligrapherPipeline.from_pretrained("black-forest-labs/FLUX.1-Fill-dev", |
|
|
transformer=transformer, |
|
|
torch_dtype=torch.bfloat16).to("cuda") |
|
|
model = Calligrapher(pipe, |
|
|
image_encoder_path="google/siglip-so400m-patch14-384", |
|
|
calligrapher_path="calligrapher.bin", |
|
|
device="cuda", num_tokens=128) |
|
|
return model |
|
|
|
|
|
|
|
|
|
|
|
model = load_models() |
|
|
print('Model loaded!') |
|
|
|
|
|
|
|
|
@spaces.GPU() |
|
|
def process_and_generate(editor_component, reference_image, prompt, height, width, |
|
|
scale, steps=50, seed=42, use_context=True, num_images=1): |
|
|
""" |
|
|
Process input images and generate customized text images using the Calligrapher model. |
|
|
|
|
|
This function handles the complete pipeline from processing user inputs through the image editor, |
|
|
preparing reference images, applying masks, and generating multiple customized text images |
|
|
based on the provided parameters. |
|
|
|
|
|
Args: |
|
|
editor_component: Gradio ImageEditor component containing the source image and mask drawings. |
|
|
reference_image: PIL Image object of the reference style image, or None for self-reference. |
|
|
prompt: String describing the desired text content. |
|
|
height: Integer height of the output image in pixels. |
|
|
width: Integer width of the output image in pixels. |
|
|
scale: Float value controlling the generation strength (0.0 to 2.0). |
|
|
steps: Integer number of inference steps for the generation process (default: 50). |
|
|
seed: Integer random seed for reproducible generation (default: 42). |
|
|
use_context: Boolean flag to include context reference in generation (default: True). |
|
|
num_images: Integer number of images to generate (default: 1). |
|
|
|
|
|
Returns: |
|
|
Tuple containing: |
|
|
- mask_vis: PIL Image of the processed mask (with context removed if applicable). |
|
|
- reference_image_to_encoder: PIL Image of the resized reference image used by the encoder. |
|
|
- all_generated_images: List of tuples, each containing (generated_image, caption_string). |
|
|
""" |
|
|
print('Begin processing!') |
|
|
|
|
|
source_image, mask_image, cropped_image = process_gradio_source(editor_component) |
|
|
|
|
|
|
|
|
source_image = source_image.resize((width, height)) |
|
|
mask_image = mask_image.resize((width, height), Image.NEAREST) |
|
|
mask_np = np.array(mask_image) |
|
|
mask_np[mask_np > 0] = 255 |
|
|
mask_image = Image.fromarray(mask_np.astype(np.uint8)) |
|
|
|
|
|
if reference_image is None: |
|
|
|
|
|
tl, br = get_bbox_from_mask(mask_image) |
|
|
|
|
|
reference_image = crop_image_from_bb(source_image, tl, br) |
|
|
|
|
|
reference_image_to_encoder = resize_img_and_pad(reference_image, target_size=(512, 512)) |
|
|
|
|
|
if use_context: |
|
|
reference_context = generate_context_reference_image(reference_image, width) |
|
|
|
|
|
source_with_context = Image.new(source_image.mode, (width, reference_context.size[1] + height)) |
|
|
source_with_context.paste(reference_context, (0, 0)) |
|
|
source_with_context.paste(source_image, (0, reference_context.size[1])) |
|
|
|
|
|
mask_with_context = Image.new(mask_image.mode, |
|
|
(mask_image.size[0], |
|
|
reference_context.size[1] + mask_image.size[0]), |
|
|
color=0) |
|
|
mask_with_context.paste(mask_image, (0, reference_context.size[1])) |
|
|
|
|
|
source_image = source_with_context |
|
|
mask_image = mask_with_context |
|
|
|
|
|
all_generated_images = [] |
|
|
for i in range(num_images): |
|
|
res = model.generate( |
|
|
image=source_image, |
|
|
mask_image=mask_image, |
|
|
ref_image=reference_image_to_encoder, |
|
|
prompt=prompt, |
|
|
scale=scale, |
|
|
num_inference_steps=steps, |
|
|
width=source_image.size[0], |
|
|
height=source_image.size[1], |
|
|
seed=seed + i, |
|
|
)[0] |
|
|
if use_context: |
|
|
res_vis = res.crop((0, reference_context.size[1], res.width, res.height)) |
|
|
mask_vis = mask_image.crop( |
|
|
(0, reference_context.size[1], mask_image.width, mask_image.height)) |
|
|
else: |
|
|
res_vis = res |
|
|
mask_vis = mask_image |
|
|
all_generated_images.append((res_vis, f"Generating {i + 1} (Seed: {seed + i})")) |
|
|
|
|
|
return mask_vis, reference_image_to_encoder, all_generated_images |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(theme="default", css=".image-editor img {max-width: 70%; height: 70%;}") as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# 🖌️ Calligrapher: Freestyle Text Image Customization    [[Code]](https://github.com/Calligrapher2025/Calligrapher) [[Project Page]](https://calligrapher2025.github.io/Calligrapher/) |
|
|
### Consider giving a star to the [project](https://github.com/Calligrapher2025/Calligrapher) if you find it useful! |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=3): |
|
|
gr.Markdown("### 🎨 Image Editing Panel") |
|
|
editor_component = gr.ImageEditor( |
|
|
label="Upload or Draw", |
|
|
type="pil", |
|
|
brush=gr.Brush(colors=["#FFFFFF"], default_size=30, color_mode="fixed"), |
|
|
layers=True, |
|
|
interactive=True, |
|
|
) |
|
|
|
|
|
gr.Markdown("### 📤 Output Result") |
|
|
gallery = gr.Gallery(label="🖼️ Result Gallery") |
|
|
gr.Markdown( |
|
|
"""<br> |
|
|
|
|
|
### ✨User Tips: |
|
|
|
|
|
1. **Speed vs Quality Trade-off.** Use fewer steps (e.g., 10-step which takes ~4s/image on a single A6000 GPU) for faster generation, but quality may be lower. |
|
|
|
|
|
2. **Inpaint Position Freedom.** Inpainting positions are flexible - they don't necessarily need to match the original text locations in the input image. |
|
|
|
|
|
3. **Iterative Editing.** Drag outputs from the gallery to the Image Editing Panel (clean the Editing Panel first) for quick refinements. |
|
|
|
|
|
4. **Mask Optimization.** Adjust mask size/aspect ratio to match your desired content. The model tends to fill the masks, and harmonizes the generation with background in terms of color and lighting. |
|
|
|
|
|
5. **Reference Image Tip.** White-background references improve style consistency - the encoder also considers background context of the given reference image. |
|
|
|
|
|
6. **Resolution Balance.** Very high-resolution generation sometimes triggers spelling errors. 512/768px are recommended considering the model is trained under the resolution of 512. |
|
|
""" |
|
|
) |
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### ⚙️Settings") |
|
|
reference_image = gr.Image( |
|
|
label="🧩 Reference Image (skip this if self-reference)", |
|
|
sources=["upload"], |
|
|
type="pil", |
|
|
) |
|
|
prompt = gr.Textbox( |
|
|
label="📝 Prompt", |
|
|
placeholder="The text is 'Image'...", |
|
|
value="The text is 'Image'." |
|
|
) |
|
|
|
|
|
with gr.Accordion("🔧 Additional Settings", open=True): |
|
|
with gr.Row(): |
|
|
height = gr.Number(label="Height", value=512, precision=0) |
|
|
width = gr.Number(label="Width", value=512, precision=0) |
|
|
scale = gr.Slider(0.0, 2.0, 1.0, step=0.1, value=1.0, label="🎚️ Strength") |
|
|
steps = gr.Slider(1, 100, 50, step=1, label="🔁 Steps") |
|
|
with gr.Row(): |
|
|
seed = gr.Number(label="🎲 Seed", value=56, precision=0) |
|
|
use_context = gr.Checkbox(value=True, label="🔍 Use Context", interactive=True) |
|
|
num_images = gr.Slider(1, 16, 2, step=1, label="🖼️ Sample Amount") |
|
|
|
|
|
run_btn = gr.Button("🚀 Run", variant="primary") |
|
|
|
|
|
mask_output = gr.Image(label="🟩 Mask Demo") |
|
|
reference_demo = gr.Image(label="🧩 Reference Demo") |
|
|
|
|
|
|
|
|
run_btn.click( |
|
|
fn=process_and_generate, |
|
|
inputs=[ |
|
|
editor_component, |
|
|
reference_image, |
|
|
prompt, |
|
|
height, |
|
|
width, |
|
|
scale, |
|
|
steps, |
|
|
seed, |
|
|
use_context, |
|
|
num_images |
|
|
], |
|
|
outputs=[ |
|
|
mask_output, |
|
|
reference_demo, |
|
|
gallery |
|
|
] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(mcp_server=True) |