Spaces:

TSXu
/

UniCalli_Dev

Running on Zero

App Files Files Community

TSXu commited on Jan 27

Commit

89e2699

1 Parent(s): e7cbbce

UI improvements: move status bar to right side, simplify layout, update defaults to Wang Xizhi

Browse files

Files changed (4) hide show

app.py +80 -86
inference.py +2 -2
src/flux/modules/layers.py +8 -6
src/flux/xflux_pipeline.py +4 -1

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 """
 Gradio Demo for Chinese Calligraphy Generation - HuggingFace Space Version
 """
 # IMPORTANT: import spaces first before any CUDA-related packages
@@ -9,6 +10,7 @@ import spaces
 import gradio as gr
 import json
 import csv
 # Load author and font mappings from CSV
 def load_author_fonts_from_csv(csv_path):
@@ -83,84 +85,81 @@ def init_generator():
 def update_font_choices(author: str):
     """
     Update available font choices based on selected author
-    Args:
-        author: Selected author name
-    Returns:
-        Updated dropdown with available fonts for the author
     """
     if author == "None (Synthetic / 合成风格)" or author not in AUTHOR_FONTS:
-        # If no author or synthetic, show all font types
         choices = list(FONT_STYLE_NAMES.values())
     else:
-        # Show only fonts available for this author
         available_fonts = AUTHOR_FONTS[author]
         choices = [FONT_STYLE_NAMES[font] for font in available_fonts if font in FONT_STYLE_NAMES]
-    # Return updated dropdown with first choice as default
     return gr.Dropdown(choices=choices, value=choices[0] if choices else None)
-@spaces.GPU(duration=300)  # 5 minutes for model loading + generation
-def generate_calligraphy(
     text: str,
     author_dropdown: str,
     font_style: str,
     num_steps: int,
-    seed: int,
-    random_seed: bool,
-    batch_size: int = 1,
 ):
     """
-    Generate calligraphy based on user inputs
     Args:
         text: Input text (1-7 characters)
-        author_dropdown: Selected author from dropdown
-        font_style: Selected font style (display name)
-        num_steps: Number of denoising steps
-        seed: Random seed
-        random_seed: Whether to use random seed
-        batch_size: Number of images to generate
-    Returns:
-        Generated images (gallery) and seed info
     """
     import torch
-    # Validate text - must be 1-7 characters
     if len(text) < 1:
         raise gr.Error("文本不能为空 / Text cannot be empty")
     if len(text) > 7:
         raise gr.Error(f"文本最多7个字符 / Text must be at most 7 characters. Current: {len(text)}")
-    # Extract font style value from display name
-    font = None
-    for font_key, font_display in FONT_STYLE_NAMES.items():
-        if font_display == font_style:
-            font = font_key
-            break
     if font is None:
         raise gr.Error(f"无法识别的字体风格 / Unknown font style: {font_style}")
     # Determine author
     author = author_dropdown if author_dropdown != "None (Synthetic / 合成风格)" else None
-    # Handle seed
-    if random_seed:
-        seed = torch.randint(0, 2**32, (1,)).item()
-    # Initialize generator if needed
     gen = init_generator()
-    # Generate batch of images
     results = []
     seeds_used = []
-    for i in range(batch_size):
-        current_seed = seed + i  # Increment seed for each image in batch
         result_img, cond_img = gen.generate(
             text=text,
             font_style=font,
@@ -168,16 +167,19 @@ def generate_calligraphy(
             num_steps=num_steps,
             seed=current_seed,
         )
-        results.append(result_img)
         seeds_used.append(current_seed)
-    # Format seed info
-    if batch_size == 1:
-        seed_info = f"Seed: {seeds_used[0]}"
     else:
-        seed_info = f"Seeds: {seeds_used[0]} - {seeds_used[-1]} ({batch_size} images)"
-    return results, seed_info
 # Create Gradio interface
@@ -199,8 +201,8 @@ with gr.Blocks(title="UniCalli - Chinese Calligraphy Generator / 中国书法生
             text_input = gr.Textbox(
                 label="输入文本 / Input Text (1-7个字符 / 1-7 characters)",
-                placeholder="请输入1-7个汉字 / Enter 1-7 Chinese characters, e.g.: 春风得意马蹄疾",
-                value="春风得意马蹄疾",
                 max_lines=1
             )
@@ -209,19 +211,19 @@ with gr.Blocks(title="UniCalli - Chinese Calligraphy Generator / 中国书法生
             author_dropdown = gr.Dropdown(
                 label="1. 选择书法家 / Select Calligrapher",
                 choices=["None (Synthetic / 合成风格)"] + AUTHOR_LIST,
-                value="赵佶\\宋徽宗",
                 info="先选择历史书法家 / Choose a historical calligrapher first"
             )
-            # Get initial fonts for default author (赵佶\宋徽宗)
-            initial_author = "赵佶\\宋徽宗"
             initial_fonts = AUTHOR_FONTS.get(initial_author, ["楷", "草", "行"])
             initial_font_choices = [FONT_STYLE_NAMES[f] for f in initial_fonts if f in FONT_STYLE_NAMES]
             font_style = gr.Dropdown(
                 label="2. 选择字体风格 / Select Font Style",
                 choices=initial_font_choices,
-                value="楷 (Regular Script)",
                 info="根据所选书法家显示可用字体 / Shows available fonts for selected calligrapher"
             )
@@ -236,45 +238,40 @@ with gr.Blocks(title="UniCalli - Chinese Calligraphy Generator / 中国书法生
                 info="更多步数 = 更高质量，但更慢 / More steps = higher quality, but slower"
             )
-            with gr.Row():
-                seed = gr.Number(
-                    label="随机种子 / Seed",
-                    value=42,
-                    precision=0
-                )
-                random_seed = gr.Checkbox(
-                    label="随机种子 / Random Seed",
-                    value=False
-                )
-            batch_size = gr.Slider(
-                label="批量生成数量 / Batch Size",
                 minimum=1,
-                maximum=4,
                 value=1,
-                step=1,
-                info="生成多张图片以选择最佳效果 / Generate multiple images to pick the best"
             )
-            generate_btn = gr.Button("🎨 生成书法 / Generate Calligraphy", variant="primary", size="lg")
         with gr.Column(scale=1):
             # Output section
-            gr.Markdown("### 🖼️ 生成结果 / Generated Result")
-            gr.Markdown("")  # Add spacing
             output_gallery = gr.Gallery(
                 label="生成结果 / Generated Results",
                 show_label=False,
                 columns=2,
                 rows=2,
-                height=650,
                 object_fit="contain",
                 allow_preview=True
             )
-            seed_info = gr.Textbox(
-                label="种子信息 / Seed Info",
                 interactive=False
             )
@@ -291,45 +288,42 @@ with gr.Blocks(title="UniCalli - Chinese Calligraphy Generator / 中国书法生
         gr.Markdown(author_info_md)
     # Event handlers
-    # Update font choices when author changes
     author_dropdown.change(
         fn=update_font_choices,
         inputs=[author_dropdown],
         outputs=[font_style]
     )
-    # Generate button click
     generate_btn.click(
-        fn=generate_calligraphy,
         inputs=[
             text_input,
             author_dropdown,
             font_style,
             num_steps,
-            seed,
-            random_seed,
-            batch_size,
         ],
-        outputs=[output_gallery, seed_info]
     )
     # Examples
     gr.Markdown("### 📋 示例 / Examples")
     gr.Examples(
         examples=[
-            ["春风得意马蹄疾", "赵佶\\宋徽宗", "楷 (Regular Script)", 25, 42, False, 1],
-            ["海内存知己", "黄庭坚", "行 (Running Script)", 25, 42, False, 1],
-            ["天道酬勤", "王羲之", "草 (Cursive Script)", 25, 42, False, 1],
-            ["宁静致远", "None (Synthetic / 合成风格)", "楷 (Regular Script)", 25, 42, False, 1],
         ],
         inputs=[
             text_input,
             author_dropdown,
             font_style,
             num_steps,
-            seed,
-            random_seed,
-            batch_size,
         ],
     )

 # -*- coding: utf-8 -*-
 """
 Gradio Demo for Chinese Calligraphy Generation - HuggingFace Space Version
+With interactive session mode to avoid model reloading
 """
 # IMPORTANT: import spaces first before any CUDA-related packages
 import gradio as gr
 import json
 import csv
+import time
 # Load author and font mappings from CSV
 def load_author_fonts_from_csv(csv_path):
 def update_font_choices(author: str):
     """
     Update available font choices based on selected author
     """
     if author == "None (Synthetic / 合成风格)" or author not in AUTHOR_FONTS:
         choices = list(FONT_STYLE_NAMES.values())
     else:
         available_fonts = AUTHOR_FONTS[author]
         choices = [FONT_STYLE_NAMES[font] for font in available_fonts if font in FONT_STYLE_NAMES]
     return gr.Dropdown(choices=choices, value=choices[0] if choices else None)
+def parse_font_style(font_style: str) -> str:
+    """Extract font key from display name"""
+    for font_key, font_display in FONT_STYLE_NAMES.items():
+        if font_display == font_style:
+            return font_key
+    return None
+@spaces.GPU(duration=600)  # 10 minutes session for multiple generations
+def interactive_session(
     text: str,
     author_dropdown: str,
     font_style: str,
     num_steps: int,
+    start_seed: int,
+    num_images: int,
+    progress=gr.Progress()
 ):
     """
+    Interactive session: load model once, generate multiple images
     Args:
         text: Input text (1-7 characters)
+        author_dropdown: Selected author
+        font_style: Font style
+        num_steps: Inference steps
+        start_seed: Starting seed
+        num_images: Number of images to generate (each with different seed)
+    Yields:
+        Progress status, gallery of results
     """
     import torch
+    # Validate text
     if len(text) < 1:
         raise gr.Error("文本不能为空 / Text cannot be empty")
     if len(text) > 7:
         raise gr.Error(f"文本最多7个字符 / Text must be at most 7 characters. Current: {len(text)}")
+    # Parse font style
+    font = parse_font_style(font_style)
     if font is None:
         raise gr.Error(f"无法识别的字体风格 / Unknown font style: {font_style}")
     # Determine author
     author = author_dropdown if author_dropdown != "None (Synthetic / 合成风格)" else None
+    # Step 1: Load model (only once per session)
+    yield "⏳ 正在加载模型... / Loading model...", []
     gen = init_generator()
+    yield "✅ 模型加载完成！开始生成... / Model loaded! Starting generation...", []
+    # Step 2: Generate multiple images
     results = []
     seeds_used = []
+    for i in range(num_images):
+        current_seed = start_seed + i
+        progress((i + 1) / num_images, desc=f"生成第 {i+1}/{num_images} 张...")
+        yield f"🎨 正在生成第 {i+1}/{num_images} 张 (Seed: {current_seed})...", results
         result_img, cond_img = gen.generate(
             text=text,
             font_style=font,
             num_steps=num_steps,
             seed=current_seed,
         )
+        results.append((result_img, f"Seed: {current_seed}"))
         seeds_used.append(current_seed)
+        # Yield intermediate results so user can see progress
+        yield f"✅ 已完成 {i+1}/{num_images} 张 (Seed: {current_seed})", results
+    # Final yield with all seeds info
+    if num_images > 1:
+        final_status = f"✅ 全部完成！共 {num_images} 张 (Seeds: {seeds_used[0]}-{seeds_used[-1]})"
     else:
+        final_status = f"✅ 完成！Seed: {seeds_used[0]}"
+    yield final_status, results
 # Create Gradio interface
             text_input = gr.Textbox(
                 label="输入文本 / Input Text (1-7个字符 / 1-7 characters)",
+                placeholder="请输入1-7个汉字 / Enter 1-7 Chinese characters, e.g.: 天道酬勤",
+                value="天道酬勤",
                 max_lines=1
             )
             author_dropdown = gr.Dropdown(
                 label="1. 选择书法家 / Select Calligrapher",
                 choices=["None (Synthetic / 合成风格)"] + AUTHOR_LIST,
+                value="王羲之",
                 info="先选择历史书法家 / Choose a historical calligrapher first"
             )
+            # Get initial fonts for default author (王羲之)
+            initial_author = "王羲之"
             initial_fonts = AUTHOR_FONTS.get(initial_author, ["楷", "草", "行"])
             initial_font_choices = [FONT_STYLE_NAMES[f] for f in initial_fonts if f in FONT_STYLE_NAMES]
             font_style = gr.Dropdown(
                 label="2. 选择字体风格 / Select Font Style",
                 choices=initial_font_choices,
+                value="草 (Cursive Script)",
                 info="根据所选书法家显示可用字体 / Shows available fonts for selected calligrapher"
             )
                 info="更多步数 = 更高质量，但更慢 / More steps = higher quality, but slower"
             )
+            start_seed = gr.Number(
+                label="起始种子 / Start Seed",
+                value=42,
+                precision=0
+            )
+            num_images = gr.Slider(
+                label="生成数量 / Number of Images",
                 minimum=1,
+                maximum=8,
                 value=1,
+                step=1
             )
+            generate_btn = gr.Button("🎨 开始生成 / Start Generation", variant="primary", size="lg")
         with gr.Column(scale=1):
             # Output section
+            gr.Markdown("### 🖼️ 生成结果 / Generated Results")
+            gr.Markdown("*点击图片可放大查看 / Click image to enlarge*")
             output_gallery = gr.Gallery(
                 label="生成结果 / Generated Results",
                 show_label=False,
                 columns=2,
                 rows=2,
+                height=550,
                 object_fit="contain",
                 allow_preview=True
             )
+            status_text = gr.Textbox(
+                label="状态 / Status",
+                value="准备就绪 / Ready",
                 interactive=False
             )
         gr.Markdown(author_info_md)
     # Event handlers
     author_dropdown.change(
         fn=update_font_choices,
         inputs=[author_dropdown],
         outputs=[font_style]
     )
+    # Generate button - uses streaming for live updates
     generate_btn.click(
+        fn=interactive_session,
         inputs=[
             text_input,
             author_dropdown,
             font_style,
             num_steps,
+            start_seed,
+            num_images,
         ],
+        outputs=[status_text, output_gallery]
     )
     # Examples
     gr.Markdown("### 📋 示例 / Examples")
     gr.Examples(
         examples=[
+            ["天道酬勤", "王羲之", "草 (Cursive Script)", 25, 42, 1],
+            ["春风得意马蹄疾", "赵佶\\宋徽宗", "楷 (Regular Script)", 25, 42, 1],
+            ["海内存知己", "黄庭坚", "行 (Running Script)", 25, 42, 1],
+            ["宁静致远", "None (Synthetic / 合成风格)", "楷 (Regular Script)", 25, 42, 1],
         ],
         inputs=[
             text_input,
             author_dropdown,
             font_style,
             num_steps,
+            start_seed,
+            num_images,
         ],
     )

inference.py CHANGED Viewed

@@ -341,8 +341,8 @@ class CalligraphyGenerator:
         # Move to GPU only if NOT using DeepSpeed (DeepSpeed will handle device placement)
         if not use_deepspeed:
-            print(f"Moving model to {self.device}...")
-            model = model.to(self.device)
             # Enable optimized attention backends
             try:

         # Move to GPU only if NOT using DeepSpeed (DeepSpeed will handle device placement)
         if not use_deepspeed:
+            print(f"Moving model to {self.device} and converting to float32...")
+            model = model.to(device=self.device, dtype=torch.float32)
             # Enable optimized attention backends
             try:

src/flux/modules/layers.py CHANGED Viewed

@@ -34,19 +34,21 @@ def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 10
     :param max_period: controls the minimum frequency of the embeddings.
     :return: an (N, D) Tensor of positional embeddings.
     """
     t = time_factor * t
     half = dim // 2
-    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
-        t.device
-    )
     args = t[:, None].float() * freqs[None]
     embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
     if dim % 2:
         embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
-    if torch.is_floating_point(t):
-        embedding = embedding.to(t)
-    return embedding
 class MLPEmbedder(nn.Module):

     :param max_period: controls the minimum frequency of the embeddings.
     :return: an (N, D) Tensor of positional embeddings.
     """
+    # Store original dtype and device
+    orig_dtype = t.dtype
+    orig_device = t.device
     t = time_factor * t
     half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=orig_device) / half)
     args = t[:, None].float() * freqs[None]
     embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
     if dim % 2:
         embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    # Always convert to original dtype
+    return embedding.to(dtype=orig_dtype, device=orig_device)
 class MLPEmbedder(nn.Module):

src/flux/xflux_pipeline.py CHANGED Viewed

@@ -225,6 +225,7 @@ class XFluxPipeline:
             if self.controlnet_loaded:
                 controlnet_image = self.annotator(controlnet_image, width, height)
             controlnet_image = torch.from_numpy((np.array(controlnet_image) / 127.5) - 1)
             controlnet_image = controlnet_image.permute(
                 2, 0, 1).unsqueeze(0).to(torch.float32).to(self.device)
@@ -311,6 +312,7 @@ class XFluxPipeline:
         neg_ip_scale=1.0,
         is_generation=True,
     ):
         x = get_noise(
             1, height, width, device=self.device,
             dtype=torch.float32, seed=seed
@@ -328,7 +330,8 @@ class XFluxPipeline:
             if not self.controlnet_loaded and controlnet_image is not None:  # tianshuo
                 # width //= 2
-                cond_latent = self.ae.encode(controlnet_image.to(self.device, dtype=torch.float32))
             inp_cond = prepare(t5=self.t5, clip=self.clip, img=x, prompt=prompt)
             neg_inp_cond = prepare(t5=self.t5, clip=self.clip, img=x, prompt=neg_prompt)

             if self.controlnet_loaded:
                 controlnet_image = self.annotator(controlnet_image, width, height)
             controlnet_image = torch.from_numpy((np.array(controlnet_image) / 127.5) - 1)
+            # Keep as float32 for VAE encoding, will be converted to model dtype after
             controlnet_image = controlnet_image.permute(
                 2, 0, 1).unsqueeze(0).to(torch.float32).to(self.device)
         neg_ip_scale=1.0,
         is_generation=True,
     ):
+        # Use float32 for stable inference
         x = get_noise(
             1, height, width, device=self.device,
             dtype=torch.float32, seed=seed
             if not self.controlnet_loaded and controlnet_image is not None:  # tianshuo
                 # width //= 2
+                # VAE expects float32 (controlnet_image is already float32)
+                cond_latent = self.ae.encode(controlnet_image)
             inp_cond = prepare(t5=self.t5, clip=self.clip, img=x, prompt=prompt)
             neg_inp_cond = prepare(t5=self.t5, clip=self.clip, img=x, prompt=neg_prompt)