Z-Image-to-LoRA

Running on Zero

App Files Files Community

Alexander Bagus commited on Jan 30

Commit

b30e434

1 Parent(s): 8009598

Initial commit

Browse files

Files changed (6) hide show

README.md +2 -3
app.py +421 -0
requirements.txt +8 -0
utils/image_utils.py +76 -0
utils/prompt_utils.py +71 -0
utils/repo_utils.py +33 -0

README.md CHANGED Viewed

@@ -1,11 +1,10 @@
 ---
 title: Z Image To LoRA
-emoji: 🌖
 colorFrom: gray
 colorTo: green
 sdk: gradio
-sdk_version: 6.5.1
-python_version: '3.12'
 app_file: app.py
 pinned: false
 license: apache-2.0

 ---
 title: Z Image To LoRA
+emoji: 🥹
 colorFrom: gray
 colorTo: green
 sdk: gradio
+sdk_version: 6.1.0
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py ADDED Viewed

	@@ -0,0 +1,421 @@

+import gradio as gr
+import numpy as np
+import torch, random, json, spaces, time
+from ulid import ULID
+from diffsynth.pipelines.z_image import (
+    ModelConfig, ZImageUnit_Image2LoRAEncode, ZImageUnit_Image2LoRADecode
+)
+from diffsynth.pipelines.z_image import ZImagePipeline as ZImagePipelineDs
+from diffusers import ZImagePipeline
+from safetensors.torch import save_file, load_file
+import torch
+from PIL import Image
+from pathlib import Path
+from utils import repo_utils, image_utils, prompt_utils
+from huggingface_hub import snapshot_download
+import glob
+# repo_utils.clone_repo_if_not_exists("git clone https://huggingface.co/DiffSynth-Studio/General-Image-Encoders", "app/repos")
+# repo_utils.clone_repo_if_not_exists("https://huggingface.co/apple/starflow", "app/models")
+URL_PUBLIC = "https://huggingface.co/spaces/AiSudo/Qwen-Image-to-LoRA/blob/main"
+DTYPE = torch.bfloat16
+MAX_SEED = np.iinfo(np.int32).max
+MODELS_DIR = Path("./models")
+def download_hf_models(output_dir: Path) -> dict:
+    """
+    Download required models from Hugging Face using huggingface_hub.
+    Downloads:
+    - DiffSynth-Studio/Z-Image-i2L
+    - Tongyi-MAI/Z-Image
+    - DiffSynth-Studio/General-Image-Encoders
+    - Tongyi-MAI/Z-Image-Turbo
+    Returns dict with paths to downloaded models.
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    models = [
+        {
+            "repo_id": "DiffSynth-Studio/General-Image-Encoders",
+            "description": "General Image Encoders (SigLIP2-G384, DINOv3-7B)",
+            "allow_patterns": None,
+        },
+        {
+            "repo_id": "Tongyi-MAI/Z-Image-Turbo",
+            "description": "Z-Image Turbo",
+        },
+        {
+            "repo_id": "Tongyi-MAI/Z-Image",
+            "description": "Z-Image base model (transformer)",
+            "allow_patterns": ["transformer/*.safetensors"],
+        },
+        {
+            "repo_id": "DiffSynth-Studio/Z-Image-i2L",
+            "description": "Z-Image-i2L (Image to LoRA model)",
+            "allow_patterns": ["*.safetensors"],
+        },
+    ]
+    downloaded_paths = {}
+    for model in models:
+        repo_id = model["repo_id"]
+        local_dir = output_dir / repo_id
+        # Check if already downloaded
+        if local_dir.exists() and any(local_dir.rglob("*.safetensors")):
+            print(f"   ✓ {repo_id} (already downloaded)")
+            downloaded_paths[repo_id] = local_dir
+            continue
+        print(f"   📥 Downloading {repo_id}...")
+        print(f"      {model['description']}")
+        try:
+            result_path = snapshot_download(
+                repo_id=repo_id,
+                local_dir=str(local_dir),
+                allow_patterns=model["allow_patterns"],
+                local_dir_use_symlinks=False,
+                resume_download=True,
+            )
+            downloaded_paths[repo_id] = Path(result_path)
+            print(f"   ✓ {repo_id}")
+        except Exception as e:
+            print(f"   ❌ Error downloading {repo_id}: {e}")
+            raise
+    return downloaded_paths
+def get_model_files(base_path: Path, pattern: str) -> list:
+    """Get list of files matching a glob pattern."""
+    full_pattern = str(base_path / pattern)
+    files = sorted(glob.glob(full_pattern))
+    return files
+downloaded_paths = download_hf_models(MODELS_DIR)
+zimage_path = MODELS_DIR / "Tongyi-MAI" / "Z-Image"
+zimage_transformer_files = get_model_files(zimage_path, "transformer/*.safetensors")
+# Z-Image-Turbo
+zimage_turbo_path = MODELS_DIR / "Tongyi-MAI" / "Z-Image-Turbo"
+text_encoder_files = get_model_files(zimage_turbo_path, "text_encoder/*.safetensors")
+vae_file = get_model_files(zimage_turbo_path, "vae/diffusion_pytorch_model.safetensors")
+tokenizer_path = zimage_turbo_path / "tokenizer"
+# General Image Encoders
+encoders_path = MODELS_DIR / "DiffSynth-Studio" / "General-Image-Encoders"
+siglip_file = get_model_files(encoders_path, "SigLIP2-G384/model.safetensors")
+dino_file = get_model_files(encoders_path, "DINOv3-7B/model.safetensors")
+# Z-Image-i2L from HuggingFace
+zimage_i2l_path = MODELS_DIR / "DiffSynth-Studio" / "Z-Image-i2L"
+zimage_i2l_file = get_model_files(zimage_i2l_path, "model.safetensors")
+print(f"   Z-Image transformer: {len(zimage_transformer_files)} file(s)")
+print(f"   Text encoder: {len(text_encoder_files)} file(s)")
+print(f"   VAE: {len(vae_file)} file(s)")
+print(f"   Tokenizer: {tokenizer_path}")
+print(f"   SigLIP2: {len(siglip_file)} file(s)")
+print(f"   DINOv3: {len(dino_file)} file(s)")
+print(f"   Z-Image-i2L: {len(zimage_i2l_file)} file(s)")
+################
+vram_config = {
+    "offload_dtype": torch.bfloat16,
+    "offload_device": "cuda",
+    "onload_dtype": torch.bfloat16,
+    "onload_device": "cuda",
+    "preparing_dtype": torch.bfloat16,
+    "preparing_device": "cuda",
+    "computation_dtype": torch.bfloat16,
+    "computation_device": "cuda",
+}
+model_configs = [
+    # All models from HuggingFace - use path= for local files
+    ModelConfig(path=zimage_transformer_files, **vram_config),
+    ModelConfig(path=text_encoder_files),
+    ModelConfig(path=vae_file),
+    ModelConfig(path=siglip_file),
+    ModelConfig(path=dino_file),
+    ModelConfig(path=zimage_i2l_file),
+]
+pipe_lora = ZImagePipelineDs.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=model_configs,
+    tokenizer_config=ModelConfig(path=str(tokenizer_path)),
+)
+pipe_imagen = ZImagePipeline.from_pretrained(
+    "./models/Tongyi-MAI/Z-Image-Turbo",
+    torch_dtype=torch.bfloat16,
+    low_cpu_mem_usage=False,
+)
+pipe_imagen.to("cuda")
+# # Load models
+# pipe_lora = QwenImagePipeline.from_pretrained(
+#     torch_dtype=torch.bfloat16,
+#     device="cuda",
+#     model_configs=[
+#         ModelConfig(
+#             download_source="huggingface",
+#             model_id="DiffSynth-Studio/General-Image-Encoders",
+#             origin_file_pattern="SigLIP2-G384/model.safetensors",
+#             **vram_config
+#         ),
+#         ModelConfig(
+#             download_source="huggingface",
+#             model_id="DiffSynth-Studio/General-Image-Encoders",
+#             origin_file_pattern="DINOv3-7B/model.safetensors",
+#             **vram_config
+#         ),
+#         ModelConfig(
+#             download_source="huggingface",
+#             model_id="DiffSynth-Studio/Z-Image-i2L",
+#             origin_file_pattern="*.safetensors",
+#             **vram_config
+#         ),
+#     ],
+#     processor_config=ModelConfig(model_id="Qwen/Qwen-Image-Edit", origin_file_pattern="processor/"),
+#     vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
+# )
+# vram_config = {
+#     "offload_dtype": "disk",
+#     "offload_device": "disk",
+#     "onload_dtype": torch.bfloat16,
+#     "onload_device": "cuda",
+#     "preparing_dtype": torch.bfloat16,
+#     "preparing_device": "cuda",
+#     "computation_dtype": torch.bfloat16,
+#     "computation_device": "cuda",
+# }
+# pipe_imagen = QwenImagePipeline.from_pretrained(
+#     torch_dtype=torch.bfloat16,
+#     device="cuda",
+#     model_configs=[
+#         ModelConfig(download_source="huggingface", model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config),
+#         ModelConfig(download_source="huggingface", model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors", **vram_config),
+#         ModelConfig(download_source="huggingface", model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
+#     ],
+#     tokenizer_config=ModelConfig(download_source="huggingface", model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"),
+#     vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
+# )
+@spaces.GPU
+def generate_lora(
+    input_images,
+    progress=gr.Progress(track_tqdm=True),
+):
+    ulid = str(ULID()).lower()[:12]
+    print(f"ulid: {ulid}")
+    if not input_images:
+        print("images are empty.")
+        return False
+    progress(0.1, desc="Processing images...")
+    pil_images = [Image.open(filepath).convert("RGB") for filepath, _ in input_images]
+    progress(0.3, desc="Encoding images to LoRA...")
+    # Model inference
+    with torch.no_grad():
+        embs = ZImageUnit_Image2LoRAEncode().process(pipe_lora, image2lora_images=pil_images)
+        progress(0.7, desc="Decoding LoRA weights...")
+        lora = ZImageUnit_Image2LoRADecode().process(pipe_lora, **embs)["lora"]
+    progress(0.9, desc="Saving LoRA file...")
+    lora_name = f"{ulid}.safetensors"
+    lora_path = f"loras/{lora_name}"
+    progress(1.0, desc="Done!")
+    save_file(lora, lora_path)
+    return lora_name, gr.update(interactive=True, value=lora_path), gr.update(interactive=True)
+@spaces.GPU
+def generate_image(
+    lora_name,
+    prompt,
+    negative_prompt="blurry ugly bad",
+    width=1024,
+    height=1024,
+    seed=42,
+    randomize_seed=True,
+    guidance_scale=3.5,
+    num_inference_steps=8,
+    progress=gr.Progress(track_tqdm=True),
+):
+    lora_path = f"loras/{lora_name}"
+    pipe_imagen.clear_lora()
+    pipe_imagen.load_lora(pipe_imagen.dit, lora_path)
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    generator = torch.Generator().manual_seed(seed)
+    output_image = pipe_imagen(
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+        num_inference_steps=num_inference_steps,
+        width=width,
+        height=height,
+        # generator=generator,
+        # true_cfg_scale=guidance_scale,
+        # guidance_scale=1.0  # Use a fixed default for distilled guidance
+    )
+    return output_image, seed
+    return True
+def read_file(path: str) -> str:
+    with open(path, 'r', encoding='utf-8') as f:
+        content = f.read()
+    return content
+css = """
+#col-container {
+    margin: 0 auto;
+    max-width: 960px;
+}
+h3{
+    text-align: center;
+    display:block;
+}
+"""
+with open('examples/0_examples.json', 'r') as file: examples = json.load(file)
+print(examples)
+with gr.Blocks() as demo:
+    with gr.Column(elem_id="col-container"):
+        with gr.Column():
+            gr.HTML(read_file("static/header.html"))
+        with gr.Row():
+            with gr.Column():
+                input_images = gr.Gallery(
+                    label="Input images",
+                    file_types=["image"],
+                    show_label=False,
+                    elem_id="gallery",
+                    columns=2,
+                    object_fit="cover",
+                    height=300)
+                lora_button = gr.Button("Generate LoRA", variant="primary")
+            with gr.Column():
+                lora_name = gr.Textbox(label="Generated LoRA path",lines=2, interactive=False)
+                lora_download = gr.DownloadButton(label=f"Download LoRA", interactive=False)
+        with gr.Column(elem_id='imagen-container') as imagen_container:
+            gr.Markdown("### After your LoRA is ready, you can try generate image here.")
+            with gr.Row():
+                with gr.Column():
+                    prompt = gr.Textbox(
+                        label="Prompt",
+                        show_label=False,
+                        lines=2,
+                        placeholder="Enter your prompt",
+                        value="a man in a fishing boat.",
+                        container=False,
+                    )
+                    imagen_button = gr.Button("Generate Image", variant="primary", interactive=False)
+                    with gr.Accordion("Advanced Settings", open=False):
+                        negative_prompt = gr.Textbox(
+                            label="Negative prompt",
+                            lines=2,
+                            container=False,
+                            placeholder="Enter your negative prompt",
+                            value="blurry ugly bad"
+                        )
+                        num_inference_steps = gr.Slider(
+                            label="Steps",
+                            minimum=1,
+                            maximum=50,
+                            step=1,
+                            value=25,
+                        )
+                        with gr.Row():
+                            width = gr.Slider(
+                                label="Width",
+                                minimum=512,
+                                maximum=1280,
+                                step=32,
+                                value=768,
+                            )
+                            height = gr.Slider(
+                                label="Height",
+                                minimum=512,
+                                maximum=1280,
+                                step=32,
+                                value=1024,
+                            )
+                        with gr.Row():
+                            seed = gr.Slider(
+                                label="Seed",
+                                minimum=0,
+                                maximum=MAX_SEED,
+                                step=1,
+                                value=42,
+                            )
+                            guidance_scale = gr.Slider(
+                                label="Guidance scale",
+                                minimum=0.0,
+                                maximum=10.0,
+                                step=0.1,
+                                value=3.5,
+                            )
+                            randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
+                with gr.Column():
+                    output_image = gr.Image(label="Generated image", show_label=False)
+        gr.Examples(examples=examples, inputs=[input_images])
+        gr.Markdown(read_file("static/footer.md"))
+    lora_button.click(
+        fn=generate_lora,
+        inputs=[
+            input_images
+        ],
+        outputs=[lora_name, lora_download, imagen_button],
+    )
+    imagen_button.click(
+        fn=generate_image,
+        inputs=[
+            lora_name,
+            prompt,
+            negative_prompt,
+            width,
+            height,
+            seed,
+            randomize_seed,
+            guidance_scale,
+            num_inference_steps,
+        ],
+        outputs=[output_image, seed],
+    )
+if __name__ == "__main__":
+    demo.launch(mcp_server=True, css=css)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio
+torch
+transformers
+accelerate
+spaces
+git+https://github.com/huggingface/diffusers.git
+git+https://github.com/modelscope/DiffSynth-Studio.git
+python-ulid

utils/image_utils.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import torch
+from PIL import Image
+import numpy as np
+def rescale_image(img, scale, nearest=32, max_size=1280):
+    w, h = img.size
+    new_w = int(w * scale)
+    new_h = int(h * scale)
+    if new_w > max_size or new_h > max_size:
+        # Calculate new size keeping aspect ratio
+        scale = min(max_size / new_w, max_size / new_h)
+        new_w = int(new_w * scale)
+        new_h = int(new_h * scale)
+    # Adjust to nearest multiple
+    new_w = (new_w // nearest) * nearest
+    new_h = (new_h // nearest) * nearest
+    return img.resize((new_w, new_h), Image.LANCZOS), new_w, new_h
+def padding_image(images, new_width, new_height):
+    new_image = Image.new('RGB', (new_width, new_height), (255, 255, 255))
+    aspect_ratio = images.width / images.height
+    if new_width / new_height > 1:
+        if aspect_ratio > new_width / new_height:
+            new_img_width = new_width
+            new_img_height = int(new_img_width / aspect_ratio)
+        else:
+            new_img_height = new_height
+            new_img_width = int(new_img_height * aspect_ratio)
+    else:
+        if aspect_ratio > new_width / new_height:
+            new_img_width = new_width
+            new_img_height = int(new_img_width / aspect_ratio)
+        else:
+            new_img_height = new_height
+            new_img_width = int(new_img_height * aspect_ratio)
+    resized_img = images.resize((new_img_width, new_img_height))
+    paste_x = (new_width - new_img_width) // 2
+    paste_y = (new_height - new_img_height) // 2
+    new_image.paste(resized_img, (paste_x, paste_y))
+    return new_image
+def get_image_latent(ref_image=None, sample_size=None, padding=False):
+    if ref_image is not None:
+        if isinstance(ref_image, str):
+            ref_image = Image.open(ref_image).convert("RGB")
+            if padding:
+                ref_image = padding_image(
+                    ref_image, sample_size[1], sample_size[0])
+            ref_image = ref_image.resize((sample_size[1], sample_size[0]))
+            ref_image = torch.from_numpy(np.array(ref_image))
+            ref_image = ref_image.unsqueeze(0).permute(
+                [3, 0, 1, 2]).unsqueeze(0) / 255
+        elif isinstance(ref_image, Image.Image):
+            ref_image = ref_image.convert("RGB")
+            if padding:
+                ref_image = padding_image(
+                    ref_image, sample_size[1], sample_size[0])
+            ref_image = ref_image.resize((sample_size[1], sample_size[0]))
+            ref_image = torch.from_numpy(np.array(ref_image))
+            ref_image = ref_image.unsqueeze(0).permute(
+                [3, 0, 1, 2]).unsqueeze(0) / 255
+        else:
+            ref_image = torch.from_numpy(np.array(ref_image))
+            ref_image = ref_image.unsqueeze(0).permute(
+                [3, 0, 1, 2]).unsqueeze(0) / 255
+    return ref_image

utils/prompt_utils.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import os
+from huggingface_hub import InferenceClient
+# source: https://huggingface.co/spaces/InstantX/Qwen-Image-ControlNet/blob/main/app.py
+def polish_prompt(original_prompt):
+    # """Rewrites the prompt using a Hugging Face InferenceClient."""
+    magic_prompt = "Ultra HD, 4K, cinematic composition"
+    system_prompt = """
+        You are a prompt engineering expert for text-to-image models. Since text-to-image models have limited capabilities in understanding user prompts, you need to identify the core theme and intent of the user's input and improve the model's understanding accuracy and generation quality through optimization and rewriting. The rewrite must strictly retain all information from the user's original prompt without deleting or distorting any details.
+        Specific requirements are as follows:
+        1. The rewrite must not affect any information expressed in the user's original prompt; the rewritten prompt should use coherent natural language, avoid low-information redundant descriptions, and keep the rewritten prompt length as concise as possible.
+        2. Ensure consistency between input and output languages: Chinese input yields Chinese output, and English input yields English output. The rewritten token count should not exceed 512.
+        3. The rewritten description should further refine subject characteristics and aesthetic techniques appearing in the original prompt, such as lighting and textures.
+        4. If the original prompt does not specify an image style, ensure the rewritten prompt uses a **realistic photography style**. If the user specifies a style, retain the user's style.
+        5. When the original prompt requires reasoning to clarify user intent, use logical reasoning based on world knowledge to convert vague abstract descriptions into specific tangible objects (e.g., convert "the tallest animal" to "a giraffe").
+        6. When the original prompt requires text generation, please use double quotes to enclose the text part (e.g., `"50% OFF"`).
+        7. When the original prompt requires generating text-heavy scenes like webpages, logos, UIs, or posters, and no specific text content is specified, you need to infer appropriate text content and enclose it in double quotes. For example, if the user inputs: "A tourism flyer with a grassland theme," it should be rewritten as: "A tourism flyer with the image title 'Grassland'."
+        8. When negative words exist in the original prompt, ensure the rewritten prompt does not contain negative words. For example, "a lakeside without boats" should be rewritten such that the word "boat" does not appear at all.
+        9. Except for text content explicitly requested by the user, **adding any extra text content is prohibited**.
+        Here are examples of rewrites for different types of prompts:
+        # Examples (Few-Shot Learning)
+        1. User Input: An animal with nine lives.
+            Rewrite Output: A cat bathed in soft sunlight, its fur soft and glossy. The background is a comfortable home environment with light from the window filtering through curtains, creating a warm light and shadow effect. The shot uses a medium distance perspective to highlight the cat's leisurely and stretched posture. Light cleverly hits the cat's face, emphasizing its spirited eyes and delicate whiskers, adding depth and affinity to the image.
+        2. User Input: Create an anime-style tourism flyer with a grassland theme.
+            Rewrite Output: In the lower right of the center, a short-haired girl sits sideways on a gray, irregularly shaped rock. She wears a white short-sleeved dress and brown flat shoes, holding a bunch of small white flowers in her left hand, smiling with her legs hanging naturally. The girl has dark brown shoulder-length hair with bangs covering her forehead, brown eyes, and a slightly open mouth. The rock surface has textures of varying depths. To the girl's left and front is lush grass, with long, yellow-green blades, some glowing golden in the sunlight. The grass extends into the distance, forming rolling green hills that fade in color as they recede. The sky occupies the upper half of the picture, pale blue dotted with a few fluffy white clouds. In the upper left corner, there is a line of text in italic, dark green font reading "Explore Nature's Peace". Colors are dominated by green, blue, and yellow, fluid lines, and distinct light and shadow contrast, creating a quiet and comfortable atmosphere.
+        3. User Input: A Christmas sale poster with a red background, promoting a Buy 1 Get 1 Free milk tea offer.
+            Rewrite Output: The poster features an overall red tone, embellished with white snowflake patterns on the top and left side. The upper right features a bunch of holly leaves with red berries and a pine cone. In the upper center, golden 3D text reads "Christmas Heartwarming Feedback" centered, along with red bold text "Buy 1 Get 1". Below, two transparent cups filled with bubble tea are placed side by side; the tea is light brown with dark brown pearls scattered at the bottom and middle. Below the cups, white snow piles up, decorated with pine branches, red berries, and pine cones. A blurry Christmas tree is faintly visible in the lower right corner. The image has high clarity, accurate text content, a unified design style, a prominent Christmas theme, and a reasonable layout, providing strong visual appeal.
+        4. User Input: A woman indoors shot in natural light, smiling with arms crossed, showing a relaxed and confident posture.
+            Rewrite Output: The image features a young Asian woman with long dark brown hair naturally falling over her shoulders, with some strands illuminated by light, showing a soft sheen. Her features are delicate, with long eyebrows, bright and spirited dark brown eyes looking directly at the camera, revealing peace and confidence. She has a high nose bridge, full lips with nude lipstick, and corners of the mouth slightly raised in a faint smile. Her skin is fair, with cheeks and collarbones illuminated by warm light, showing a healthy ruddiness. She wears a black spaghetti strap tank top revealing graceful collarbone lines, and a thin gold necklace with small beads and metal bars glinting in the light. Her outer layer is a beige knitted cardigan, soft in texture with visible knitting patterns on the sleeves. Her arms are crossed over her chest, hands covered by the cardigan sleeves, in a relaxed posture. The background is a pure dark brown without extra decoration, making the figure the absolute focus. The figure is located in the center of the frame. Light enters from the upper right, creating bright spots on her left cheek, neck, and collarbone, while the right side is slightly shadowed, creating a three-dimensional and soft tone. Image details are clear, showcasing skin texture, hair, and clothing materials well. Colors are dominated by warm tones, with the combination of beige and dark brown creating a warm and comfortable atmosphere. The overall style is natural, elegant, and artistic.
+        5. User Input: Create a series of images showing the growth process of an apple from seed to fruit. The series should include four stages: 1. Sowing, 2. Seedling growth, 3. Plant maturity, 4. Fruit harvesting.
+            Rewrite Output: A 4-panel exquisite illustration depicting the growth process of an apple, capturing each stage precisely and clearly. 1. "Sowing": A close-up shot of a hand gently placing a small apple seed into fertile dark soil, with visible soil texture and the seed's smooth surface. The background is a soft-focus garden dotted with green leaves and sunlight filtering through. 2. "Seedling Growth": A young apple sapling breaks through the soil, stretching tender green leaves toward the sky. The scene is set in a vibrant garden illuminated by warm golden light, highlighting the seedling's delicate structure. 3. "Plant Maturity": A mature apple tree, lush with branches and leaves, covered in tender green foliage and developing small apples. The background is a vibrant orchard under a clear blue sky, with dappled sunlight creating a peaceful atmosphere. 4. "Fruit Harvesting": A hand reaches into the tree to pick a ripe red apple, its smooth skin glistening in the sun. The scene shows the abundance of the orchard, with baskets of apples in the background, giving a sense of fulfillment. Each illustration uses a realistic style, focusing on details and harmonious colors to showcase the natural beauty and development of the apple's life cycle.
+        6. User Input: If 1 represents red, 2 represents green, 3 represents purple, and 4 represents yellow, please generate a four-color rainbow based on this rule. The color order from top to bottom is 3142.
+            Rewrite Output: The image consists of four horizontally arranged colored stripes, ordered from top to bottom as purple, red, yellow, and green. A white number is centered on each stripe. The top purple stripe features the number "3", the red stripe below it has the number "1", the yellow stripe further down has the number "4", and the bottom green stripe has the number "2". All numbers use a sans-serif font in pure white, forming a sharp contrast with the background colors to ensure good readability. The stripes have high color saturation and a slight texture. The overall layout is simple and clear, with distinct visual effects and no extra decorative elements, emphasizing the numerical information. The image is high definition, with accurate colors and a consistent style, offering strong visual appeal.
+        7. User Input: A stone tablet carved with "Guan Guan Ju Jiu, On the River Isle", natural light, background is a Chinese garden.
+            Rewrite Output: An ancient stone tablet carved with "Guan Guan Ju Jiu, On the River Isle", the surface covered with traces of time, the writing clear and deep. Natural light falls from above, softly illuminating every detail of the stone tablet and enhancing its sense of history. The background is an elegant Chinese garden featuring lush bamboo forests, winding paths, and quiet pools, creating a serene and distant atmosphere. The overall picture uses a realistic style with rich details and natural light and shadow effects, highlighting the cultural heritage of the stone tablet and the classical beauty of the garden.
+        # Output Format
+        Please directly output the rewritten and optimized Prompt content. Do not include any explanatory language or JSON formatting, and do not add opening or closing quotes yourself.
+    """
+    api_key = os.environ.get("HF_TOKEN")
+    if not api_key:
+        print("Warning: HF_TOKEN is not set. Prompt enhancement is disabled.")
+        return original_prompt
+    if not original_prompt:
+        return magic_prompt
+    # client = InferenceClient(provider="cerebras", api_key=api_key)
+    # messages = []
+    client = InferenceClient(provider="nebius", api_key=api_key)
+    try:
+        completion = client.chat.completions.create(
+            model="Qwen/Qwen3-Coder-30B-A3B-Instruct",
+            max_tokens=256,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": original_prompt}
+            ],
+        )
+        # completion = client.chat.completions.create(
+        #     model="Qwen/Qwen3-235B-A22B-Instruct-2507", messages=messages
+        # )
+        polished_prompt = completion.choices[0].message.content
+        # polished_prompt += f" {magic_prompt}"
+        return polished_prompt.strip().replace("\n", " ")
+    except Exception as e:
+        print(f"Error during prompt enhancement: {e}")
+        return original_prompt

utils/repo_utils.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import os, subprocess
+def clone_repo_if_not_exists(git_url, git_dir):
+    """
+    Clones a Git repository if it doesn't already exist in the git_dir folder.
+    """
+    home_dir = os.path.expanduser("~")
+    models_dir = os.path.join(home_dir, git_dir)
+    # Extract repository name from the Git URL
+    if git_url.endswith(".git"):
+        git_name = git_url.split('/')[-1][:-4]
+    else:
+        git_name = git_url.split('/')[-1]
+    repo_path = os.path.join(models_dir, git_name)
+    if not os.path.exists(models_dir):
+        os.makedirs(models_dir)
+        print(f"Created directory: {models_dir}")
+    if not os.path.exists(repo_path):
+        print(f"Repository '{git_name}' not found in '{models_dir}'. Cloning from {git_url}...")
+        try:
+            subprocess.run(["git", "clone", git_url, repo_path], check=True)
+            print(f"Successfully cloned '{git_name}' to '{repo_path}'.")
+        except subprocess.CalledProcessError as e:
+            print(f"Error cloning repository: {e}")
+        except FileNotFoundError:
+            print("Error: 'git' command not found. Please ensure Git is installed and in your PATH.")
+    else:
+        print(f"Repository '{git_name}' already exists at '{repo_path}'. Skipping clone.")