""" Text-to-Image Generator + Evaluation Metrics Dataset : rhli/genarena | Model: runwayml/stable-diffusion-v1-5 Deploy on: Hugging Face Spaces (Gradio SDK) Evaluation metrics ────────────────── • CLIP Score – prompt-image alignment (higher = better; 0-100) Analogue of recall: did the image capture the prompt? • FID – Fréchet Inception Distance vs. a reference batch (lower = better; 0 = identical distributions) Analogue of precision: are generated images realistic? • Aesthetic Score – LAION aesthetic predictor (higher = better; 1-10) """ import torch import torch.nn as nn import gradio as gr import numpy as np import random from PIL import Image from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler from datasets import load_dataset # ───────────────────────────────────────────────────────────────────────────── # 1. Device / dtype # ───────────────────────────────────────────────────────────────────────────── DEVICE = "cuda" if torch.cuda.is_available() else "cpu" DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32 # ───────────────────────────────────────────────────────────────────────────── # 2. Generation pipeline # ───────────────────────────────────────────────────────────────────────────── MODEL_ID = "runwayml/stable-diffusion-v1-5" print(f"Loading generation model on {DEVICE} ...") pipe = StableDiffusionPipeline.from_pretrained( MODEL_ID, torch_dtype=DTYPE, safety_checker=None, requires_safety_checker=False, ) pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) pipe = pipe.to(DEVICE) if DEVICE == "cuda": pipe.enable_attention_slicing() print("Generation model ready") # ───────────────────────────────────────────────────────────────────────────── # 3. Evaluation models (lazy-loaded on first use to save startup time) # ───────────────────────────────────────────────────────────────────────────── _clip_model = None _clip_processor = None _aesthetic_model = None def _load_clip(): global _clip_model, _clip_processor if _clip_model is None: from transformers import CLIPModel, CLIPProcessor print("Loading CLIP ViT-B/32 ...") _clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(DEVICE) _clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") _clip_model.eval() print("CLIP ready") return _clip_model, _clip_processor class _AestheticPredictor(nn.Module): """Small MLP trained on LAION human ratings — predicts aesthetic score from CLIP embeddings.""" def __init__(self, input_size: int = 768): super().__init__() self.layers = nn.Sequential( nn.Linear(input_size, 1024), nn.Dropout(0.2), nn.Linear(1024, 128), nn.Dropout(0.2), nn.Linear(128, 64), nn.Dropout(0.1), nn.Linear(64, 16), nn.Linear(16, 1), ) def forward(self, x): return self.layers(x) def _load_aesthetic(): global _aesthetic_model if _aesthetic_model is None: from huggingface_hub import hf_hub_download print("Loading aesthetic predictor ...") weights_path = hf_hub_download( "camenduru/improved-aesthetic-predictor", filename="sac+logos+ava1-l14-linearMSE.pth", ) _aesthetic_model = _AestheticPredictor(input_size=768) # weights_only=False required for legacy .pth files (PyTorch 2.x changed the default) state = torch.load(weights_path, map_location="cpu", weights_only=False) _aesthetic_model.load_state_dict(state) _aesthetic_model.eval().to(DEVICE) print("Aesthetic predictor ready") return _aesthetic_model # ───────────────────────────────────────────────────────────────────────────── # 4. Metric helpers # ───────────────────────────────────────────────────────────────────────────── def compute_clip_score(image: Image.Image, prompt: str) -> float: """ CLIP Score in [0, 100]. Cosine similarity between CLIP image & text embeddings, scaled to 0-100. Higher = better prompt alignment — recall analogue. truncation=True + max_length=77 prevents the hard 77-token limit overflow. """ model, processor = _load_clip() inputs = processor( text=[prompt], images=image, return_tensors="pt", padding=True, truncation=True, max_length=77, ) # Move each tensor individually — BatchEncoding.to() is unreliable across versions inputs = {k: v.to(DEVICE) for k, v in inputs.items()} with torch.no_grad(): out = model(**inputs) img_emb = out.image_embeds txt_emb = out.text_embeds img_emb = img_emb / img_emb.norm(dim=-1, keepdim=True) txt_emb = txt_emb / txt_emb.norm(dim=-1, keepdim=True) sim = (img_emb * txt_emb).sum().item() return round(float(np.clip(sim * 100, 0, 100)), 2) def compute_aesthetic_score(image: Image.Image) -> float: """ LAION aesthetic score in [1, 10]. Higher = more visually pleasing. Uses CLIPVisionModelWithProjection (not CLIPModel) so .forward() always returns image_embeds as a plain tensor, not a BaseModelOutputWithPooling. """ try: from transformers import CLIPVisionModelWithProjection, CLIPProcessor clip_v = CLIPVisionModelWithProjection.from_pretrained( "openai/clip-vit-large-patch14" ).to(DEVICE) clip_v.eval() proc_v = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14") aes = _load_aesthetic() pixel_values = proc_v(images=image, return_tensors="pt")["pixel_values"].to(DEVICE) with torch.no_grad(): out = clip_v(pixel_values=pixel_values) # CLIPVisionModelOutput emb = out.image_embeds # plain tensor (1, 768) emb = emb / emb.norm(dim=-1, keepdim=True) score = aes(emb).item() return round(float(np.clip(score, 1, 10)), 2) except Exception as e: print(f"Aesthetic score skipped: {e}") return -1.0 # ───────────────────────────────────────────────────────────────────────────── # 5. Dataset prompts # ───────────────────────────────────────────────────────────────────────────── PROMPT_COLUMN = "prompt" try: _ds = load_dataset("rhli/genarena", split="train") DATASET_PROMPTS = [_ds[i][PROMPT_COLUMN] for i in range(min(200, len(_ds)))] print(f"Loaded {len(DATASET_PROMPTS)} prompts from rhli/genarena") except Exception as e: print(f"Dataset load failed: {e}") DATASET_PROMPTS = [ "a futuristic city at sunset", "a cozy cottage in a misty forest", "a robot painting a watercolor", "an astronaut on a purple alien planet", ] # ───────────────────────────────────────────────────────────────────────────── # 6. Core inference helpers wired to Gradio callbacks # ───────────────────────────────────────────────────────────────────────────── def _run_pipe(prompt, negative_prompt, num_steps, guidance_scale, seed): generator = torch.Generator(DEVICE).manual_seed(int(seed)) # torch.amp.autocast is the stable API across PyTorch versions if DEVICE == "cuda": ctx = torch.amp.autocast(device_type="cuda") else: ctx = torch.no_grad() with ctx: result = pipe( prompt, negative_prompt=negative_prompt or None, num_inference_steps=int(num_steps), guidance_scale=float(guidance_scale), generator=generator, height=512, width=512, ) return result.images[0] def generate_image(prompt, negative_prompt, num_steps, guidance_scale, seed): if not prompt.strip(): return None, "Please enter a prompt." try: image = _run_pipe(prompt, negative_prompt, num_steps, guidance_scale, seed) return image, f"Generated with seed {int(seed)}" except Exception as e: return None, f"Error: {e}" def evaluate_single(prompt, negative_prompt, num_steps, guidance_scale, seed, run_aesthetic): """Generate one image and compute CLIP score + optionally aesthetic score.""" if not prompt.strip(): return None, 0.0, 0.0, "Please enter a prompt." try: image = _run_pipe(prompt, negative_prompt, num_steps, guidance_scale, seed) clip = compute_clip_score(image, prompt) aes = compute_aesthetic_score(image) if run_aesthetic else -1.0 clip_status = "Good" if clip >= 25 else "Moderate" if clip >= 15 else "Low" rows = [ "### Evaluation Results", "", "| Metric | Value | Status |", "|--------|-------|--------|", f"| **CLIP Score** (0-100, recall analogue) | `{clip:.1f}` | {clip_status} |", ] if aes > 0: aes_status = "Good" if aes >= 5 else "Moderate" if aes >= 3 else "Low" rows.append(f"| **Aesthetic Score** (1-10) | `{aes:.2f}` | {aes_status} |") else: rows.append("| **Aesthetic Score** | `skipped` | enable checkbox to compute |") rows += [ "", "**CLIP Score** — how well the image matches the prompt (recall analogue).", "**Aesthetic Score** — perceived visual quality via LAION predictor.", ] return image, clip, aes if aes > 0 else 0.0, "\n".join(rows) except Exception as e: return None, 0.0, 0.0, f"Error: {e}" def random_prompt(): return random.choice(DATASET_PROMPTS) def random_seed(): return random.randint(0, 2**31 - 1) # ───────────────────────────────────────────────────────────────────────────── # 7. Gradio UI # ───────────────────────────────────────────────────────────────────────────── with gr.Blocks(title="Text-to-Image Generator", theme=gr.themes.Soft()) as demo: gr.Markdown( "# Text-to-Image Generator\n" "Stable Diffusion v1.5 · Dataset: " "[rhli/genarena](https://huggingface.co/datasets/rhli/genarena)" ) with gr.Tabs(): # ── Tab 1: Generate ────────────────────────────────────────────────── with gr.TabItem("Generate"): with gr.Row(): with gr.Column(scale=1): prompt_box = gr.Textbox(label="Prompt", lines=3, placeholder="Describe the image you want...") surprise_btn = gr.Button("Surprise me (dataset prompt)", variant="secondary", size="sm") neg_box = gr.Textbox( label="Negative prompt (optional)", value="blurry, low quality, ugly, distorted", lines=2, ) with gr.Accordion("Advanced settings", open=False): steps_sl = gr.Slider(10, 50, 20, step=1, label="Inference steps") guide_sl = gr.Slider(1.0, 20.0, 7.5, step=0.5, label="Guidance scale") with gr.Row(): seed_box = gr.Number(label="Seed", value=42, precision=0) rand_seed_btn = gr.Button("Random seed", size="sm") gen_btn = gr.Button("Generate", variant="primary") with gr.Column(scale=1): gen_image = gr.Image(label="Generated image", type="pil") gen_status = gr.Markdown("") gen_btn.click( generate_image, inputs=[prompt_box, neg_box, steps_sl, guide_sl, seed_box], outputs=[gen_image, gen_status], ) surprise_btn.click(random_prompt, outputs=prompt_box) rand_seed_btn.click(random_seed, outputs=seed_box) gr.Examples( examples=[ ["a golden sunset over a calm ocean, photorealistic", "blurry, low quality", 20, 7.5, 42], ["a watercolor painting of a Japanese cherry blossom garden", "", 25, 8.0, 7], ["a futuristic robot chef in a neon-lit kitchen", "low quality", 20, 7.5, 99], ["an ancient library filled with glowing magical books", "", 20, 9.0, 12], ], inputs=[prompt_box, neg_box, steps_sl, guide_sl, seed_box], outputs=[gen_image, gen_status], fn=generate_image, cache_examples=False, ) # ── Tab 2: Single-image evaluation ─────────────────────────────────── with gr.TabItem("Evaluate Single Image"): gr.Markdown( "Generate one image and measure:\n" "- **CLIP Score** (0-100) — prompt alignment. *Recall analogue.*\n" "- **Aesthetic Score** (1-10) — visual quality. *(adds ~30 s, loads an extra model)*" ) with gr.Row(): with gr.Column(scale=1): eval_prompt = gr.Textbox(label="Prompt", lines=3, placeholder="Enter your prompt...") eval_neg = gr.Textbox( label="Negative prompt", value="blurry, low quality, ugly, distorted", lines=2, ) with gr.Accordion("Settings", open=False): eval_steps = gr.Slider(10, 50, 20, step=1, label="Inference steps") eval_guide = gr.Slider(1.0, 20.0, 7.5, step=0.5, label="Guidance scale") with gr.Row(): eval_seed = gr.Number(label="Seed", value=42, precision=0) eval_rand_btn = gr.Button("Random seed", size="sm") eval_aes_chk = gr.Checkbox(label="Compute Aesthetic Score (slower)", value=False) eval_btn = gr.Button("Generate + Evaluate", variant="primary") with gr.Column(scale=1): eval_image = gr.Image(label="Generated image", type="pil") clip_num = gr.Number(label="CLIP Score (0-100)", precision=2) aes_num = gr.Number(label="Aesthetic Score (1-10)", precision=2) eval_md = gr.Markdown("") eval_btn.click( evaluate_single, inputs=[eval_prompt, eval_neg, eval_steps, eval_guide, eval_seed, eval_aes_chk], outputs=[eval_image, clip_num, aes_num, eval_md], ) eval_rand_btn.click(random_seed, outputs=eval_seed) # ── Tab 4: Metric guide ─────────────────────────────────────────────── with gr.TabItem("Metric Guide"): gr.Markdown( """ ## Evaluation Metrics | Metric | Range | Better when | Analogue | Method | |--------|-------|-------------|----------|--------| | CLIP Score | 0 – 100 | Higher | **Recall** | Cosine sim of CLIP image & text embeddings | | Aesthetic Score | 1 – 10 | Higher | Quality | LAION linear head on CLIP ViT-L/14 features | --- ### CLIP Score — Recall analogue - **What it measures:** Did the image capture the content described in the prompt? - **How:** CLIP encodes the image and text into a shared embedding space; cosine similarity is computed and scaled to 0-100. - **Threshold:** ≥ 25 is generally good alignment for SD v1.5. - **Limit:** CLIP can miss subtle semantic errors and spatial relationships. Prompts are truncated to 77 tokens. ### Aesthetic Score - **What it measures:** Perceived visual quality, independent of the prompt. - **How:** A small MLP trained on human LAION ratings predicts a score from CLIP ViT-L/14 embeddings. - **Threshold:** ≥ 5.0 is considered aesthetically pleasing. """ ) # ───────────────────────────────────────────────────────────────────────────── # 8. Launch # ───────────────────────────────────────────────────────────────────────────── if __name__ == "__main__": # queue() is required for long-running functions (batch eval, FID) # Without it Gradio times out silently when a function takes > a few seconds demo.queue().launch()