Spaces:

Chyd19
/

Chyd-Text-Image

Sleeping

App Files Files Community

Chyd19 commited on Dec 2, 2025

Commit

afb1906

verified ·

1 Parent(s): 936d869

my app.py

Browse files

Files changed (1) hide show

app.py +373 -0

app.py ADDED Viewed

	@@ -0,0 +1,373 @@

+# Srction 1
+# ==============================
+# SECTION 1
+# ==============================
+# Libraries
+import torch
+import gradio as gr
+from PIL import Image
+from diffusers import DiffusionPipeline
+from transformers import pipeline, BlipProcessor, BlipForQuestionAnswering
+import lpips
+import clip
+from bert_score import score
+import torchvision.transforms as T
+device = "cuda" if torch.cuda.is_available() else "cpu"
+def free_gpu_cache():
+    if device == "cuda":
+        torch.cuda.empty_cache()
+# ==============================
+# MODELS
+# ==============================
+gen_pipe = DiffusionPipeline.from_pretrained(
+    "stabilityai/sdxl-turbo",
+    torch_dtype=torch.float16 if device=="cuda" else torch.float32
+).to(device)
+dreamshaper_pipe = DiffusionPipeline.from_pretrained(
+    "Lykon/dreamshaper-7",
+    torch_dtype=torch.float16 if device=="cuda" else torch.float32
+).to(device)
+captioner = pipeline(
+    "image-to-text",
+    model="Salesforce/blip-image-captioning-large",
+    device=0 if device=="cuda" else -1,
+    generate_kwargs={"max_new_tokens":256, "num_beams":5, "temperature":0.7}
+)
+sentiment_model = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english",
+                           device=0 if device=="cuda" else -1)
+ner_model = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english",
+                     aggregation_strategy="simple", device=0 if device=="cuda" else -1)
+topic_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli",
+                       device=0 if device=="cuda" else -1)
+vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cpu")
+clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)
+lpips_model = lpips.LPIPS(net='alex').to(device)
+lpips_transform = T.Compose([T.ToTensor(), T.Resize((256,256))])
+style_map = {
+    "Photorealistic": "photorealistic, ultra-detailed, 8k, cinematic lighting",
+    "Real Life": "natural lighting, true-to-life colors, DSLR",
+    "Documentary": "documentary handheld muted colors",
+    "iPhone Camera": "iPhone photo natural HDR",
+    "Street Photography": "candid street ambient shadows",
+    "Cinematic": "cinematic lighting dramatic depth",
+    "Anime": "anime cel shaded vibrant",
+    "Watercolor": "watercolor soft wash art",
+    "Macro": "macro lens shallow DOF",
+    "Cyberpunk": "neon cyberpunk futuristic",
+}
+# Section 2
+# ==============================
+# SECTION 2 — FUNCTIONS
+# ==============================
+def generate_image_with_enhancer(base_caption, enhancer, negative, seed, style, images):
+    images = images or []
+    base_caption = base_caption or ""
+    enhancer = enhancer or ""
+    final_prompt = f"{base_caption}, {enhancer}".strip(", ")
+    final_prompt = f"{final_prompt}, {style_map.get(style,'')}".strip(", ")
+    try:
+        seed = int(seed)
+    except:
+        seed = 42
+    generator = torch.Generator(device="cpu").manual_seed(seed)
+    try:
+        with torch.no_grad():
+            out = gen_pipe(prompt=final_prompt, negative_prompt=negative, generator=generator)
+        img = out.images[0]
+    except Exception as e:
+        print("SD Turbo failed:", e)
+        img = None
+    if img:
+        images.append(img)
+    free_gpu_cache()
+    return img, images
+def generate_dreamshaper_with_enhancer(base_caption, enhancer, negative, seed, style, images):
+    images = images or []
+    base_caption = base_caption or ""
+    enhancer = enhancer or ""
+    final_prompt = f"{base_caption}, {enhancer}".strip(", ")
+    final_prompt = f"{final_prompt}, {style_map.get(style,'')}".strip(", ")
+    try:
+        seed = int(seed)
+    except:
+        seed = 42
+    generator = torch.Generator(device="cpu").manual_seed(seed)
+    try:
+        with torch.no_grad():
+            out = dreamshaper_pipe(prompt=final_prompt, negative_prompt=negative, generator=generator)
+        img = out.images[0]
+    except Exception as e:
+        print("DreamShaper failed:", e)
+        img = None
+    if img:
+        images.append(img)
+    free_gpu_cache()
+    return img, images
+def caption_for_image(img):
+    try:
+        out = captioner(img)
+        return out[0]["generated_text"]
+    except:
+        return "Caption failed."
+def answer_vqa(question, image):
+    if not image or not question.strip():
+        return "Provide image + question."
+    try:
+        inputs_raw = vqa_processor(images=image, text=question, return_tensors="pt")
+        inputs = {k:v.to("cpu") for k,v in inputs_raw.items()}
+        with torch.no_grad():
+            out = vqa_model(**inputs)
+        ans_id = out.logits.argmax(-1)
+        return vqa_processor.decode(ans_id[0], skip_special_tokens=True)
+    except:
+        return "VQA failed."
+def compute_metrics(images, captions, i1, i2):
+    img1 = images[i1]
+    img2 = images[i2]
+    cap1 = captions[i1]
+    cap2 = captions[i2]
+    # CLIP
+    t1 = clip_preprocess(img1).unsqueeze(0).to("cpu")
+    t2 = clip_preprocess(img2).unsqueeze(0).to("cpu")
+    with torch.no_grad():
+        f1 = clip_model.encode_image(t1)
+        f2 = clip_model.encode_image(t2)
+        clip_sim = float(torch.cosine_similarity(f1, f2))
+    # LPIPS
+    L1 = (lpips_transform(img1).unsqueeze(0)*2 - 1)
+    L2 = (lpips_transform(img2).unsqueeze(0)*2 - 1)
+    with torch.no_grad():
+        lp = float(lpips_model(L1, L2))
+    # BERTScore
+    if cap1 and cap2:
+        _, _, F = score([cap1],[cap2], lang="en", verbose=False)
+        bert_f1 = float(F.mean())
+    else:
+        bert_f1 = 0.0
+    return clip_sim, lp, bert_f1
+# Section 3
+# ---------------- Build Gradio UI with Custom Look ----------------
+def build_ui_with_custom_ui():
+    with gr.Blocks(title="Multimodal AI Image Studio") as demo:
+        # ---------------- CSS Styling ----------------
+        gr.HTML("""
+        <style>
+        .heading-orange h2, .heading-orange h3 { color: #ff5500 !important; }
+        .orange-btn button { background-color: #ff5500 !important; color: white !important; border-radius: 6px !important; height: 36px !important; font-weight: bold; }
+        .teal-btn button { background-color: #008080 !important; color: white !important; border-radius: 6px !important; height: 40px !important; font-weight: bold; }
+        /* Horizontal thin spinner */
+        .loading-line {
+            height: 4px;
+            background: linear-gradient(90deg, #008080 0%, #00cccc 50%, #008080 100%);
+            background-size: 200% 100%;
+            animation: loading 1s linear infinite;
+        }
+        @keyframes loading {
+            0% { background-position: 200% 0; }
+            100% { background-position: -200% 0; }
+        }
+        /* Match enhancer box to upload button */
+        .enhancer-box textarea {
+            width: 100% !important;
+            height: 36px !important;
+            box-sizing: border-box;
+            font-size: 14px;
+        }
+        /* Equal-height styling for Step-1 columns */
+        .equal-height-row {
+            display: flex;
+            align-items: stretch;
+        }
+        .equal-height-row > .gr-column {
+            display: flex;
+            flex-direction: column;
+        }
+        </style>
+        """)
+        # ---------------- Heading ----------------
+        gr.Markdown("## Multimodal AI Image Studio: An Integrated Comparative Perspective", elem_classes="heading-orange")
+        # ---------------- States ----------------
+        images_state = gr.State([])
+        captions_state = gr.State([])
+        # ---------------- Step 1: Upload Reference Image ----------------
+        gr.Markdown("### Upload Reference Image", elem_classes="heading-orange")
+        # ✅ APPLY equal-height class here
+        with gr.Row(elem_classes="equal-height-row"):
+            with gr.Column(scale=1):
+                upload_input = gr.Image(label="Drag & Drop Image", type="pil")
+                upload_btn = gr.Button("Upload Image & Generate Caption", elem_classes="orange-btn")
+            with gr.Column(scale=1):
+                upload_preview = gr.Image(label="Uploaded Image", interactive=False)
+                enhancer_box = gr.Textbox(
+                    label="Add Prompt Enhancer (Optional)",
+                    placeholder="Example: 'at night with neon lights', 'wearing a red jacket', etc.",
+                    elem_classes="enhancer-box"
+                )
+                caption_out = gr.Markdown(label="Generated Caption")
+        # Robust captioning
+        def upload_and_generate_caption_ui(img, images_state, captions_state):
+            if img is None:
+                return None, "No image uploaded.", [], []
+            images = [img]
+            try:
+                output = captioner(img)
+                caption = output[0]["generated_text"] if len(output) > 0 and "generated_text" in output[0] else "Caption failed."
+            except Exception as e:
+                print("Captioning error:", e)
+                caption = "Caption failed."
+            captions = [caption]
+            return img, caption, images, captions
+        upload_btn.click(
+            upload_and_generate_caption_ui,
+            inputs=[upload_input, images_state, captions_state],
+            outputs=[upload_preview, caption_out, images_state, captions_state]
+        )
+        # ---------------- Step 2: Generate SD-Turbo & DreamShaper ----------------
+        gr.Markdown("### Generate Images from Caption", elem_classes="heading-orange")
+        with gr.Row():
+            with gr.Column(scale=1, min_width=300):
+                sd_btn = gr.Button("Generate SD-Turbo Image", elem_classes="orange-btn")
+                sd_preview = gr.Image(label="SD-Turbo Image", interactive=False)
+            with gr.Column(scale=1, min_width=300):
+                ds_btn = gr.Button("Generate DreamShaper Image", elem_classes="orange-btn")
+                ds_preview = gr.Image(label="DreamShaper Image", interactive=False)
+        def generate_sd_from_caption_ui(caption, enhancer, images_state, captions_state):
+            final_prompt = f"{caption}, {enhancer}".strip(", ")
+            img, images = generate_image_with_enhancer(final_prompt, enhancer="", negative="", seed=42, style="Photorealistic", images=images_state)
+            try:
+                generated_caption = captioner(img)[0]["generated_text"]
+            except:
+                generated_caption = "Caption failed."
+            captions_state[1:2] = [generated_caption]
+            return img, images, captions_state
+        def generate_ds_from_caption_ui(caption, enhancer, images_state, captions_state):
+            final_prompt = f"{caption}, {enhancer}".strip(", ")
+            img, images = generate_dreamshaper_with_enhancer(final_prompt, enhancer="", negative="", seed=123, style="Photorealistic", images=images_state)
+            try:
+                generated_caption = captioner(img)[0]["generated_text"]
+            except:
+                generated_caption = "Caption failed."
+            captions_state[2:3] = [generated_caption]
+            return img, images, captions_state
+        sd_btn.click(generate_sd_from_caption_ui, inputs=[caption_out, enhancer_box, images_state, captions_state],
+                     outputs=[sd_preview, images_state, captions_state])
+        ds_btn.click(generate_ds_from_caption_ui, inputs=[caption_out, enhancer_box, images_state, captions_state],
+                     outputs=[ds_preview, images_state, captions_state])
+        # ---------------- Step 3: Compute Pairwise Metrics ----------------
+        gr.Markdown("### Compute Pairwise Metrics", elem_classes="heading-orange")
+        metrics_btn = gr.Button("Compute Metrics for All Pairs", elem_classes="teal-btn")
+        with gr.Row():
+            metrics_A = gr.Markdown()
+            metrics_B = gr.Markdown()
+            metrics_C = gr.Markdown()
+        def compute_metrics_all_pairs_ui(images, captions):
+            yield ("<div class='loading-line'></div>", "<div class='loading-line'></div>", "<div class='loading-line'></div>")
+            if len(images) < 3:
+                msg = "All three images and captions are required to compute metrics."
+                yield msg, msg, msg
+            else:
+                A = compute_metrics(images, captions, 0, 1)
+                B = compute_metrics(images, captions, 0, 2)
+                C = compute_metrics(images, captions, 1, 2)
+                yield (f"**Reference ↔ SD-Turbo**\n{A}",
+                       f"**Reference ↔ DreamShaper**\n{B}",
+                       f"**SD-Turbo ↔ DreamShaper**\n{C}")
+        metrics_btn.click(compute_metrics_all_pairs_ui, inputs=[images_state, captions_state],
+                          outputs=[metrics_A, metrics_B, metrics_C])
+        # ---------------- Step 4: NLP Analysis ----------------
+        gr.Markdown("### NLP Analysis of Captions", elem_classes="heading-orange")
+        nlp_btn = gr.Button("Analyze Captions", elem_classes="teal-btn")
+        nlp_out = gr.HTML()
+        def analyze_caption_pipeline_ui(captions):
+            yield "<div class='loading-line'></div>"
+            if len(captions) < 3:
+                yield "<b>All three captions are required for NLP analysis.</b>"
+            else:
+                labels = ["Reference Image", "SD-Turbo", "DreamShaper"]
+                blocks = []
+                for label, caption in zip(labels, captions):
+                    sentiment = "<br>".join([f"{s['label']}: {s['score']:.2f}" for s in sentiment_model(caption)])
+                    ents = "<br>".join([f"{e['entity_group']}: {e['word']}" for e in ner_model(caption)]) or "None"
+                    topics_data = topic_model(caption, candidate_labels=['people','animals','objects','food','nature'])
+                    topics = "<br>".join([f"{l}: {sc:.2f}" for l, sc in zip(topics_data['labels'], topics_data['scores'])])
+                    block = f"<div style='flex:1;padding:10px;min-width:250px;'><h3><u>{label}</u></h3><b>Sentiment</b><br>{sentiment}<br><br><b>Entities</b><br>{ents}<br><br><b>Topics</b><br>{topics}</div>"
+                    blocks.append(block)
+                yield f"<div style='display:flex; gap:20px; justify-content:space-between;'>{''.join(blocks)}</div>"
+        nlp_btn.click(analyze_caption_pipeline_ui, inputs=[captions_state], outputs=[nlp_out])
+        # ---------------- Step 5: Visual Question Answering ----------------
+        gr.Markdown("### Visual Question Answering (VQA)", elem_classes="heading-orange")
+        with gr.Row():
+            with gr.Column(scale=1):
+                vqa_input = gr.Textbox(label="Enter a question about the reference image")
+                vqa_btn = gr.Button("Get Answer", elem_classes="teal-btn")
+            with gr.Column(scale=1):
+                vqa_out = gr.Markdown(label="VQA Output")
+        def answer_vqa_ui(question, image):
+            yield "<div class='loading-line'></div>"
+            ans = answer_vqa(question, image)
+            yield ans
+        vqa_btn.click(answer_vqa_ui, inputs=[vqa_input, upload_preview], outputs=[vqa_out])
+    return demo
+# Launch the interface
+demo = build_ui_with_custom_ui()
+demo.launch()