Spaces:

Chyd19
/

Chyd-Text-Image

Sleeping

App Files Files Community

Chyd19 commited on Dec 3, 2025

Commit

8ddb2d1

verified ·

1 Parent(s): 24b980d

Update app.py

Browse files

Files changed (1) hide show

app.py +260 -6

app.py CHANGED Viewed

@@ -1,4 +1,259 @@
 # ==============================
 # SECTION 1
 # ==============================
@@ -181,7 +436,7 @@ def compute_metrics(images, captions, i1, i2):
 def build_ui_with_custom_ui():
     with gr.Blocks(title="Multimodal AI Image Studio") as demo:
         # ---------------- CSS Styling ----------------
-        gr.HTML("""
         <style>
         .heading-orange h2, .heading-orange h3 { color: #ff5500 !important; }
         .orange-btn button { background-color: #ff5500 !important; color: white !important; border-radius: 6px !important; height: 36px !important; font-weight: bold; }
@@ -217,7 +472,7 @@ def build_ui_with_custom_ui():
             flex-direction: column;
         }
         </style>
-        """)
         # ---------------- Heading ----------------
         gr.Markdown("## Multimodal AI Image Studio: An Integrated Comparative Perspective", elem_classes="heading-orange")
@@ -404,8 +659,8 @@ def build_ui_with_custom_ui():
 demo = build_ui_with_custom_ui()
 demo.launch()
-"""
 # Section 3
 # ---------------- Build Gradio UI with Custom Look ----------------
 def build_ui_with_custom_ui():
@@ -597,6 +852,5 @@ def build_ui_with_custom_ui():
 # Launch the interface
 demo = build_ui_with_custom_ui()
-demo.launch()
-"""

+# ==============================
+# Libraries
+# ==============================
+import torch
+import gradio as gr
+from PIL import Image
+from diffusers import DiffusionPipeline
+from transformers import pipeline, BlipProcessor, BlipForQuestionAnswering
+import lpips
+import clip
+from bert_score import score
+import torchvision.transforms as T
+device = "cuda" if torch.cuda.is_available() else "cpu"
+def free_gpu_cache():
+    if device == "cuda":
+        torch.cuda.empty_cache()
+# ==============================
+# Load Models (HF-ready, memory safe)
+# ==============================
+# SDXL-Turbo
+gen_pipe = DiffusionPipeline.from_pretrained(
+    "stabilityai/sdxl-turbo",
+    torch_dtype=torch.float16 if device=="cuda" else torch.float32
+).to(device)
+# DreamShaper
+dreamshaper_pipe = DiffusionPipeline.from_pretrained(
+    "Lykon/dreamshaper-7",
+    torch_dtype=torch.float16 if device=="cuda" else torch.float32
+).to(device)
+# BLIP Captioning
+captioner = pipeline(
+    "image-to-text",
+    model="Salesforce/blip-image-captioning-large",
+    device=0 if device=="cuda" else -1,
+    generate_kwargs={"max_new_tokens":256, "num_beams":5, "temperature":0.7}
+)
+# Sentiment / NER / Topic
+sentiment_model = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english",
+                           device=0 if device=="cuda" else -1)
+ner_model = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english",
+                     aggregation_strategy="simple", device=0 if device=="cuda" else -1)
+topic_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli",
+                       device=0 if device=="cuda" else -1)
+# BLIP VQA
+vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cpu")
+# CLIP / LPIPS
+clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)
+lpips_model = lpips.LPIPS(net='alex').to(device)
+lpips_transform = T.Compose([T.ToTensor(), T.Resize((256,256))])
+# Style map
+style_map = {
+    "Photorealistic": "photorealistic, ultra-detailed, 8k, cinematic lighting",
+    "Real Life": "natural lighting, true-to-life colors, DSLR",
+    "Documentary": "documentary handheld muted colors",
+    "iPhone Camera": "iPhone photo natural HDR",
+    "Street Photography": "candid street ambient shadows",
+    "Cinematic": "cinematic lighting dramatic depth",
+    "Anime": "anime cel shaded vibrant",
+    "Watercolor": "watercolor soft wash art",
+    "Macro": "macro lens shallow DOF",
+    "Cyberpunk": "neon cyberpunk futuristic",
+}
+# ==============================
+# Functions
+# ==============================
+def generate_image(pipe, caption, enhancer, negative, seed, style):
+    final_prompt = f"{caption}, {enhancer}".strip(", ")
+    final_prompt = f"{final_prompt}, {style_map.get(style,'')}".strip(", ")
+    try:
+        seed = int(seed)
+    except:
+        seed = 42
+    generator = torch.Generator(device="cpu").manual_seed(seed)
+    img = None
+    try:
+        with torch.no_grad():
+            out = pipe(prompt=final_prompt, negative_prompt=negative, generator=generator, height=512, width=512)
+        img = out.images[0]
+    except Exception as e:
+        print(f"{pipe} generation failed:", e)
+    free_gpu_cache()
+    return img
+def caption_for_image(img):
+    try:
+        out = captioner(img)
+        return out[0]["generated_text"]
+    except:
+        return "Caption failed."
+def compute_metrics(images, captions, i1, i2):
+    img1, img2 = images[i1], images[i2]
+    cap1, cap2 = captions[i1], captions[i2]
+    # CLIP similarity
+    t1, t2 = clip_preprocess(img1).unsqueeze(0).to(device), clip_preprocess(img2).unsqueeze(0).to(device)
+    with torch.no_grad():
+        f1, f2 = clip_model.encode_image(t1), clip_model.encode_image(t2)
+        clip_sim = float(torch.cosine_similarity(f1, f2))
+    # LPIPS
+    L1 = (lpips_transform(img1).unsqueeze(0)*2 - 1).to(device)
+    L2 = (lpips_transform(img2).unsqueeze(0)*2 - 1).to(device)
+    with torch.no_grad():
+        lp = float(lpips_model(L1, L2))
+    # BERTScore
+    if cap1 and cap2:
+        _, _, F = score([cap1],[cap2], lang="en", verbose=False)
+        bert_f1 = float(F.mean())
+    else:
+        bert_f1 = 0.0
+    return clip_sim, lp, bert_f1
+def answer_vqa(question, image):
+    if not image or not question.strip():
+        return "Provide image + question."
+    try:
+        inputs_raw = vqa_processor(images=image, text=question, return_tensors="pt")
+        inputs = {k:v.to("cpu") for k,v in inputs_raw.items()}
+        with torch.no_grad():
+            out = vqa_model(**inputs)
+        ans_id = out.logits.argmax(-1)
+        return vqa_processor.decode(ans_id[0], skip_special_tokens=True)
+    except:
+        return "I could not determine the answer."
+# ==============================
+# Gradio UI
+# ==============================
+def build_ui():
+    with gr.Blocks(title="Multimodal AI Image Studio") as demo:
+        images_state = gr.State([None, None, None])
+        captions_state = gr.State(["", "", ""])
+        gr.Markdown("## Multimodal AI Image Studio (HF-ready)")
+        # --- Step 1: Upload Reference ---
+        upload_input = gr.Image(label="Upload Reference Image", type="pil")
+        upload_btn = gr.Button("Upload & Caption")
+        upload_preview = gr.Image(interactive=False)
+        caption_out = gr.Markdown()
+        def upload_and_caption(img, images_state, captions_state):
+            if img is None:
+                return None, "No image uploaded.", images_state, captions_state
+            caption = caption_for_image(img)
+            images_state[0] = img
+            captions_state[0] = caption
+            return img, caption, images_state, captions_state
+        upload_btn.click(upload_and_caption, inputs=[upload_input, images_state, captions_state],
+                         outputs=[upload_preview, caption_out, images_state, captions_state])
+        # --- Step 2: Generate SDXL & DreamShaper ---
+        sd_btn = gr.Button("Generate SD-Turbo")
+        ds_btn = gr.Button("Generate DreamShaper")
+        sd_preview = gr.Image(interactive=False)
+        ds_preview = gr.Image(interactive=False)
+        def gen_sd(caption, images_state, captions_state):
+            img = generate_image(gen_pipe, caption, enhancer="", negative="", seed=42, style="Photorealistic")
+            if img:
+                images_state[1] = img
+                captions_state[1] = caption_for_image(img)
+            return img, images_state, captions_state
+        def gen_ds(caption, images_state, captions_state):
+            img = generate_image(dreamshaper_pipe, caption, enhancer="", negative="", seed=123, style="Photorealistic")
+            if img:
+                images_state[2] = img
+                captions_state[2] = caption_for_image(img)
+            return img, images_state, captions_state
+        sd_btn.click(gen_sd, inputs=[caption_out, images_state, captions_state],
+                     outputs=[sd_preview, images_state, captions_state])
+        ds_btn.click(gen_ds, inputs=[caption_out, images_state, captions_state],
+                     outputs=[ds_preview, images_state, captions_state])
+        # --- Step 3: Metrics ---
+        metrics_btn = gr.Button("Compute Metrics")
+        metrics_out = gr.Markdown()
+        def metrics_ui(images_state, captions_state):
+            imgs = images_state or []
+            caps = captions_state or []
+            if None in imgs or "" in caps:
+                return "All three images and captions are required."
+            A = compute_metrics(imgs, caps, 0, 1)
+            B = compute_metrics(imgs, caps, 0, 2)
+            C = compute_metrics(imgs, caps, 1, 2)
+            return f"Reference ↔ SD-Turbo: {A}\nReference ↔ DreamShaper: {B}\nSD-Turbo ↔ DreamShaper: {C}"
+        metrics_btn.click(metrics_ui, inputs=[images_state, captions_state], outputs=[metrics_out])
+        # --- Step 4: NLP ---
+        nlp_btn = gr.Button("Analyze Captions")
+        nlp_out = gr.HTML()
+        def analyze_nlp(captions_state):
+            caps = captions_state or []
+            if "" in caps:
+                return "<b>All three captions are required.</b>"
+            labels = ["Reference", "SD-Turbo", "DreamShaper"]
+            html_blocks = []
+            for label, cap in zip(labels, caps):
+                # Sentiment
+                sentiment = "<br>".join([f"{s['label']}: {s['score']:.2f}" for s in sentiment_model(cap)])
+                # Entities
+                ents_list = ner_model(cap)
+                ents = "<br>".join([f"{e['entity_group']}: {e['word']}" for e in ents_list])
+                # Topics
+                topics_data = topic_model(cap, candidate_labels=['people','animals','objects','food','nature'])
+                topics = "<br>".join([f"{l}: {sc:.2f}" for l, sc in zip(topics_data['labels'], topics_data['scores'])])
+                html_blocks.append(f"<div style='padding:10px;'><h3>{label}</h3><b>Sentiment</b><br>{sentiment}<br><b>Entities</b><br>{ents}<br><b>Topics</b><br>{topics}</div>")
+            return "<div style='display:flex;gap:20px;'>" + "".join(html_blocks) + "</div>"
+        nlp_btn.click(analyze_nlp, inputs=[captions_state], outputs=[nlp_out])
+        # --- Step 5: VQA ---
+        vqa_input = gr.Textbox(label="Ask about reference image")
+        vqa_btn = gr.Button("Get Answer")
+        vqa_out = gr.Markdown()
+        def vqa_ui(question, img):
+            return answer_vqa(question, img)
+        vqa_btn.click(vqa_ui, inputs=[vqa_input, upload_preview], outputs=[vqa_out])
+    return demo
+# Launch
+demo = build_ui()
+demo.launch()
+# Dumped section
+"""
+####################################################################################
 # ==============================
 # SECTION 1
 # ==============================
 def build_ui_with_custom_ui():
     with gr.Blocks(title="Multimodal AI Image Studio") as demo:
         # ---------------- CSS Styling ----------------
+        gr.HTML(
         <style>
         .heading-orange h2, .heading-orange h3 { color: #ff5500 !important; }
         .orange-btn button { background-color: #ff5500 !important; color: white !important; border-radius: 6px !important; height: 36px !important; font-weight: bold; }
             flex-direction: column;
         }
         </style>
+        )
         # ---------------- Heading ----------------
         gr.Markdown("## Multimodal AI Image Studio: An Integrated Comparative Perspective", elem_classes="heading-orange")
 demo = build_ui_with_custom_ui()
 demo.launch()
+   ####################################################################################
 # Section 3
 # ---------------- Build Gradio UI with Custom Look ----------------
 def build_ui_with_custom_ui():
 # Launch the interface
 demo = build_ui_with_custom_ui()
+demo.launch()"""