Spaces:

ProfRom
/

TestSpace

Sleeping

App Files Files Community

ProfRom commited on 20 days ago

Commit

faa5acc

verified ·

1 Parent(s): d2ca300

Stetson - Final submission

Browse files

Files changed (1) hide show

app.py +74 -105

app.py CHANGED Viewed

@@ -1,135 +1,104 @@
-import gradio as gr
 import torch
 from transformers import pipeline
-PROJECT_LABELS = ["person", "skis", "cell phone", "spoon", "stop sign"]
-def get_device():
-    return 0 if torch.cuda.is_available() else -1
-DEVICE = get_device()
-detector = pipeline(
-    task="object-detection",
-    model="facebook/detr-resnet-50",
-    device=DEVICE,
 )
-captioner = pipeline(task="image-text-to-text", model="Salesforce/blip-image-captioning-base", device=DEVICE)
-zero_shot = pipeline(
-    task="zero-shot-classification",
-    model="typeform/distilbert-base-uncased-mnli",
-    device=DEVICE,
 )
-def normalize_label(text):
-    text = str(text).strip().lower()
-    aliases = {
-        "cellphone": "cell phone",
-        "mobile phone": "cell phone",
-        "phone": "cell phone",
-        "human": "person",
-        "people": "person",
-        "stopsign": "stop sign",
-    }
-    return aliases.get(text, text)
-def run_object_detection(image):
-    detections = detector(image)
-    rows = []
-    for detection in detections[:10]:
-        box = detection.get("box", {})
-        rows.append(
-            [
-                normalize_label(detection.get("label", "")),
-                round(float(detection.get("score", 0.0)), 4),
-                round(float(box.get("xmin", 0.0)), 1),
-                round(float(box.get("ymin", 0.0)), 1),
-                round(float(box.get("xmax", 0.0)), 1),
-                round(float(box.get("ymax", 0.0)), 1),
-            ]
-        )
-    return rows
-def run_captioning(image):
-    result = captioner(image, "")
-    if not result:
-        return ""
-    return result[0]["generated_text"].strip()
-def classify_caption(caption):
-    if not caption:
-        return "No caption generated.", []
-    result = zero_shot(caption, PROJECT_LABELS)
-    top_label = normalize_label(result["labels"][0])
-    top_score = float(result["scores"][0])
-    score_rows = [
-        [normalize_label(label), round(float(score), 4)]
-        for label, score in zip(result["labels"], result["scores"])
-    ]
-    return f"{top_label} ({top_score:.4f})", score_rows
-def analyze_image(image):
-    if image is None:
-        return [], "", "", []
-    detection_rows = run_object_detection(image)
-    caption = run_captioning(image)
-    top_caption_label, zero_shot_rows = classify_caption(caption)
-    return detection_rows, caption, top_caption_label, zero_shot_rows
-with gr.Blocks(title="Multimodal Image Analysis") as demo:
-    gr.Markdown("# Multimodal Image Analysis")
     gr.Markdown(
-        "Upload an image to run object detection, image captioning, "
-        "and zero-shot classification of the generated caption."
     )
     with gr.Row():
-        image_input = gr.Image(type="pil", label="Upload Image")
-        with gr.Column():
-            detection_output = gr.Dataframe(
-                headers=["label", "score", "xmin", "ymin", "xmax", "ymax"],
-                label="Object Detection Results",
-                interactive=False,
             )
-            caption_output = gr.Textbox(label="Generated Caption")
-            top_label_output = gr.Textbox(label="Caption-Based Class Prediction")
-            zero_shot_output = gr.Dataframe(
-                headers=["candidate label", "score"],
-                label="Zero-Shot Classification Scores",
-                interactive=False,
             )
-    analyze_button = gr.Button("Analyze Image", variant="primary")
-    analyze_button.click(
         fn=analyze_image,
-        inputs=image_input,
-        outputs=[
-            detection_output,
-            caption_output,
-            top_label_output,
-            zero_shot_output,
-        ],
     )
 if __name__ == "__main__":
     demo.launch()

 import torch
 from transformers import pipeline
+from PIL import Image
+import gradio as gr
+# ── Device: GPU if available, otherwise CPU ───────────────────────────────────
+device = 0 if torch.cuda.is_available() else -1
+# ── Pipeline 1: Image Captioning (BLIP) ──────────────────────────────────────
+captioner = pipeline(
+    "image-to-text",
+    model="Salesforce/blip-image-captioning-base",
+    device=device,
 )
+# ── Pipeline 2: Image Classification (ViT) ───────────────────────────────────
+classifier = pipeline(
+    "image-classification",
+    model="google/vit-base-patch16-224",
+    device=device,
+)
+# ── Pipeline 3: Sentiment Analysis on Caption (DistilBERT) ───────────────────
+sentiment_analyzer = pipeline(
+    "sentiment-analysis",
+    model="distilbert-base-uncased-finetuned-sst-2-english",
+    device=device,
 )
+def analyze_image(image: Image.Image):
+    """Run all three pipelines and return formatted results."""
+    if image is None:
+        return "Upload an image to begin.", "", ""
+    # Pipeline 1 - BLIP caption
+    caption_result = captioner(image)
+    caption = caption_result[0]["generated_text"]
+    # Pipeline 2 - ViT top-5 classifications
+    cls_results = classifier(image)
+    top5_lines = [
+        f"{i + 1}. {r['label'].replace('_', ' ').title()}: {r['score']:.2%}"
+        for i, r in enumerate(cls_results[:5])
+    ]
+    top5_text = "\n".join(top5_lines)
+    # Pipeline 3 - DistilBERT sentiment on the caption
+    sent = sentiment_analyzer(caption)[0]
+    sentiment_text = f"{sent['label'].capitalize()}  (confidence: {sent['score']:.2%})"
+    return caption, top5_text, sentiment_text
+# ── Gradio UI (Blocks for layout control) ────────────────────────────────────
+with gr.Blocks(title="Multimodal AI Image Analyzer") as demo:
     gr.Markdown(
+        """
+        # Multimodal AI Image Analyzer
+        Upload any image to run three AI pipelines simultaneously:
+        - **BLIP** generates a natural-language caption (computer vision → text)
+        - **ViT** classifies the image content from 1,000 ImageNet categories
+        - **DistilBERT** analyzes the sentiment of the generated caption (NLP)
+        """
     )
     with gr.Row():
+        with gr.Column(scale=1):
+            img_input = gr.Image(type="pil", label="Input Image")
+            analyze_btn = gr.Button("Analyze", variant="primary")
+        with gr.Column(scale=1):
+            caption_out = gr.Textbox(
+                label="Pipeline 1 — BLIP Caption (image-to-text)",
+                lines=3,
+            )
+            cls_out = gr.Textbox(
+                label="Pipeline 2 — ViT Top-5 Classifications (image-classification)",
+                lines=6,
             )
+            sentiment_out = gr.Textbox(
+                label="Pipeline 3 — DistilBERT Caption Sentiment (sentiment-analysis)",
+                lines=2,
             )
+    analyze_btn.click(
         fn=analyze_image,
+        inputs=img_input,
+        outputs=[caption_out, cls_out, sentiment_out],
     )
+    gr.Markdown(
+        """
+        ---
+        **Models used:**
+        [Salesforce/blip-image-captioning-base](https://huggingface.co/Salesforce/blip-image-captioning-base) ·
+        [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) ·
+        [distilbert-base-uncased-finetuned-sst-2-english](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)
+        """
+    )
 if __name__ == "__main__":
     demo.launch()