Spaces:

ProfRom
/

TestSpace3

Running

App Files Files Community

ProfRom commited on Dec 8, 2025

Commit

648030e

verified ·

1 Parent(s): 13a9703

Poojary - Final Assignment submission

Browse files

Files changed (1) hide show

app.py +95 -173

app.py CHANGED Viewed

@@ -1,177 +1,99 @@
-# ==============================================================================
-# Josh Guimond
-# Unit 8 Assignment: End-to-End AI Solution Implementation
-# ARIN 460
-# 12/03/2025
-# Description: This script implements a multimodal AI web app using Gradio to
-# run two image captioning models, a text “vibe” classifier, and NLP metrics on
-# uploaded images, allowing direct comparison of model captions to ground-truth
-# descriptions.
-# ==============================================================================
-# Video: https://youtu.be/pXCO00lK2UE
-# Space: https://huggingface.co/spaces/jguimond/assignment_8_v3
-# ==============================================================================
-# SECTION 1: SETUP & INSTALLATIONS
-# ==============================================================================
-# Install libraries
 import gradio as gr
-from transformers import pipeline, AutoTokenizer, AutoModelForImageTextToText
-from sentence_transformers import SentenceTransformer, util
-import evaluate
-import warnings
-import logging
-# Filter out the "FutureWarning" and "UserWarning" to keep the console clean
-warnings.filterwarnings("ignore", category=FutureWarning)
-warnings.filterwarnings("ignore", category=UserWarning)
-logging.getLogger("transformers").setLevel(logging.ERROR)
-# ==============================================================================
-# SECTION 2: LOAD MODELS
-# ==============================================================================
-# --- 1. Load Image Captioning Models ---
-# Model 1: BLIP (Base)
-print("Loading Model 1 (BLIP)...")
-captioner_model1 = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
-# Model 2: ViT-GPT2 (With Tokenizer Fix)
-print("Loading Model 2 (ViT-GPT2)...")
-# Load the tokenizer manually to set the pad_token and fix the warning
-vit_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
-vit_tokenizer.pad_token = vit_tokenizer.eos_token # <--- THE FIX
-captioner_model2 = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", tokenizer=vit_tokenizer)
-# --- 2. Load NLP Analysis Models (Unit 4 Techniques) ---
-# A. Zero-Shot Classifier (For Nuanced Vibe/Sentiment)
-print("Loading Zero-Shot Classifier...")
-classifier = pipeline("zero-shot-classification", model="MoritzLaurer/deberta-v3-xsmall-zeroshot-v1.1-all-33")
-# B. Semantic Similarity (For Model Agreement)
-print("Loading Sentence Transformer...")
-similarity_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
-# C. ROUGE Metric (For Accuracy vs Ground Truth)
-print("Loading ROUGE Metric...")
-rouge = evaluate.load("rouge")
-# Define Nuanced Labels based on the image list
-# These cover: Peaceful dog, Sad funeral, Happy kids, Angry man, Scared people, Fighting tigers
-VIBE_LABELS = ["Peaceful/Calm", "Happy/Joy", "Sad/Sorrow", "Angry/Upset", "Fear/Scared", "Action/Violence"]
-# ==============================================================================
-# SECTION 3: ANALYSIS FUNCTIONS
-# ==============================================================================
-# --- Analysis Function ---
-def analyze_image(image, ground_truth):
-    # -- A. Generate Captions --
-    res1 = captioner_model1(image)
-    cap1 = res1[0]['generated_text']
-    res2 = captioner_model2(image)
-    cap2 = res2[0]['generated_text']
-    # -- B. Analyze Vibe (Zero-Shot) --
-    # Model 1 Vibe
-    vibe1_result = classifier(cap1, VIBE_LABELS)
-    vibe1_label = vibe1_result['labels'][0]
-    vibe1_score = vibe1_result['scores'][0]
-    # Model 2 Vibe
-    vibe2_result = classifier(cap2, VIBE_LABELS)
-    vibe2_label = vibe2_result['labels'][0]
-    vibe2_score = vibe2_result['scores'][0]
-    # -- C. Calculate Statistics --
-    # 1. Semantic Similarity (Do the models agree?)
-    emb1 = similarity_model.encode(cap1, convert_to_tensor=True)
-    emb2 = similarity_model.encode(cap2, convert_to_tensor=True)
-    sim_score = util.pytorch_cos_sim(emb1, emb2).item()
-    # 2. ROUGE Scores (How accurate are they vs Ground Truth?)
-    rouge_output = "N/A (No Ground Truth provided)"
-    if ground_truth and ground_truth.strip() != "":
-        # Calculate scores
-        r1 = rouge.compute(predictions=[cap1], references=[ground_truth])
-        r2 = rouge.compute(predictions=[cap2], references=[ground_truth])
-        # Format the ROUGE output nicely
-        rouge_output = (
-            f"Model 1 ROUGE-L: {r1['rougeL']:.3f}\n"
-            f"Model 2 ROUGE-L: {r2['rougeL']:.3f}\n"
-            f"(Higher is better)"
-        )
-    # -- D. Format Output Strings --
-    # Create clean, formatted strings for the large textboxes
-    out1 = (
-        f"CAPTION: {cap1}\n"
-        f"-----------------------------\n"
-        f"DETECTED VIBE: {vibe1_label}\n"
-        f"CONFIDENCE: {vibe1_score:.1%}"
-    )
-    out2 = (
-        f"CAPTION: {cap2}\n"
-        f"-----------------------------\n"
-        f"DETECTED VIBE: {vibe2_label}\n"
-        f"CONFIDENCE: {vibe2_score:.1%}"
-    )
-    stats = (
-        f"--- 1. MODEL AGREEMENT (Semantic Similarity) ---\n"
-        f"Score: {sim_score:.3f}\n"
-        f"(Scale: 0.0 = Different, 1.0 = Identical)\n\n"
-        f"--- 2. OBJECT IDENTIFICATION ACCURACY (ROUGE) ---\n"
-        f"Ground Truth: '{ground_truth}'\n"
-        f"{rouge_output}"
-    )
-    return out1, out2, stats
-# ==============================================================================
-# SECTION 4: GRADIO INTERFACE
-# ==============================================================================
-# Define Inputs
-image_input = gr.Image(type="pil", label="Upload Image")
-text_input = gr.Textbox(label="Ground Truth Description", placeholder="e.g. 'A peaceful dog on a beach'")
-# Define Outputs with LARGER viewing areas (lines=5 or 10)
-output_m1 = gr.Textbox(label="Model 1 (BLIP) Analysis", lines=4)
-output_m2 = gr.Textbox(label="Model 2 (ViT-GPT2) Analysis", lines=4)
-output_stats = gr.Textbox(label="Comparison Metrics & Statistics", lines=10)
-# Create Interface
-interface = gr.Interface(
-    fn=analyze_image,
-    inputs=[image_input, text_input],
-    outputs=[output_m1, output_m2, output_stats],
-    title="Multimodal AI: Nuanced Image Analysis",
-    description="This application uses two Image Captioning models (BLIP & ViT-GPT2) to identify objects, Zero-Shot Classification to detect emotional vibes (Happy, Sad, Angry, etc.), and calculates ROUGE/Similarity metrics.",
-    examples=[
-        ["images/1.png", "A peaceful dog on a sunny beach"],
-        ["images/2.png", "Sad men carrying a casket at a funeral"],
-        ["images/3.png", "Happy kids at a birthday party"],
-        ["images/4.png", "An angry man in a car"],
-        ["images/5.png", "Two people happy mountain biking"],
-        ["images/6.png", "A man upset about his food at a restaurant"],
-        ["images/7.png", "A couple happy at a restaurant"],
-        ["images/8.png", "A sad woman reading a book"],
-        ["images/9.png", "People scared at a movie"],
-        ["images/10.png", "Two tigers fighting"]
-    ]
 )
-if __name__ == "__main__":
-    interface.launch()

 import gradio as gr
+import torch
+import os
+import tempfile
+from huggingface_hub import login
+from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering, infer_device, PaliGemmaForConditionalGeneration
+from accelerate import Accelerator
+# login to Hugging Face
+login(token=os.getenv('HF_TOKEN'))
+# Set the device
+device = infer_device()
+# MODEL 1: BLIP-VQA
+processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
+model = AutoModelForVisualQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
+# Define inference function for Model 1
+def process_image(image, prompt):
+    inputs = processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)
+    try:
+        # Generate output from the model
+        output = model.generate(**inputs, max_new_tokens=10)
+        # Decode and return the output
+        decoded_output = processor.batch_decode(output, skip_special_tokens=True)[0].strip()
+        # remove prompt from output
+        if decoded_output.startswith(prompt):
+            return decoded_output[len(prompt):].strip()
+        return decoded_output
+    except Exception as e:
+        print(f"Error in Model 1: {e}")
+        return "An error occurred during processing for Model 1."
+# MODEL 2: PaliGemma
+processor2 = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224")
+model2 = PaliGemmaForConditionalGeneration.from_pretrained(
+    "google/paligemma-3b-mix-224",
+    torch_dtype=torch.bfloat16
+).to(device)
+# Define inference function for Model 2
+def process_image2(image, prompt):
+    inputs2 = processor2(
+        text=prompt,
+        images=image,
+        return_tensors="pt"
+    ).to(device, model2.dtype)
+    try:
+        output = model2.generate(**inputs2, max_new_tokens=10)
+        decoded_output = processor2.batch_decode(
+            output[:, inputs2["input_ids"].shape[1]:],
+            skip_special_tokens=True
+        )[0].strip()
+        return decoded_output
+    except Exception as e:
+        print(f"Error in Model 2: {e}")
+        return "An error occurred during processing for Model 2. Ensure your hardware supports bfloat16 or adjust the torch_dtype."
+# GRADIO INTERFACE
+inputs_model1 = [
+   gr.Image(type="pil"),
+   gr.Textbox(label="Prompt", placeholder="Enter your question")
+]
+inputs_model2 = [
+   gr.Image(type="pil"),
+   gr.Textbox(label="Prompt", placeholder="Enter your question")
+]
+outputs_model1 = gr.Textbox(label="Answer")
+outputs_model2 = gr.Textbox(label="Answer")
+# Create the Gradio apps for each model
+model1_inf = gr.Interface(
+    fn=process_image,
+    inputs=inputs_model1,
+    outputs=outputs_model1,
+    title="Model 1: BLIP-VQA-Base",
+    description="Ask a question about the uploaded image using BLIP."
 )
+model2_inf = gr.Interface(
+    fn=process_image2,
+    inputs=inputs_model2,
+    outputs=outputs_model2,
+    title="Model 2: PaliGemma",
+    description="Ask a question about the uploaded image using PaliGemma."
+)
+demo = gr.TabbedInterface([model1_inf, model2_inf],["Model 1 (BLIP)", "Model 2 (PaliGemma)"])
+demo.launch(share=True)