Spaces:

ProfRom
/

TestSpace3

Sleeping

App Files Files Community

ProfRom commited on Dec 4, 2025

Commit

ea35d18

verified ·

1 Parent(s): f2f22f7

Guimond - Final Assignment submission

Browse files

Files changed (2) hide show

app.py +169 -314
requirements.txt +9 -9

app.py CHANGED Viewed

@@ -1,322 +1,177 @@
-# app.py — Lazy Loaded Multimodal AI System
-#
-# Models load ONLY when needed to avoid memory overflow
-# Works on Hugging Face free CPU Spaces
-import torch
 import gradio as gr
-device = torch.device("cpu")
-# ---------------------------------------------------------
-# LAZY MODEL LOADERS
-# ---------------------------------------------------------
-def load_caption_model():
-    from transformers import BlipProcessor, BlipForConditionalGeneration
-    model_name = "Salesforce/blip-image-captioning-base"
-    processor = BlipProcessor.from_pretrained(model_name)
-    model = BlipForConditionalGeneration.from_pretrained(model_name).to(device)
-    return processor, model
-def load_sentiment_model():
-    from transformers import pipeline
-    return pipeline(
-        "sentiment-analysis",
-        model="distilbert-base-uncased-finetuned-sst-2-english"
-    )
-def load_vqa_model():
-    from transformers import BlipProcessor, BlipForQuestionAnswering
-    model_name = "Salesforce/blip-vqa-base"
-    processor = BlipProcessor.from_pretrained(model_name)
-    model = BlipForQuestionAnswering.from_pretrained(model_name).to(device)
-    return processor, model
-def load_detr_model():
-    from transformers import DetrImageProcessor, DetrForObjectDetection
-    processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
-    model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(device)
-    return processor, model
-def load_vit_model():
-    from transformers import ViTImageProcessor, ViTForImageClassification
-    model_name = "google/vit-base-patch16-224"
-    processor = ViTImageProcessor.from_pretrained(model_name)
-    model = ViTForImageClassification.from_pretrained(model_name).to(device)
-    return processor, model
-# NEW — more verbose, less repetitive rewrite model
-def load_llm():
-    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-    name = "google/flan-t5-large"
-    tokenizer = AutoTokenizer.from_pretrained(name)
-    model = AutoModelForSeq2SeqLM.from_pretrained(name).to(device)
-    return tokenizer, model
-# ---------------------------------------------------------
-# TASKS
-# ---------------------------------------------------------
-def generate_caption(image):
-    processor, model = load_caption_model()
-    inputs = processor(images=image, return_tensors="pt").to(device)
-    with torch.no_grad():
-        out_ids = model.generate(**inputs, max_new_tokens=30)
-    return processor.decode(out_ids[0], skip_special_tokens=True)
-def analyze_sentiment(text):
-    sentiment = load_sentiment_model()
-    out = sentiment(text)[0]
-    return out["label"], round(out["score"] * 100, 2)
-def vqa_answer(image, question):
-    processor, model = load_vqa_model()
-    inputs = processor(images=image, text=question, return_tensors="pt").to(device)
-    with torch.no_grad():
-        out = model.generate(**inputs)
-    return processor.decode(out[0], skip_special_tokens=True)
-def detect_objects(image):
-    processor, model = load_detr_model()
-    inputs = processor(images=image, return_tensors="pt").to(device)
-    with torch.no_grad():
-        outputs = model(**inputs)
-    target_sizes = torch.tensor([image.size[::-1]])
-    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes)[0]
-    detections = []
-    for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
-        if score > 0.3:
-            detections.append(
-                f"{model.config.id2label[label.item()]} (score {round(score.item(), 2)})"
-            )
-    if len(detections) == 0:
-        return ["No high-confidence objects detected"]
-    return detections
-def classify_scene(image):
-    processor, model = load_vit_model()
-    inputs = processor(images=image, return_tensors="pt").to(device)
-    with torch.no_grad():
-        logits = model(**inputs).logits
-    label = logits.argmax(-1).item()
-    return model.config.id2label[label]
-# ---------------------------------------------------------
-# REWRITE CAPTIONS (8 STYLE SYSTEM + LENGTH SLIDER)
-# ---------------------------------------------------------
-def _build_style_prompt(caption, style):
-    base = (
-        "Rewrite the following image caption. "
-        "Keep the original meaning and important details, "
-        "but change the wording significantly and avoid repeating sentences verbatim. "
-        "Do not just copy the original text.\n\n"
-        f"Original caption:\n{caption}\n\n"
-    )
-    if style == "Short":
-        return (
-            base
-            + "Now produce a shorter, compact version in one or two sentences."
-        )
-    elif style == "Creative":
-        return (
-            base
-            + "Rewrite it in a colorful, imaginative, and richly descriptive style."
         )
-    elif style == "Technical":
-        return (
-            base
-            + "Rewrite it in a highly technical, analytical style using precise visual terminology."
-        )
-    elif style == "Humorous":
-        return (
-            base
-            + "Rewrite it with a fun, humorous, witty tone while keeping the meaning."
-        )
-    elif style == "Poetic":
-        return (
-            base
-            + "Rewrite it in a poetic, rhythmic, metaphorical style using sensory language."
-        )
-    elif style == "Cinematic":
-        return (
-            base
-            + "Rewrite it as if describing an epic cinematic movie scene with dramatic, vivid imagery."
-        )
-    elif style == "Journalistic":
-        return (
-            base
-            + "Rewrite it in a factual, neutral, journalistic news-reporting style."
-        )
-    elif style == "Academic":
-        return (
-            base
-            + "Rewrite it in a formal, academic style with clear, analytical phrasing."
-        )
-    else:
-        # Fallback: treat unknown style as creative rewrite
-        return (
-            base
-            + "Rewrite it in a natural, descriptive style."
-        )
-def rewrite_caption(caption, style, length):
-    tokenizer, model = load_llm()
-    prompt = _build_style_prompt(caption, style)
-    # Tokenize
-    inputs = tokenizer(prompt, return_tensors="pt").to(device)
-    # First pass: normal creative decoding
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=length,
-            do_sample=True,
-            temperature=0.9,
-            top_p=0.9,
-            no_repeat_ngram_size=3,
-            repetition_penalty=1.2,
-        )
-    rewritten = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
-    # If the model basically echoed the caption, try a second, more forceful pass.
-    if rewritten.lower().strip() == caption.lower().strip():
-        strong_prompt = (
-            "Paraphrase and expand the following caption. "
-            "Use different wording and add extra detail, but keep the meaning. "
-            "Do not repeat the original sentence exactly.\n\n"
-            f"Original caption:\n{caption}"
-        )
-        strong_inputs = tokenizer(strong_prompt, return_tensors="pt").to(device)
-        with torch.no_grad():
-            outputs2 = model.generate(
-                **strong_inputs,
-                max_new_tokens=length,
-                do_sample=True,
-                temperature=1.0,
-                top_p=0.95,
-                no_repeat_ngram_size=3,
-                repetition_penalty=1.3,
-            )
-        rewritten2 = tokenizer.decode(outputs2[0], skip_special_tokens=True).strip()
-        # Only replace if it actually changed something
-        if rewritten2 and rewritten2.lower().strip() != caption.lower().strip():
-            rewritten = rewritten2
-    return rewritten
-def extract_metadata(image):
-    width, height = image.size
-    meta = f"Dimensions: {width} x {height}\n"
-    meta += "EXIF data detected\n" if "exif" in image.info else "No EXIF data available\n"
-    return meta
-# ---------------------------------------------------------
-# MAIN LOOP
-# ---------------------------------------------------------
-def process_all(image, question, style, length):
-    if image is None:
-        return ["No image"] * 8
-    caption = generate_caption(image)
-    sentiment_label, sentiment_score = analyze_sentiment(caption)
-    vqa = vqa_answer(image, question) if question else "No question asked"
-    objects = detect_objects(image)
-    scene = classify_scene(image)
-    rewritten = rewrite_caption(caption, style, length)
-    metadata = extract_metadata(image)
-    return caption, sentiment_label, sentiment_score, vqa, objects, scene, rewritten, metadata
-# ---------------------------------------------------------
-# GRADIO UI
-# ---------------------------------------------------------
-with gr.Blocks(title="Multimodal AI System (Lazy Loaded)") as demo:
-    gr.Markdown("# **Multimodal AI System**")
-    with gr.Row():
-        image_input = gr.Image(type="pil", label="Upload Image")
-        question_input = gr.Textbox(label="Ask a Question")
-        style_input = gr.Dropdown(
-            [
-                "Short",
-                "Creative",
-                "Technical",
-                "Humorous",
-                "Poetic",
-                "Cinematic",
-                "Journalistic",
-                "Academic"
-            ],
-            label="Rewrite Style"
-        )
-    # New: length slider
-    length_slider = gr.Slider(
-        minimum=20,
-        maximum=200,
-        value=80,
-        step=10,
-        label="Rewrite Length (Max Tokens)"
     )
-    run_btn = gr.Button("Run All Tools")
-    caption = gr.Textbox(label="Generated Caption")
-    sentiment_label = gr.Textbox(label="Sentiment Label")
-    sentiment_score = gr.Number(label="Sentiment Score")
-    vqa_output = gr.Textbox(label="VQA Answer")
-    objects_output = gr.JSON(label="Detected Objects")
-    scene_output = gr.Textbox(label="Scene Classification")
-    rewritten_output = gr.Textbox(label="Rewritten Caption")
-    metadata_output = gr.Textbox(label="Image Metadata")
-    run_btn.click(
-        process_all,
-        [image_input, question_input, style_input, length_slider],
-        [
-            caption,
-            sentiment_label,
-            sentiment_score,
-            vqa_output,
-            objects_output,
-            scene_output,
-            rewritten_output,
-            metadata_output
-        ]
     )
 if __name__ == "__main__":
-    demo.launch()

+# ==============================================================================
+# Josh Guimond
+# Unit 8 Assignment: End-to-End AI Solution Implementation
+# ARIN 460
+# 12/03/2025
+# Description: This script implements a multimodal AI web app using Gradio to
+# run two image captioning models, a text “vibe” classifier, and NLP metrics on
+# uploaded images, allowing direct comparison of model captions to ground-truth
+# descriptions.
+# ==============================================================================
+# Video: https://youtu.be/pXCO00lK2UE
+# Space: https://huggingface.co/spaces/jguimond/assignment_8_v3
+# ==============================================================================
+# SECTION 1: SETUP & INSTALLATIONS
+# ==============================================================================
+# Install libraries
 import gradio as gr
+from transformers import pipeline, AutoTokenizer, AutoModelForImageTextToText
+from sentence_transformers import SentenceTransformer, util
+import evaluate
+import warnings
+import logging
+# Filter out the "FutureWarning" and "UserWarning" to keep the console clean
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+logging.getLogger("transformers").setLevel(logging.ERROR)
+# ==============================================================================
+# SECTION 2: LOAD MODELS
+# ==============================================================================
+# --- 1. Load Image Captioning Models ---
+# Model 1: BLIP (Base)
+print("Loading Model 1 (BLIP)...")
+captioner_model1 = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+# Model 2: ViT-GPT2 (With Tokenizer Fix)
+print("Loading Model 2 (ViT-GPT2)...")
+# Load the tokenizer manually to set the pad_token and fix the warning
+vit_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+vit_tokenizer.pad_token = vit_tokenizer.eos_token # <--- THE FIX
+captioner_model2 = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", tokenizer=vit_tokenizer)
+# --- 2. Load NLP Analysis Models (Unit 4 Techniques) ---
+# A. Zero-Shot Classifier (For Nuanced Vibe/Sentiment)
+print("Loading Zero-Shot Classifier...")
+classifier = pipeline("zero-shot-classification", model="MoritzLaurer/deberta-v3-xsmall-zeroshot-v1.1-all-33")
+# B. Semantic Similarity (For Model Agreement)
+print("Loading Sentence Transformer...")
+similarity_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+# C. ROUGE Metric (For Accuracy vs Ground Truth)
+print("Loading ROUGE Metric...")
+rouge = evaluate.load("rouge")
+# Define Nuanced Labels based on the image list
+# These cover: Peaceful dog, Sad funeral, Happy kids, Angry man, Scared people, Fighting tigers
+VIBE_LABELS = ["Peaceful/Calm", "Happy/Joy", "Sad/Sorrow", "Angry/Upset", "Fear/Scared", "Action/Violence"]
+# ==============================================================================
+# SECTION 3: ANALYSIS FUNCTIONS
+# ==============================================================================
+# --- Analysis Function ---
+def analyze_image(image, ground_truth):
+    # -- A. Generate Captions --
+    res1 = captioner_model1(image)
+    cap1 = res1[0]['generated_text']
+    res2 = captioner_model2(image)
+    cap2 = res2[0]['generated_text']
+    # -- B. Analyze Vibe (Zero-Shot) --
+    # Model 1 Vibe
+    vibe1_result = classifier(cap1, VIBE_LABELS)
+    vibe1_label = vibe1_result['labels'][0]
+    vibe1_score = vibe1_result['scores'][0]
+    # Model 2 Vibe
+    vibe2_result = classifier(cap2, VIBE_LABELS)
+    vibe2_label = vibe2_result['labels'][0]
+    vibe2_score = vibe2_result['scores'][0]
+    # -- C. Calculate Statistics --
+    # 1. Semantic Similarity (Do the models agree?)
+    emb1 = similarity_model.encode(cap1, convert_to_tensor=True)
+    emb2 = similarity_model.encode(cap2, convert_to_tensor=True)
+    sim_score = util.pytorch_cos_sim(emb1, emb2).item()
+    # 2. ROUGE Scores (How accurate are they vs Ground Truth?)
+    rouge_output = "N/A (No Ground Truth provided)"
+    if ground_truth and ground_truth.strip() != "":
+        # Calculate scores
+        r1 = rouge.compute(predictions=[cap1], references=[ground_truth])
+        r2 = rouge.compute(predictions=[cap2], references=[ground_truth])
+        # Format the ROUGE output nicely
+        rouge_output = (
+            f"Model 1 ROUGE-L: {r1['rougeL']:.3f}\n"
+            f"Model 2 ROUGE-L: {r2['rougeL']:.3f}\n"
+            f"(Higher is better)"
         )
+    # -- D. Format Output Strings --
+    # Create clean, formatted strings for the large textboxes
+    out1 = (
+        f"CAPTION: {cap1}\n"
+        f"-----------------------------\n"
+        f"DETECTED VIBE: {vibe1_label}\n"
+        f"CONFIDENCE: {vibe1_score:.1%}"
     )
+    out2 = (
+        f"CAPTION: {cap2}\n"
+        f"-----------------------------\n"
+        f"DETECTED VIBE: {vibe2_label}\n"
+        f"CONFIDENCE: {vibe2_score:.1%}"
     )
+    stats = (
+        f"--- 1. MODEL AGREEMENT (Semantic Similarity) ---\n"
+        f"Score: {sim_score:.3f}\n"
+        f"(Scale: 0.0 = Different, 1.0 = Identical)\n\n"
+        f"--- 2. OBJECT IDENTIFICATION ACCURACY (ROUGE) ---\n"
+        f"Ground Truth: '{ground_truth}'\n"
+        f"{rouge_output}"
+    )
+    return out1, out2, stats
+# ==============================================================================
+# SECTION 4: GRADIO INTERFACE
+# ==============================================================================
+# Define Inputs
+image_input = gr.Image(type="pil", label="Upload Image")
+text_input = gr.Textbox(label="Ground Truth Description", placeholder="e.g. 'A peaceful dog on a beach'")
+# Define Outputs with LARGER viewing areas (lines=5 or 10)
+output_m1 = gr.Textbox(label="Model 1 (BLIP) Analysis", lines=4)
+output_m2 = gr.Textbox(label="Model 2 (ViT-GPT2) Analysis", lines=4)
+output_stats = gr.Textbox(label="Comparison Metrics & Statistics", lines=10)
+# Create Interface
+interface = gr.Interface(
+    fn=analyze_image,
+    inputs=[image_input, text_input],
+    outputs=[output_m1, output_m2, output_stats],
+    title="Multimodal AI: Nuanced Image Analysis",
+    description="This application uses two Image Captioning models (BLIP & ViT-GPT2) to identify objects, Zero-Shot Classification to detect emotional vibes (Happy, Sad, Angry, etc.), and calculates ROUGE/Similarity metrics.",
+    examples=[
+        ["images/1.png", "A peaceful dog on a sunny beach"],
+        ["images/2.png", "Sad men carrying a casket at a funeral"],
+        ["images/3.png", "Happy kids at a birthday party"],
+        ["images/4.png", "An angry man in a car"],
+        ["images/5.png", "Two people happy mountain biking"],
+        ["images/6.png", "A man upset about his food at a restaurant"],
+        ["images/7.png", "A couple happy at a restaurant"],
+        ["images/8.png", "A sad woman reading a book"],
+        ["images/9.png", "People scared at a movie"],
+        ["images/10.png", "Two tigers fighting"]
+    ]
+)
 if __name__ == "__main__":
+    interface.launch()

requirements.txt CHANGED Viewed

@@ -1,11 +1,11 @@
-torch
-torchvision
 transformers
-timm
 gradio
-Pillow
-numpy
-scipy
-accelerate
-pycocotools
-exifread

+# requirements.txt
 transformers
+torch
 gradio
+pillow
+sentence-transformers
+evaluate
+rouge_score
+absl-py
+scikit-learn