Spaces:

ProfRom
/

TestSpace3

Sleeping

App Files Files Community

ProfRom commited on Dec 3, 2025

Commit

adbc5fd

verified ·

1 Parent(s): e95fe57

Smallwood - Sanity Check 3

Browse files

Files changed (1) hide show

app.py +122 -199

app.py CHANGED Viewed

@@ -1,204 +1,127 @@
-# app.py — Lazy Loaded Multimodal AI System
-#
-# Models load ONLY when needed to avoid memory overflow
-# Works on Hugging Face free CPU Spaces
-import torch
 import gradio as gr
-device = torch.device("cpu")
-# ---------------------------------------------------------
-# LAZY MODEL LOADERS
-# ---------------------------------------------------------
-def load_caption_model():
-    from transformers import BlipProcessor, BlipForConditionalGeneration
-    model_name = "Salesforce/blip-image-captioning-base"
-    processor = BlipProcessor.from_pretrained(model_name)
-    model = BlipForConditionalGeneration.from_pretrained(model_name).to(device)
-    return processor, model
-def load_sentiment_model():
-    from transformers import pipeline
-    return pipeline(
-        "sentiment-analysis",
-        model="distilbert-base-uncased-finetuned-sst-2-english"
-    )
-def load_vqa_model():
-    from transformers import BlipProcessor, BlipForQuestionAnswering
-    model_name = "Salesforce/blip-vqa-base"
-    processor = BlipProcessor.from_pretrained(model_name)
-    model = BlipForQuestionAnswering.from_pretrained(model_name).to(device)
-    return processor, model
-def load_detr_model():
-    from transformers import DetrImageProcessor, DetrForObjectDetection
-    processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
-    model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(device)
-    return processor, model
-def load_vit_model():
-    from transformers import ViTImageProcessor, ViTForImageClassification
-    model_name = "google/vit-base-patch16-224"
-    processor = ViTImageProcessor.from_pretrained(model_name)
-    model = ViTForImageClassification.from_pretrained(model_name).to(device)
-    return processor, model
-def load_llm():
-    from transformers import AutoTokenizer, AutoModelForCausalLM
-    name = "gpt2"
-    tokenizer = AutoTokenizer.from_pretrained(name)
-    model = AutoModelForCausalLM.from_pretrained(name).to(device)
-    return tokenizer, model
-# ---------------------------------------------------------
-# TASKS
-# ---------------------------------------------------------
-def generate_caption(image):
-    processor, model = load_caption_model()
-    inputs = processor(images=image, return_tensors="pt").to(device)
-    with torch.no_grad():
-        out_ids = model.generate(**inputs, max_new_tokens=30)
-    return processor.decode(out_ids[0], skip_special_tokens=True)
-def analyze_sentiment(text):
-    sentiment = load_sentiment_model()
-    out = sentiment(text)[0]
-    return out["label"], round(out["score"] * 100, 2)
-def vqa_answer(image, question):
-    processor, model = load_vqa_model()
-    inputs = processor(images=image, text=question, return_tensors="pt").to(device)
-    with torch.no_grad():
-        out = model.generate(**inputs)
-    return processor.decode(out[0], skip_special_tokens=True)
-def detect_objects(image):
-    processor, model = load_detr_model()
-    inputs = processor(images=image, return_tensors="pt").to(device)
-    with torch.no_grad():
-        outputs = model(**inputs)
-    target_sizes = torch.tensor([image.size[::-1]])
-    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes)[0]
-    detections = []
-    for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
-        if score > 0.3:
-            detections.append(
-                f"{model.config.id2label[label.item()]} (score {round(score.item(), 2)})"
-            )
-    if len(detections) == 0:
-        return ["No high-confidence objects detected"]
-    return detections
-def classify_scene(image):
-    processor, model = load_vit_model()
-    inputs = processor(images=image, return_tensors="pt").to(device)
-    with torch.no_grad():
-        logits = model(**inputs).logits
-    label = logits.argmax(-1).item()
-    return model.config.id2label[label]
-def rewrite_caption(caption, style):
-    tokenizer, model = load_llm()
-    if style == "Short":
-        prompt = f"Summarize: {caption}"
-    elif style == "Creative":
-        prompt = f"Rewrite creatively: {caption}"
-    elif style == "Technical":
-        prompt = f"Rewrite in technical detail: {caption}"
-    else:
-        prompt = caption
-    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
-    with torch.no_grad():
-        outputs = model.generate(inputs, max_new_tokens=60)
-    return tokenizer.decode(outputs[0], skip_special_tokens=True)
-def extract_metadata(image):
-    width, height = image.size
-    meta = f"Dimensions: {width} x {height}\n"
-    meta += "EXIF data detected\n" if "exif" in image.info else "No EXIF data available\n"
-    return meta
-# ---------------------------------------------------------
-# MAIN LOOP
-# ---------------------------------------------------------
-def process_all(image, question, style):
-    if image is None:
-        return ["No image"] * 8
-    caption = generate_caption(image)
-    sentiment_label, sentiment_score = analyze_sentiment(caption)
-    vqa = vqa_answer(image, question) if question else "No question asked"
-    objects = detect_objects(image)
-    scene = classify_scene(image)
-    rewritten = rewrite_caption(caption, style)
-    metadata = extract_metadata(image)
-    return caption, sentiment_label, sentiment_score, vqa, objects, scene, rewritten, metadata
-# ---------------------------------------------------------
-# GRADIO UI
-# ---------------------------------------------------------
-with gr.Blocks(title="Multimodal AI System (Lazy Loaded)") as demo:
-    gr.Markdown("# **Multimodal AI System**")
-    with gr.Row():
-        image_input = gr.Image(type="pil", label="Upload Image")
-        question_input = gr.Textbox(label="Ask a Question")
-        style_input = gr.Dropdown(["Short", "Creative", "Technical"], label="Caption Style")
-    run_btn = gr.Button("Run All Tools")
-    caption = gr.Textbox(label="Generated Caption")
-    sentiment_label = gr.Textbox(label="Sentiment Label")
-    sentiment_score = gr.Number(label="Sentiment Score")
-    vqa_output = gr.Textbox(label="VQA Answer")
-    objects_output = gr.JSON(label="Detected Objects")
-    scene_output = gr.Textbox(label="Scene Classification")
-    rewritten_output = gr.Textbox(label="Rewritten Caption")
-    metadata_output = gr.Textbox(label="Image Metadata")
-    run_btn.click(
-        process_all,
-        [image_input, question_input, style_input],
         [
-            caption,
-            sentiment_label,
-            sentiment_score,
-            vqa_output,
-            objects_output,
-            scene_output,
-            rewritten_output,
-            metadata_output
-        ]
     )
-if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+from transformers import pipeline
+from PIL import ImageDraw, ImageFont
+import textwrap
+# --- LOAD MODELS ---
+print("Loading Models...")
+caption_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+classification_pipeline = pipeline("image-classification", model="google/vit-base-patch16-224")
+sentiment_pipeline = pipeline("sentiment-analysis")
+# --- DRAWING FUNCTION ---
+def add_caption_to_image(image, text):
+    draw = ImageDraw.Draw(image)
+    image_width, image_height = image.size
+    # 1. Setup Font
+    try:
+        font = ImageFont.truetype("DejaVuSans.ttf", 20)
+    except IOError:
+        font = ImageFont.load_default()
+    # 2. Wrap Text
+    avg_char_width = 12
+    chars_per_line = max(10, int((image_width - 40) / avg_char_width))
+    lines = textwrap.wrap(text, width=chars_per_line)
+    # 3. Calculate Box Size
+    line_height = 24
+    total_text_height = len(lines) * line_height
+    y_start = image_height - total_text_height - 20
+    max_line_width = 0
+    for line in lines:
+        bbox = draw.textbbox((0, 0), line, font=font)
+        w = bbox[2] - bbox[0]
+        if w > max_line_width: max_line_width = w
+    box_x = (image_width - max_line_width) / 2
+    # 4. Draw Box
+    padding = 10
+    draw.rectangle(
         [
+            (box_x - padding, y_start - padding),
+            (box_x + max_line_width + padding, y_start + total_text_height + padding)
+        ],
+        fill=(0, 0, 0, 180)
     )
+    # 5. Draw Text
+    current_y = y_start
+    for line in lines:
+        bbox = draw.textbbox((0, 0), line, font=font)
+        line_width = bbox[2] - bbox[0]
+        line_x = (image_width - line_width) / 2
+        draw.text((line_x, current_y), line, font=font, fill="white")
+        current_y += line_height
+    return image
+# --- ANALYSIS FUNCTION ---
+def multimodal_analysis(input_image):
+    if input_image is None: return None, "Upload image first", "N/A"
+    processed_image = input_image.copy()
+    # 1. Caption
+    try:
+        caption = caption_pipeline(input_image)[0]['generated_text']
+    except:
+        return processed_image, "Error", "Error"
+    # 2. Draw
+    final_img = add_caption_to_image(processed_image, caption)
+    # 3. Classify
+    try:
+        res = classification_pipeline(input_image)
+        cls_str = f"{res[0]['label']} ({res[0]['score']:.2f})"
+    except:
+        cls_str = "Error"
+    # 4. Sentiment
+    try:
+        sent = sentiment_pipeline(caption)[0]['label']
+    except:
+        sent = "Error"
+    return final_img, cls_str, sent
+# --- INTERFACE (Removed Theme to fix crash) ---
+with gr.Blocks() as demo:
+    gr.Markdown("# 🤖 Multimodal AI Analyst")
+    gr.Markdown("Select an example image below to see: **Image Captioning**, **Vision Classification**, and **NLP Sentiment Analysis** working together.")
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(type="pil", label="Input Image")
+            submit_btn = gr.Button("🔍 Analyze Image", variant="primary")
+        with gr.Column():
+            output_image = gr.Image(label="AI Caption Result")
+            with gr.Row():
+                output_class = gr.Textbox(label="Object Class")
+                output_sent = gr.Textbox(label="Caption Sentiment")
+    # EXACT FILES FROM YOUR LIST
+    examples = [
+        ["Ashe Catcum with Pikachu.png"],
+        ["Beautiful sunrise over ocean.png"],
+        ["Cat on a couch.png"],
+        ["Female Crying.png"],
+        ["Lions Football team huddle.png"],
+        ["michael jordan trophy.png"],
+        ["Puppies playing in grass.png"],
+        ["Red Ferrari.png"],
+        ["Siamese cat.png"],
+        ["Stormy dark sky lightning.png"]
+    ]
+    gr.Examples(examples=examples, inputs=image_input)
+    submit_btn.click(fn=multimodal_analysis, inputs=image_input, outputs=[output_image, output_class, output_sent])
+demo.launch()