Spaces:

ProfRom
/

TestSpace3

Running

App Files Files Community

ProfRom commited on Nov 26, 2025

Commit

5206179

verified ·

1 Parent(s): 01c9097

Gailey - Sanity Check 2

Browse files

Files changed (1) hide show

app.py +202 -98

app.py CHANGED Viewed

@@ -1,100 +1,204 @@
-import gradio as gr
 import torch
-import os
-import tempfile
-from huggingface_hub import login
-from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering, infer_device, PaliGemmaForConditionalGeneration
-from accelerate import Accelerator
-# login to Hugging Face
-# login(token=os.getenv('HF_TOKEN'))
-# Set the device
-device = infer_device()
-# MODEL 1: BLIP-VQA
-processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
-model = AutoModelForVisualQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
-# Define inference function for Model 1
-def process_image(image, prompt):
-    inputs = processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)
-    try:
-        # Generate output from the model
-        output = model.generate(**inputs, max_new_tokens=10)
-        # Decode and return the output
-        decoded_output = processor.batch_decode(output, skip_special_tokens=True)[0].strip()
-        # remove prompt from output
-        if decoded_output.startswith(prompt):
-            return decoded_output[len(prompt):].strip()
-        return decoded_output
-    except Exception as e:
-        print(f"Error in Model 1: {e}")
-        return "An error occurred during processing for Model 1."
-# MODEL 2: PaliGemma
-processor2 = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224")
-model2 = PaliGemmaForConditionalGeneration.from_pretrained(
-    "google/paligemma-3b-mix-224",
-    torch_dtype=torch.bfloat16
-).to(device)
-# Define inference function for Model 2
-def process_image2(image, prompt):
-    inputs2 = processor2(
-        text=prompt,
-        images=image,
-        return_tensors="pt"
-    ).to(device, model2.dtype)
-    try:
-        output = model2.generate(**inputs2, max_new_tokens=10)
-        decoded_output = processor2.batch_decode(
-            output[:, inputs2["input_ids"].shape[1]:],
-            skip_special_tokens=True
-        )[0].strip()
-        return decoded_output
-    except Exception as e:
-        print(f"Error in Model 2: {e}")
-        return "An error occurred during processing for Model 2. Ensure your hardware supports bfloat16 or adjust the torch_dtype."
-# GRADIO INTERFACE
-inputs_model1 = [
-   gr.Image(type="pil"),
-   gr.Textbox(label="Prompt", placeholder="Enter your question")
-]
-inputs_model2 = [
-   gr.Image(type="pil"),
-   gr.Textbox(label="Prompt", placeholder="Enter your question")
-]
-outputs_model1 = gr.Textbox(label="Answer")
-outputs_model2 = gr.Textbox(label="Answer")
-# Create the Gradio apps for each model
-model1_inf = gr.Interface(
-    fn=process_image,
-    inputs=inputs_model1,
-    outputs=outputs_model1,
-    title="Model 1: BLIP-VQA-Base",
-    description="Ask a question about the uploaded image using BLIP."
-)
-model2_inf = gr.Interface(
-    fn=process_image2,
-    inputs=inputs_model2,
-    outputs=outputs_model2,
-    title="Model 2: PaliGemma",
-    description="Ask a question about the uploaded image using PaliGemma."
-)
-demo = gr.TabbedInterface([model1_inf, model2_inf],["Model 1 (BLIP)", "Model 2 (PaliGemma)"])
-demo.launch(share=True)

+# app.py — Lazy Loaded Multimodal AI System
+#
+# Models load ONLY when needed to avoid memory overflow
+# Works on Hugging Face free CPU Spaces
 import torch
+import gradio as gr
+device = torch.device("cpu")
+# ---------------------------------------------------------
+# LAZY MODEL LOADERS
+# ---------------------------------------------------------
+def load_caption_model():
+    from transformers import BlipProcessor, BlipForConditionalGeneration
+    model_name = "Salesforce/blip-image-captioning-base"
+    processor = BlipProcessor.from_pretrained(model_name)
+    model = BlipForConditionalGeneration.from_pretrained(model_name).to(device)
+    return processor, model
+def load_sentiment_model():
+    from transformers import pipeline
+    return pipeline(
+        "sentiment-analysis",
+        model="distilbert-base-uncased-finetuned-sst-2-english"
+    )
+def load_vqa_model():
+    from transformers import BlipProcessor, BlipForQuestionAnswering
+    model_name = "Salesforce/blip-vqa-base"
+    processor = BlipProcessor.from_pretrained(model_name)
+    model = BlipForQuestionAnswering.from_pretrained(model_name).to(device)
+    return processor, model
+def load_detr_model():
+    from transformers import DetrImageProcessor, DetrForObjectDetection
+    processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
+    model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(device)
+    return processor, model
+def load_vit_model():
+    from transformers import ViTImageProcessor, ViTForImageClassification
+    model_name = "google/vit-base-patch16-224"
+    processor = ViTImageProcessor.from_pretrained(model_name)
+    model = ViTForImageClassification.from_pretrained(model_name).to(device)
+    return processor, model
+def load_llm():
+    from transformers import AutoTokenizer, AutoModelForCausalLM
+    name = "gpt2"
+    tokenizer = AutoTokenizer.from_pretrained(name)
+    model = AutoModelForCausalLM.from_pretrained(name).to(device)
+    return tokenizer, model
+# ---------------------------------------------------------
+# TASKS
+# ---------------------------------------------------------
+def generate_caption(image):
+    processor, model = load_caption_model()
+    inputs = processor(images=image, return_tensors="pt").to(device)
+    with torch.no_grad():
+        out_ids = model.generate(**inputs, max_new_tokens=30)
+    return processor.decode(out_ids[0], skip_special_tokens=True)
+def analyze_sentiment(text):
+    sentiment = load_sentiment_model()
+    out = sentiment(text)[0]
+    return out["label"], round(out["score"] * 100, 2)
+def vqa_answer(image, question):
+    processor, model = load_vqa_model()
+    inputs = processor(images=image, text=question, return_tensors="pt").to(device)
+    with torch.no_grad():
+        out = model.generate(**inputs)
+    return processor.decode(out[0], skip_special_tokens=True)
+def detect_objects(image):
+    processor, model = load_detr_model()
+    inputs = processor(images=image, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    target_sizes = torch.tensor([image.size[::-1]])
+    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes)[0]
+    detections = []
+    for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+        if score > 0.3:
+            detections.append(
+                f"{model.config.id2label[label.item()]} (score {round(score.item(), 2)})"
+            )
+    if len(detections) == 0:
+        return ["No high-confidence objects detected"]
+    return detections
+def classify_scene(image):
+    processor, model = load_vit_model()
+    inputs = processor(images=image, return_tensors="pt").to(device)
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    label = logits.argmax(-1).item()
+    return model.config.id2label[label]
+def rewrite_caption(caption, style):
+    tokenizer, model = load_llm()
+    if style == "Short":
+        prompt = f"Summarize: {caption}"
+    elif style == "Creative":
+        prompt = f"Rewrite creatively: {caption}"
+    elif style == "Technical":
+        prompt = f"Rewrite in technical detail: {caption}"
+    else:
+        prompt = caption
+    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = model.generate(inputs, max_new_tokens=60)
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)
+def extract_metadata(image):
+    width, height = image.size
+    meta = f"Dimensions: {width} x {height}\n"
+    meta += "EXIF data detected\n" if "exif" in image.info else "No EXIF data available\n"
+    return meta
+# ---------------------------------------------------------
+# MAIN LOOP
+# ---------------------------------------------------------
+def process_all(image, question, style):
+    if image is None:
+        return ["No image"] * 8
+    caption = generate_caption(image)
+    sentiment_label, sentiment_score = analyze_sentiment(caption)
+    vqa = vqa_answer(image, question) if question else "No question asked"
+    objects = detect_objects(image)
+    scene = classify_scene(image)
+    rewritten = rewrite_caption(caption, style)
+    metadata = extract_metadata(image)
+    return caption, sentiment_label, sentiment_score, vqa, objects, scene, rewritten, metadata
+# ---------------------------------------------------------
+# GRADIO UI
+# ---------------------------------------------------------
+with gr.Blocks(title="Multimodal AI System (Lazy Loaded)") as demo:
+    gr.Markdown("# **Multimodal AI System**")
+    with gr.Row():
+        image_input = gr.Image(type="pil", label="Upload Image")
+        question_input = gr.Textbox(label="Ask a Question")
+        style_input = gr.Dropdown(["Short", "Creative", "Technical"], label="Caption Style")
+    run_btn = gr.Button("Run All Tools")
+    caption = gr.Textbox(label="Generated Caption")
+    sentiment_label = gr.Textbox(label="Sentiment Label")
+    sentiment_score = gr.Number(label="Sentiment Score")
+    vqa_output = gr.Textbox(label="VQA Answer")
+    objects_output = gr.JSON(label="Detected Objects")
+    scene_output = gr.Textbox(label="Scene Classification")
+    rewritten_output = gr.Textbox(label="Rewritten Caption")
+    metadata_output = gr.Textbox(label="Image Metadata")
+    run_btn.click(
+        process_all,
+        [image_input, question_input, style_input],
+        [
+            caption,
+            sentiment_label,
+            sentiment_score,
+            vqa_output,
+            objects_output,
+            scene_output,
+            rewritten_output,
+            metadata_output
+        ]
+    )
+if __name__ == "__main__":
+    demo.launch()