Spaces:

ProfRom
/

TestSpace3

Sleeping

App Files Files Community

ProfRom commited on Dec 8, 2025

Commit

235a4e1

verified ·

1 Parent(s): 648030e

Ngo - Final Assignment submission

Browse files

Files changed (2) hide show

app.py +34 -92
requirements.txt +3 -3

app.py CHANGED Viewed

@@ -1,99 +1,41 @@
 import gradio as gr
 import torch
-import os
-import tempfile
-from huggingface_hub import login
-from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering, infer_device, PaliGemmaForConditionalGeneration
-from accelerate import Accelerator
-# login to Hugging Face
-login(token=os.getenv('HF_TOKEN'))
-# Set the device
-device = infer_device()
-# MODEL 1: BLIP-VQA
-processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
-model = AutoModelForVisualQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
-# Define inference function for Model 1
-def process_image(image, prompt):
-    inputs = processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)
-    try:
-        # Generate output from the model
-        output = model.generate(**inputs, max_new_tokens=10)
-        # Decode and return the output
-        decoded_output = processor.batch_decode(output, skip_special_tokens=True)[0].strip()
-        # remove prompt from output
-        if decoded_output.startswith(prompt):
-            return decoded_output[len(prompt):].strip()
-        return decoded_output
-    except Exception as e:
-        print(f"Error in Model 1: {e}")
-        return "An error occurred during processing for Model 1."
-# MODEL 2: PaliGemma
-processor2 = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224")
-model2 = PaliGemmaForConditionalGeneration.from_pretrained(
-    "google/paligemma-3b-mix-224",
-    torch_dtype=torch.bfloat16
-).to(device)
-# Define inference function for Model 2
-def process_image2(image, prompt):
-    inputs2 = processor2(
-        text=prompt,
-        images=image,
-        return_tensors="pt"
-    ).to(device, model2.dtype)
-    try:
-        output = model2.generate(**inputs2, max_new_tokens=10)
-        decoded_output = processor2.batch_decode(
-            output[:, inputs2["input_ids"].shape[1]:],
-            skip_special_tokens=True
-        )[0].strip()
-        return decoded_output
-    except Exception as e:
-        print(f"Error in Model 2: {e}")
-        return "An error occurred during processing for Model 2. Ensure your hardware supports bfloat16 or adjust the torch_dtype."
-# GRADIO INTERFACE
-inputs_model1 = [
-   gr.Image(type="pil"),
-   gr.Textbox(label="Prompt", placeholder="Enter your question")
-]
-inputs_model2 = [
-   gr.Image(type="pil"),
-   gr.Textbox(label="Prompt", placeholder="Enter your question")
-]
-outputs_model1 = gr.Textbox(label="Answer")
-outputs_model2 = gr.Textbox(label="Answer")
-# Create the Gradio apps for each model
-model1_inf = gr.Interface(
-    fn=process_image,
-    inputs=inputs_model1,
-    outputs=outputs_model1,
-    title="Model 1: BLIP-VQA-Base",
-    description="Ask a question about the uploaded image using BLIP."
 )
-model2_inf = gr.Interface(
-    fn=process_image2,
-    inputs=inputs_model2,
-    outputs=outputs_model2,
-    title="Model 2: PaliGemma",
-    description="Ask a question about the uploaded image using PaliGemma."
 )
-demo = gr.TabbedInterface([model1_inf, model2_inf],["Model 1 (BLIP)", "Model 2 (PaliGemma)"])
-demo.launch(share=True)

 import gradio as gr
+from transformers import Blip2Processor, Blip2ForConditionalGeneration
 import torch
+# Load pre-trained BLIP-2 model and processor
+processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+model = Blip2ForConditionalGeneration.from_pretrained(
+    "Salesforce/blip2-opt-2.7b",
+    torch_dtype=torch.float16
 )
+def predict(image, question=None):
+    # If no question provided, generate a caption
+    if question is None or question.strip() == "":
+        inputs = processor(image, return_tensors="pt")
+    else:
+        inputs = processor(image, question, return_tensors="pt")
+    # Move to GPU if available
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    inputs = inputs.to(device)
+    model.to(device)
+    # Generate output
+    out = model.generate(**inputs, max_new_tokens=50)
+    result = processor.decode(out[0], skip_special_tokens=True)
+    return result
+# Gradio interface
+iface = gr.Interface(
+    fn=predict,
+    inputs=[
+        gr.Image(type="pil", label="Upload Image"),
+        gr.Textbox(label="Optional Question", placeholder="Ask something about the image...")
+    ],
+    outputs=gr.Textbox(label="Result"),
+    title="BLIP-2 Multimodal Assistant",
+    description="Upload an image and get a caption. Optionally, ask a question about the image."
 )
+iface.launch()

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-transformers
 torch
-peft
-gradio

+gradio>=4.0
+transformers>=4.30
 torch
+pillow