Spaces:

manaskhan
/

imagecaption-generatorBlip-2

Sleeping

App Files Files Community

manaskhan commited on Sep 13, 2025

Commit

3abbd00

verified ·

1 Parent(s): b4acfa1

Create app.py

Browse files

Files changed (1) hide show

app.py +89 -0

app.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import streamlit as st
+import torch
+from transformers import Blip2Processor, Blip2ForConditionalGeneration
+from PIL import Image
+# Set up the Streamlit page configuration
+st.set_page_config(
+    page_title="BLIP-2 Image Captioning",
+    page_icon="📸",
+    layout="wide",
+)
+# --- Model Loading (using caching for efficiency) ---
+# The @st.cache_resource decorator ensures the model and processor are loaded only once.
+# This is crucial for a performant Streamlit app on Hugging Face Spaces.
+@st.cache_resource
+def load_model():
+    """
+    Loads the BLIP-2 model and processor from Hugging Face Hub.
+    We're using a smaller version (`blip2-opt-2.7b`) that is more suitable for
+    Hugging Face's free tier, though it may still require significant resources.
+    We load the model in 8-bit to reduce memory usage.
+    """
+    # Check if a CUDA-enabled GPU is available. If not, use CPU.
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    try:
+        # Load the processor and model
+        processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+        # We load the model in 8-bit to save memory, which is important for
+        # deployment on platforms like Hugging Face Spaces.
+        model = Blip2ForConditionalGeneration.from_pretrained(
+            "Salesforce/blip2-opt-2.7b",
+            device_map="auto",
+            load_in_8bit=True,
+            torch_dtype=torch.float16
+        )
+        return processor, model, device
+    except Exception as e:
+        st.error(f"Error loading the model: {e}")
+        st.info("The model is very large and may require a GPU with at least 15GB of VRAM. "
+                "If you're seeing this error, the free tier of Hugging Face Spaces might not be enough.")
+        return None, None, None
+# --- Main App Interface ---
+st.title("📸 BLIP-2 Image Captioning AI")
+st.write(
+    "Upload an image, and this application will generate a descriptive caption using the powerful "
+    "[BLIP-2 model](https://huggingface.co/Salesforce/blip2-opt-2.7b) from Hugging Face."
+)
+# Load the model and processor
+processor, model, device = load_model()
+if model and processor:
+    # Create a file uploader widget
+    uploaded_file = st.file_uploader(
+        "Choose an image...",
+        type=["jpg", "jpeg", "png", "bmp"],
+        help="Upload an image file to get a caption."
+    )
+    if uploaded_file is not None:
+        # Display the uploaded image
+        st.image(uploaded_file, caption="Uploaded Image", use_column_width=True)
+        st.write("")
+        st.info("Generating caption...")
+        try:
+            # Open the uploaded image file as a PIL Image
+            raw_image = Image.open(uploaded_file).convert("RGB")
+            # Preprocess the image and generate the caption
+            inputs = processor(images=raw_image, return_tensors="pt").to(device, torch.float16)
+            out = model.generate(**inputs, max_new_tokens=50) # Increased max_new_tokens for longer captions
+            # Decode the generated tokens to text
+            caption = processor.decode(out[0], skip_special_tokens=True).strip()
+            # Display the generated caption
+            st.success(f"**Caption:** {caption}")
+        except Exception as e:
+            st.error(f"An error occurred during caption generation: {e}")
+else:
+    st.warning("The application could not be initialized. Please check the logs for details.")