Spaces:

manaskhan
/

imagecaption-generatorBlip-2

Sleeping

App Files Files Community

imagecaption-generatorBlip-2 / app.py

manaskhan

Create app.py

3abbd00 verified 5 months ago

raw

history blame contribute delete

3.45 kB

	import streamlit as st
	import torch
	from transformers import Blip2Processor, Blip2ForConditionalGeneration
	from PIL import Image

	# Set up the Streamlit page configuration
	st.set_page_config(
	page_title="BLIP-2 Image Captioning",
	page_icon="📸",
	layout="wide",
	)

	# --- Model Loading (using caching for efficiency) ---
	# The @st.cache_resource decorator ensures the model and processor are loaded only once.
	# This is crucial for a performant Streamlit app on Hugging Face Spaces.
	@st.cache_resource
	def load_model():
	"""
	Loads the BLIP-2 model and processor from Hugging Face Hub.

	We're using a smaller version (`blip2-opt-2.7b`) that is more suitable for
	Hugging Face's free tier, though it may still require significant resources.
	We load the model in 8-bit to reduce memory usage.
	"""
	# Check if a CUDA-enabled GPU is available. If not, use CPU.
	device = "cuda" if torch.cuda.is_available() else "cpu"

	try:
	# Load the processor and model
	processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")

	# We load the model in 8-bit to save memory, which is important for
	# deployment on platforms like Hugging Face Spaces.
	model = Blip2ForConditionalGeneration.from_pretrained(
	"Salesforce/blip2-opt-2.7b",
	device_map="auto",
	load_in_8bit=True,
	torch_dtype=torch.float16
	)
	return processor, model, device
	except Exception as e:
	st.error(f"Error loading the model: {e}")
	st.info("The model is very large and may require a GPU with at least 15GB of VRAM. "
	"If you're seeing this error, the free tier of Hugging Face Spaces might not be enough.")
	return None, None, None

	# --- Main App Interface ---
	st.title("📸 BLIP-2 Image Captioning AI")
	st.write(
	"Upload an image, and this application will generate a descriptive caption using the powerful "
	"[BLIP-2 model](https://huggingface.co/Salesforce/blip2-opt-2.7b) from Hugging Face."
	)

	# Load the model and processor
	processor, model, device = load_model()

	if model and processor:
	# Create a file uploader widget
	uploaded_file = st.file_uploader(
	"Choose an image...",
	type=["jpg", "jpeg", "png", "bmp"],
	help="Upload an image file to get a caption."
	)

	if uploaded_file is not None:
	# Display the uploaded image
	st.image(uploaded_file, caption="Uploaded Image", use_column_width=True)
	st.write("")
	st.info("Generating caption...")

	try:
	# Open the uploaded image file as a PIL Image
	raw_image = Image.open(uploaded_file).convert("RGB")

	# Preprocess the image and generate the caption
	inputs = processor(images=raw_image, return_tensors="pt").to(device, torch.float16)
	out = model.generate(**inputs, max_new_tokens=50) # Increased max_new_tokens for longer captions

	# Decode the generated tokens to text
	caption = processor.decode(out[0], skip_special_tokens=True).strip()

	# Display the generated caption
	st.success(f"Caption: {caption}")

	except Exception as e:
	st.error(f"An error occurred during caption generation: {e}")

	else:
	st.warning("The application could not be initialized. Please check the logs for details.")