Spaces:

Maham930
/

Aqib_prototype

Runtime error

App Files Files Community

Aqib_prototype / app.py

Maham930

Update app.py

35d6f94 verified about 2 months ago

raw

history blame contribute delete

3.01 kB

	import streamlit as st
	from PIL import Image
	import torch
	from transformers import Blip2Processor, Blip2ForConditionalGeneration
	import os

	# -----------------------
	# Streamlit config
	# -----------------------
	st.set_page_config(page_title="Multimodal Image Understanding AI", layout="centered")
	st.title("📸 Multimodal Image Understanding & Storytelling AI")
	st.markdown(
	"Upload an image or use live camera, and get:\n"
	"- Caption\n"
	"- Summary\n"
	"- Detected objects\n"
	"- Emotion/mood\n"
	"- Short story inspired by the image"
	)

	# -----------------------
	# Model settings
	# -----------------------
	MODEL_NAME = "Salesforce/blip2-flan-t5-xl"
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	HF_TOKEN = os.getenv("HF_TOKEN") # Add HF_TOKEN as secret in Spaces (recommended)

	@st.cache_resource(show_spinner="🔄 Loading AI model, please wait...")
	def load_model():
	processor = Blip2Processor.from_pretrained(MODEL_NAME, use_fast=False, token=HF_TOKEN)
	model = Blip2ForConditionalGeneration.from_pretrained(
	MODEL_NAME,
	torch_dtype=torch.float16 if DEVICE=="cuda" else torch.float32,
	device_map="auto" if DEVICE=="cuda" else None,
	token=HF_TOKEN
	)
	model.eval()
	return processor, model

	processor, model = load_model()

	# -----------------------
	# Image input
	# -----------------------
	image_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
	camera_image = st.camera_input("Or take a live picture")

	image = None
	if camera_image:
	image = Image.open(camera_image).convert("RGB")
	elif image_file:
	image = Image.open(image_file).convert("RGB")

	if image:
	st.image(image, caption="Your Image", use_column_width=True)

	# -----------------------
	# Helper function
	# -----------------------
	def ask_model(prompt):
	inputs = processor(images=image, text=prompt, return_tensors="pt").to(DEVICE)
	out = model.generate(**inputs, max_new_tokens=150)
	return processor.decode(out[0], skip_special_tokens=True)

	with st.spinner("🧠 Analyzing image..."):
	caption = ask_model("Describe this image in one factual sentence.")
	summary = ask_model("Give a concise 3–5 line descriptive summary of this image.")
	objects = ask_model("List the main objects and entities visible in this image.")
	emotion = ask_model("Detect the emotional tone or mood of this image (happy, calm, tense, etc.).")
	story = ask_model("Write a short story (5–10 lines) inspired by this image.")

	# -----------------------
	# Output
	# -----------------------
	st.subheader("📝 Caption")
	st.write(caption)

	st.subheader("📄 Summary")
	st.write(summary)

	st.subheader("📦 Detected Objects")
	st.write(objects)

	st.subheader("😊 Emotional Tone")
	st.write(emotion)

	st.subheader("📖 Short Story")
	st.write(story)

	else:
	st.info("⬆️ Upload an image or use the camera above to begin.")