Aqib_prototype / app.py
Maham930's picture
Update app.py
35d6f94 verified
import streamlit as st
from PIL import Image
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import os
# -----------------------
# Streamlit config
# -----------------------
st.set_page_config(page_title="Multimodal Image Understanding AI", layout="centered")
st.title("πŸ“Έ Multimodal Image Understanding & Storytelling AI")
st.markdown(
"Upload an image or use live camera, and get:\n"
"- Caption\n"
"- Summary\n"
"- Detected objects\n"
"- Emotion/mood\n"
"- Short story inspired by the image"
)
# -----------------------
# Model settings
# -----------------------
MODEL_NAME = "Salesforce/blip2-flan-t5-xl"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
HF_TOKEN = os.getenv("HF_TOKEN") # Add HF_TOKEN as secret in Spaces (recommended)
@st.cache_resource(show_spinner="πŸ”„ Loading AI model, please wait...")
def load_model():
processor = Blip2Processor.from_pretrained(MODEL_NAME, use_fast=False, token=HF_TOKEN)
model = Blip2ForConditionalGeneration.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16 if DEVICE=="cuda" else torch.float32,
device_map="auto" if DEVICE=="cuda" else None,
token=HF_TOKEN
)
model.eval()
return processor, model
processor, model = load_model()
# -----------------------
# Image input
# -----------------------
image_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
camera_image = st.camera_input("Or take a live picture")
image = None
if camera_image:
image = Image.open(camera_image).convert("RGB")
elif image_file:
image = Image.open(image_file).convert("RGB")
if image:
st.image(image, caption="Your Image", use_column_width=True)
# -----------------------
# Helper function
# -----------------------
def ask_model(prompt):
inputs = processor(images=image, text=prompt, return_tensors="pt").to(DEVICE)
out = model.generate(**inputs, max_new_tokens=150)
return processor.decode(out[0], skip_special_tokens=True)
with st.spinner("🧠 Analyzing image..."):
caption = ask_model("Describe this image in one factual sentence.")
summary = ask_model("Give a concise 3–5 line descriptive summary of this image.")
objects = ask_model("List the main objects and entities visible in this image.")
emotion = ask_model("Detect the emotional tone or mood of this image (happy, calm, tense, etc.).")
story = ask_model("Write a short story (5–10 lines) inspired by this image.")
# -----------------------
# Output
# -----------------------
st.subheader("πŸ“ Caption")
st.write(caption)
st.subheader("πŸ“„ Summary")
st.write(summary)
st.subheader("πŸ“¦ Detected Objects")
st.write(objects)
st.subheader("😊 Emotional Tone")
st.write(emotion)
st.subheader("πŸ“– Short Story")
st.write(story)
else:
st.info("⬆️ Upload an image or use the camera above to begin.")