Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| from PIL import Image | |
| import torch | |
| from transformers import Blip2Processor, Blip2ForConditionalGeneration | |
| import os | |
| # ----------------------- | |
| # Streamlit config | |
| # ----------------------- | |
| st.set_page_config(page_title="Multimodal Image Understanding AI", layout="centered") | |
| st.title("πΈ Multimodal Image Understanding & Storytelling AI") | |
| st.markdown( | |
| "Upload an image or use live camera, and get:\n" | |
| "- Caption\n" | |
| "- Summary\n" | |
| "- Detected objects\n" | |
| "- Emotion/mood\n" | |
| "- Short story inspired by the image" | |
| ) | |
| # ----------------------- | |
| # Model settings | |
| # ----------------------- | |
| MODEL_NAME = "Salesforce/blip2-flan-t5-xl" | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| HF_TOKEN = os.getenv("HF_TOKEN") # Add HF_TOKEN as secret in Spaces (recommended) | |
| def load_model(): | |
| processor = Blip2Processor.from_pretrained(MODEL_NAME, use_fast=False, token=HF_TOKEN) | |
| model = Blip2ForConditionalGeneration.from_pretrained( | |
| MODEL_NAME, | |
| torch_dtype=torch.float16 if DEVICE=="cuda" else torch.float32, | |
| device_map="auto" if DEVICE=="cuda" else None, | |
| token=HF_TOKEN | |
| ) | |
| model.eval() | |
| return processor, model | |
| processor, model = load_model() | |
| # ----------------------- | |
| # Image input | |
| # ----------------------- | |
| image_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"]) | |
| camera_image = st.camera_input("Or take a live picture") | |
| image = None | |
| if camera_image: | |
| image = Image.open(camera_image).convert("RGB") | |
| elif image_file: | |
| image = Image.open(image_file).convert("RGB") | |
| if image: | |
| st.image(image, caption="Your Image", use_column_width=True) | |
| # ----------------------- | |
| # Helper function | |
| # ----------------------- | |
| def ask_model(prompt): | |
| inputs = processor(images=image, text=prompt, return_tensors="pt").to(DEVICE) | |
| out = model.generate(**inputs, max_new_tokens=150) | |
| return processor.decode(out[0], skip_special_tokens=True) | |
| with st.spinner("π§ Analyzing image..."): | |
| caption = ask_model("Describe this image in one factual sentence.") | |
| summary = ask_model("Give a concise 3β5 line descriptive summary of this image.") | |
| objects = ask_model("List the main objects and entities visible in this image.") | |
| emotion = ask_model("Detect the emotional tone or mood of this image (happy, calm, tense, etc.).") | |
| story = ask_model("Write a short story (5β10 lines) inspired by this image.") | |
| # ----------------------- | |
| # Output | |
| # ----------------------- | |
| st.subheader("π Caption") | |
| st.write(caption) | |
| st.subheader("π Summary") | |
| st.write(summary) | |
| st.subheader("π¦ Detected Objects") | |
| st.write(objects) | |
| st.subheader("π Emotional Tone") | |
| st.write(emotion) | |
| st.subheader("π Short Story") | |
| st.write(story) | |
| else: | |
| st.info("β¬οΈ Upload an image or use the camera above to begin.") | |