import streamlit as st
from PIL import Image
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import os

# -----------------------
# Streamlit config
# -----------------------
st.set_page_config(page_title="Multimodal Image Understanding AI", layout="centered")
st.title("📸 Multimodal Image Understanding & Storytelling AI")
st.markdown(
    "Upload an image or use live camera, and get:\n"
    "- Caption\n"
    "- Summary\n"
    "- Detected objects\n"
    "- Emotion/mood\n"
    "- Short story inspired by the image"
)

# -----------------------
# Model settings
# -----------------------
MODEL_NAME = "Salesforce/blip2-flan-t5-xl"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
HF_TOKEN = os.getenv("HF_TOKEN")  # Add HF_TOKEN as secret in Spaces (recommended)

@st.cache_resource(show_spinner="🔄 Loading AI model, please wait...")
def load_model():
    processor = Blip2Processor.from_pretrained(MODEL_NAME, use_fast=False, token=HF_TOKEN)
    model = Blip2ForConditionalGeneration.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16 if DEVICE=="cuda" else torch.float32,
        device_map="auto" if DEVICE=="cuda" else None,
        token=HF_TOKEN
    )
    model.eval()
    return processor, model

processor, model = load_model()

# -----------------------
# Image input
# -----------------------
image_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
camera_image = st.camera_input("Or take a live picture")

image = None
if camera_image:
    image = Image.open(camera_image).convert("RGB")
elif image_file:
    image = Image.open(image_file).convert("RGB")

if image:
    st.image(image, caption="Your Image", use_column_width=True)

    # -----------------------
    # Helper function
    # -----------------------
    def ask_model(prompt):
        inputs = processor(images=image, text=prompt, return_tensors="pt").to(DEVICE)
        out = model.generate(**inputs, max_new_tokens=150)
        return processor.decode(out[0], skip_special_tokens=True)

    with st.spinner("🧠 Analyzing image..."):
        caption = ask_model("Describe this image in one factual sentence.")
        summary = ask_model("Give a concise 3–5 line descriptive summary of this image.")
        objects = ask_model("List the main objects and entities visible in this image.")
        emotion = ask_model("Detect the emotional tone or mood of this image (happy, calm, tense, etc.).")
        story = ask_model("Write a short story (5–10 lines) inspired by this image.")

    # -----------------------
    # Output
    # -----------------------
    st.subheader("📝 Caption")
    st.write(caption)

    st.subheader("📄 Summary")
    st.write(summary)

    st.subheader("📦 Detected Objects")
    st.write(objects)

    st.subheader("😊 Emotional Tone")
    st.write(emotion)

    st.subheader("📖 Short Story")
    st.write(story)

else:
    st.info("⬆️ Upload an image or use the camera above to begin.")