import streamlit as st
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from PIL import Image
from gtts import gTTS
import torch
import tempfile

# Page configuration
st.set_page_config(page_title="🧸 Story Generator for Kids", page_icon="📚")
st.title("🖼️ Image to Story Generator (Zephyr + BLIP)")
st.write("Upload an image and enjoy a magical story with voice, designed for kids aged 3–10!")

# upload image
uploaded_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])

if uploaded_file is not None:
    image = Image.open(uploaded_file)
    st.image(image, caption="Uploaded Image", use_container_width=True)

    if st.button("Generate Story"):
        with st.spinner("📷 Generating caption..."):
            # image description model（BLIP Large）
            captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=0)
            caption = captioner(image)[0]['generated_text'].strip()

        with st.spinner("✍️ Generating story with Zephyr..."):
            # Load the Zephyr 7B model
            tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
            model = AutoModelForCausalLM.from_pretrained(
                "HuggingFaceH4/zephyr-7b-beta",
                torch_dtype=torch.float16,
                device_map="auto"
            )

            # Instruction prompt word format (Zephyr format)
            prompt = (
                "<|system|>\nYou are a friendly AI assistant who writes short stories for children.\n"
                "<|user|>\nWrite a short, vivid, and imaginative story (under 100 words) suitable for children aged 3 to 10, "
                f"based on this image description: {caption}\n<|assistant|>\n"
            )

            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

            outputs = model.generate(
                **inputs,
                max_new_tokens=180,
                do_sample=True,
                temperature=0.8,
                top_p=0.95
            )

            decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
            story = decoded.split("<|assistant|>")[-1].strip()

            # The number of restrictive words shall not exceed 100
            words = story.split()
            if len(words) > 100:
                story = " ".join(words[:100]) + "..."

        with st.spinner("🔊 Converting story to speech..."):
            # Text-to-speech
            tts = gTTS(text=story, lang='en')
            temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
            tts.save(temp_audio.name)

            # result of presentation
            st.subheader("📖 Generated Story")
            st.write(story)

            st.subheader("🔊 Listen to the Story")
            st.audio(temp_audio.name, format="audio/mp3")