import streamlit as st from PIL import Image import tempfile from transformers import pipeline # --- Stage 1: Image → Caption --- def generate_caption(image): caption_pipeline = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") caption = caption_pipeline(image)[0]['generated_text'] return caption # --- Stage 2: Caption → Story --- def generate_story(caption): story_pipeline = pipeline("text-generation", model="gpt2") prompt = f"Write a fun, short story (50-100 words) for a child based on: {caption}" story = story_pipeline(prompt, max_length=100, do_sample=True)[0]['generated_text'] return story # --- Stage 3: Story → Audio --- def generate_audio(story_text): tts_pipeline = pipeline("text-to-speech", model="espnet/kan-bayashi_ljspeech_vits") speech = tts_pipeline(story_text) with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: f.write(speech["audio"]) return f.name # --- Streamlit UI --- def main(): st.title("📖 AI Storyteller for Kids (3 Stages)") st.write("Upload a child-friendly image and let the app create a story and read it out loud!") uploaded_image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"]) if uploaded_image: image = Image.open(uploaded_image) st.image(image, caption="Your uploaded image", use_column_width=True) with st.spinner("🔍 Generating caption..."): caption = generate_caption(image) st.success(f"🖼️ Caption: {caption}") with st.spinner("📝 Generating story..."): story = generate_story(caption) st.markdown("### 📚 Generated Story:") st.write(story) with st.spinner("🔊 Generating audio..."): audio_path = generate_audio(story) st.audio(audio_path, format="audio/wav") if __name__ == "__main__": main()