import streamlit as st from PIL import Image from transformers import BlipProcessor, BlipForConditionalGeneration, pipeline from gtts import gTTS import os import tempfile # Load models @st.cache_resource def load_models(): processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") gpt2_pipeline = pipeline("text-generation", model="gpt2") return processor, blip_model, gpt2_pipeline processor, blip_model, gpt2 = load_models() # UI st.title("🖼️📖 Storyteller for Kids") st.write("Upload an image and let the app create and read a magical story just for kids!") uploaded_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"]) if uploaded_file: image = Image.open(uploaded_file).convert("RGB") st.image(image, caption="Uploaded Image", use_column_width=True) with st.spinner("Generating image caption..."): inputs = processor(images=image, return_tensors="pt") out = blip_model.generate(**inputs) caption = processor.decode(out[0], skip_special_tokens=True) st.success("Caption generated!") st.write(f"**Caption:** {caption}") with st.spinner("Writing a children's story..."): prompt = f"Write a short, imaginative story for children aged 3-10 about this: {caption}" story_output = gpt2( prompt, max_length=100, num_return_sequences=1, do_sample=True, temperature=0.9, top_p=0.95, top_k=50, repetition_penalty=1.2, pad_token_id=50256, eos_token_id=50256, )[0]["generated_text"] story = story_output.strip().replace('\n', ' ') # Truncate to ~100 words for safety story = " ".join(story.split()[:100]) st.success("Story created!") st.write(f"**Story:**\n\n{story}") with st.spinner("Converting story to audio..."): try: tts = gTTS(text=story, lang='en') with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp: tts.save(fp.name) st.audio(fp.name, format="audio/mp3") st.success("Audio playback ready!") except Exception as e: st.error(f"Text-to-speech failed: {e}")