import streamlit as st from transformers import pipeline from gtts import gTTS import tempfile import os # function part # img2text def img2text(img_path): # Image captioning model captioner = pipeline( "image-to-text", model="nlpconnect/vit-gpt2-image-captioning" #This model is relatively fast and accurate ) result = captioner(img_path) return result[0]["generated_text"] # text2story def text2story(scenario): # Story generator config generator = pipeline( "text-generation", model="gpt2", #Relatively small but fast max_length=200, # Maximum story lengt num_return_sequences=1 #Number of variants to generate ) prompt = f"Create a children's story based on: {scenario}" story = generator(prompt)[0]["generated_text"] return story # text2audio def text2audio(story_text): # Audio file creation tts = gTTS(text=story_text, lang="en") audio_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") tts.save(audio_file.name) return audio_file.name def main(): st.set_page_config( page_title="Image to Story", page_icon="📖" ) st.header("Upload Your Image") uploaded_file = st.file_uploader( "Choose Image", type=["jpg", "png", "jpeg"] ) if uploaded_file: temp_img = os.path.join(tempfile.gettempdir(), uploaded_file.name) with open(temp_img, "wb") as f: f.write(uploaded_file.getvalue()) st.image(uploaded_file) #Stage 1: Image to Text with st.status("🖼️ Processing image..."): scenario = img2text(temp_img) st.write("Image Caption:", scenario) #Stage 2: Text to Story with st.status("📖 Generating story..."): story = text2story(scenario) st.subheader("Story") st.write(story) #Stage 3: Story to Audio data with st.status("🔊 Converting audio..."): audio_path = text2audio(story) # Play button if st.button("▶️ Play Audio Story"): st.audio(audio_path, format="audio/mp3") # Cleanup os.unlink(temp_img) os.unlink(audio_path) if __name__ == "__main__": main()