Spaces:
Build error
Build error
| import streamlit as st | |
| from transformers import pipeline | |
| import torch | |
| from gtts import gTTS | |
| import io | |
| st.set_page_config(page_title="Your Image to Audio Story", | |
| page_icon="🦜") | |
| st.header("Turn Your Image📷 to a Short Audio Story🔊 for Children👶") | |
| uploaded_file = st.file_uploader("Select an Image After the Models are Loaded...") | |
| # function part | |
| # Preload models once | |
| def load_models(): | |
| return { | |
| "img_model": pipeline("image-to-text", "cnmoro/tiny-image-captioning"), | |
| "story_model": pipeline("text-generation", "Qwen/Qwen2.5-0.5B-Instruct") | |
| } | |
| models = load_models() | |
| # img2text | |
| def img2text(url): | |
| text = models["img_model"](url)[0]["generated_text"] | |
| return text | |
| # text2story | |
| def text2story(text): | |
| # Define your messages | |
| prompt = f"Generate a brief 100-word story about: {text}" | |
| messages = [ | |
| {"role": "system", "content": "You are a helpful assistant."}, | |
| {"role": "user", "content": prompt} | |
| ] | |
| response = models["story_model"]( | |
| messages, | |
| max_new_tokens=100, | |
| do_sample=True, | |
| temperature=0.7)[0]["generated_text"] | |
| story_text = response[2]["content"] | |
| return story_text | |
| # text2audio | |
| def text2audio(story_text): | |
| # Create temporary in-memory file | |
| audio_io = io.BytesIO() | |
| # Generate speech using gTTS | |
| tts = gTTS(text=story_text, lang='en', slow=False) | |
| tts.write_to_fp(audio_io) | |
| audio_io.seek(0) | |
| # Return as dictionary with compatible structure | |
| return { | |
| 'audio': audio_io, | |
| 'sampling_rate': 16000 # gTTS uses 16kHz by default | |
| } | |
| # Initialize session state variables | |
| if 'processed_data' not in st.session_state: | |
| st.session_state.processed_data = { | |
| 'scenario': None, | |
| 'story': None, | |
| 'audio': None | |
| } | |
| if uploaded_file is not None: | |
| print(uploaded_file) | |
| bytes_data = uploaded_file.getvalue() | |
| with open(uploaded_file.name, "wb") as file: | |
| file.write(bytes_data) | |
| st.image(uploaded_file, caption="Uploaded Image", | |
| use_container_width=True) | |
| # Only process if file is new | |
| if st.session_state.get('current_file') != uploaded_file.name: | |
| st.session_state.current_file = uploaded_file.name | |
| # Stage 1: Image to Text | |
| with st.spinner('Processing image...'): | |
| st.session_state.processed_data['scenario'] = img2text(uploaded_file.name) | |
| # Stage 2: Text to Story | |
| with st.spinner('Generating story...'): | |
| st.session_state.processed_data['story'] = text2story( | |
| st.session_state.processed_data['scenario'] | |
| ) | |
| # Stage 3: Story to Audio | |
| with st.spinner('Creating audio...'): | |
| st.session_state.processed_data['audio'] = text2audio( | |
| st.session_state.processed_data['story'] | |
| ) | |
| # Display results | |
| # st.image(uploaded_file) | |
| st.write("Caption:", st.session_state.processed_data['scenario']) | |
| st.write("Story:", st.session_state.processed_data['story']) | |
| # Keep audio button OUTSIDE file processing block | |
| if st.button("Play Audio of the Story Generated"): | |
| if st.session_state.processed_data.get('audio'): | |
| audio_data = st.session_state.processed_data['audio'] | |
| # Convert BytesIO to bytes and specify format | |
| st.audio( | |
| audio_data['audio'].getvalue(), | |
| format="audio/mp3" # gTTS outputs MP3 by default | |
| ) | |
| else: | |
| st.warning("Please generate a story first!") |