import streamlit as st from transformers import pipeline from PIL import Image import os # function part # img2text def img2text(image_path): try: # Load the image-to-text model image_to_text_model = pipeline("image-to-text", model="naver-clova-ix/donut-base") # Open the image file image = Image.open(image_path) # Extract text from the image result = image_to_text_model(image) # Get the generated text text = result[0]["generated_text"] if result else "No text detected" return text except Exception as e: st.error(f"Error processing image: {str(e)}") return f"Error: {str(e)}" # text2story def text2story(text): # For now, just return the extracted text as the story # This function can be expanded later with more sophisticated story generation story_text = f"Here's a story based on the text: {text}" return story_text # text2audio def text2audio(story_text): try: # Load the text-to-speech model (using a common TTS pipeline) # Note: You may need to install additional dependencies depending on the model used tts_model = pipeline("text-to-speech", model="espnet/kan-bayashi_ljspeech_vits") # Generate audio from the story text audio_data = tts_model(story_text) return audio_data except Exception as e: st.error(f"Error generating audio: {str(e)}") return None # main part st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜") st.header("Turn Your Image to Audio Story") st.subheader("Using Donut model for text extraction") uploaded_file = st.file_uploader("Select an Image...", type=['png', 'jpg', 'jpeg', 'gif', 'bmp', 'webp']) if uploaded_file is not None: # Save the uploaded file temporarily bytes_data = uploaded_file.getvalue() with open(uploaded_file.name, "wb") as file: file.write(bytes_data) # Display the uploaded image st.image(uploaded_file, caption="Uploaded Image", use_column_width=True) # Stage 1: Image to Text with st.spinner('Processing img2text...'): extracted_text = img2text(uploaded_file.name) st.subheader("Extracted Text:") st.write(extracted_text) # Stage 2: Text to Story with st.spinner('Generating a story...'): story = text2story(extracted_text) st.subheader("Generated Story:") st.write(story) # Stage 3: Story to Audio data with st.spinner('Generating audio data...'): audio_data = text2audio(story) # Remove the temporary file if os.path.exists(uploaded_file.name): os.remove(uploaded_file.name) # Play button if st.button("Play Audio"): if audio_data: st.audio(audio_data['audio'], format="audio/wav", start_time=0, sample_rate=audio_data['sampling_rate']) else: st.warning("Audio generation failed. Playing a placeholder audio.") try: st.audio("kids_playing_audio.wav") except FileNotFoundError: st.error("Placeholder audio file not found. Audio playback is unavailable.")