File size: 3,882 Bytes
90bef38
 
b038974
cd79461
118cd25
 
7c5a1e4
4e37056
7c5a1e4
b038974
118cd25
cd79461
4e37056
 
 
 
7c5a1e4
 
4e37056
7c5a1e4
 
4e37056
7c5a1e4
 
 
b038974
7c5a1e4
 
90bef38
7c5a1e4
 
 
 
 
90bef38
118cd25
7c5a1e4
b038974
118cd25
 
cd79461
4e37056
cd79461
 
 
 
 
b038974
cd79461
 
1fb1e8e
cd79461
b038974
7c5a1e4
 
90bef38
7c5a1e4
 
 
 
4e37056
 
 
 
 
 
 
 
 
 
 
 
 
 
90bef38
4e37056
 
 
 
 
7c5a1e4
4e37056
 
 
 
 
7c5a1e4
4e37056
 
 
 
7c5a1e4
4e37056
 
 
5b9e396
4e37056
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import streamlit as st
from PIL import Image
import os
import tempfile
import sys

# function part
# img2text with a model that doesn't require sentencepiece
def img2text(image_path):
    try:
        from transformers import pipeline
        
        # Use the Salesforce model instead of Donut to avoid sentencepiece issues
        st.info("Using Salesforce/blip-image-captioning-base model for image-to-text")
        image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
        
        # Open the image file
        image = Image.open(image_path)
        
        # Extract text from the image
        result = image_to_text_model(image)
        
        # Get the generated text
        text = result[0]["generated_text"] if result else "No text detected"
        return text
    except Exception as e:
        st.error(f"Error processing image: {str(e)}")
        return f"Error: {str(e)}"

# text2story
def text2story(text):
    # For now, just return the extracted text as the story
    story_text = f"Here's a story based on the text: {text}"
    return story_text

# text2audio using Google Text-to-Speech
def text2audio(story_text):
    try:
        from gtts import gTTS
        
        # Create a temporary file
        temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
        temp_audio_path = temp_audio.name
        temp_audio.close()
        
        # Initialize gTTS and generate audio
        tts = gTTS(text=story_text, lang='en', slow=False)
        
        # Save to the temporary file
        tts.save(temp_audio_path)
        
        return temp_audio_path
    except Exception as e:
        st.error(f"Error generating audio: {str(e)}")
        return None

# main part
st.set_page_config(page_title="Your Image to Audio Story",
                   page_icon="🦜")
st.header("Turn Your Image to Audio Story")
st.subheader("Image to Text to Audio Conversion")

uploaded_file = st.file_uploader("Select an Image...", type=['png', 'jpg', 'jpeg', 'gif', 'bmp', 'webp'])

if uploaded_file is not None:
    # Save the uploaded file temporarily
    bytes_data = uploaded_file.getvalue()
    image_temp_path = os.path.join(tempfile.gettempdir(), uploaded_file.name)
    with open(image_temp_path, "wb") as file:
        file.write(bytes_data)

    # Display the uploaded image
    st.image(uploaded_file, caption="Uploaded Image",
             use_column_width=True)

    # Stage 1: Image to Text
    with st.spinner('Processing img2text...'):
        extracted_text = img2text(image_temp_path)
        st.subheader("Extracted Text:")
        st.write(extracted_text)

    # Stage 2: Text to Story
    with st.spinner('Generating a story...'):
        story = text2story(extracted_text)
        st.subheader("Generated Story:")
        st.write(story)

    # Stage 3: Story to Audio data
    audio_file_path = None
    with st.spinner('Generating audio data...'):
        audio_file_path = text2audio(story)

    # Remove the temporary image file
    if os.path.exists(image_temp_path):
        os.remove(image_temp_path)

    # Play button
    if st.button("Play Audio"):
        if audio_file_path and os.path.exists(audio_file_path):
            # Play the generated audio
            with open(audio_file_path, "rb") as audio_file:
                audio_bytes = audio_file.read()
            st.audio(audio_bytes, format="audio/mp3")
            
            # Clean up the audio file after playing
            try:
                os.remove(audio_file_path)
            except:
                pass
        else:
            st.warning("Audio generation failed. Playing a placeholder audio.")
            try:
                st.audio("kids_playing_audio.wav")
            except FileNotFoundError:
                st.error("Placeholder audio file not found. Audio playback is unavailable.")