File size: 2,932 Bytes
cd245d5
90bef38
8d5fabf
cd245d5
76abf5e
118cd25
cd245d5
 
 
b2cad31
cd245d5
 
8d5fabf
cd245d5
 
7df9b81
83842b8
 
7df9b81
76abf5e
 
 
 
3fd88eb
 
 
76abf5e
7df9b81
e5f2129
83842b8
e5f2129
cd9e32e
 
83842b8
 
 
7df9b81
 
 
cd9e32e
 
3fd88eb
8d5fabf
cd245d5
 
 
 
 
 
 
 
 
 
 
8d5fabf
cd245d5
f006a50
 
 
4e37056
 
f006a50
a084b90
cd245d5
f006a50
 
8d5fabf
f006a50
 
 
 
 
 
 
 
 
 
 
 
e5f2129
f006a50
 
 
76abf5e
 
 
7df9b81
76abf5e
f006a50
76abf5e
f006a50
 
e5f2129
f006a50
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# import part
import streamlit as st
from transformers import pipeline
import os
import tempfile

# function part
# img2text
def img2text(image_path):
    image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
    text = image_to_text(image_path)[0]["generated_text"]
    return text

# text2story
def text2audio(story_text):
    try:
        # Use the HelpingAI TTS model as requested
        synthesizer = pipeline("text-to-speech", model="HelpingAI/HelpingAI-TTS-v1")
        
        # Limit text length to avoid timeouts
        max_chars = 500
        if len(story_text) > max_chars:
            last_period = story_text[:max_chars].rfind('.')
            if last_period > 0:
                story_text = story_text[:last_period + 1]
            else:
                story_text = story_text[:max_chars]
        
        # Generate speech
        st.write("Generating audio...")
        speech = synthesizer(story_text)
        st.write(f"Speech output keys: {list(speech.keys())}")
        
        # We'll pass the audio data directly to Streamlit instead of saving to a file
        # This works because Streamlit's st.audio() can take raw audio data
        return speech
        
    except Exception as e:
        st.error(f"Error generating audio: {str(e)}")
        import traceback
        st.error(traceback.format_exc())
        return None

# Function to save temporary image file
def save_uploaded_image(uploaded_file):
    if not os.path.exists("temp"):
        os.makedirs("temp")
    
    image_path = os.path.join("temp", uploaded_file.name)
    
    with open(image_path, "wb") as f:
        f.write(uploaded_file.getvalue())
    
    return image_path

# main part
st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")
st.header("Turn Your Image to Audio Story")
uploaded_file = st.file_uploader("Select an Image...")

if uploaded_file is not None:
    # Display the uploaded image
    st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)
    
    # Save the image temporarily
    image_path = save_uploaded_image(uploaded_file)
    
    # Stage 1: Image to Text
    st.text('Processing img2text...')
    caption = img2text(image_path)
    st.write(caption)
    
    # Stage 2: Text to Story
    st.text('Generating a story...')
    story = text2story(caption)
    st.write(story)
    
    # Stage 3: Story to Audio data
    st.text('Generating audio data...')
    audio_file = text2audio(story)
    
    # Play button
    if st.button("Play Audio"):
        if audio_file and os.path.exists(audio_file):
            # Play the audio file
            st.audio(audio_file)
        else:
            st.error("Audio generation failed. Please try again.")
    
    # Clean up the temporary files
    try:
        os.remove(image_path)
        # Don't delete audio file immediately as it might still be playing
    except:
        pass