File size: 5,582 Bytes
1ebc71c
90bef38
8d5fabf
118cd25
cd245d5
a4fc174
 
9e7cf7c
a4fc174
cd245d5
8d5fabf
a4fc174
5f21a2d
a4fc174
9e7cf7c
5f21a2d
9e7cf7c
a4fc174
5f21a2d
a4fc174
5f21a2d
 
a4fc174
5f21a2d
 
 
9e7cf7c
5f21a2d
 
 
 
 
 
 
9e7cf7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f21a2d
 
 
1ebc71c
cd245d5
7df9b81
e77741a
 
7df9b81
76abf5e
 
 
 
3fd88eb
 
 
76abf5e
7df9b81
a79c9ac
 
1ebc71c
 
a4fc174
5f21a2d
a79c9ac
7df9b81
 
 
3fd88eb
8d5fabf
cd245d5
f006a50
 
 
4e37056
 
f006a50
a084b90
cd245d5
1ebc71c
 
e77741a
f006a50
a4fc174
1ebc71c
a4fc174
f006a50
 
a4fc174
 
 
f006a50
 
a4fc174
 
f006a50
 
 
a79c9ac
 
 
 
 
 
 
 
 
 
1ebc71c
 
 
 
 
 
 
 
 
 
 
 
 
 
a79c9ac
 
7df9b81
1ebc71c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# import part - only using the two requested imports
import streamlit as st
from transformers import pipeline

# function part
# img2text
def img2text(image_path):
    image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
    text = image_to_text(image_path)[0]["generated_text"]
    return text

# text2story - IMPROVED to end naturally
def text2story(text):
    # Using a smaller text generation model
    generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    
    # Create a prompt for the story generation
    prompt = f"Write a fun children's story based on this: {text}. The story should be short and end naturally with a conclusion. Once upon a time, "
    
    # Generate the story
    story_result = generator(
        prompt,
        max_length=250,  # Increased to allow for a complete story
        num_return_sequences=1,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        do_sample=True
    )
   
    # Extract the generated text
    story_text = story_result[0]['generated_text']
    story_text = story_text.replace(prompt, "Once upon a time, ")
    
    # Find a natural ending point (end of sentence) before 100 words
    words = story_text.split()
    if len(words) > 100:
        # Join the first 100 words
        shortened_text = " ".join(words[:100])
        
        # Find the last complete sentence
        last_period = shortened_text.rfind('.')
        last_question = shortened_text.rfind('?')
        last_exclamation = shortened_text.rfind('!')
        
        # Find the last sentence ending punctuation
        last_end = max(last_period, last_question, last_exclamation)
        
        if last_end > 0:
            # Truncate at the end of the last complete sentence
            story_text = shortened_text[:last_end + 1]
        else:
            # If no sentence ending found, just use the shortened text
            story_text = shortened_text
    
    return story_text

# text2audio - Using HelpingAI-TTS-v1 model
def text2audio(story_text):
    try:
        # Use the HelpingAI TTS model as requested
        synthesizer = pipeline("text-to-speech", model="HelpingAI/HelpingAI-TTS-v1")
        
        # Limit text length to avoid timeouts
        max_chars = 500
        if len(story_text) > max_chars:
            last_period = story_text[:max_chars].rfind('.')
            if last_period > 0:
                story_text = story_text[:last_period + 1]
            else:
                story_text = story_text[:max_chars]
        
        # Generate speech
        speech = synthesizer(story_text)
        
        # Get output information
        st.write(f"Speech output keys: {list(speech.keys())}")
        
        return speech
        
    except Exception as e:
        st.error(f"Error generating audio: {str(e)}")
        return None

# main part
st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")
st.header("Turn Your Image to Audio Story")
uploaded_file = st.file_uploader("Select an Image...")

if uploaded_file is not None:
    # Display the uploaded image
    st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)
    
    # Create a temporary file in memory from the uploaded file
    image_bytes = uploaded_file.getvalue()
    
    # Stage 1: Image to Text
    st.text('Processing img2text...')
    caption = img2text(image_bytes)  # Pass bytes directly to pipeline
    st.write(caption)
    
    # Stage 2: Text to Story
    st.text('Generating a story...')
    story = text2story(caption)
    st.write(story)
    
    # Stage 3: Story to Audio data
    st.text('Generating audio data...')
    speech_output = text2audio(story)
    
    # Play button
    if st.button("Play Audio"):
        if speech_output is not None:
            # Try to play the audio directly
            try:
                if 'audio' in speech_output and 'sampling_rate' in speech_output:
                    st.audio(speech_output['audio'], sample_rate=speech_output['sampling_rate'])
                elif 'audio_array' in speech_output and 'sampling_rate' in speech_output:
                    st.audio(speech_output['audio_array'], sample_rate=speech_output['sampling_rate'])
                elif 'waveform' in speech_output and 'sample_rate' in speech_output:
                    st.audio(speech_output['waveform'], sample_rate=speech_output['sample_rate'])
                else:
                    # Try the first array-like value as audio data
                    for key, value in speech_output.items():
                        if hasattr(value, '__len__') and len(value) > 1000:
                            if 'rate' in speech_output:
                                st.audio(value, sample_rate=speech_output['rate'])
                            elif 'sample_rate' in speech_output:
                                st.audio(value, sample_rate=speech_output['sample_rate'])  
                            elif 'sampling_rate' in speech_output:
                                st.audio(value, sample_rate=speech_output['sampling_rate'])
                            else:
                                st.audio(value, sample_rate=24000)  # Default sample rate
                            break
                    else:
                        st.error(f"Could not find compatible audio format in: {list(speech_output.keys())}")
            except Exception as e:
                st.error(f"Error playing audio: {str(e)}")
        else:
            st.error("Audio generation failed. Please try again.")