File size: 5,563 Bytes
a79c9ac
90bef38
8d5fabf
f9b627f
 
118cd25
cd245d5
 
f9b627f
b2cad31
f9b627f
cd245d5
8d5fabf
a79c9ac
5f21a2d
 
 
 
 
a79c9ac
5f21a2d
 
 
 
a79c9ac
5f21a2d
 
 
 
 
 
 
 
 
 
 
a79c9ac
5f21a2d
 
a79c9ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f21a2d
 
 
a79c9ac
cd245d5
7df9b81
a79c9ac
 
7df9b81
76abf5e
 
 
 
3fd88eb
 
 
76abf5e
7df9b81
a79c9ac
 
5f21a2d
a79c9ac
 
cd9e32e
a79c9ac
7df9b81
 
 
3fd88eb
8d5fabf
cd245d5
f006a50
 
 
4e37056
 
f006a50
a084b90
cd245d5
f9b627f
 
8d5fabf
f006a50
 
f9b627f
f006a50
 
 
 
 
 
 
 
 
a79c9ac
f006a50
 
 
a79c9ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7df9b81
a79c9ac
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# import part - only using the two requested imports
import streamlit as st
from transformers import pipeline
from PIL import Image
import io

# function part
# img2text
def img2text(image):
    image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
    text = image_to_text(image)[0]["generated_text"]
    return text

# text2story - IMPROVED to end naturally
def text2story(text):
    # Using a smaller text generation model
    generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    
    # Create a prompt for the story generation
    prompt = f"Write a fun children's story based on this: {text}. The story should be short and end naturally with a conclusion. Once upon a time, "
    
    # Generate the story
    story_result = generator(
        prompt,
        max_length=250,  # Increased to allow for a complete story
        num_return_sequences=1,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        do_sample=True
    )
   
    # Extract the generated text
    story_text = story_result[0]['generated_text']
    story_text = story_text.replace(prompt, "Once upon a time, ")
    
    # Find a natural ending point (end of sentence) before 100 words
    words = story_text.split()
    if len(words) > 100:
        # Join the first 100 words
        shortened_text = " ".join(words[:100])
        
        # Find the last complete sentence
        last_period = shortened_text.rfind('.')
        last_question = shortened_text.rfind('?')
        last_exclamation = shortened_text.rfind('!')
        
        # Find the last sentence ending punctuation
        last_end = max(last_period, last_question, last_exclamation)
        
        if last_end > 0:
            # Truncate at the end of the last complete sentence
            story_text = shortened_text[:last_end + 1]
        else:
            # If no sentence ending found, just use the shortened text
            story_text = shortened_text
    
    return story_text

# text2audio - Using HelpingAI-TTS-v1 model
def text2audio(story_text):
    try:
        # Use the HelpingAI TTS model as requested
        synthesizer = pipeline("text-to-speech", model="HelpingAI/HelpingAI-TTS-v1")
        
        # Limit text length to avoid timeouts
        max_chars = 500
        if len(story_text) > max_chars:
            last_period = story_text[:max_chars].rfind('.')
            if last_period > 0:
                story_text = story_text[:last_period + 1]
            else:
                story_text = story_text[:max_chars]
        
        # Generate speech
        speech = synthesizer(story_text)
        
        # Get output information
        st.write(f"Speech output keys: {list(speech.keys())}")
        
        return speech
        
    except Exception as e:
        st.error(f"Error generating audio: {str(e)}")
        return None

# main part
st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")
st.header("Turn Your Image to Audio Story")
uploaded_file = st.file_uploader("Select an Image...")

if uploaded_file is not None:
    # Display the uploaded image
    st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)
    
    # Convert the file to a PIL Image
    image = Image.open(uploaded_file)
    
    # Stage 1: Image to Text
    st.text('Processing img2text...')
    caption = img2text(image)  # Pass PIL image to pipeline
    st.write(caption)
    
    # Stage 2: Text to Story
    st.text('Generating a story...')
    story = text2story(caption)
    st.write(story)
    
    # Stage 3: Story to Audio data
    st.text('Generating audio data...')
    speech_output = text2audio(story)
    
    # Play button
    if st.button("Play Audio"):
        if speech_output is not None:
            # Try to play the audio directly
            try:
                if 'audio' in speech_output and 'sampling_rate' in speech_output:
                    st.audio(speech_output['audio'], sample_rate=speech_output['sampling_rate'])
                elif 'audio_array' in speech_output and 'sampling_rate' in speech_output:
                    st.audio(speech_output['audio_array'], sample_rate=speech_output['sampling_rate'])
                elif 'waveform' in speech_output and 'sample_rate' in speech_output:
                    st.audio(speech_output['waveform'], sample_rate=speech_output['sample_rate'])
                else:
                    # Try the first array-like value as audio data
                    for key, value in speech_output.items():
                        if hasattr(value, '__len__') and len(value) > 1000:
                            if 'rate' in speech_output:
                                st.audio(value, sample_rate=speech_output['rate'])
                            elif 'sample_rate' in speech_output:
                                st.audio(value, sample_rate=speech_output['sample_rate'])  
                            elif 'sampling_rate' in speech_output:
                                st.audio(value, sample_rate=speech_output['sampling_rate'])
                            else:
                                st.audio(value, sample_rate=24000)  # Default sample rate
                            break
                    else:
                        st.error(f"Could not find compatible audio format in: {list(speech_output.keys())}")
            except Exception as e:
                st.error(f"Error playing audio: {str(e)}")
        else:
            st.error("Audio generation failed. Please try again.")