File size: 5,789 Bytes
e77741a
90bef38
8d5fabf
f9b627f
118cd25
cd245d5
9e7cf7c
f9b627f
9e7cf7c
 
 
 
cd245d5
8d5fabf
9e7cf7c
5f21a2d
9e7cf7c
 
5f21a2d
9e7cf7c
 
5f21a2d
9e7cf7c
5f21a2d
 
9e7cf7c
5f21a2d
 
 
9e7cf7c
5f21a2d
 
 
 
 
 
 
9e7cf7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f21a2d
 
 
a79c9ac
cd245d5
7df9b81
e77741a
 
7df9b81
76abf5e
 
 
 
3fd88eb
 
 
76abf5e
7df9b81
a79c9ac
 
5f21a2d
a79c9ac
7df9b81
 
 
3fd88eb
8d5fabf
cd245d5
f006a50
 
 
4e37056
 
f006a50
a084b90
cd245d5
f9b627f
 
8d5fabf
e77741a
 
 
f006a50
e77741a
 
 
 
f006a50
 
e77741a
 
 
 
f006a50
 
e77741a
 
 
f006a50
 
 
a79c9ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7df9b81
a79c9ac
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# import part
import streamlit as st
from transformers import pipeline
from PIL import Image

# function part
# img2text - Using the original model
def img2text(image):
    # Use the specified model but with optimized parameters
    image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
    # Limiting the output length for speed
    text = image_to_text(image, max_new_tokens=30)[0]["generated_text"]
    return text

# text2story - Using the original model but with optimized parameters
def text2story(text):
    # Using the specified TinyLlama model
    generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    
    # Create a prompt for the story generation
    prompt = f"Write a brief children's story based on this: {text}. Once upon a time, "
    
    # Generate with more constrained parameters for speed
    story_result = generator(
        prompt,
        max_new_tokens=150,  # Use max_new_tokens instead of max_length for efficiency
        num_return_sequences=1,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        do_sample=True
    )
   
    # Extract the generated text
    story_text = story_result[0]['generated_text']
    story_text = story_text.replace(prompt, "Once upon a time, ")
    
    # Find a natural ending point (end of sentence) before 100 words
    words = story_text.split()
    if len(words) > 100:
        # Join the first 100 words
        shortened_text = " ".join(words[:100])
        
        # Find the last complete sentence
        last_period = shortened_text.rfind('.')
        last_question = shortened_text.rfind('?')
        last_exclamation = shortened_text.rfind('!')
        
        # Find the last sentence ending punctuation
        last_end = max(last_period, last_question, last_exclamation)
        
        if last_end > 0:
            # Truncate at the end of the last complete sentence
            story_text = shortened_text[:last_end + 1]
        else:
            # If no sentence ending found, just use the shortened text
            story_text = shortened_text
    
    return story_text

# text2audio - Using HelpingAI-TTS-v1 model
def text2audio(story_text):
    try:
        # Use the HelpingAI TTS model as requested
        synthesizer = pipeline("text-to-speech", model="HelpingAI/HelpingAI-TTS-v1")
        
        # Limit text length to avoid timeouts
        max_chars = 500
        if len(story_text) > max_chars:
            last_period = story_text[:max_chars].rfind('.')
            if last_period > 0:
                story_text = story_text[:last_period + 1]
            else:
                story_text = story_text[:max_chars]
        
        # Generate speech
        speech = synthesizer(story_text)
        
        return speech
        
    except Exception as e:
        st.error(f"Error generating audio: {str(e)}")
        return None

# main part
st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")
st.header("Turn Your Image to Audio Story")
uploaded_file = st.file_uploader("Select an Image...")

if uploaded_file is not None:
    # Display the uploaded image
    st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)
    
    # Convert the file to a PIL Image
    image = Image.open(uploaded_file)
    
    # Progress indicator
    progress_bar = st.progress(0)
    
    # Stage 1: Image to Text
    with st.spinner('Processing image caption...'):
        caption = img2text(image)
        progress_bar.progress(33)
    st.write(f"**Image caption:** {caption}")
    
    # Stage 2: Text to Story
    with st.spinner('Creating story...'):
        story = text2story(caption)
        progress_bar.progress(66)
    st.write(f"**Story:** {story}")
    
    # Stage 3: Story to Audio data
    with st.spinner('Generating audio...'):
        speech_output = text2audio(story)
        progress_bar.progress(100)
    
    # Play button
    if st.button("Play Audio"):
        if speech_output is not None:
            # Try to play the audio directly
            try:
                if 'audio' in speech_output and 'sampling_rate' in speech_output:
                    st.audio(speech_output['audio'], sample_rate=speech_output['sampling_rate'])
                elif 'audio_array' in speech_output and 'sampling_rate' in speech_output:
                    st.audio(speech_output['audio_array'], sample_rate=speech_output['sampling_rate'])
                elif 'waveform' in speech_output and 'sample_rate' in speech_output:
                    st.audio(speech_output['waveform'], sample_rate=speech_output['sample_rate'])
                else:
                    # Try the first array-like value as audio data
                    for key, value in speech_output.items():
                        if hasattr(value, '__len__') and len(value) > 1000:
                            if 'rate' in speech_output:
                                st.audio(value, sample_rate=speech_output['rate'])
                            elif 'sample_rate' in speech_output:
                                st.audio(value, sample_rate=speech_output['sample_rate'])  
                            elif 'sampling_rate' in speech_output:
                                st.audio(value, sample_rate=speech_output['sampling_rate'])
                            else:
                                st.audio(value, sample_rate=24000)  # Default sample rate
                            break
                    else:
                        st.error(f"Could not find compatible audio format in: {list(speech_output.keys())}")
            except Exception as e:
                st.error(f"Error playing audio: {str(e)}")
        else:
            st.error("Audio generation failed. Please try again.")