Spaces:

CR7CAD
/

Assignment1

Sleeping

File size: 5,252 Bytes

e77741a
90bef38
8d5fabf
f9b627f
118cd25
cd245d5
e77741a
f9b627f
e77741a
 
 
cd245d5
8d5fabf
e77741a
5f21a2d
e77741a
 
5f21a2d
e77741a
 
5f21a2d
e77741a
5f21a2d
 
e77741a
5f21a2d
 
 
 
 
 
 
 
 
 
e77741a
 
 
 
 
 
 
 
 
 
 
5f21a2d
 
 
a79c9ac
cd245d5
7df9b81
e77741a
 
7df9b81
76abf5e
 
 
 
3fd88eb
 
 
76abf5e
7df9b81
a79c9ac
 
5f21a2d
a79c9ac
7df9b81
 
 
3fd88eb
8d5fabf
cd245d5
f006a50
 
 
4e37056
 
f006a50
a084b90
cd245d5
f9b627f
 
8d5fabf
e77741a
 
 
f006a50
e77741a
 
 
 
f006a50
 
e77741a
 
 
 
f006a50
 
e77741a
 
 
f006a50
 
 
a79c9ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7df9b81
a79c9ac

# import part
import streamlit as st
from transformers import pipeline
from PIL import Image

# function part
# img2text - Using a lighter model
def img2text(image):
    # Use a smaller, faster image captioning model
    image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
    text = image_to_text(image, max_new_tokens=20)[0]["generated_text"]
    return text

# text2story - Using a much faster model with constraints
def text2story(text):
    # Use a tiny model that's much faster
    generator = pipeline("text-generation", model="distilgpt2")
    
    # Create a more constrained prompt for faster generation
    prompt = f"A short children's story about {text}: Once upon a time, "
    
    # Generate with strict constraints for speed
    story_result = generator(
        prompt,
        max_new_tokens=100,  # Limit token generation
        num_return_sequences=1,
        temperature=0.7,
        top_k=50,
        do_sample=True
    )
   
    # Extract the generated text
    story_text = story_result[0]['generated_text']
    story_text = story_text.replace(prompt, "Once upon a time, ")
    
    # Find a natural ending point (end of sentence)
    last_period = story_text.rfind('.')
    last_question = story_text.rfind('?')
    last_exclamation = story_text.rfind('!')
    
    # Find the last sentence ending punctuation
    last_end = max(last_period, last_question, last_exclamation)
    
    if last_end > 0:
        # Truncate at the end of the last complete sentence
        story_text = story_text[:last_end + 1]
    
    return story_text

# text2audio - Using HelpingAI-TTS-v1 model
def text2audio(story_text):
    try:
        # Use the HelpingAI TTS model as requested
        synthesizer = pipeline("text-to-speech", model="HelpingAI/HelpingAI-TTS-v1")
        
        # Limit text length to avoid timeouts
        max_chars = 500
        if len(story_text) > max_chars:
            last_period = story_text[:max_chars].rfind('.')
            if last_period > 0:
                story_text = story_text[:last_period + 1]
            else:
                story_text = story_text[:max_chars]
        
        # Generate speech
        speech = synthesizer(story_text)
        
        return speech
        
    except Exception as e:
        st.error(f"Error generating audio: {str(e)}")
        return None

# main part
st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")
st.header("Turn Your Image to Audio Story")
uploaded_file = st.file_uploader("Select an Image...")

if uploaded_file is not None:
    # Display the uploaded image
    st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)
    
    # Convert the file to a PIL Image
    image = Image.open(uploaded_file)
    
    # Progress indicator
    progress_bar = st.progress(0)
    
    # Stage 1: Image to Text
    with st.spinner('Processing image caption...'):
        caption = img2text(image)
        progress_bar.progress(33)
    st.write(f"**Image caption:** {caption}")
    
    # Stage 2: Text to Story
    with st.spinner('Creating story...'):
        story = text2story(caption)
        progress_bar.progress(66)
    st.write(f"**Story:** {story}")
    
    # Stage 3: Story to Audio data
    with st.spinner('Generating audio...'):
        speech_output = text2audio(story)
        progress_bar.progress(100)
    
    # Play button
    if st.button("Play Audio"):
        if speech_output is not None:
            # Try to play the audio directly
            try:
                if 'audio' in speech_output and 'sampling_rate' in speech_output:
                    st.audio(speech_output['audio'], sample_rate=speech_output['sampling_rate'])
                elif 'audio_array' in speech_output and 'sampling_rate' in speech_output:
                    st.audio(speech_output['audio_array'], sample_rate=speech_output['sampling_rate'])
                elif 'waveform' in speech_output and 'sample_rate' in speech_output:
                    st.audio(speech_output['waveform'], sample_rate=speech_output['sample_rate'])
                else:
                    # Try the first array-like value as audio data
                    for key, value in speech_output.items():
                        if hasattr(value, '__len__') and len(value) > 1000:
                            if 'rate' in speech_output:
                                st.audio(value, sample_rate=speech_output['rate'])
                            elif 'sample_rate' in speech_output:
                                st.audio(value, sample_rate=speech_output['sample_rate'])  
                            elif 'sampling_rate' in speech_output:
                                st.audio(value, sample_rate=speech_output['sampling_rate'])
                            else:
                                st.audio(value, sample_rate=24000)  # Default sample rate
                            break
                    else:
                        st.error(f"Could not find compatible audio format in: {list(speech_output.keys())}")
            except Exception as e:
                st.error(f"Error playing audio: {str(e)}")
        else:
            st.error("Audio generation failed. Please try again.")