Spaces:

CR7CAD
/

Assignment1

Sleeping

File size: 5,290 Bytes

e77741a
90bef38
8d5fabf
118cd25
cd245d5
a4fc174
 
9e7cf7c
a4fc174
cd245d5
8d5fabf
a4fc174
5f21a2d
a4fc174
9e7cf7c
5f21a2d
9e7cf7c
a4fc174
5f21a2d
a4fc174
5f21a2d
 
a4fc174
5f21a2d
 
 
9e7cf7c
5f21a2d
 
 
 
 
 
 
9e7cf7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f21a2d
 
 
a4fc174
cd245d5
7df9b81
e77741a
 
7df9b81
76abf5e
 
 
 
3fd88eb
 
 
76abf5e
7df9b81
a79c9ac
a4fc174
a79c9ac
a4fc174
5f21a2d
a4fc174
 
a79c9ac
7df9b81
 
 
a4fc174
 
3fd88eb
8d5fabf
a4fc174
 
 
 
 
 
 
 
 
 
 
 
cd245d5
f006a50
 
 
4e37056
 
f006a50
a084b90
cd245d5
a4fc174
 
e77741a
f006a50
a4fc174
 
 
f006a50
 
a4fc174
 
 
f006a50
 
a4fc174
 
f006a50
 
 
a79c9ac
 
 
 
 
 
 
 
 
 
a4fc174
a79c9ac
 
7df9b81
a4fc174

# import part
import streamlit as st
from transformers import pipeline

# function part
# img2text
def img2text(image_path):
    image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
    text = image_to_text(image_path)[0]["generated_text"]
    return text

# text2story - IMPROVED to end naturally
def text2story(text):
    # Using a smaller text generation model
    generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    
    # Create a prompt for the story generation
    prompt = f"Write a fun children's story based on this: {text}. The story should be short and end naturally with a conclusion. Once upon a time, "
    
    # Generate the story
    story_result = generator(
        prompt,
        max_length=250,  # Increased to allow for a complete story
        num_return_sequences=1,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        do_sample=True
    )
   
    # Extract the generated text
    story_text = story_result[0]['generated_text']
    story_text = story_text.replace(prompt, "Once upon a time, ")
    
    # Find a natural ending point (end of sentence) before 100 words
    words = story_text.split()
    if len(words) > 100:
        # Join the first 100 words
        shortened_text = " ".join(words[:100])
        
        # Find the last complete sentence
        last_period = shortened_text.rfind('.')
        last_question = shortened_text.rfind('?')
        last_exclamation = shortened_text.rfind('!')
        
        # Find the last sentence ending punctuation
        last_end = max(last_period, last_question, last_exclamation)
        
        if last_end > 0:
            # Truncate at the end of the last complete sentence
            story_text = shortened_text[:last_end + 1]
        else:
            # If no sentence ending found, just use the shortened text
            story_text = shortened_text
    
    return story_text

# text2audio - Simplified without numpy/scipy
def text2audio(story_text):
    try:
        # Use the HelpingAI TTS model as requested
        synthesizer = pipeline("text-to-speech", model="HelpingAI/HelpingAI-TTS-v1")
        
        # Limit text length to avoid timeouts
        max_chars = 500
        if len(story_text) > max_chars:
            last_period = story_text[:max_chars].rfind('.')
            if last_period > 0:
                story_text = story_text[:last_period + 1]
            else:
                story_text = story_text[:max_chars]
        
        # Generate speech
        st.write("Generating audio...")
        speech = synthesizer(story_text)
        st.write(f"Speech output keys: {list(speech.keys())}")
        
        # We'll pass the audio data directly to Streamlit instead of saving to a file
        # This works because Streamlit's st.audio() can take raw audio data
        return speech
        
    except Exception as e:
        st.error(f"Error generating audio: {str(e)}")
        import traceback
        st.error(traceback.format_exc())
        return None

# Function to save temporary image file
def save_uploaded_image(uploaded_file):
    if not os.path.exists("temp"):
        os.makedirs("temp")
    
    image_path = os.path.join("temp", uploaded_file.name)
    
    with open(image_path, "wb") as f:
        f.write(uploaded_file.getvalue())
    
    return image_path

# main part
st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")
st.header("Turn Your Image to Audio Story")
uploaded_file = st.file_uploader("Select an Image...")

if uploaded_file is not None:
    # Display the uploaded image
    st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)
    
    # Save the image temporarily
    image_path = save_uploaded_image(uploaded_file)
    
    # Stage 1: Image to Text
    st.text('Processing img2text...')
    caption = img2text(image_path)
    st.write(caption)
    
    # Stage 2: Text to Story
    st.text('Generating a story...')
    story = text2story(caption)
    st.write(story)
    
    # Stage 3: Story to Audio data
    st.text('Generating audio data...')
    speech_output = text2audio(story)
    
    # Play button
    if st.button("Play Audio"):
        if speech_output is not None:
            # Try to play the audio directly
            try:
                if 'audio' in speech_output and 'sampling_rate' in speech_output:
                    st.audio(speech_output['audio'], sample_rate=speech_output['sampling_rate'])
                elif 'audio_array' in speech_output and 'sampling_rate' in speech_output:
                    st.audio(speech_output['audio_array'], sample_rate=speech_output['sampling_rate'])
                elif 'waveform' in speech_output and 'sample_rate' in speech_output:
                    st.audio(speech_output['waveform'], sample_rate=speech_output['sample_rate'])
                else:
                    st.error(f"Could not find compatible audio format in: {list(speech_output.keys())}")
            except Exception as e:
                st.error(f"Error playing audio: {str(e)}")
        else:
            st.error("Audio generation failed. Please try again.")
    
    # Clean up the temporary files
    try:
        os.remove(image_path)
    except:
        pass