Spaces:

CR7CAD
/

Assignment1

Sleeping

File size: 4,958 Bytes

cd245d5
90bef38
8d5fabf
cd245d5
7df9b81
 
 
118cd25
cd245d5
 
 
b2cad31
cd245d5
 
8d5fabf
cd245d5
 
f006a50
6f17888
cd245d5
 
f006a50
d996989
 
 
 
 
 
 
 
 
 
 
6f17888
f006a50
 
 
 
 
b9c5fcd
 
 
 
f006a50
cd245d5
8d5fabf
7df9b81
cd245d5
7df9b81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d5fabf
cd245d5
 
 
 
 
 
 
 
 
 
 
8d5fabf
cd245d5
f006a50
 
 
4e37056
 
f006a50
a084b90
cd245d5
f006a50
 
8d5fabf
f006a50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7df9b81
 
 
 
 
 
 
 
 
f006a50

# import part
import streamlit as st
from transformers import pipeline
import os
import numpy as np
import io
import scipy.io.wavfile as wavfile

# function part
# img2text
def img2text(image_path):
    image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
    text = image_to_text(image_path)[0]["generated_text"]
    return text

# text2story
def text2story(text):
    # Using a smaller text generation model
    generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    
    # Create a prompt for the story generation
    prompt = f"Write a fun children's story based on this: {text}. Once upon a time, "
    
    # Generate the story
    story_result = generator(
        prompt,
        max_length=150,
        num_return_sequences=1,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        do_sample=True
    )
   
    # Extract the generated text
    story_text = story_result[0]['generated_text']
    story_text = story_text.replace(prompt, "Once upon a time, ")
    
    # Make sure the story is at least 100 words
    words = story_text.split()
    if len(words) > 100:
        # Simply truncate to 100 words
        story_text = " ".join(words[:100])
    
    return story_text

# text2audio - REVISED to use facebook/mms-tts-eng model
def text2audio(story_text):
    try:
        # Use a smaller and more reliable TTS model
        synthesizer = pipeline("text-to-speech", model="facebook/mms-tts-eng")
        
        # Break the text into smaller chunks if needed (prevent timeout)
        max_chunk_size = 200  # characters
        chunks = []
        
        for i in range(0, len(story_text), max_chunk_size):
            chunk = story_text[i:i+max_chunk_size]
            # Make sure we break at word boundaries
            if i+max_chunk_size < len(story_text) and story_text[i+max_chunk_size] != ' ':
                # Find the last space in this chunk
                last_space = chunk.rfind(' ')
                if last_space != -1:
                    chunk = chunk[:last_space]
            
            chunks.append(chunk)
        
        # Process each chunk
        audio_arrays = []
        sampling_rate = None
        
        for chunk in chunks:
            if not chunk.strip():  # Skip empty chunks
                continue
                
            speech = synthesizer(chunk)
            if sampling_rate is None:
                sampling_rate = speech["sampling_rate"]
            
            audio_arrays.append(speech["audio"])
        
        # Combine all audio chunks
        combined_audio = np.concatenate(audio_arrays)
        
        # Create a BytesIO object to store the wave file
        wav_buffer = io.BytesIO()
        wavfile.write(wav_buffer, sampling_rate, combined_audio)
        wav_buffer.seek(0)  # Rewind the buffer
        
        return {
            "audio": wav_buffer.getvalue(),
            "sampling_rate": sampling_rate
        }
        
    except Exception as e:
        st.error(f"Error generating audio: {str(e)}")
        # Fallback to a pre-recorded audio file if available
        try:
            with open("fallback_audio.wav", "rb") as f:
                return {
                    "audio": f.read(),
                    "sampling_rate": 22050  # Common sample rate
                }
        except:
            return None

# Function to save temporary image file
def save_uploaded_image(uploaded_file):
    if not os.path.exists("temp"):
        os.makedirs("temp")
    
    image_path = os.path.join("temp", uploaded_file.name)
    
    with open(image_path, "wb") as f:
        f.write(uploaded_file.getvalue())
    
    return image_path

# main part
st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")
st.header("Turn Your Image to Audio Story")
uploaded_file = st.file_uploader("Select an Image...")

if uploaded_file is not None:
    # Display the uploaded image
    st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)
    
    # Save the image temporarily
    image_path = save_uploaded_image(uploaded_file)
    
    # Stage 1: Image to Text
    st.text('Processing img2text...')
    caption = img2text(image_path)
    st.write(caption)
    
    # Stage 2: Text to Story
    st.text('Generating a story...')
    story = text2story(caption)
    st.write(story)
    
    # Stage 3: Story to Audio data
    st.text('Generating audio data...')
    audio_data = text2audio(story)
    
    # Play button
    if st.button("Play Audio"):
        if audio_data:
            st.audio(
                audio_data["audio"],
                format="audio/wav",
                start_time=0,
                sample_rate=audio_data["sampling_rate"]
            )
        else:
            st.error("Failed to generate audio. Please try again.")
    
    # Clean up the temporary file
    try:
        os.remove(image_path)
    except:
        pass