Spaces:

CR7CAD
/

Assignment1

Sleeping

File size: 5,025 Bytes

8fe6281
90bef38
8d5fabf
ab8ead3
8fe6281
 
 
862568a
ce9aea5
862568a
ce9aea5
862568a
ce9aea5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
862568a
ce9aea5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118cd25
ad4186a
ab8ead3
ad4186a
 
cd245d5
8d5fabf
8fe6281
5f21a2d
ad4186a
7c4bc18
5f21a2d
7c4bc18
5f21a2d
 
7c4bc18
5f21a2d
 
 
 
 
 
 
7c4bc18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f21a2d
 
ad4186a
 
 
4e37056
ab8ead3
ad4186a
 
f006a50
ad4186a
ab8ead3
f006a50
ad4186a
8fe6281
 
ad4186a
 
 
8fe6281
 
ad4186a
 
 
8fe6281
 
ce9aea5
8fe6281
 
ce9aea5
 
 
 
8fe6281
 
ce9aea5

# Imports
import streamlit as st
from transformers import pipeline
from PIL import Image
import torch
import os
import tempfile

# For TTS, try multiple options in order of preference
try:
    # Try gTTS first
    from gtts import gTTS
    
    def text2audio(story_text):
        # Create a temporary file
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
        temp_filename = temp_file.name
        temp_file.close()
        
        # Use gTTS to convert text to speech
        tts = gTTS(text=story_text, lang='en', slow=False)
        tts.save(temp_filename)
        
        # Read the audio file
        with open(temp_filename, 'rb') as audio_file:
            audio_bytes = audio_file.read()
        
        # Clean up the temporary file
        os.unlink(temp_filename)
        
        return audio_bytes, 'audio/mp3'

except ImportError:
    st.warning("gTTS not available. Using alternative text-to-speech method.")
    
    # Define alternative TTS using built-in transformers pipeline
    def text2audio(story_text):
        # Use a different TTS method
        from transformers import pipeline
        
        # Try a simple TTS model that should work with base transformers
        synthesizer = pipeline("text-to-speech", model="facebook/mms-tts-eng")
        
        # Generate speech
        speech = synthesizer(story_text)
        
        # Return the audio data
        if 'audio' in speech:
            return speech['audio'], speech.get('sampling_rate', 16000)
        elif 'audio_array' in speech:
            return speech['audio_array'], speech.get('sampling_rate', 16000)
        else:
            # In case of failure, return an error message
            raise Exception("Failed to generate audio with any available method")

# Simple image-to-text function
def img2text(image):
    image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
    text = image_to_text(image)[0]["generated_text"]
    return text

# Improved text-to-story function with natural ending
def text2story(text):
    generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    prompt = f"Write a short children's story based on this: {text}. The story should have a clear beginning, middle, and end. Keep it under 150 words. Once upon a time, "
    
    # Generate a longer text to ensure we get a complete story
    story_result = generator(
        prompt,
        max_length=300,
        num_return_sequences=1,
        temperature=0.7,
        do_sample=True
    )
   
    story_text = story_result[0]['generated_text']
    story_text = story_text.replace(prompt, "Once upon a time, ")
    
    # Find natural ending points (end of sentences)
    periods = [i for i, char in enumerate(story_text) if char == '.']
    question_marks = [i for i, char in enumerate(story_text) if char == '?']
    exclamation_marks = [i for i, char in enumerate(story_text) if char == '!']
    
    # Combine all ending punctuation and sort
    all_endings = sorted(periods + question_marks + exclamation_marks)
    
    # If we have any sentence endings
    if all_endings:
        # Get the index where the story should reasonably end (after at least 100 characters)
        min_story_length = 100
        suitable_endings = [i for i in all_endings if i >= min_story_length]
        
        if suitable_endings:
            # Find an ending that completes a thought (not just the first sentence)
            if len(suitable_endings) > 2:
                # Use the third sentence ending or later for a more complete story
                return story_text[:suitable_endings[2]+1]
            else:
                # If we don't have many sentences, use the last one we found
                return story_text[:suitable_endings[-1]+1]
    
    # If no good ending is found, return as is
    return story_text

# Basic Streamlit interface
st.title("Image to Audio Story")
uploaded_file = st.file_uploader("Upload an image")

if uploaded_file is not None:
    # Display image
    st.image(uploaded_file, caption="Uploaded Image")
    
    # Convert to PIL Image
    image = Image.open(uploaded_file)
    
    # Image to Text
    with st.spinner("Generating caption..."):
        caption = img2text(image)
    st.write(f"Caption: {caption}")
    
    # Text to Story
    with st.spinner("Creating story..."):
        story = text2story(caption)
    st.write(f"Story: {story}")
    
    # Text to Audio
    with st.spinner("Generating audio..."):
        try:
            audio_data, audio_format = text2audio(story)
            
            # Play audio
            if isinstance(audio_format, str) and audio_format.startswith('audio/'):
                st.audio(audio_data, format=audio_format)
            else:
                st.audio(audio_data, sample_rate=audio_format)
        except Exception as e:
            st.error(f"Error generating or playing audio: {e}")
            st.info("There was an issue with the text-to-speech conversion.")