Spaces:

CR7CAD
/

Assignment1

Sleeping

File size: 6,343 Bytes

8fe6281
90bef38
8d5fabf
ab8ead3
8fe6281
 
 
862568a
ce9aea5
862568a
ce9aea5
862568a
ce9aea5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
862568a
ce9aea5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118cd25
ad4186a
ab8ead3
ad4186a
 
cd245d5
8d5fabf
b6b91c6
5f21a2d
ad4186a
5f21a2d
b6b91c6
 
 
 
 
 
 
 
5f21a2d
 
b6b91c6
5f21a2d
 
 
 
 
 
b6b91c6
 
 
 
 
 
 
 
 
7c4bc18
 
 
 
 
 
 
 
 
b6b91c6
 
 
 
7c4bc18
 
b6b91c6
 
 
 
 
 
 
 
 
 
7c4bc18
 
b6b91c6
 
 
 
 
 
 
 
 
 
 
 
7c4bc18
b6b91c6
5f21a2d
 
ad4186a
 
 
4e37056
ab8ead3
ad4186a
 
f006a50
ad4186a
ab8ead3
f006a50
ad4186a
8fe6281
 
ad4186a
 
 
8fe6281
 
b6b91c6
 
 
 
ad4186a
 
8fe6281
 
ce9aea5
8fe6281
 
ce9aea5
 
 
 
8fe6281
 
ce9aea5

# Imports
import streamlit as st
from transformers import pipeline
from PIL import Image
import torch
import os
import tempfile

# For TTS, try multiple options in order of preference
try:
    # Try gTTS first
    from gtts import gTTS
    
    def text2audio(story_text):
        # Create a temporary file
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
        temp_filename = temp_file.name
        temp_file.close()
        
        # Use gTTS to convert text to speech
        tts = gTTS(text=story_text, lang='en', slow=False)
        tts.save(temp_filename)
        
        # Read the audio file
        with open(temp_filename, 'rb') as audio_file:
            audio_bytes = audio_file.read()
        
        # Clean up the temporary file
        os.unlink(temp_filename)
        
        return audio_bytes, 'audio/mp3'

except ImportError:
    st.warning("gTTS not available. Using alternative text-to-speech method.")
    
    # Define alternative TTS using built-in transformers pipeline
    def text2audio(story_text):
        # Use a different TTS method
        from transformers import pipeline
        
        # Try a simple TTS model that should work with base transformers
        synthesizer = pipeline("text-to-speech", model="facebook/mms-tts-eng")
        
        # Generate speech
        speech = synthesizer(story_text)
        
        # Return the audio data
        if 'audio' in speech:
            return speech['audio'], speech.get('sampling_rate', 16000)
        elif 'audio_array' in speech:
            return speech['audio_array'], speech.get('sampling_rate', 16000)
        else:
            # In case of failure, return an error message
            raise Exception("Failed to generate audio with any available method")

# Simple image-to-text function
def img2text(image):
    image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
    text = image_to_text(image)[0]["generated_text"]
    return text

# Improved text-to-story function with longer stories (approaching 100 words)
def text2story(text):
    generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    
    # Specifically ask for a longer story (150-200 words) to ensure we get at least 100
    prompt = f"""Write a children's story based on this: {text}. 
    The story should have a clear beginning, middle, and end.
    Make the story approximately 150-200 words long with descriptive language.
    Start with "Once upon a time, "
    """
    
    # Generate a longer text with higher max_length to ensure we get a complete story
    story_result = generator(
        prompt,
        max_length=500,  # Increased max length
        num_return_sequences=1,
        temperature=0.7,
        do_sample=True
    )
   
    story_text = story_result[0]['generated_text']
    
    # Extract just the story part (after the prompt)
    if "Once upon a time, " in story_text:
        # Find the index of "Once upon a time" and extract from there
        start_idx = story_text.find("Once upon a time, ")
        story_text = story_text[start_idx:]
    else:
        # If we can't find the exact phrase, try to find the story after the prompt
        story_text = story_text.replace(prompt, "Once upon a time, ")
    
    # Find natural ending points (end of sentences)
    periods = [i for i, char in enumerate(story_text) if char == '.']
    question_marks = [i for i, char in enumerate(story_text) if char == '?']
    exclamation_marks = [i for i, char in enumerate(story_text) if char == '!']
    
    # Combine all ending punctuation and sort
    all_endings = sorted(periods + question_marks + exclamation_marks)
    
    # Count words in the story so far
    def count_words(text):
        return len(text.split())
    
    # If we have any sentence endings
    if all_endings:
        # Find endings that give us stories of approximately 100 words or more
        target_word_count = 100
        min_word_count = 80  # Allow slightly shorter stories that end naturally
        
        suitable_endings = []
        for ending_idx in all_endings:
            candidate_text = story_text[:ending_idx+1]
            word_count = count_words(candidate_text)
            if word_count >= min_word_count:
                suitable_endings.append((ending_idx, word_count))
        
        if suitable_endings:
            # Find the ending that gets us closest to our target word count
            suitable_endings.sort(key=lambda x: abs(x[1] - target_word_count))
            best_ending_idx = suitable_endings[0][0]
            return story_text[:best_ending_idx+1]
    
    # If we couldn't find a good ending point, try to get at least 80-100 words
    if len(all_endings) > 0:
        for i in range(len(all_endings)-1, -1, -1):
            ending_idx = all_endings[i]
            candidate_text = story_text[:ending_idx+1]
            if count_words(candidate_text) >= 80:
                return candidate_text
    
    # If all else fails, return the story as is
    return story_text

# Basic Streamlit interface
st.title("Image to Audio Story")
uploaded_file = st.file_uploader("Upload an image")

if uploaded_file is not None:
    # Display image
    st.image(uploaded_file, caption="Uploaded Image")
    
    # Convert to PIL Image
    image = Image.open(uploaded_file)
    
    # Image to Text
    with st.spinner("Generating caption..."):
        caption = img2text(image)
    st.write(f"Caption: {caption}")
    
    # Text to Story
    with st.spinner("Creating story..."):
        story = text2story(caption)
        # Display word count for transparency
        word_count = len(story.split())
        st.write(f"Story ({word_count} words):")
        st.write(story)
    
    # Text to Audio
    with st.spinner("Generating audio..."):
        try:
            audio_data, audio_format = text2audio(story)
            
            # Play audio
            if isinstance(audio_format, str) and audio_format.startswith('audio/'):
                st.audio(audio_data, format=audio_format)
            else:
                st.audio(audio_data, sample_rate=audio_format)
        except Exception as e:
            st.error(f"Error generating or playing audio: {e}")
            st.info("There was an issue with the text-to-speech conversion.")