# Imports
import streamlit as st
from transformers import pipeline
from PIL import Image
import torch
import os
import tempfile

# For TTS, try multiple options in order of preference
try:
    # Try gTTS first
    from gtts import gTTS
    
    def text2audio(story_text):
        # Create a temporary file
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
        temp_filename = temp_file.name
        temp_file.close()
        
        # Use gTTS to convert text to speech
        tts = gTTS(text=story_text, lang='en', slow=False)
        tts.save(temp_filename)
        
        # Read the audio file
        with open(temp_filename, 'rb') as audio_file:
            audio_bytes = audio_file.read()
        
        # Clean up the temporary file
        os.unlink(temp_filename)
        
        return audio_bytes, 'audio/mp3'

except ImportError:
    st.warning("gTTS not available. Using alternative text-to-speech method.")
    
    # Define alternative TTS using built-in transformers pipeline
    def text2audio(story_text):
        # Use a different TTS method
        from transformers import pipeline
        
        # Try a simple TTS model that should work with base transformers
        synthesizer = pipeline("text-to-speech", model="facebook/mms-tts-eng")
        
        # Generate speech
        speech = synthesizer(story_text)
        
        # Return the audio data
        if 'audio' in speech:
            return speech['audio'], speech.get('sampling_rate', 16000)
        elif 'audio_array' in speech:
            return speech['audio_array'], speech.get('sampling_rate', 16000)
        else:
            # In case of failure, return an error message
            raise Exception("Failed to generate audio with any available method")

# Simple image-to-text function
def img2text(image):
    image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
    text = image_to_text(image)[0]["generated_text"]
    return text

# Improved text-to-story function with longer stories (approaching 100 words)
def text2story(text):
    generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    
    # Specifically ask for a longer story (150-200 words) to ensure we get at least 100
    prompt = f"""Write a children's story based on this: {text}. 
    The story should have a clear beginning, middle, and end.
    Make the story approximately 150-200 words long with descriptive language.
    Start with "Once upon a time, "
    """
    
    # Generate a longer text with higher max_length to ensure we get a complete story
    story_result = generator(
        prompt,
        max_length=500,  # Increased max length
        num_return_sequences=1,
        temperature=0.7,
        do_sample=True
    )
   
    story_text = story_result[0]['generated_text']
    
    # Extract just the story part (after the prompt)
    if "Once upon a time, " in story_text:
        # Find the index of "Once upon a time" and extract from there
        start_idx = story_text.find("Once upon a time, ")
        story_text = story_text[start_idx:]
    else:
        # If we can't find the exact phrase, try to find the story after the prompt
        story_text = story_text.replace(prompt, "Once upon a time, ")
    
    # Find natural ending points (end of sentences)
    periods = [i for i, char in enumerate(story_text) if char == '.']
    question_marks = [i for i, char in enumerate(story_text) if char == '?']
    exclamation_marks = [i for i, char in enumerate(story_text) if char == '!']
    
    # Combine all ending punctuation and sort
    all_endings = sorted(periods + question_marks + exclamation_marks)
    
    # Count words in the story so far
    def count_words(text):
        return len(text.split())
    
    # If we have any sentence endings
    if all_endings:
        # Find endings that give us stories of approximately 100 words or more
        target_word_count = 100
        min_word_count = 80  # Allow slightly shorter stories that end naturally
        
        suitable_endings = []
        for ending_idx in all_endings:
            candidate_text = story_text[:ending_idx+1]
            word_count = count_words(candidate_text)
            if word_count >= min_word_count:
                suitable_endings.append((ending_idx, word_count))
        
        if suitable_endings:
            # Find the ending that gets us closest to our target word count
            suitable_endings.sort(key=lambda x: abs(x[1] - target_word_count))
            best_ending_idx = suitable_endings[0][0]
            return story_text[:best_ending_idx+1]
    
    # If we couldn't find a good ending point, try to get at least 80-100 words
    if len(all_endings) > 0:
        for i in range(len(all_endings)-1, -1, -1):
            ending_idx = all_endings[i]
            candidate_text = story_text[:ending_idx+1]
            if count_words(candidate_text) >= 80:
                return candidate_text
    
    # If all else fails, return the story as is
    return story_text

# Basic Streamlit interface
st.title("Image to Audio Story")
uploaded_file = st.file_uploader("Upload an image")

if uploaded_file is not None:
    # Display image
    st.image(uploaded_file, caption="Uploaded Image")
    
    # Convert to PIL Image
    image = Image.open(uploaded_file)
    
    # Image to Text
    with st.spinner("Generating caption..."):
        caption = img2text(image)
    st.write(f"Caption: {caption}")
    
    # Text to Story
    with st.spinner("Creating story..."):
        story = text2story(caption)
        # Display word count for transparency
        word_count = len(story.split())
        st.write(f"Story ({word_count} words):")
        st.write(story)
    
    # Text to Audio
    with st.spinner("Generating audio..."):
        try:
            audio_data, audio_format = text2audio(story)
            
            # Play audio
            if isinstance(audio_format, str) and audio_format.startswith('audio/'):
                st.audio(audio_data, format=audio_format)
            else:
                st.audio(audio_data, sample_rate=audio_format)
        except Exception as e:
            st.error(f"Error generating or playing audio: {e}")
            st.info("There was an issue with the text-to-speech conversion.")