# Only the two imports you requested
import streamlit as st
from transformers import pipeline
from PIL import Image

# Simple image-to-text function
def img2text(image):
    image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
    text = image_to_text(image)[0]["generated_text"]
    return text

# Improved text-to-story function with natural ending
def text2story(text):
    generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    prompt = f"Write a short children's story based on this: {text}. The story should have a clear beginning, middle, and end. Keep it under 150 words. Once upon a time, "
    
    # Generate a longer text to ensure we get a complete story
    story_result = generator(
        prompt,
        max_length=300,
        num_return_sequences=1,
        temperature=0.7,
        do_sample=True
    )
   
    story_text = story_result[0]['generated_text']
    story_text = story_text.replace(prompt, "Once upon a time, ")
    
    # Find natural ending points (end of sentences)
    periods = [i for i, char in enumerate(story_text) if char == '.']
    question_marks = [i for i, char in enumerate(story_text) if char == '?']
    exclamation_marks = [i for i, char in enumerate(story_text) if char == '!']
    
    # Combine all ending punctuation and sort
    all_endings = sorted(periods + question_marks + exclamation_marks)
    
    # If we have any sentence endings
    if all_endings:
        # Get the index where the story should reasonably end (after at least 100 characters)
        min_story_length = 100
        suitable_endings = [i for i in all_endings if i >= min_story_length]
        
        if suitable_endings:
            # Find an ending that completes a thought (not just the first sentence)
            if len(suitable_endings) > 2:
                # Use the third sentence ending or later for a more complete story
                return story_text[:suitable_endings[2]+1]
            else:
                # If we don't have many sentences, use the last one we found
                return story_text[:suitable_endings[-1]+1]
    
    # If no good ending is found, return as is
    return story_text

# Updated text-to-audio function with a compatible model
def text2audio(story_text):
    # Use Microsoft's SpeechT5 model which is widely supported
    synthesizer = pipeline("text-to-speech", model="microsoft/speecht5_tts")
    
    # This model requires speaker embeddings
    from transformers import SpeechT5HifiGan
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
    
    # Get speaker embeddings for a female voice
    from transformers import SpeechT5Processor
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    speaker_embeddings = processor.speaker_embeddings["female"]
    
    # Limit text length to avoid issues
    max_chars = 500
    if len(story_text) > max_chars:
        last_period = story_text[:max_chars].rfind('.')
        if last_period > 0:
            story_text = story_text[:last_period + 1]
        else:
            story_text = story_text[:max_chars]
    
    # Generate speech with appropriate parameters
    inputs = processor(text=story_text, return_tensors="pt")
    speech = synthesizer(inputs["input_ids"][0], speaker_embeddings=speaker_embeddings, vocoder=vocoder)
    
    return speech

# Basic Streamlit interface
st.title("Image to Audio Story")
uploaded_file = st.file_uploader("Upload an image")

if uploaded_file is not None:
    # Display image
    st.image(uploaded_file, caption="Uploaded Image")
    
    # Convert to PIL Image
    image = Image.open(uploaded_file)
    
    # Image to Text
    with st.spinner("Generating caption..."):
        caption = img2text(image)
    st.write(f"Caption: {caption}")
    
    # Text to Story
    with st.spinner("Creating story..."):
        story = text2story(caption)
    st.write(f"Story: {story}")
    
    # Text to Audio
    with st.spinner("Generating audio..."):
        try:
            speech_output = text2audio(story)
            
            # Play audio
            if hasattr(speech_output, 'numpy') or hasattr(speech_output, 'audio'):
                if hasattr(speech_output, 'numpy'):
                    audio_data = speech_output.numpy()
                else:
                    audio_data = speech_output.audio
                
                sample_rate = speech_output.sampling_rate if hasattr(speech_output, 'sampling_rate') else 16000
                st.audio(audio_data, sample_rate=sample_rate)
            else:
                st.audio(speech_output['audio'], sample_rate=speech_output.get('sampling_rate', 16000))
        except Exception as e:
            st.error(f"Error generating or playing audio: {e}")
            st.write("Try installing the latest transformers library with: pip install --upgrade transformers")