# import part - only using the two requested imports
import streamlit as st
from transformers import pipeline

# function part
# img2text
def img2text(image_path):
    image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
    text = image_to_text(image_path)[0]["generated_text"]
    return text

# text2story - IMPROVED to end naturally
def text2story(text):
    # Using a smaller text generation model
    generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    
    # Create a prompt for the story generation
    prompt = f"Write a fun children's story based on this: {text}. The story should be short and end naturally with a conclusion. Once upon a time, "
    
    # Generate the story
    story_result = generator(
        prompt,
        max_length=250,  # Increased to allow for a complete story
        num_return_sequences=1,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        do_sample=True
    )
   
    # Extract the generated text
    story_text = story_result[0]['generated_text']
    story_text = story_text.replace(prompt, "Once upon a time, ")
    
    # Find a natural ending point (end of sentence) before 100 words
    words = story_text.split()
    if len(words) > 100:
        # Join the first 100 words
        shortened_text = " ".join(words[:100])
        
        # Find the last complete sentence
        last_period = shortened_text.rfind('.')
        last_question = shortened_text.rfind('?')
        last_exclamation = shortened_text.rfind('!')
        
        # Find the last sentence ending punctuation
        last_end = max(last_period, last_question, last_exclamation)
        
        if last_end > 0:
            # Truncate at the end of the last complete sentence
            story_text = shortened_text[:last_end + 1]
        else:
            # If no sentence ending found, just use the shortened text
            story_text = shortened_text
    
    return story_text

# text2audio - Using HelpingAI-TTS-v1 model
def text2audio(story_text):
    try:
        # Use the HelpingAI TTS model as requested
        synthesizer = pipeline("text-to-speech", model="HelpingAI/HelpingAI-TTS-v1")
        
        # Limit text length to avoid timeouts
        max_chars = 500
        if len(story_text) > max_chars:
            last_period = story_text[:max_chars].rfind('.')
            if last_period > 0:
                story_text = story_text[:last_period + 1]
            else:
                story_text = story_text[:max_chars]
        
        # Generate speech
        speech = synthesizer(story_text)
        
        # Get output information
        st.write(f"Speech output keys: {list(speech.keys())}")
        
        return speech
        
    except Exception as e:
        st.error(f"Error generating audio: {str(e)}")
        return None

# main part
st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")
st.header("Turn Your Image to Audio Story")
uploaded_file = st.file_uploader("Select an Image...")

if uploaded_file is not None:
    # Display the uploaded image
    st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)
    
    # Create a temporary file in memory from the uploaded file
    image_bytes = uploaded_file.getvalue()
    
    # Stage 1: Image to Text
    st.text('Processing img2text...')
    caption = img2text(image_bytes)  # Pass bytes directly to pipeline
    st.write(caption)
    
    # Stage 2: Text to Story
    st.text('Generating a story...')
    story = text2story(caption)
    st.write(story)
    
    # Stage 3: Story to Audio data
    st.text('Generating audio data...')
    speech_output = text2audio(story)
    
    # Play button
    if st.button("Play Audio"):
        if speech_output is not None:
            # Try to play the audio directly
            try:
                if 'audio' in speech_output and 'sampling_rate' in speech_output:
                    st.audio(speech_output['audio'], sample_rate=speech_output['sampling_rate'])
                elif 'audio_array' in speech_output and 'sampling_rate' in speech_output:
                    st.audio(speech_output['audio_array'], sample_rate=speech_output['sampling_rate'])
                elif 'waveform' in speech_output and 'sample_rate' in speech_output:
                    st.audio(speech_output['waveform'], sample_rate=speech_output['sample_rate'])
                else:
                    # Try the first array-like value as audio data
                    for key, value in speech_output.items():
                        if hasattr(value, '__len__') and len(value) > 1000:
                            if 'rate' in speech_output:
                                st.audio(value, sample_rate=speech_output['rate'])
                            elif 'sample_rate' in speech_output:
                                st.audio(value, sample_rate=speech_output['sample_rate'])  
                            elif 'sampling_rate' in speech_output:
                                st.audio(value, sample_rate=speech_output['sampling_rate'])
                            else:
                                st.audio(value, sample_rate=24000)  # Default sample rate
                            break
                    else:
                        st.error(f"Could not find compatible audio format in: {list(speech_output.keys())}")
            except Exception as e:
                st.error(f"Error playing audio: {str(e)}")
        else:
            st.error("Audio generation failed. Please try again.")