import streamlit as st
from PIL import Image
import os
import tempfile
import sys

# function part
# img2text with a model that doesn't require sentencepiece
def img2text(image_path):
    try:
        from transformers import pipeline
        
        # Use the Salesforce model instead of Donut to avoid sentencepiece issues
        st.info("Using Salesforce/blip-image-captioning-base model for image-to-text")
        image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
        
        # Open the image file
        image = Image.open(image_path)
        
        # Extract text from the image
        result = image_to_text_model(image)
        
        # Get the generated text
        text = result[0]["generated_text"] if result else "No text detected"
        return text
    except Exception as e:
        st.error(f"Error processing image: {str(e)}")
        return f"Error: {str(e)}"

# text2story
def text2story(text):
    # For now, just return the extracted text as the story
    story_text = f"Here's a story based on the text: {text}"
    return story_text

# text2audio using Google Text-to-Speech
def text2audio(story_text):
    try:
        from gtts import gTTS
        
        # Create a temporary file
        temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
        temp_audio_path = temp_audio.name
        temp_audio.close()
        
        # Initialize gTTS and generate audio
        tts = gTTS(text=story_text, lang='en', slow=False)
        
        # Save to the temporary file
        tts.save(temp_audio_path)
        
        return temp_audio_path
    except Exception as e:
        st.error(f"Error generating audio: {str(e)}")
        return None

# main part
st.set_page_config(page_title="Your Image to Audio Story",
                   page_icon="🦜")
st.header("Turn Your Image to Audio Story")
st.subheader("Image to Text to Audio Conversion")

uploaded_file = st.file_uploader("Select an Image...", type=['png', 'jpg', 'jpeg', 'gif', 'bmp', 'webp'])

if uploaded_file is not None:
    # Save the uploaded file temporarily
    bytes_data = uploaded_file.getvalue()
    image_temp_path = os.path.join(tempfile.gettempdir(), uploaded_file.name)
    with open(image_temp_path, "wb") as file:
        file.write(bytes_data)

    # Display the uploaded image
    st.image(uploaded_file, caption="Uploaded Image",
             use_column_width=True)

    # Stage 1: Image to Text
    with st.spinner('Processing img2text...'):
        extracted_text = img2text(image_temp_path)
        st.subheader("Extracted Text:")
        st.write(extracted_text)

    # Stage 2: Text to Story
    with st.spinner('Generating a story...'):
        story = text2story(extracted_text)
        st.subheader("Generated Story:")
        st.write(story)

    # Stage 3: Story to Audio data
    audio_file_path = None
    with st.spinner('Generating audio data...'):
        audio_file_path = text2audio(story)

    # Remove the temporary image file
    if os.path.exists(image_temp_path):
        os.remove(image_temp_path)

    # Play button
    if st.button("Play Audio"):
        if audio_file_path and os.path.exists(audio_file_path):
            # Play the generated audio
            with open(audio_file_path, "rb") as audio_file:
                audio_bytes = audio_file.read()
            st.audio(audio_bytes, format="audio/mp3")
            
            # Clean up the audio file after playing
            try:
                os.remove(audio_file_path)
            except:
                pass
        else:
            st.warning("Audio generation failed. Playing a placeholder audio.")
            try:
                st.audio("kids_playing_audio.wav")
            except FileNotFoundError:
                st.error("Placeholder audio file not found. Audio playback is unavailable.")