import streamlit as st
from PIL import Image
from transformers import pipeline

def generate_image_caption(image):
    """Generates a caption for the given image using a pre-trained model."""
    img2caption = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

    # Generate caption
    result = img2caption(image)
    return result[0]['generated_text']

def text2story(text):
    text_to_story_model = pipeline("text-generation", model="pranavpsv/genre-story-generator-v2")
    story_text = text_to_story_model(text, max_new_tokens=150)[0]['generated_text']
    words = story_text.split()
    if len(words) > 100:
       story_text = ' '.join(words[:100]) + '.'
    return story_text

def text2speech(text):
    """Converts text to speech using a pre-trained model."""
    speech_pipe = pipeline("text-to-speech", model="facebook/mms-tts-eng")
    speech_output = speech_pipe(text)
    return speech_output

def main():
    # App title
    st.title("Storyteller on Hugging Face")
    st.write("Welcome to the image to story audio app!")

    uploaded_image = st.file_uploader("Upload an image (jpg, jpeg, png)", type=["jpg", "jpeg", "png"])

    if uploaded_image is not None:
        image = Image.open(uploaded_image).convert("RGB")
        st.image(image, caption="Uploaded Image", use_column_width=True)

        # Stage 1: Image to Text
        st.text('Processing img2text...')
        image_caption = generate_image_caption(image)
        st.write(image_caption)

        # Stage 2: Text to Story
        st.text('Processing text2story...')
        story = text2story(image_caption)
        st.write("Generated Story:", story)

        # Stage 3: Story to Speech
        st.text('Processing story2speech...')
        speech_output = text2speech(story)
        st.audio(speech_output["audio"], sample_rate=speech_output["sampling_rate"])
    else:
        st.warning("⚠️ Please upload an image file")

if __name__ == "__main__":
    main()