import streamlit as st from PIL import Image from transformers import pipeline def generate_image_caption(image): """Generates a caption for the given image using a pre-trained model.""" img2caption = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") # Generate caption result = img2caption(image) return result[0]['generated_text'] def text2story(text): text_to_story_model = pipeline("text-generation", model="pranavpsv/genre-story-generator-v2") story_text = text_to_story_model(text, max_new_tokens=150)[0]['generated_text'] words = story_text.split() if len(words) > 100: story_text = ' '.join(words[:100]) + '.' return story_text def text2speech(text): """Converts text to speech using a pre-trained model.""" speech_pipe = pipeline("text-to-speech", model="facebook/mms-tts-eng") speech_output = speech_pipe(text) return speech_output def main(): # App title st.title("Storyteller on Hugging Face") st.write("Welcome to the image to story audio app!") uploaded_image = st.file_uploader("Upload an image (jpg, jpeg, png)", type=["jpg", "jpeg", "png"]) if uploaded_image is not None: image = Image.open(uploaded_image).convert("RGB") st.image(image, caption="Uploaded Image", use_column_width=True) # Stage 1: Image to Text st.text('Processing img2text...') image_caption = generate_image_caption(image) st.write(image_caption) # Stage 2: Text to Story st.text('Processing text2story...') story = text2story(image_caption) st.write("Generated Story:", story) # Stage 3: Story to Speech st.text('Processing story2speech...') speech_output = text2speech(story) st.audio(speech_output["audio"], sample_rate=speech_output["sampling_rate"]) else: st.warning("⚠️ Please upload an image file") if __name__ == "__main__": main()