import streamlit as st from PIL import Image import time from transformers import pipeline import tempfile import os # Function to generate image caption def generate_image_caption(image_path): """Generates a caption for the given image using a pre-trained model.""" img2caption = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") result = img2caption(image_path) return result[0]['generated_text'] # Function to generate story from text def text2story(text): """Generates a story from input text""" pipe = pipeline("text-generation", model="pranavpsv/genre-story-generator-v2") story_text = pipe(text, max_length=200)[0]['generated_text'] return story_text # Function to convert text to speech def text_to_speech(text): """Converts text to speech audio""" try: # Initialize text-to-audio pipeline tts_pipe = pipeline("text-to-audio", model="facebook/mms-tts-eng") # Generate audio (returns dict with 'audio' array and 'sampling_rate') audio_output = tts_pipe(text[:1000]) # Limit text length # Return the audio array and sampling rate return audio_output['audio'], audio_output['sampling_rate'] except Exception as e: st.error(f"Speech generation failed: {str(e)}") return None, None # Main application def main(): st.title("Image to Story with Speech") st.write("Upload an image to generate a caption, story, and audio narration") uploaded_image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"]) if uploaded_image is not None: try: # Process image with st.spinner("Processing image..."): image = Image.open(uploaded_image) st.image(image, caption="Uploaded Image", use_column_width=True) # Save temporary file with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file: image.save(temp_file.name) image_path = temp_file.name # Generate caption with st.spinner("Generating caption..."): caption = generate_image_caption(image_path) st.subheader("Generated Caption") st.write(caption) # Generate story with st.spinner("Generating story..."): story = text2story(caption) st.subheader("Generated Story") st.write(story) # Generate speech with st.spinner("Generating audio..."): audio_array, sample_rate = text_to_speech(story) if audio_array is not None: st.subheader("Audio Narration") st.audio(audio_array, sample_rate=sample_rate) except Exception as e: st.error(f"An error occurred: {str(e)}") finally: # Clean up temporary file if 'image_path' in locals() and os.path.exists(image_path): os.remove(image_path) if __name__ == "__main__": main()