File size: 3,122 Bytes
ea5ce1f
 
 
 
 
 
 
64553ef
ea5ce1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import streamlit as st
from PIL import Image
import time
from transformers import pipeline
import tempfile
import os


# Function to generate image caption
def generate_image_caption(image_path):
    """Generates a caption for the given image using a pre-trained model."""
    img2caption = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
    result = img2caption(image_path)
    return result[0]['generated_text']

# Function to generate story from text
def text2story(text):
    """Generates a story from input text"""
    pipe = pipeline("text-generation", model="pranavpsv/genre-story-generator-v2")
    story_text = pipe(text, max_length=200)[0]['generated_text']
    return story_text

# Function to convert text to speech
def text_to_speech(text):
    """Converts text to speech audio"""
    try:
        # Initialize text-to-audio pipeline
        tts_pipe = pipeline("text-to-audio", model="facebook/mms-tts-eng")
        
        # Generate audio (returns dict with 'audio' array and 'sampling_rate')
        audio_output = tts_pipe(text[:1000])  # Limit text length
        
        # Return the audio array and sampling rate
        return audio_output['audio'], audio_output['sampling_rate']
    except Exception as e:
        st.error(f"Speech generation failed: {str(e)}")
        return None, None

# Main application
def main():
    st.title("Image to Story with Speech")
    st.write("Upload an image to generate a caption, story, and audio narration")

    uploaded_image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])

    if uploaded_image is not None:
        try:
            # Process image
            with st.spinner("Processing image..."):
                image = Image.open(uploaded_image)
                st.image(image, caption="Uploaded Image", use_column_width=True)
                
                # Save temporary file
                with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
                    image.save(temp_file.name)
                    image_path = temp_file.name

            # Generate caption
            with st.spinner("Generating caption..."):
                caption = generate_image_caption(image_path)
                st.subheader("Generated Caption")
                st.write(caption)

            # Generate story
            with st.spinner("Generating story..."):
                story = text2story(caption)
                st.subheader("Generated Story")
                st.write(story)

            # Generate speech
            with st.spinner("Generating audio..."):
                audio_array, sample_rate = text_to_speech(story)
                if audio_array is not None:
                    st.subheader("Audio Narration")
                    st.audio(audio_array, sample_rate=sample_rate)

        except Exception as e:
            st.error(f"An error occurred: {str(e)}")
        finally:
            # Clean up temporary file
            if 'image_path' in locals() and os.path.exists(image_path):
                os.remove(image_path)

if __name__ == "__main__":
    main()