Spaces:

CR7CAD
/

Assignment1

Sleeping

File size: 3,275 Bytes

90bef38
5b9e396
90bef38
b038974
e1ee436
7c5a1e4
 
 
b038974
7c5a1e4
 
 
 
 
 
 
 
 
b038974
7c5a1e4
 
90bef38
7c5a1e4
 
 
 
 
 
90bef38
7c5a1e4
 
b038974
7c5a1e4
 
 
b038974
7c5a1e4
 
1fb1e8e
7c5a1e4
b038974
7c5a1e4
 
90bef38
7c5a1e4
 
 
 
 
90bef38
7c5a1e4
 
 
 
 
 
 
90bef38
 
7c5a1e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b9e396
7c5a1e4

import streamlit as st
from transformers import pipeline
from PIL import Image
import os

# function part
# img2text
def img2text(image_path):
    try:
        # Load the image-to-text model
        image_to_text_model = pipeline("image-to-text", model="naver-clova-ix/donut-base")
        # Open the image file
        image = Image.open(image_path)
        # Extract text from the image
        result = image_to_text_model(image)
        # Get the generated text
        text = result[0]["generated_text"] if result else "No text detected"
        return text
    except Exception as e:
        st.error(f"Error processing image: {str(e)}")
        return f"Error: {str(e)}"

# text2story
def text2story(text):
    # For now, just return the extracted text as the story
    # This function can be expanded later with more sophisticated story generation
    story_text = f"Here's a story based on the text: {text}"
    return story_text

# text2audio
def text2audio(story_text):
    try:
        # Load the text-to-speech model (using a common TTS pipeline)
        # Note: You may need to install additional dependencies depending on the model used
        tts_model = pipeline("text-to-speech", model="espnet/kan-bayashi_ljspeech_vits")
        
        # Generate audio from the story text
        audio_data = tts_model(story_text)
        
        return audio_data
    except Exception as e:
        st.error(f"Error generating audio: {str(e)}")
        return None

# main part
st.set_page_config(page_title="Your Image to Audio Story",
                   page_icon="🦜")
st.header("Turn Your Image to Audio Story")
st.subheader("Using Donut model for text extraction")

uploaded_file = st.file_uploader("Select an Image...", type=['png', 'jpg', 'jpeg', 'gif', 'bmp', 'webp'])

if uploaded_file is not None:
    # Save the uploaded file temporarily
    bytes_data = uploaded_file.getvalue()
    with open(uploaded_file.name, "wb") as file:
        file.write(bytes_data)

    # Display the uploaded image
    st.image(uploaded_file, caption="Uploaded Image",
             use_column_width=True)

    # Stage 1: Image to Text
    with st.spinner('Processing img2text...'):
        extracted_text = img2text(uploaded_file.name)
        st.subheader("Extracted Text:")
        st.write(extracted_text)

    # Stage 2: Text to Story
    with st.spinner('Generating a story...'):
        story = text2story(extracted_text)
        st.subheader("Generated Story:")
        st.write(story)

    # Stage 3: Story to Audio data
    with st.spinner('Generating audio data...'):
        audio_data = text2audio(story)

    # Remove the temporary file
    if os.path.exists(uploaded_file.name):
        os.remove(uploaded_file.name)

    # Play button
    if st.button("Play Audio"):
        if audio_data:
            st.audio(audio_data['audio'],
                    format="audio/wav",
                    start_time=0,
                    sample_rate=audio_data['sampling_rate'])
        else:
            st.warning("Audio generation failed. Playing a placeholder audio.")
            try:
                st.audio("kids_playing_audio.wav")
            except FileNotFoundError:
                st.error("Placeholder audio file not found. Audio playback is unavailable.")