ysuneu's picture
Update app.py
a3cc323 verified
import streamlit as st
from PIL import Image
from transformers import pipeline
def generate_image_caption(image):
"""Generates a caption for the given image using a pre-trained model."""
img2caption = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
# Generate caption
result = img2caption(image)
return result[0]['generated_text']
def text2story(text):
text_to_story_model = pipeline("text-generation", model="pranavpsv/genre-story-generator-v2")
story_text = text_to_story_model(text, max_new_tokens=150)[0]['generated_text']
words = story_text.split()
if len(words) > 100:
story_text = ' '.join(words[:100]) + '.'
return story_text
def text2speech(text):
"""Converts text to speech using a pre-trained model."""
speech_pipe = pipeline("text-to-speech", model="facebook/mms-tts-eng")
speech_output = speech_pipe(text)
return speech_output
def main():
# App title
st.title("Storyteller on Hugging Face")
st.write("Welcome to the image to story audio app!")
uploaded_image = st.file_uploader("Upload an image (jpg, jpeg, png)", type=["jpg", "jpeg", "png"])
if uploaded_image is not None:
image = Image.open(uploaded_image).convert("RGB")
st.image(image, caption="Uploaded Image", use_column_width=True)
# Stage 1: Image to Text
st.text('Processing img2text...')
image_caption = generate_image_caption(image)
st.write(image_caption)
# Stage 2: Text to Story
st.text('Processing text2story...')
story = text2story(image_caption)
st.write("Generated Story:", story)
# Stage 3: Story to Speech
st.text('Processing story2speech...')
speech_output = text2speech(story)
st.audio(speech_output["audio"], sample_rate=speech_output["sampling_rate"])
else:
st.warning("⚠️ Please upload an image file")
if __name__ == "__main__":
main()