Spaces:

frankai98
/

ISOM5240_Assignment1

Build error

App Files Files Community

ISOM5240_Assignment1 / app.py

frankai98

Update app.py

0ea388d verified 12 months ago

raw

history blame contribute delete

3.54 kB

	import streamlit as st
	from transformers import pipeline
	import torch
	from gtts import gTTS
	import io

	st.set_page_config(page_title="Your Image to Audio Story",
	page_icon="🦜")
	st.header("Turn Your Image📷 to a Short Audio Story🔊 for Children👶")
	uploaded_file = st.file_uploader("Select an Image After the Models are Loaded...")

	# function part

	# Preload models once
	@st.cache_resource
	def load_models():
	return {
	"img_model": pipeline("image-to-text", "cnmoro/tiny-image-captioning"),
	"story_model": pipeline("text-generation", "Qwen/Qwen2.5-0.5B-Instruct")
	}

	models = load_models()


	# img2text
	def img2text(url):
	text = models["img_model"](url)[0]["generated_text"]
	return text

	# text2story
	def text2story(text):
	# Define your messages
	prompt = f"Generate a brief 100-word story about: {text}"
	messages = [
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": prompt}
	]
	response = models["story_model"](
	messages,
	max_new_tokens=100,
	do_sample=True,
	temperature=0.7)[0]["generated_text"]
	story_text = response[2]["content"]
	return story_text

	# text2audio
	def text2audio(story_text):
	# Create temporary in-memory file
	audio_io = io.BytesIO()

	# Generate speech using gTTS
	tts = gTTS(text=story_text, lang='en', slow=False)
	tts.write_to_fp(audio_io)
	audio_io.seek(0)

	# Return as dictionary with compatible structure
	return {
	'audio': audio_io,
	'sampling_rate': 16000 # gTTS uses 16kHz by default
	}

	# Initialize session state variables
	if 'processed_data' not in st.session_state:
	st.session_state.processed_data = {
	'scenario': None,
	'story': None,
	'audio': None
	}

	if uploaded_file is not None:
	print(uploaded_file)
	bytes_data = uploaded_file.getvalue()
	with open(uploaded_file.name, "wb") as file:
	file.write(bytes_data)
	st.image(uploaded_file, caption="Uploaded Image",
	use_container_width=True)
	# Only process if file is new
	if st.session_state.get('current_file') != uploaded_file.name:
	st.session_state.current_file = uploaded_file.name

	# Stage 1: Image to Text
	with st.spinner('Processing image...'):
	st.session_state.processed_data['scenario'] = img2text(uploaded_file.name)

	# Stage 2: Text to Story
	with st.spinner('Generating story...'):
	st.session_state.processed_data['story'] = text2story(
	st.session_state.processed_data['scenario']
	)

	# Stage 3: Story to Audio
	with st.spinner('Creating audio...'):
	st.session_state.processed_data['audio'] = text2audio(
	st.session_state.processed_data['story']
	)

	# Display results
	# st.image(uploaded_file)
	st.write("Caption:", st.session_state.processed_data['scenario'])
	st.write("Story:", st.session_state.processed_data['story'])

	# Keep audio button OUTSIDE file processing block
	if st.button("Play Audio of the Story Generated"):
	if st.session_state.processed_data.get('audio'):
	audio_data = st.session_state.processed_data['audio']
	# Convert BytesIO to bytes and specify format
	st.audio(
	audio_data['audio'].getvalue(),
	format="audio/mp3" # gTTS outputs MP3 by default
	)
	else:
	st.warning("Please generate a story first!")