Spaces:

CR7CAD
/

Assignment1

Sleeping

App Files Files Community

Assignment1 / app.py

CR7CAD

Update app.py

b6b91c6 verified 10 months ago

raw

history blame

6.34 kB

	# Imports
	import streamlit as st
	from transformers import pipeline
	from PIL import Image
	import torch
	import os
	import tempfile

	# For TTS, try multiple options in order of preference
	try:
	# Try gTTS first
	from gtts import gTTS

	def text2audio(story_text):
	# Create a temporary file
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
	temp_filename = temp_file.name
	temp_file.close()

	# Use gTTS to convert text to speech
	tts = gTTS(text=story_text, lang='en', slow=False)
	tts.save(temp_filename)

	# Read the audio file
	with open(temp_filename, 'rb') as audio_file:
	audio_bytes = audio_file.read()

	# Clean up the temporary file
	os.unlink(temp_filename)

	return audio_bytes, 'audio/mp3'

	except ImportError:
	st.warning("gTTS not available. Using alternative text-to-speech method.")

	# Define alternative TTS using built-in transformers pipeline
	def text2audio(story_text):
	# Use a different TTS method
	from transformers import pipeline

	# Try a simple TTS model that should work with base transformers
	synthesizer = pipeline("text-to-speech", model="facebook/mms-tts-eng")

	# Generate speech
	speech = synthesizer(story_text)

	# Return the audio data
	if 'audio' in speech:
	return speech['audio'], speech.get('sampling_rate', 16000)
	elif 'audio_array' in speech:
	return speech['audio_array'], speech.get('sampling_rate', 16000)
	else:
	# In case of failure, return an error message
	raise Exception("Failed to generate audio with any available method")

	# Simple image-to-text function
	def img2text(image):
	image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
	text = image_to_text(image)[0]["generated_text"]
	return text

	# Improved text-to-story function with longer stories (approaching 100 words)
	def text2story(text):
	generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")

	# Specifically ask for a longer story (150-200 words) to ensure we get at least 100
	prompt = f"""Write a children's story based on this: {text}.
	The story should have a clear beginning, middle, and end.
	Make the story approximately 150-200 words long with descriptive language.
	Start with "Once upon a time, "
	"""

	# Generate a longer text with higher max_length to ensure we get a complete story
	story_result = generator(
	prompt,
	max_length=500, # Increased max length
	num_return_sequences=1,
	temperature=0.7,
	do_sample=True
	)

	story_text = story_result[0]['generated_text']

	# Extract just the story part (after the prompt)
	if "Once upon a time, " in story_text:
	# Find the index of "Once upon a time" and extract from there
	start_idx = story_text.find("Once upon a time, ")
	story_text = story_text[start_idx:]
	else:
	# If we can't find the exact phrase, try to find the story after the prompt
	story_text = story_text.replace(prompt, "Once upon a time, ")

	# Find natural ending points (end of sentences)
	periods = [i for i, char in enumerate(story_text) if char == '.']
	question_marks = [i for i, char in enumerate(story_text) if char == '?']
	exclamation_marks = [i for i, char in enumerate(story_text) if char == '!']

	# Combine all ending punctuation and sort
	all_endings = sorted(periods + question_marks + exclamation_marks)

	# Count words in the story so far
	def count_words(text):
	return len(text.split())

	# If we have any sentence endings
	if all_endings:
	# Find endings that give us stories of approximately 100 words or more
	target_word_count = 100
	min_word_count = 80 # Allow slightly shorter stories that end naturally

	suitable_endings = []
	for ending_idx in all_endings:
	candidate_text = story_text[:ending_idx+1]
	word_count = count_words(candidate_text)
	if word_count >= min_word_count:
	suitable_endings.append((ending_idx, word_count))

	if suitable_endings:
	# Find the ending that gets us closest to our target word count
	suitable_endings.sort(key=lambda x: abs(x[1] - target_word_count))
	best_ending_idx = suitable_endings[0][0]
	return story_text[:best_ending_idx+1]

	# If we couldn't find a good ending point, try to get at least 80-100 words
	if len(all_endings) > 0:
	for i in range(len(all_endings)-1, -1, -1):
	ending_idx = all_endings[i]
	candidate_text = story_text[:ending_idx+1]
	if count_words(candidate_text) >= 80:
	return candidate_text

	# If all else fails, return the story as is
	return story_text

	# Basic Streamlit interface
	st.title("Image to Audio Story")
	uploaded_file = st.file_uploader("Upload an image")

	if uploaded_file is not None:
	# Display image
	st.image(uploaded_file, caption="Uploaded Image")

	# Convert to PIL Image
	image = Image.open(uploaded_file)

	# Image to Text
	with st.spinner("Generating caption..."):
	caption = img2text(image)
	st.write(f"Caption: {caption}")

	# Text to Story
	with st.spinner("Creating story..."):
	story = text2story(caption)
	# Display word count for transparency
	word_count = len(story.split())
	st.write(f"Story ({word_count} words):")
	st.write(story)

	# Text to Audio
	with st.spinner("Generating audio..."):
	try:
	audio_data, audio_format = text2audio(story)

	# Play audio
	if isinstance(audio_format, str) and audio_format.startswith('audio/'):
	st.audio(audio_data, format=audio_format)
	else:
	st.audio(audio_data, sample_rate=audio_format)
	except Exception as e:
	st.error(f"Error generating or playing audio: {e}")
	st.info("There was an issue with the text-to-speech conversion.")