Spaces:
Sleeping
Sleeping
File size: 6,343 Bytes
8fe6281 90bef38 8d5fabf ab8ead3 8fe6281 862568a ce9aea5 862568a ce9aea5 862568a ce9aea5 862568a ce9aea5 118cd25 ad4186a ab8ead3 ad4186a cd245d5 8d5fabf b6b91c6 5f21a2d ad4186a 5f21a2d b6b91c6 5f21a2d b6b91c6 5f21a2d b6b91c6 7c4bc18 b6b91c6 7c4bc18 b6b91c6 7c4bc18 b6b91c6 7c4bc18 b6b91c6 5f21a2d ad4186a 4e37056 ab8ead3 ad4186a f006a50 ad4186a ab8ead3 f006a50 ad4186a 8fe6281 ad4186a 8fe6281 b6b91c6 ad4186a 8fe6281 ce9aea5 8fe6281 ce9aea5 8fe6281 ce9aea5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
# Imports
import streamlit as st
from transformers import pipeline
from PIL import Image
import torch
import os
import tempfile
# For TTS, try multiple options in order of preference
try:
# Try gTTS first
from gtts import gTTS
def text2audio(story_text):
# Create a temporary file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
temp_filename = temp_file.name
temp_file.close()
# Use gTTS to convert text to speech
tts = gTTS(text=story_text, lang='en', slow=False)
tts.save(temp_filename)
# Read the audio file
with open(temp_filename, 'rb') as audio_file:
audio_bytes = audio_file.read()
# Clean up the temporary file
os.unlink(temp_filename)
return audio_bytes, 'audio/mp3'
except ImportError:
st.warning("gTTS not available. Using alternative text-to-speech method.")
# Define alternative TTS using built-in transformers pipeline
def text2audio(story_text):
# Use a different TTS method
from transformers import pipeline
# Try a simple TTS model that should work with base transformers
synthesizer = pipeline("text-to-speech", model="facebook/mms-tts-eng")
# Generate speech
speech = synthesizer(story_text)
# Return the audio data
if 'audio' in speech:
return speech['audio'], speech.get('sampling_rate', 16000)
elif 'audio_array' in speech:
return speech['audio_array'], speech.get('sampling_rate', 16000)
else:
# In case of failure, return an error message
raise Exception("Failed to generate audio with any available method")
# Simple image-to-text function
def img2text(image):
image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
text = image_to_text(image)[0]["generated_text"]
return text
# Improved text-to-story function with longer stories (approaching 100 words)
def text2story(text):
generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
# Specifically ask for a longer story (150-200 words) to ensure we get at least 100
prompt = f"""Write a children's story based on this: {text}.
The story should have a clear beginning, middle, and end.
Make the story approximately 150-200 words long with descriptive language.
Start with "Once upon a time, "
"""
# Generate a longer text with higher max_length to ensure we get a complete story
story_result = generator(
prompt,
max_length=500, # Increased max length
num_return_sequences=1,
temperature=0.7,
do_sample=True
)
story_text = story_result[0]['generated_text']
# Extract just the story part (after the prompt)
if "Once upon a time, " in story_text:
# Find the index of "Once upon a time" and extract from there
start_idx = story_text.find("Once upon a time, ")
story_text = story_text[start_idx:]
else:
# If we can't find the exact phrase, try to find the story after the prompt
story_text = story_text.replace(prompt, "Once upon a time, ")
# Find natural ending points (end of sentences)
periods = [i for i, char in enumerate(story_text) if char == '.']
question_marks = [i for i, char in enumerate(story_text) if char == '?']
exclamation_marks = [i for i, char in enumerate(story_text) if char == '!']
# Combine all ending punctuation and sort
all_endings = sorted(periods + question_marks + exclamation_marks)
# Count words in the story so far
def count_words(text):
return len(text.split())
# If we have any sentence endings
if all_endings:
# Find endings that give us stories of approximately 100 words or more
target_word_count = 100
min_word_count = 80 # Allow slightly shorter stories that end naturally
suitable_endings = []
for ending_idx in all_endings:
candidate_text = story_text[:ending_idx+1]
word_count = count_words(candidate_text)
if word_count >= min_word_count:
suitable_endings.append((ending_idx, word_count))
if suitable_endings:
# Find the ending that gets us closest to our target word count
suitable_endings.sort(key=lambda x: abs(x[1] - target_word_count))
best_ending_idx = suitable_endings[0][0]
return story_text[:best_ending_idx+1]
# If we couldn't find a good ending point, try to get at least 80-100 words
if len(all_endings) > 0:
for i in range(len(all_endings)-1, -1, -1):
ending_idx = all_endings[i]
candidate_text = story_text[:ending_idx+1]
if count_words(candidate_text) >= 80:
return candidate_text
# If all else fails, return the story as is
return story_text
# Basic Streamlit interface
st.title("Image to Audio Story")
uploaded_file = st.file_uploader("Upload an image")
if uploaded_file is not None:
# Display image
st.image(uploaded_file, caption="Uploaded Image")
# Convert to PIL Image
image = Image.open(uploaded_file)
# Image to Text
with st.spinner("Generating caption..."):
caption = img2text(image)
st.write(f"Caption: {caption}")
# Text to Story
with st.spinner("Creating story..."):
story = text2story(caption)
# Display word count for transparency
word_count = len(story.split())
st.write(f"Story ({word_count} words):")
st.write(story)
# Text to Audio
with st.spinner("Generating audio..."):
try:
audio_data, audio_format = text2audio(story)
# Play audio
if isinstance(audio_format, str) and audio_format.startswith('audio/'):
st.audio(audio_data, format=audio_format)
else:
st.audio(audio_data, sample_rate=audio_format)
except Exception as e:
st.error(f"Error generating or playing audio: {e}")
st.info("There was an issue with the text-to-speech conversion.") |