Spaces:
Sleeping
Sleeping
File size: 5,290 Bytes
e77741a 90bef38 8d5fabf 118cd25 cd245d5 a4fc174 9e7cf7c a4fc174 cd245d5 8d5fabf a4fc174 5f21a2d a4fc174 9e7cf7c 5f21a2d 9e7cf7c a4fc174 5f21a2d a4fc174 5f21a2d a4fc174 5f21a2d 9e7cf7c 5f21a2d 9e7cf7c 5f21a2d a4fc174 cd245d5 7df9b81 e77741a 7df9b81 76abf5e 3fd88eb 76abf5e 7df9b81 a79c9ac a4fc174 a79c9ac a4fc174 5f21a2d a4fc174 a79c9ac 7df9b81 a4fc174 3fd88eb 8d5fabf a4fc174 cd245d5 f006a50 4e37056 f006a50 a084b90 cd245d5 a4fc174 e77741a f006a50 a4fc174 f006a50 a4fc174 f006a50 a4fc174 f006a50 a79c9ac a4fc174 a79c9ac 7df9b81 a4fc174 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
# import part
import streamlit as st
from transformers import pipeline
# function part
# img2text
def img2text(image_path):
image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
text = image_to_text(image_path)[0]["generated_text"]
return text
# text2story - IMPROVED to end naturally
def text2story(text):
# Using a smaller text generation model
generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
# Create a prompt for the story generation
prompt = f"Write a fun children's story based on this: {text}. The story should be short and end naturally with a conclusion. Once upon a time, "
# Generate the story
story_result = generator(
prompt,
max_length=250, # Increased to allow for a complete story
num_return_sequences=1,
temperature=0.7,
top_k=50,
top_p=0.95,
do_sample=True
)
# Extract the generated text
story_text = story_result[0]['generated_text']
story_text = story_text.replace(prompt, "Once upon a time, ")
# Find a natural ending point (end of sentence) before 100 words
words = story_text.split()
if len(words) > 100:
# Join the first 100 words
shortened_text = " ".join(words[:100])
# Find the last complete sentence
last_period = shortened_text.rfind('.')
last_question = shortened_text.rfind('?')
last_exclamation = shortened_text.rfind('!')
# Find the last sentence ending punctuation
last_end = max(last_period, last_question, last_exclamation)
if last_end > 0:
# Truncate at the end of the last complete sentence
story_text = shortened_text[:last_end + 1]
else:
# If no sentence ending found, just use the shortened text
story_text = shortened_text
return story_text
# text2audio - Simplified without numpy/scipy
def text2audio(story_text):
try:
# Use the HelpingAI TTS model as requested
synthesizer = pipeline("text-to-speech", model="HelpingAI/HelpingAI-TTS-v1")
# Limit text length to avoid timeouts
max_chars = 500
if len(story_text) > max_chars:
last_period = story_text[:max_chars].rfind('.')
if last_period > 0:
story_text = story_text[:last_period + 1]
else:
story_text = story_text[:max_chars]
# Generate speech
st.write("Generating audio...")
speech = synthesizer(story_text)
st.write(f"Speech output keys: {list(speech.keys())}")
# We'll pass the audio data directly to Streamlit instead of saving to a file
# This works because Streamlit's st.audio() can take raw audio data
return speech
except Exception as e:
st.error(f"Error generating audio: {str(e)}")
import traceback
st.error(traceback.format_exc())
return None
# Function to save temporary image file
def save_uploaded_image(uploaded_file):
if not os.path.exists("temp"):
os.makedirs("temp")
image_path = os.path.join("temp", uploaded_file.name)
with open(image_path, "wb") as f:
f.write(uploaded_file.getvalue())
return image_path
# main part
st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")
st.header("Turn Your Image to Audio Story")
uploaded_file = st.file_uploader("Select an Image...")
if uploaded_file is not None:
# Display the uploaded image
st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)
# Save the image temporarily
image_path = save_uploaded_image(uploaded_file)
# Stage 1: Image to Text
st.text('Processing img2text...')
caption = img2text(image_path)
st.write(caption)
# Stage 2: Text to Story
st.text('Generating a story...')
story = text2story(caption)
st.write(story)
# Stage 3: Story to Audio data
st.text('Generating audio data...')
speech_output = text2audio(story)
# Play button
if st.button("Play Audio"):
if speech_output is not None:
# Try to play the audio directly
try:
if 'audio' in speech_output and 'sampling_rate' in speech_output:
st.audio(speech_output['audio'], sample_rate=speech_output['sampling_rate'])
elif 'audio_array' in speech_output and 'sampling_rate' in speech_output:
st.audio(speech_output['audio_array'], sample_rate=speech_output['sampling_rate'])
elif 'waveform' in speech_output and 'sample_rate' in speech_output:
st.audio(speech_output['waveform'], sample_rate=speech_output['sample_rate'])
else:
st.error(f"Could not find compatible audio format in: {list(speech_output.keys())}")
except Exception as e:
st.error(f"Error playing audio: {str(e)}")
else:
st.error("Audio generation failed. Please try again.")
# Clean up the temporary files
try:
os.remove(image_path)
except:
pass |