Spaces:
Sleeping
Sleeping
File size: 5,252 Bytes
e77741a 90bef38 8d5fabf f9b627f 118cd25 cd245d5 e77741a f9b627f e77741a cd245d5 8d5fabf e77741a 5f21a2d e77741a 5f21a2d e77741a 5f21a2d e77741a 5f21a2d e77741a 5f21a2d e77741a 5f21a2d a79c9ac cd245d5 7df9b81 e77741a 7df9b81 76abf5e 3fd88eb 76abf5e 7df9b81 a79c9ac 5f21a2d a79c9ac 7df9b81 3fd88eb 8d5fabf cd245d5 f006a50 4e37056 f006a50 a084b90 cd245d5 f9b627f 8d5fabf e77741a f006a50 e77741a f006a50 e77741a f006a50 e77741a f006a50 a79c9ac 7df9b81 a79c9ac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
# import part
import streamlit as st
from transformers import pipeline
from PIL import Image
# function part
# img2text - Using a lighter model
def img2text(image):
# Use a smaller, faster image captioning model
image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
text = image_to_text(image, max_new_tokens=20)[0]["generated_text"]
return text
# text2story - Using a much faster model with constraints
def text2story(text):
# Use a tiny model that's much faster
generator = pipeline("text-generation", model="distilgpt2")
# Create a more constrained prompt for faster generation
prompt = f"A short children's story about {text}: Once upon a time, "
# Generate with strict constraints for speed
story_result = generator(
prompt,
max_new_tokens=100, # Limit token generation
num_return_sequences=1,
temperature=0.7,
top_k=50,
do_sample=True
)
# Extract the generated text
story_text = story_result[0]['generated_text']
story_text = story_text.replace(prompt, "Once upon a time, ")
# Find a natural ending point (end of sentence)
last_period = story_text.rfind('.')
last_question = story_text.rfind('?')
last_exclamation = story_text.rfind('!')
# Find the last sentence ending punctuation
last_end = max(last_period, last_question, last_exclamation)
if last_end > 0:
# Truncate at the end of the last complete sentence
story_text = story_text[:last_end + 1]
return story_text
# text2audio - Using HelpingAI-TTS-v1 model
def text2audio(story_text):
try:
# Use the HelpingAI TTS model as requested
synthesizer = pipeline("text-to-speech", model="HelpingAI/HelpingAI-TTS-v1")
# Limit text length to avoid timeouts
max_chars = 500
if len(story_text) > max_chars:
last_period = story_text[:max_chars].rfind('.')
if last_period > 0:
story_text = story_text[:last_period + 1]
else:
story_text = story_text[:max_chars]
# Generate speech
speech = synthesizer(story_text)
return speech
except Exception as e:
st.error(f"Error generating audio: {str(e)}")
return None
# main part
st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")
st.header("Turn Your Image to Audio Story")
uploaded_file = st.file_uploader("Select an Image...")
if uploaded_file is not None:
# Display the uploaded image
st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)
# Convert the file to a PIL Image
image = Image.open(uploaded_file)
# Progress indicator
progress_bar = st.progress(0)
# Stage 1: Image to Text
with st.spinner('Processing image caption...'):
caption = img2text(image)
progress_bar.progress(33)
st.write(f"**Image caption:** {caption}")
# Stage 2: Text to Story
with st.spinner('Creating story...'):
story = text2story(caption)
progress_bar.progress(66)
st.write(f"**Story:** {story}")
# Stage 3: Story to Audio data
with st.spinner('Generating audio...'):
speech_output = text2audio(story)
progress_bar.progress(100)
# Play button
if st.button("Play Audio"):
if speech_output is not None:
# Try to play the audio directly
try:
if 'audio' in speech_output and 'sampling_rate' in speech_output:
st.audio(speech_output['audio'], sample_rate=speech_output['sampling_rate'])
elif 'audio_array' in speech_output and 'sampling_rate' in speech_output:
st.audio(speech_output['audio_array'], sample_rate=speech_output['sampling_rate'])
elif 'waveform' in speech_output and 'sample_rate' in speech_output:
st.audio(speech_output['waveform'], sample_rate=speech_output['sample_rate'])
else:
# Try the first array-like value as audio data
for key, value in speech_output.items():
if hasattr(value, '__len__') and len(value) > 1000:
if 'rate' in speech_output:
st.audio(value, sample_rate=speech_output['rate'])
elif 'sample_rate' in speech_output:
st.audio(value, sample_rate=speech_output['sample_rate'])
elif 'sampling_rate' in speech_output:
st.audio(value, sample_rate=speech_output['sampling_rate'])
else:
st.audio(value, sample_rate=24000) # Default sample rate
break
else:
st.error(f"Could not find compatible audio format in: {list(speech_output.keys())}")
except Exception as e:
st.error(f"Error playing audio: {str(e)}")
else:
st.error("Audio generation failed. Please try again.") |