Spaces:
Sleeping
Sleeping
File size: 3,138 Bytes
82363f6 7730cd8 2ba3da6 82363f6 d4c3cef 82363f6 7730cd8 82363f6 753fdd3 db4cdf7 753fdd3 68c3b1f 753fdd3 7730cd8 753fdd3 7730cd8 753fdd3 5f79f2b 753fdd3 5f79f2b 2ba3da6 5f79f2b 2ba3da6 7730cd8 2ba3da6 82363f6 68c3b1f 82363f6 7730cd8 82363f6 7730cd8 82363f6 7730cd8 37a58fb 82363f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
#Import part
from transformers import pipeline
import streamlit as st
import torch
# Use function for the implementation
# function part
# img2text
def img2text(img):
image_to_text_model = pipeline("image-to-text",
model="Salesforce/blip-image-captioning-base")
text = image_to_text_model(img)[0]["generated_text"]
return text
# text2story
def text2story(text):
generator = pipeline("text-generation", model="distilbert/distilgpt2")
story_text = generator(
text,
min_length=150, # min_length, # of tokens at least larger than100
max_length=300,
num_return_sequences=1,
no_repeat_ngram_size=2, # prevent repetition
early_stopping=False # prohibit early stopping
)[0]["generated_text"]
return story_text
# text2audio
def text2audio(story_text):
tts_pipeline = pipeline("text-to-speech", model="facebook/mms-tts-eng")
audio_data = tts_pipeline(story_text) # 直接返回字典
return audio_data
# tts_pipeline = pipeline("text-to-speech", model="suno/bark-small")
# audio_data = tts_pipeline(story_text)
# audio_buffer = io.BytesIO()
# wavfile.write(audio_buffer, rate=audio_data["sampling_rate"], data=audio_data["audio"])
# audio_buffer.seek(0)
# return {
# 'audio': audio_buffer.getvalue(),
# 'sampling_rate': audio_data["sampling_rate"]
# }
# processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
# model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
# inputs = processor(text=story_text, return_tensors="pt")
# with torch.no_grad():
# speech = model.generate(**inputs)
# audio_data = speech.cpu().numpy().squeeze()
# audio_buffer = io.BytesIO()
# wavfile.write(audio_buffer, rate=16000, data=audio_data) # 16kHz 采样率
# audio_buffer.seek(0)
# return {'audio': audio_buffer.getvalue(), 'sampling_rate': 16000}
# program main part
st.set_page_config(page_title="Your Image to Audio Story",
page_icon="🦜")
st.header("Turn Your Image to Audio Story")
uploaded_file = st.file_uploader("Select an Image...")
if uploaded_file is not None:
print(uploaded_file)
bytes_data = uploaded_file.getvalue()
with open(uploaded_file.name, "wb") as file:
file.write(bytes_data)
st.image(uploaded_file, caption="Uploaded Image",
use_column_width=True)
#Stage 1: Image to Text
st.text('Processing img2text...')
scenario = img2text(uploaded_file.name)
st.write(scenario)
#Stage 2: Text to Story
st.text('Generating a story...')
story = text2story(scenario)
st.write(story)
#Stage 3: Story to Audio data
st.text('Generating audio data...')
audio_data =text2audio(story)
# Play button
if st.button("Play Audio"):
st.audio(audio_data['audio'],
format="audio/wav",
start_time=0,
sample_rate = audio_data['sampling_rate'])
#st.audio("kids_playing_audio.wav")
|