Spaces:
Sleeping
Sleeping
File size: 1,383 Bytes
a82cf9e c5e27c9 8a21543 afb7934 a82cf9e c5e27c9 a82cf9e afb7934 a82cf9e afb7934 a82cf9e c5e27c9 afb7934 c5e27c9 afb7934 c5e27c9 afb7934 c5e27c9 a82cf9e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import gradio as gr
import numpy as np
from transformers import pipeline
# caption = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
caption = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
generate = pipeline("text-generation", model="openai-community/gpt2")
tts = pipeline(task="text-to-speech", model="facebook/mms-tts-eng")
def run_caption(img):
res = caption(img, max_new_tokens=128)
return res[0]["generated_text"]
def run_generate(txt):
res = generate(txt, max_length=50)
return res[0]["generated_text"]
def run_tts(txt):
res = tts(txt)
audio = (res["audio"].reshape(-1) * 2 ** 15).astype(np.int16)
return res["sampling_rate"], audio
def run_caption_tts(img):
return run_tts(run_caption(img))
def run_caption_generate_tts(img):
return run_tts(run_generate(run_caption(img)))
with gr.Blocks() as demo:
gr.Interface(
run_caption,
inputs=gr.Image(type="pil"),
outputs="text",
)
gr.Interface(
run_generate,
inputs="text",
outputs="text",
)
gr.Interface(
run_tts,
inputs=gr.Textbox(),
outputs="audio",
)
gr.Interface(
run_caption_tts,
inputs=gr.Image(type="pil"),
outputs="audio",
)
gr.Interface(
run_caption_generate_tts,
inputs=gr.Image(type="pil"),
outputs="audio",
)
if __name__ == "__main__":
demo.launch()
|