Spaces:
Running
Running
File size: 1,622 Bytes
42ef26d 92615bf 42ef26d 722f574 afff762 42ef26d 50a2f63 722f574 6c76059 50a2f63 afff762 6c76059 722f574 6c76059 c88106c 722f574 c88106c ac026ae 5ba51aa 42ef26d ac026ae 5ba51aa ac026ae 5ba51aa ac026ae 5ba51aa c714d05 42ef26d 2f90508 c714d05 2f90508 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | import gradio as gr
from transformers import pipeline
from helper import load_image_from_url, render_results_in_image
from helper import summarize_predictions_natural_language
od_pipe = pipeline("object-detection", model="facebook/detr-resnet-50")
#tts_pipe = pipeline("text-to-speech",
# model="kakao-enterprise/vits-ljs")
def get_pipeline_prediction(pil_image):
pipeline_output = od_pipe(pil_image)
text = summarize_predictions_natural_language(pipeline_output)
#text = "Hello, my name is Ratha"
processed_image = render_results_in_image(pil_image,
pipeline_output)
#gen_audio = tts_pipe(text)
#rate= gen_audio["sampling_rate"]
return processed_image, text
#, (rate, gen_audio["audio"][0])
demo = gr.Interface(
fn=get_pipeline_prediction,
inputs=gr.Image(label="Input image",
type="pil"),
outputs= [
gr.Image(label="Output image with predicted instances", type="pil"),
gr.Textbox(label="Prediction Summary")
#,gr.Audio(label="Generated Speech")
]
)
demo.launch()
#text = itt_pipe(input)
#tts_pipe = pipeline("text-to-speech",
# model="kakao-enterprise/vits-ljs")
#narrated_text = tts_pipe(tts_pipe[0]['generated_text'])
#def launch(text):
# out = tts_pipe(text)
# audio = IPythonAudio(out["audio"][0],
# rate=out["sampling_rate"])
# return audio
#iface = gr.Interface(launch,
# inputs=gr.Image(type='pil'),
# outputs="text")
#iface.launch()
|