import gradio as gr from transformers import pipeline from helper import load_image_from_url, render_results_in_image from helper import summarize_predictions_natural_language od_pipe = pipeline("object-detection", model="facebook/detr-resnet-50") #tts_pipe = pipeline("text-to-speech", # model="kakao-enterprise/vits-ljs") def get_pipeline_prediction(pil_image): pipeline_output = od_pipe(pil_image) text = summarize_predictions_natural_language(pipeline_output) #text = "Hello, my name is Ratha" processed_image = render_results_in_image(pil_image, pipeline_output) #gen_audio = tts_pipe(text) #rate= gen_audio["sampling_rate"] return processed_image, text #, (rate, gen_audio["audio"][0]) demo = gr.Interface( fn=get_pipeline_prediction, inputs=gr.Image(label="Input image", type="pil"), outputs= [ gr.Image(label="Output image with predicted instances", type="pil"), gr.Textbox(label="Prediction Summary") #,gr.Audio(label="Generated Speech") ] ) demo.launch() #text = itt_pipe(input) #tts_pipe = pipeline("text-to-speech", # model="kakao-enterprise/vits-ljs") #narrated_text = tts_pipe(tts_pipe[0]['generated_text']) #def launch(text): # out = tts_pipe(text) # audio = IPythonAudio(out["audio"][0], # rate=out["sampling_rate"]) # return audio #iface = gr.Interface(launch, # inputs=gr.Image(type='pil'), # outputs="text") #iface.launch()