File size: 1,383 Bytes
a82cf9e
 
 
 
 
c5e27c9
 
 
8a21543
afb7934
 
 
 
 
 
 
 
 
 
 
a82cf9e
 
c5e27c9
 
 
 
 
 
a82cf9e
afb7934
 
 
 
a82cf9e
afb7934
 
 
 
 
 
 
 
 
 
 
 
a82cf9e
 
 
 
 
 
c5e27c9
afb7934
c5e27c9
afb7934
c5e27c9
 
 
afb7934
c5e27c9
 
 
 
a82cf9e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import gradio as gr
import numpy as np

from transformers import pipeline

# caption = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
caption = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")

generate = pipeline("text-generation", model="openai-community/gpt2")

tts = pipeline(task="text-to-speech", model="facebook/mms-tts-eng")

def run_caption(img):
  res = caption(img, max_new_tokens=128)
  return res[0]["generated_text"]

def run_generate(txt):
  res = generate(txt, max_length=50)
  return res[0]["generated_text"]

def run_tts(txt):
  res = tts(txt)
  audio = (res["audio"].reshape(-1) * 2 ** 15).astype(np.int16)
  return res["sampling_rate"], audio


def run_caption_tts(img):
  return run_tts(run_caption(img))

def run_caption_generate_tts(img):
  return run_tts(run_generate(run_caption(img)))


with gr.Blocks() as demo:
  gr.Interface(
    run_caption,
    inputs=gr.Image(type="pil"),
    outputs="text",
  )

  gr.Interface(
    run_generate,
    inputs="text",
    outputs="text",
  )

  gr.Interface(
    run_tts,
    inputs=gr.Textbox(),
    outputs="audio",
  )

  gr.Interface(
    run_caption_tts,
    inputs=gr.Image(type="pil"),
    outputs="audio",
  )

  gr.Interface(
    run_caption_generate_tts,
    inputs=gr.Image(type="pil"),
    outputs="audio",
  )

if __name__ == "__main__":
   demo.launch()