thiagohersan commited on
Commit
c5e27c9
·
verified ·
1 Parent(s): bf6e397

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -2
app.py CHANGED
@@ -5,10 +5,20 @@ from transformers import pipeline
5
 
6
  tts = pipeline(task="text-to-speech", model="facebook/mms-tts-eng")
7
 
 
 
 
8
  def run_tts(txt):
9
  res = tts(txt)
10
- audio = (res['audio'].reshape(-1) * 2 ** 15).astype(np.int16)
11
- return res['sampling_rate'], audio
 
 
 
 
 
 
 
12
 
13
  with gr.Blocks() as demo:
14
  gr.Interface(
@@ -17,5 +27,17 @@ with gr.Blocks() as demo:
17
  outputs="audio",
18
  )
19
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  if __name__ == "__main__":
21
  demo.launch()
 
5
 
6
  tts = pipeline(task="text-to-speech", model="facebook/mms-tts-eng")
7
 
8
+ # caption = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
9
+ caption = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
10
+
11
  def run_tts(txt):
12
  res = tts(txt)
13
+ audio = (res["audio"].reshape(-1) * 2 ** 15).astype(np.int16)
14
+ return res["sampling_rate"], audio
15
+
16
+ def run_caption(img):
17
+ res = caption(img, max_new_tokens=128)
18
+ return res[0]["generated_text"]
19
+
20
+ def run_caption_tts(img):
21
+ return run_tts(run_caption(img))
22
 
23
  with gr.Blocks() as demo:
24
  gr.Interface(
 
27
  outputs="audio",
28
  )
29
 
30
+ gr.Interface(
31
+ run_caption,
32
+ inputs=gr.Image(type="pil"),
33
+ outputs="text",
34
+ )
35
+
36
+ gr.Interface(
37
+ run_caption_tts,
38
+ inputs=gr.Image(type="pil"),
39
+ outputs="audio",
40
+ )
41
+
42
  if __name__ == "__main__":
43
  demo.launch()