Spaces:

IDMNYU
/

9103D-2025S-api-example

Sleeping

thiagohersan commited on Jan 25, 2025

Commit

c5e27c9

verified ·

1 Parent(s): bf6e397

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -5,10 +5,20 @@ from transformers import pipeline
 tts = pipeline(task="text-to-speech", model="facebook/mms-tts-eng")
 def run_tts(txt):
   res = tts(txt)
-  audio = (res['audio'].reshape(-1) * 2 ** 15).astype(np.int16)
-  return res['sampling_rate'], audio
 with gr.Blocks() as demo:
   gr.Interface(
@@ -17,5 +27,17 @@ with gr.Blocks() as demo:
     outputs="audio",
   )
 if __name__ == "__main__":
    demo.launch()

 tts = pipeline(task="text-to-speech", model="facebook/mms-tts-eng")
+# caption = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
+caption = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
 def run_tts(txt):
   res = tts(txt)
+  audio = (res["audio"].reshape(-1) * 2 ** 15).astype(np.int16)
+  return res["sampling_rate"], audio
+def run_caption(img):
+  res = caption(img, max_new_tokens=128)
+  return res[0]["generated_text"]
+def run_caption_tts(img):
+  return run_tts(run_caption(img))
 with gr.Blocks() as demo:
   gr.Interface(
     outputs="audio",
   )
+  gr.Interface(
+    run_caption,
+    inputs=gr.Image(type="pil"),
+    outputs="text",
+  )
+  gr.Interface(
+    run_caption_tts,
+    inputs=gr.Image(type="pil"),
+    outputs="audio",
+  )
 if __name__ == "__main__":
    demo.launch()