Spaces:

Matthijs
/

whisper_word_timestamps

Runtime error

App Files Files Community

Matthijs Hollemans commited on Jun 20, 2023

Commit

44f5cb7

1 Parent(s): 1aa521f

16-bit floats, draw one word at a time, optimize making video

Browse files

Files changed (1) hide show

app.py +76 -22

app.py CHANGED Viewed

@@ -2,13 +2,14 @@ import gradio as gr
 import librosa
 import numpy as np
 import moviepy.editor as mpy
 from PIL import Image, ImageDraw, ImageFont
 from transformers import pipeline
-fps = 25
 max_duration = 60  # seconds
 video_width = 640
 video_height = 480
 margin_left = 20
@@ -21,28 +22,46 @@ font = ImageFont.truetype("Lato-Regular.ttf", 40)
 text_color = (255, 200, 200)
 highlight_color = (255, 255, 255)
-checkpoint = "openai/whisper-tiny"
 # checkpoint = "openai/whisper-base"
-# checkpoint = "openai/whisper-small"
-pipe = pipeline(model=checkpoint)
 # TODO: no longer need to set these manually once the models have been updated on the Hub
 # whisper-tiny
-pipe.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
 # whisper-base
 # pipe.model.generation_config.alignment_heads = [[3, 1], [4, 2], [4, 3], [4, 7], [5, 1], [5, 2], [5, 4], [5, 6]]
 # whisper-small
-# pipe.model.generation_config.alignment_heads = [[5, 3], [5, 9], [8, 0], [8, 4], [8, 7], [8, 8], [9, 0], [9, 7], [9, 9], [10, 5]]
 chunks = []
-def make_frame(t):
-    global chunks
-    # TODO speed optimization: could cache the last image returned and if the
-    # active chunk and active word didn't change, use that last image instead
-    # of drawing the exact same thing again
     # TODO in the Henry V example, the word "desires" has an ending timestamp
     # that's too far into the future, and so the word stays highlighted.
@@ -55,29 +74,60 @@ def make_frame(t):
     # for debugging: draw frame time
     #draw.text((20, 20), str(t), fill=text_color, font=font)
     x = margin_left
     y = margin_top
-    for chunk in chunks:
         chunk_start = chunk["timestamp"][0]
         chunk_end = chunk["timestamp"][1]
         if chunk_end is None: chunk_end = max_duration
-        if chunk_start <= t <= chunk_end:
-            word = chunk["text"]
-            word_length = draw.textlength(word, font)
-            x = (video_width - word_length) / 2
-            y = video_height / 2 - 20
-            draw.text((x, y), word, fill=highlight_color, font=font)
-            break
-    return np.array(image)
 def predict(audio_path):
-    global chunks
     audio_data, sr = librosa.load(audio_path, mono=True)
     duration = librosa.get_duration(y=audio_data, sr=sr)
@@ -103,7 +153,11 @@ title = "Word-level timestamps with Whisper"
 description = """
 This demo shows Whisper <b>word-level timestamps</b> in action using Hugging Face Transformers. It creates a video showing subtitled audio with the current word highlighted. It can even do music lyrics!
-This demo uses the <b>openai/whisper-small</b> checkpoint. Since it's only a demo, the output is limited to the first 60 seconds of audio.
 """
 article = """

 import librosa
 import numpy as np
 import moviepy.editor as mpy
+import torch
 from PIL import Image, ImageDraw, ImageFont
 from transformers import pipeline
 max_duration = 60  # seconds
+fps = 25
 video_width = 640
 video_height = 480
 margin_left = 20
 text_color = (255, 200, 200)
 highlight_color = (255, 255, 255)
+# checkpoint = "openai/whisper-tiny"
 # checkpoint = "openai/whisper-base"
+checkpoint = "openai/whisper-small"
+if torch.cuda.is_available() and torch.cuda.device_count() > 0:
+    from transformers import (
+        AutomaticSpeechRecognitionPipeline,
+        WhisperForConditionalGeneration,
+        WhisperProcessor,
+    )
+    model = WhisperForConditionalGeneration.from_pretrained(checkpoint).to("cuda").half()
+    processor = WhisperProcessor.from_pretrained(checkpoint)
+    pipe = AutomaticSpeechRecognitionPipeline(
+        model=model,
+        tokenizer=processor.tokenizer,
+        feature_extractor=processor.feature_extractor,
+        batch_size=8,
+        torch_dtype=torch.float16,
+        device="cuda:0"
+    )
+else:
+    pipe = pipeline(model=checkpoint)
 # TODO: no longer need to set these manually once the models have been updated on the Hub
 # whisper-tiny
+# pipe.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
 # whisper-base
 # pipe.model.generation_config.alignment_heads = [[3, 1], [4, 2], [4, 3], [4, 7], [5, 1], [5, 2], [5, 4], [5, 6]]
 # whisper-small
+pipe.model.generation_config.alignment_heads = [[5, 3], [5, 9], [8, 0], [8, 4], [8, 7], [8, 8], [9, 0], [9, 7], [9, 9], [10, 5]]
 chunks = []
+start_chunk = 0
+last_draws = []
+last_image = None
+def make_frame(t):
+    global chunks, start_chunk, last_draws, last_image
     # TODO in the Henry V example, the word "desires" has an ending timestamp
     # that's too far into the future, and so the word stays highlighted.
     # for debugging: draw frame time
     #draw.text((20, 20), str(t), fill=text_color, font=font)
+    space_length = draw.textlength(" ", font)
     x = margin_left
     y = margin_top
+    # Create a list of drawing commands
+    draws = []
+    for i in range(start_chunk, len(chunks)):
+        chunk = chunks[i]
         chunk_start = chunk["timestamp"][0]
         chunk_end = chunk["timestamp"][1]
+        if chunk_start > t: break
         if chunk_end is None: chunk_end = max_duration
+        word = chunk["text"]
+        word_length = draw.textlength(word + " ", font) - space_length
+        if x + word_length >= video_width - margin_right:
+            x = margin_left
+            y += line_height
+            # restart page when end is reached
+            if y >= margin_top + line_height * 7:
+                start_chunk = i
+                break
+        highlight = (chunk_start <= t < chunk_end)
+        draws.append([x, y, word, word_length, highlight])
+        x += word_length + space_length
+    # If the drawing commands didn't change, then reuse the last image,
+    # otherwise draw a new image
+    if draws != last_draws:
+        for x, y, word, word_length, highlight in draws:
+            if highlight:
+                color = highlight_color
+                draw.rectangle([x, y + line_height, x + word_length, y + line_height + 4], fill=color)
+            else:
+                color = text_color
+            draw.text((x, y), word, fill=color, font=font)
+        last_image = np.array(image)
+        last_draws = draws
+    return last_image
 def predict(audio_path):
+    global chunks, start_chunk, last_draws, last_image
+    start_chunk = 0
+    last_draws = []
+    last_image = None
     audio_data, sr = librosa.load(audio_path, mono=True)
     duration = librosa.get_duration(y=audio_data, sr=sr)
 description = """
 This demo shows Whisper <b>word-level timestamps</b> in action using Hugging Face Transformers. It creates a video showing subtitled audio with the current word highlighted. It can even do music lyrics!
+This demo uses the <b>openai/whisper-small</b> checkpoint.
+Since it's only a demo, the output is limited to the first 60 seconds of audio.
+To use this on longer audio, <a href="https://huggingface.co/spaces/Matthijs/whisper_word_timestamps/settings?duplicate=true">duplicate the space</a>
+and in <b>app.py</b> change the value of `max_duration`.
 """
 article = """