Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import librosa | |
| import numpy as np | |
| import moviepy.editor as mpy | |
| import torch | |
| from PIL import Image, ImageDraw, ImageFont | |
| from transformers import pipeline | |
| # checkpoint = "openai/whisper-tiny" | |
| # checkpoint = "openai/whisper-base" | |
| checkpoint = "openai/whisper-small" | |
| # We need to set alignment_heads on the model's generation_config (at least | |
| # until the models have been updated on the hub). | |
| # If you're going to use a different version of whisper, see the following | |
| # for which values to use for alignment_heads: | |
| # https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a | |
| # whisper-tiny | |
| # alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]] | |
| # whisper-base | |
| # alignment_heads = [[3, 1], [4, 2], [4, 3], [4, 7], [5, 1], [5, 2], [5, 4], [5, 6]] | |
| # whisper-small | |
| alignment_heads = [[5, 3], [5, 9], [8, 0], [8, 4], [8, 7], [8, 8], [9, 0], [9, 7], [9, 9], [10, 5]] | |
| max_duration = 60 # seconds | |
| fps = 25 | |
| video_width = 640 | |
| video_height = 480 | |
| margin_left = 20 | |
| margin_right = 20 | |
| margin_top = 20 | |
| line_height = 44 | |
| background_image = Image.open("background.png") | |
| font = ImageFont.truetype("Lato-Regular.ttf", 40) | |
| text_color = (255, 200, 200) | |
| highlight_color = (255, 255, 255) | |
| LANGUAGES = { | |
| "en": "english", | |
| "zh": "chinese", | |
| "de": "german", | |
| "es": "spanish", | |
| "ru": "russian", | |
| "ko": "korean", | |
| "fr": "french", | |
| "ja": "japanese", | |
| "pt": "portuguese", | |
| "tr": "turkish", | |
| "pl": "polish", | |
| "ca": "catalan", | |
| "nl": "dutch", | |
| "ar": "arabic", | |
| "sv": "swedish", | |
| "it": "italian", | |
| "id": "indonesian", | |
| "hi": "hindi", | |
| "fi": "finnish", | |
| "vi": "vietnamese", | |
| "he": "hebrew", | |
| "uk": "ukrainian", | |
| "el": "greek", | |
| "ms": "malay", | |
| "cs": "czech", | |
| "ro": "romanian", | |
| "da": "danish", | |
| "hu": "hungarian", | |
| "ta": "tamil", | |
| "no": "norwegian", | |
| "th": "thai", | |
| "ur": "urdu", | |
| "hr": "croatian", | |
| "bg": "bulgarian", | |
| "lt": "lithuanian", | |
| "la": "latin", | |
| "mi": "maori", | |
| "ml": "malayalam", | |
| "cy": "welsh", | |
| "sk": "slovak", | |
| "te": "telugu", | |
| "fa": "persian", | |
| "lv": "latvian", | |
| "bn": "bengali", | |
| "sr": "serbian", | |
| "az": "azerbaijani", | |
| "sl": "slovenian", | |
| "kn": "kannada", | |
| "et": "estonian", | |
| "mk": "macedonian", | |
| "br": "breton", | |
| "eu": "basque", | |
| "is": "icelandic", | |
| "hy": "armenian", | |
| "ne": "nepali", | |
| "mn": "mongolian", | |
| "bs": "bosnian", | |
| "kk": "kazakh", | |
| "sq": "albanian", | |
| "sw": "swahili", | |
| "gl": "galician", | |
| "mr": "marathi", | |
| "pa": "punjabi", | |
| "si": "sinhala", | |
| "km": "khmer", | |
| "sn": "shona", | |
| "yo": "yoruba", | |
| "so": "somali", | |
| "af": "afrikaans", | |
| "oc": "occitan", | |
| "ka": "georgian", | |
| "be": "belarusian", | |
| "tg": "tajik", | |
| "sd": "sindhi", | |
| "gu": "gujarati", | |
| "am": "amharic", | |
| "yi": "yiddish", | |
| "lo": "lao", | |
| "uz": "uzbek", | |
| "fo": "faroese", | |
| "ht": "haitian creole", | |
| "ps": "pashto", | |
| "tk": "turkmen", | |
| "nn": "nynorsk", | |
| "mt": "maltese", | |
| "sa": "sanskrit", | |
| "lb": "luxembourgish", | |
| "my": "myanmar", | |
| "bo": "tibetan", | |
| "tl": "tagalog", | |
| "mg": "malagasy", | |
| "as": "assamese", | |
| "tt": "tatar", | |
| "haw": "hawaiian", | |
| "ln": "lingala", | |
| "ha": "hausa", | |
| "ba": "bashkir", | |
| "jw": "javanese", | |
| "su": "sundanese", | |
| } | |
| # language code lookup by name, with a few language aliases | |
| TO_LANGUAGE_CODE = { | |
| **{language: code for code, language in LANGUAGES.items()}, | |
| "burmese": "my", | |
| "valencian": "ca", | |
| "flemish": "nl", | |
| "haitian": "ht", | |
| "letzeburgesch": "lb", | |
| "pushto": "ps", | |
| "panjabi": "pa", | |
| "moldavian": "ro", | |
| "moldovan": "ro", | |
| "sinhalese": "si", | |
| "castilian": "es", | |
| } | |
| if torch.cuda.is_available() and torch.cuda.device_count() > 0: | |
| from transformers import ( | |
| AutomaticSpeechRecognitionPipeline, | |
| WhisperForConditionalGeneration, | |
| WhisperProcessor, | |
| ) | |
| model = WhisperForConditionalGeneration.from_pretrained(checkpoint).to("cuda").half() | |
| processor = WhisperProcessor.from_pretrained(checkpoint) | |
| pipe = AutomaticSpeechRecognitionPipeline( | |
| model=model, | |
| tokenizer=processor.tokenizer, | |
| feature_extractor=processor.feature_extractor, | |
| batch_size=8, | |
| torch_dtype=torch.float16, | |
| device="cuda:0" | |
| ) | |
| else: | |
| pipe = pipeline(model=checkpoint) | |
| pipe.model.generation_config.alignment_heads = alignment_heads | |
| chunks = [] | |
| start_chunk = 0 | |
| last_draws = None | |
| last_image = None | |
| def make_frame(t): | |
| global chunks, start_chunk, last_draws, last_image | |
| # TODO in the Henry V example, the word "desires" has an ending timestamp | |
| # that's too far into the future, and so the word stays highlighted. | |
| # Could fix this by finding the latest word that is active in the chunk | |
| # and only highlight that one. | |
| image = background_image.copy() | |
| draw = ImageDraw.Draw(image) | |
| # for debugging: draw frame time | |
| #draw.text((20, 20), str(t), fill=text_color, font=font) | |
| space_length = draw.textlength(" ", font) | |
| x = margin_left | |
| y = margin_top | |
| # Create a list of drawing commands | |
| draws = [] | |
| for i in range(start_chunk, len(chunks)): | |
| chunk = chunks[i] | |
| chunk_start = chunk["timestamp"][0] | |
| chunk_end = chunk["timestamp"][1] | |
| if chunk_start > t: break | |
| if chunk_end is None: chunk_end = max_duration | |
| word = chunk["text"] | |
| word_length = draw.textlength(word + " ", font) - space_length | |
| if x + word_length >= video_width - margin_right: | |
| x = margin_left | |
| y += line_height | |
| # restart page when end is reached | |
| if y >= margin_top + line_height * 7: | |
| start_chunk = i | |
| break | |
| highlight = (chunk_start <= t < chunk_end) | |
| draws.append([x, y, word, word_length, highlight]) | |
| x += word_length + space_length | |
| # If the drawing commands didn't change, then reuse the last image, | |
| # otherwise draw a new image | |
| if draws != last_draws: | |
| for x, y, word, word_length, highlight in draws: | |
| if highlight: | |
| color = highlight_color | |
| draw.rectangle([x, y + line_height, x + word_length, y + line_height + 4], fill=color) | |
| else: | |
| color = text_color | |
| draw.text((x, y), word, fill=color, font=font) | |
| last_image = np.array(image) | |
| last_draws = draws | |
| return last_image | |
| def predict(audio_path, language=None): | |
| global chunks, start_chunk, last_draws, last_image | |
| start_chunk = 0 | |
| last_draws = None | |
| last_image = None | |
| audio_data, sr = librosa.load(audio_path, mono=True) | |
| duration = librosa.get_duration(y=audio_data, sr=sr) | |
| duration = min(max_duration, duration) | |
| audio_data = audio_data[:int(duration * sr)] | |
| if language is not None: | |
| pipe.model.config.forced_decoder_ids = ( | |
| pipe.tokenizer.get_decoder_prompt_ids( | |
| language=language, | |
| task="transcribe" | |
| ) | |
| ) | |
| # Run Whisper to get word-level timestamps. | |
| audio_inputs = librosa.resample(audio_data, orig_sr=sr, target_sr=pipe.feature_extractor.sampling_rate) | |
| output = pipe(audio_inputs, chunk_length_s=30, stride_length_s=[4, 2], return_timestamps="word") | |
| chunks = output["chunks"] | |
| #print(chunks) | |
| # Create the video. | |
| clip = mpy.VideoClip(make_frame, duration=duration) | |
| audio_clip = mpy.AudioFileClip(audio_path).set_duration(duration) | |
| clip = clip.set_audio(audio_clip) | |
| clip.write_videofile("my_video.mp4", fps=fps, codec="libx264", audio_codec="aac") | |
| return "my_video.mp4" | |
| title = "Word-level timestamps with Whisper" | |
| description = """ | |
| This demo shows Whisper <b>word-level timestamps</b> in action using Hugging Face Transformers. It creates a video showing subtitled audio with the current word highlighted. It can even do music lyrics! | |
| This demo uses the <b>openai/whisper-small</b> checkpoint. | |
| Since it's only a demo, the output is limited to the first 60 seconds of audio. | |
| To use this on longer audio, <a href="https://huggingface.co/spaces/Matthijs/whisper_word_timestamps/settings?duplicate=true">duplicate the space</a> | |
| and in <b>app.py</b> change the value of `max_duration`. | |
| """ | |
| article = """ | |
| <div style='margin:20px auto;'> | |
| <p>Credits:<p> | |
| <ul> | |
| <li>Shakespeare's "Henry V" speech from <a href="https://freesound.org/people/acclivity/sounds/24096/">acclivity</a> (CC BY-NC 4.0 license) | |
| <li>"Here's to the Crazy Ones" speech by Steve Jobs</li> | |
| <li>"Stupid People" comedy routine by Bill Engvall</li> | |
| <li>"BeOS, It's The OS" song by The Cotton Squares</li> | |
| <li>Lato font by Łukasz Dziedzic (licensed under Open Font License)</li> | |
| <li>Whisper model by OpenAI</li> | |
| </ul> | |
| </div> | |
| """ | |
| examples = [ | |
| ["examples/steve_jobs_crazy_ones.mp3", "english"], | |
| ["examples/henry5.wav", "english"], | |
| ["examples/stupid_people.mp3", "english"], | |
| ["examples/beos_song.mp3", "english"], | |
| ["examples/johan_cruijff.mp3", "dutch"], | |
| ] | |
| gr.Interface( | |
| fn=predict, | |
| inputs=[ | |
| gr.Audio(label="Upload Audio", source="upload", type="filepath"), | |
| gr.Dropdown(label="Language", choices=sorted(list(TO_LANGUAGE_CODE.keys()))), | |
| ], | |
| outputs=[ | |
| gr.Video(label="Output Video"), | |
| ], | |
| title=title, | |
| description=description, | |
| article=article, | |
| examples=examples, | |
| ).launch() | |