Spaces:
Runtime error
Runtime error
| import whisperx | |
| import streamlit as st | |
| import torch | |
| import tempfile | |
| import subprocess | |
| def transcribe(audio_file): | |
| if torch.cuda.is_available(): | |
| device = "cuda" | |
| else: | |
| device = "cpu" | |
| batch_size = 16 # reduce if low on GPU mem | |
| compute_type = "int8" # change to "float16" if high on GPU mem (may reduce accuracy) | |
| YOUR_HF_TOKEN = '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' | |
| # load audio file | |
| audio_bytes = uploaded_file.getvalue() | |
| with open(temp_file, 'wb') as f: | |
| f.write(audio_bytes) | |
| # 1. Transcribe with original whisper (batched) | |
| model = whisperx.load_model("tiny", device = device, compute_type=compute_type) | |
| audio = whisperx.load_audio(temp_file) | |
| result = model.transcribe(audio, batch_size=batch_size) | |
| st.write("Transcribed! Here's what we have so far:") | |
| st.write(result["segments"]) # before alignment | |
| # delete model if low on GPU resources | |
| # import gc; gc.collect(); torch.cuda.empty_cache(); del model | |
| # 2. Align whisper output | |
| model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) | |
| result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False) | |
| st.write("Aligned! Here's what we have so far:") | |
| st.write(result["segments"]) # after alignment | |
| # delete model if low on GPU resources | |
| # import gc; gc.collect(); torch.cuda.empty_cache(); del model_a | |
| # 3. Assign speaker labels | |
| diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device) | |
| # add min/max number of speakers if known | |
| diarize_segments = diarize_model(audio_file) | |
| # diarize_model(audio_file, min_speakers=min_speakers, max_speakers=max_speakers) | |
| result = whisperx.assign_word_speakers(diarize_segments, result) | |
| st.write(diarize_segments) | |
| st.write(result["segments"]) # segments are now assigned speaker IDs | |
| st.title("Automated Transcription") | |
| form = st.form(key='my_form') | |
| uploaded_file = form.file_uploader("Choose a file") | |
| submit = form.form_submit_button("Transcribe!") | |
| if submit: | |
| #temporary file to store audio_file | |
| tmp_dir = tempfile.TemporaryDirectory() | |
| temp_file = tmp_dir.name + '/mono.wav' | |
| cmd = f"ffmpeg -y -i {uploaded_file} -acodec pcm_s16le -ar 16000 -ac 1 {temp_file}" | |
| subprocess.Popen(cmd, shell=True).wait() | |
| transcribe(temp_file) |