Spaces:
Runtime error
Runtime error
File size: 5,261 Bytes
9ec01b8 faf2a27 ee99df3 faf2a27 4ccc97f ee99df3 4ccc97f da8c6a9 1ac8b8b da8c6a9 4ccc97f da8c6a9 1ac8b8b da8c6a9 9e9f496 faf2a27 da8c6a9 6178c80 4ccc97f 9908ddd 6178c80 da8c6a9 4ccc97f c0d73db ee99df3 da8c6a9 649cc15 80f0f94 4ccc97f 80f0f94 4ccc97f 1ac8b8b 4ccc97f 80f0f94 014d79d 4ccc97f 9908ddd 1ac8b8b 014d79d 9ec01b8 c089e11 9908ddd 1ac8b8b 9ec01b8 ee99df3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | import whisper
import pandas as pd
import whisper
import subprocess
from simple_diarizer.diarizer import Diarizer
import streamlit as st
import base64
def create_download_link(val, filename, label):
b64 = base64.b64encode(val)
return f'<a href="data:application/octet-stream;base64,{b64.decode()}" download="{filename}">{label}</a>'
def segment(nu_speakers):
diar = Diarizer(embed_model='ecapa',cluster_method='sc')
segments = diar.diarize('mono.wav', num_speakers=nu_speakers)
sdf = pd.DataFrame(segments)
# reorganize so the first speaker is always speaker 1
speaker_s = sdf['label'].drop_duplicates().reset_index()['label']
speaker_d = dict((v,k+1) for k,v in speaker_s.items())
sdf['speaker'] = sdf['label'].replace(speaker_d)
return sdf
def audio_to_df(uploaded):
monotize(uploaded)
model = whisper.load_model(model_size)
result = model.transcribe('mono.wav',verbose=True,
without_timestamps=False,
task = task)
tdf = pd.DataFrame(result['segments'])
return tdf
def monotize(uploaded):
cmd = f"ffmpeg -y -i {uploaded} -acodec pcm_s16le -ar 16000 -ac 1 mono.wav"
subprocess.Popen(cmd, shell=True).wait()
def add_preface(row):
text = row['text'].replace('\n','')
speaker = row['speaker']
return f'Speaker {speaker}: {text}'
def transcribe(uploaded, nu_speakers):
with st.spinner(text="Converting file..."):
monotize('temp_audio')
audio_file = open('mono.wav', 'rb')
audio_bytes = audio_file.read()
st.audio('mono.wav', format='audio/wav')
with st.spinner(text=f"Transcribing using {model_size} model..."):
tdf = audio_to_df(uploaded)
with st.spinner(text="Segmenting..."):
sdf = segment(nu_speakers)
ns_list = sdf[['start','speaker']].to_dict(orient='records')
# Find the nearest transcript line to the start of each speaker
for row in ns_list:
input = row['start']
id = tdf.iloc[(tdf['start']-input).abs().argsort()[:1]]['id'].values[0]
tdf.loc[tdf['id'] ==id, 'speaker'] = row['speaker']
tdf['speaker'].fillna(method = 'ffill', inplace = True)
tdf['speaker'].fillna(method = 'bfill', inplace = True)
tdf['n1'] = tdf['speaker'] != tdf['speaker'].shift(1)
tdf['speach'] = tdf['n1'].cumsum()
binned_df = tdf.groupby(['speach', 'speaker'])['text'].apply('\n'.join).reset_index()
binned_df['speaker'] = binned_df['speaker'].astype(int)
binned_df['output'] = binned_df.apply(add_preface, axis=1)
lines = []
for row in binned_df['output'].values:
st.write(row)
lines.append(row)
tdf['speaker'] = tdf['speaker'].astype(int)
tdf_cols = ['speaker','start','end','text']
#st.dataframe(tdf[tdf_cols])
return {'text':lines, 'df': tdf[tdf_cols]}
descript = ("This web app creates transcripts using OpenAI's [Whisper](https://github.com/openai/whisper) to transcribe "
"audio files combined with [Chau](https://github.com/cvqluu)'s [Simple Diarizer](https://github.com/cvqluu/simple_diarizer) "
"to partition the text by speaker.\n"
"* You can upload an audio or video file of up to 200MBs.\n"
"* Creating the transcript takes some time. "
"The process takes approximately 20% of the length of the audio file using the base Whisper model.\n "
"* The transcription process handles a variety of languages, and can also translate the audio to English. The tiny model is not good at translating. \n"
"* Speaker segmentation seems to work best with the base model. The small model produces better transcripts, but something seems off with the timecodes, degrading the speaker attribution. \n"
"* After uploading the file, be sure to select the number of speakers." )
st.title("Automated Transcription")
st.markdown(descript)
form = st.form(key='my_form')
uploaded = form.file_uploader("Choose a file")
nu_speakers = form.slider('Number of speakers in recording:', min_value=1, max_value=8, value=2, step=1)
models = form.selectbox(
'Which Whisper model?',
('Tiny (fast)', 'Base (good)', 'Small (great but slow)'), index=1)
translate = form.checkbox('Translate to English?')
submit = form.form_submit_button("Transcribe!")
if submit:
if models == 'Tiny (fast)':
model_size = 'tiny'
elif models == 'Base (good)':
model_size ='base'
elif models == 'Small (great but slow)':
model_size = 'small'
if translate == True:
task = 'translate'
else:
task = 'transcribe'
bytes_data = uploaded.getvalue()
with open('temp_audio', 'wb') as outfile:
outfile.write(bytes_data)
transcript = transcribe('temp_audio', nu_speakers)
csv = transcript['df'].to_csv( float_format='%.2f', index=False).encode('utf-8')
text = '\n'.join(transcript['text']).encode('utf-8')
download_url = create_download_link(text, 'transcript.txt', 'Download transcript as plain text.')
st.markdown(download_url, unsafe_allow_html=True)
download_url = create_download_link(csv, 'transcript.csv', 'Download transcript as CSV (with time codes)')
st.markdown(download_url, unsafe_allow_html=True)
|