Spaces:
Runtime error
Runtime error
| import whisper | |
| import pandas as pd | |
| import whisper | |
| import subprocess | |
| from simple_diarizer.diarizer import Diarizer | |
| import streamlit as st | |
| def speech_to_text(uploaded): | |
| model = whisper.load_model('base') | |
| result = model.transcribe(uploaded,verbose=True) | |
| return f'You said: {result["text"]}' | |
| def segment(nu_speakers): | |
| diar = Diarizer(embed_model='xvec',cluster_method='sc') | |
| segments = diar.diarize('mono.wav', num_speakers=nu_speakers) | |
| sdf = pd.DataFrame(segments) | |
| # reorganize so the first speaker is always speaker 1 | |
| speaker_s = sdf['label'].drop_duplicates().reset_index()['label'] | |
| speaker_d = dict((v,k+1) for k,v in speaker_s.items()) | |
| sdf['speaker'] = sdf['label'].replace(speaker_d) | |
| return sdf | |
| def audio_to_df(uploaded): | |
| monotize(uploaded) | |
| model = whisper.load_model('base') | |
| result = model.transcribe('mono.wav',verbose=True, | |
| without_timestamps=False) | |
| tdf = pd.DataFrame(result['segments']) | |
| return tdf | |
| def monotize(uploaded): | |
| cmd = f"ffmpeg -y -i {uploaded} -acodec pcm_s16le -ar 16000 -ac 1 mono.wav" | |
| subprocess.Popen(cmd, shell=True).wait() | |
| def add_preface(row): | |
| text = row['text'].replace('\n','') | |
| speaker = row['speaker'] | |
| return f'Speaker {speaker}: {text}' | |
| def transcribe(uploaded, nu_speakers): | |
| with st.spinner(text="Converting file..."): | |
| monotize('temp_audio') | |
| with st.spinner(text="Transcribing..."): | |
| tdf = audio_to_df(uploaded) | |
| with st.spinner(text="Segmenting..."): | |
| sdf = segment(nu_speakers) | |
| ns_list = sdf[['start','speaker']].to_dict(orient='records') | |
| # Find the nearest transcript line to the start of each speaker | |
| for row in ns_list: | |
| input = row['start'] | |
| id = tdf.iloc[(tdf['start']-input).abs().argsort()[:1]]['id'].values[0] | |
| tdf.loc[tdf['id'] ==id, 'speaker'] = row['speaker'] | |
| tdf['speaker'].fillna(method = 'ffill', inplace = True) | |
| tdf['speaker'].fillna(method = 'bfill', inplace = True) | |
| tdf['n1'] = tdf['speaker'] != tdf['speaker'].shift(1) | |
| tdf['speach'] = tdf['n1'].cumsum() | |
| binned_df = tdf.groupby(['speach', 'speaker'])['text'].apply('\n'.join).reset_index() | |
| binned_df['speaker'] = binned_df['speaker'].astype(int) | |
| binned_df['output'] = binned_df.apply(add_preface, axis=1) | |
| lines = [] | |
| for row in binned_df['output'].values: | |
| st.write(row) | |
| lines.append(row) | |
| return '\n'.join(lines) | |
| descript = ("This web app creates transcripts using OpenAI's [Whisper](https://github.com/openai/whisper) to transcribe " | |
| "audio files combined with [Chau](https://github.com/cvqluu)'s [Simple Diarizer](https://github.com/cvqluu/simple_diarizer) " | |
| "to partition the text by speaker.\n" | |
| "* You can upload a audio or video file of up to 200MBs.\n" | |
| "* Creating the transcript takes some time. " | |
| "Using the default base transcription model, the process takes approximately 20% of the length of the audio file.\n " | |
| "* After uploading the file, **be sure to select the number of speakers**." ) | |
| st.title("Automated Transcription") | |
| st.markdown(descript) | |
| form = st.form(key='my_form') | |
| uploaded = form.file_uploader("Choose a file") | |
| nu_speakers = form.slider('Number of speakers in audio file:', min_value=1, max_value=6, value=2, step=1) | |
| submit = form.form_submit_button("Transcribe!") | |
| if submit: | |
| bytes_data = uploaded.getvalue() | |
| with open('temp_audio', 'wb') as outfile: | |
| outfile.write(bytes_data) | |
| text = transcribe('temp_audio', nu_speakers) | |