Spaces:
Runtime error
Runtime error
Neal Caren
commited on
Commit
·
9ec01b8
1
Parent(s):
555bd19
testing
Browse files
app.py
CHANGED
|
@@ -1,13 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
|
| 3 |
|
| 4 |
-
uploaded_file = st.file_uploader("Choose a file")
|
| 5 |
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
# To read file as bytes:
|
| 12 |
-
bytes_data = uploaded_file.getvalue()
|
| 13 |
-
st.write(bytes_data)
|
|
|
|
| 1 |
+
import whisper
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import whisper
|
| 4 |
+
import subprocess
|
| 5 |
+
from simple_diarizer.diarizer import Diarizer
|
| 6 |
import streamlit as st
|
| 7 |
|
| 8 |
|
|
|
|
| 9 |
|
| 10 |
+
def speech_to_text(uploaded):
|
| 11 |
+
model = whisper.load_model('tiny')
|
| 12 |
+
result = model.transcribe(uploaded)
|
| 13 |
+
return f'You said: {result["text"]}'
|
| 14 |
|
| 15 |
+
def segment(nu_speakers):
|
| 16 |
|
| 17 |
+
diar = Diarizer(embed_model='xvec',cluster_method='sc')
|
| 18 |
+
segments = diar.diarize('mono.wav', num_speakers=nu_speakers)
|
| 19 |
+
|
| 20 |
+
sdf = pd.DataFrame(segments)
|
| 21 |
+
|
| 22 |
+
# reorganize so the first speaker is always speaker 1
|
| 23 |
+
speaker_s = sdf['label'].drop_duplicates().reset_index()['label']
|
| 24 |
+
speaker_d = dict((v,k+1) for k,v in speaker_s.items())
|
| 25 |
+
|
| 26 |
+
sdf['speaker'] = sdf['label'].replace(speaker_d)
|
| 27 |
+
return sdf
|
| 28 |
+
|
| 29 |
+
def audio_to_df(uploaded):
|
| 30 |
+
monotize(uploaded)
|
| 31 |
+
model = whisper.load_model('base')
|
| 32 |
+
result = model.transcribe('mono.wav',verbose=True,
|
| 33 |
+
without_timestamps=False)
|
| 34 |
+
tdf = pd.DataFrame(result['segments'])
|
| 35 |
+
return tdf
|
| 36 |
+
|
| 37 |
+
def monotize(uploaded):
|
| 38 |
+
print(uploaded.name)
|
| 39 |
+
cmd = f"ffmpeg -y -i {uploaded.name} -acodec pcm_s16le -ar 16000 -ac 1 mono.wav"
|
| 40 |
+
subprocess.Popen(cmd, shell=True).wait()
|
| 41 |
+
|
| 42 |
+
def add_preface(row):
|
| 43 |
+
text = row['text'].replace('\n','')
|
| 44 |
+
speaker = row['speaker']
|
| 45 |
+
return f'Speaker {speaker}: {text}'
|
| 46 |
+
|
| 47 |
+
def transcribe(uploaded, nu_speakers):
|
| 48 |
+
|
| 49 |
+
monotize(uploaded)
|
| 50 |
+
tdf = audio_to_df(uploaded)
|
| 51 |
+
sdf = segment(nu_speakers)
|
| 52 |
+
|
| 53 |
+
ns_list = sdf[['start','speaker']].to_dict(orient='records')
|
| 54 |
+
|
| 55 |
+
# Find the nearest transcript line to the start of each speaker
|
| 56 |
+
for row in ns_list:
|
| 57 |
+
input = row['start']
|
| 58 |
+
id = tdf.iloc[(tdf['start']-input).abs().argsort()[:1]]['id'].values[0]
|
| 59 |
+
tdf.loc[tdf['id'] ==id, 'speaker'] = row['speaker']
|
| 60 |
+
|
| 61 |
+
tdf['speaker'].fillna(method = 'ffill', inplace = True)
|
| 62 |
+
tdf['speaker'].fillna(method = 'bfill', inplace = True)
|
| 63 |
+
|
| 64 |
+
tdf['n1'] = tdf['speaker'] != tdf['speaker'].shift(1)
|
| 65 |
+
tdf['speach'] = tdf['n1'].cumsum()
|
| 66 |
+
binned_df = tdf.groupby(['speach', 'speaker'])['text'].apply('\n'.join).reset_index()
|
| 67 |
+
|
| 68 |
+
binned_df['speaker'] = binned_df['speaker'].astype(int)
|
| 69 |
+
binned_df['output'] = binned_df.apply(add_preface, axis=1)
|
| 70 |
+
|
| 71 |
+
lines = []
|
| 72 |
+
for row in binned_df['output'].values:
|
| 73 |
+
lines.append(row)
|
| 74 |
+
|
| 75 |
+
return '\n'.join(lines)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
descript = ("This web app creates transcripts using OpenAI's [Whisper](https://github.com/openai/whisper) to transcribe "
|
| 79 |
+
"audio files combined with [Chau](https://github.com/cvqluu)'s [Simple Diarizer](https://github.com/cvqluu/simple_diarizer) "
|
| 80 |
+
"to partition the text by speaker.\n"
|
| 81 |
+
"* Creating the transcript takes some time. "
|
| 82 |
+
"Using the default base transcription model, the process takes approximately 20% of the length of the audio file.\n "
|
| 83 |
+
"* There seems to be cap on the uploaded file size of about 20MBs. My [colab](https://colab.research.google.com/drive/18AD-mb3bT4s8k3UNhZu-ghPq2DT5il3V?usp=sharing) version "
|
| 84 |
+
"can handle any file size, but requies some Python knowledge.\n"
|
| 85 |
+
"* After uploading the file, **be sure to select the number of speakers**." )
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
with st.form(key='my_form'):
|
| 89 |
+
uploaded = st.file_uploader("Choose a file")
|
| 90 |
+
nu_speakers = st.slider('Number of speakers in audio file:', min_value=1, max_value=6, value=2, step=1)
|
| 91 |
+
submit = st.form_submit_button("Transcribe!")
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
if submit:
|
| 95 |
+
bytes_data = uploaded.getvalue()
|
| 96 |
+
with open('temp_audio', 'wb') as outfile:
|
| 97 |
+
outfile.write(bytes_data)
|
| 98 |
+
speech_to_text('temp_audio')
|
| 99 |
# To read file as bytes:
|
|
|
|
|
|