File size: 5,261 Bytes
9ec01b8
 
 
 
 
faf2a27
ee99df3
faf2a27
4ccc97f
ee99df3
 
 
 
 
4ccc97f
 
da8c6a9
 
1ac8b8b
da8c6a9
 
 
 
 
 
 
 
 
 
 
 
 
4ccc97f
da8c6a9
1ac8b8b
 
da8c6a9
 
 
9e9f496
 
 
faf2a27
da8c6a9
 
 
 
 
 
6178c80
 
4ccc97f
 
 
 
 
9908ddd
6178c80
 
 
da8c6a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ccc97f
 
 
c0d73db
ee99df3
da8c6a9
649cc15
80f0f94
 
 
4ccc97f
80f0f94
4ccc97f
1ac8b8b
 
4ccc97f
80f0f94
 
 
 
014d79d
 
4ccc97f
9908ddd
 
 
1ac8b8b
014d79d
9ec01b8
 
c089e11
9908ddd
 
 
 
 
 
 
1ac8b8b
 
 
 
 
9ec01b8
 
 
ee99df3
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import whisper
import pandas as pd
import whisper
import subprocess
from simple_diarizer.diarizer import Diarizer
import streamlit as st
import base64




def create_download_link(val, filename, label):
    b64 = base64.b64encode(val)
    return f'<a href="data:application/octet-stream;base64,{b64.decode()}" download="{filename}">{label}</a>'


def segment(nu_speakers):

    diar = Diarizer(embed_model='ecapa',cluster_method='sc')
    segments = diar.diarize('mono.wav', num_speakers=nu_speakers)

    sdf = pd.DataFrame(segments)

    # reorganize so the first speaker is always speaker 1
    speaker_s = sdf['label'].drop_duplicates().reset_index()['label']
    speaker_d = dict((v,k+1) for k,v in speaker_s.items())

    sdf['speaker'] = sdf['label'].replace(speaker_d)
    return sdf

def audio_to_df(uploaded):
    monotize(uploaded)
    model = whisper.load_model(model_size)
    result = model.transcribe('mono.wav',verbose=True,
                          without_timestamps=False,
                           task = task)
    tdf = pd.DataFrame(result['segments'])
    return tdf

def monotize(uploaded):
    cmd = f"ffmpeg -y -i {uploaded} -acodec pcm_s16le -ar 16000 -ac 1 mono.wav"
    subprocess.Popen(cmd, shell=True).wait()

def add_preface(row):
    text = row['text'].replace('\n','')
    speaker = row['speaker']
    return f'Speaker {speaker}: {text}'

def transcribe(uploaded, nu_speakers):
    with st.spinner(text="Converting file..."):
        monotize('temp_audio')

    audio_file = open('mono.wav', 'rb')
    audio_bytes = audio_file.read()
    st.audio('mono.wav', format='audio/wav')

    with st.spinner(text=f"Transcribing using {model_size} model..."):
        tdf = audio_to_df(uploaded)
    with st.spinner(text="Segmenting..."):
        sdf = segment(nu_speakers)

    ns_list = sdf[['start','speaker']].to_dict(orient='records')

    # Find the nearest transcript line to the start of each speaker
    for row in ns_list:
        input = row['start']
        id = tdf.iloc[(tdf['start']-input).abs().argsort()[:1]]['id'].values[0]
        tdf.loc[tdf['id'] ==id, 'speaker'] = row['speaker']

    tdf['speaker'].fillna(method = 'ffill', inplace = True)
    tdf['speaker'].fillna(method = 'bfill', inplace = True)

    tdf['n1'] = tdf['speaker'] != tdf['speaker'].shift(1)
    tdf['speach'] = tdf['n1'].cumsum()
    binned_df = tdf.groupby(['speach', 'speaker'])['text'].apply('\n'.join).reset_index()

    binned_df['speaker'] = binned_df['speaker'].astype(int)
    binned_df['output'] = binned_df.apply(add_preface, axis=1)

    lines = []
    for row in binned_df['output'].values:
        st.write(row)
        lines.append(row)
    tdf['speaker'] = tdf['speaker'].astype(int)

    tdf_cols = ['speaker','start','end','text']
    #st.dataframe(tdf[tdf_cols])
    return {'text':lines, 'df': tdf[tdf_cols]}


descript = ("This web app creates transcripts using OpenAI's [Whisper](https://github.com/openai/whisper) to transcribe "
            "audio files combined with [Chau](https://github.com/cvqluu)'s [Simple Diarizer](https://github.com/cvqluu/simple_diarizer) "
            "to partition the text by speaker.\n"
            "* You can upload an audio or video file of up to 200MBs.\n"
            "* Creating the transcript takes some time. "
            "The process takes approximately 20% of the length of the audio file using the base Whisper model.\n "
            "* The transcription process handles a variety of languages, and can also translate the audio to English. The tiny model is not good at translating. \n"
            "* Speaker segmentation seems to work best with the base model. The small model produces better transcripts, but something seems off with the timecodes, degrading the speaker attribution. \n"
            "* After uploading the file, be sure to select the number of speakers." )

st.title("Automated Transcription")
st.markdown(descript)

form = st.form(key='my_form')
uploaded = form.file_uploader("Choose a file")
nu_speakers = form.slider('Number of speakers in recording:', min_value=1, max_value=8, value=2, step=1)
models = form.selectbox(
    'Which Whisper model?',
    ('Tiny (fast)', 'Base (good)', 'Small (great but slow)'), index=1)
translate = form.checkbox('Translate to English?')
submit = form.form_submit_button("Transcribe!")


if submit:
    if models == 'Tiny (fast)':
        model_size = 'tiny'
    elif models == 'Base (good)':
        model_size ='base'
    elif models == 'Small (great but slow)':
        model_size = 'small'

    if translate == True:
        task = 'translate'
    else:
        task = 'transcribe'

    bytes_data = uploaded.getvalue()
    with open('temp_audio', 'wb') as outfile:
        outfile.write(bytes_data)
    transcript = transcribe('temp_audio', nu_speakers)

    csv = transcript['df'].to_csv( float_format='%.2f', index=False).encode('utf-8')
    text = '\n'.join(transcript['text']).encode('utf-8')
    download_url = create_download_link(text, 'transcript.txt', 'Download transcript as plain text.')
    st.markdown(download_url, unsafe_allow_html=True)

    download_url = create_download_link(csv, 'transcript.csv', 'Download transcript as CSV (with time codes)')
    st.markdown(download_url, unsafe_allow_html=True)