Spaces:

NealCaren
/

transcript

Runtime error

Neal Caren

full?

80f0f94 over 3 years ago

3.59 kB

	import whisper
	import pandas as pd
	import whisper
	import subprocess
	from simple_diarizer.diarizer import Diarizer
	import streamlit as st

	def speech_to_text(uploaded):
	model = whisper.load_model('base')
	result = model.transcribe(uploaded,verbose=True)
	return f'You said: {result["text"]}'

	def segment(nu_speakers):

	diar = Diarizer(embed_model='xvec',cluster_method='sc')
	segments = diar.diarize('mono.wav', num_speakers=nu_speakers)

	sdf = pd.DataFrame(segments)

	# reorganize so the first speaker is always speaker 1
	speaker_s = sdf['label'].drop_duplicates().reset_index()['label']
	speaker_d = dict((v,k+1) for k,v in speaker_s.items())

	sdf['speaker'] = sdf['label'].replace(speaker_d)
	return sdf

	def audio_to_df(uploaded):
	monotize(uploaded)
	model = whisper.load_model('base')
	result = model.transcribe('mono.wav',verbose=True,
	without_timestamps=False)
	tdf = pd.DataFrame(result['segments'])
	return tdf

	def monotize(uploaded):
	cmd = f"ffmpeg -y -i {uploaded} -acodec pcm_s16le -ar 16000 -ac 1 mono.wav"
	subprocess.Popen(cmd, shell=True).wait()

	def add_preface(row):
	text = row['text'].replace('\n','')
	speaker = row['speaker']
	return f'Speaker {speaker}: {text}'

	def transcribe(uploaded, nu_speakers):
	with st.spinner(text="Converting file..."):
	monotize('temp_audio')
	with st.spinner(text="Transcribing..."):
	tdf = audio_to_df(uploaded)
	with st.spinner(text="Segmenting..."):
	sdf = segment(nu_speakers)

	ns_list = sdf[['start','speaker']].to_dict(orient='records')

	# Find the nearest transcript line to the start of each speaker
	for row in ns_list:
	input = row['start']
	id = tdf.iloc[(tdf['start']-input).abs().argsort()[:1]]['id'].values[0]
	tdf.loc[tdf['id'] ==id, 'speaker'] = row['speaker']

	tdf['speaker'].fillna(method = 'ffill', inplace = True)
	tdf['speaker'].fillna(method = 'bfill', inplace = True)

	tdf['n1'] = tdf['speaker'] != tdf['speaker'].shift(1)
	tdf['speach'] = tdf['n1'].cumsum()
	binned_df = tdf.groupby(['speach', 'speaker'])['text'].apply('\n'.join).reset_index()

	binned_df['speaker'] = binned_df['speaker'].astype(int)
	binned_df['output'] = binned_df.apply(add_preface, axis=1)

	lines = []
	for row in binned_df['output'].values:
	st.write(row)
	lines.append(row)

	return '\n'.join(lines)


	descript = ("This web app creates transcripts using OpenAI's [Whisper](https://github.com/openai/whisper) to transcribe "
	"audio files combined with [Chau](https://github.com/cvqluu)'s [Simple Diarizer](https://github.com/cvqluu/simple_diarizer) "
	"to partition the text by speaker.\n"
	"* You can upload a audio or video file of up to 200MBs.\n"
	"* Creating the transcript takes some time. "
	"Using the default base transcription model, the process takes approximately 20% of the length of the audio file.\n "
	"* After uploading the file, be sure to select the number of speakers." )

	st.title("Automated Transcription")
	st.markdown(descript)

	form = st.form(key='my_form')
	uploaded = form.file_uploader("Choose a file")
	nu_speakers = form.slider('Number of speakers in audio file:', min_value=1, max_value=6, value=2, step=1)
	submit = form.form_submit_button("Transcribe!")


	if submit:
	bytes_data = uploaded.getvalue()
	with open('temp_audio', 'wb') as outfile:
	outfile.write(bytes_data)
	text = transcribe('temp_audio', nu_speakers)