Spaces:
Running
Running
Upload 3 files
Browse files- app.py +34 -0
- deepgram_transcribe.py +129 -0
- requirements.txt +2 -0
app.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from deepgram_transcribe import process
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
def upload_file(files):
|
| 7 |
+
file_paths = [file.name for file in files]
|
| 8 |
+
return file_paths
|
| 9 |
+
|
| 10 |
+
def process_submit(title, file, upload_button, progress=gr.Progress()):
|
| 11 |
+
progress(0, desc="Starting...")
|
| 12 |
+
zip_folder_path = process(file, progress)
|
| 13 |
+
return zip_folder_path
|
| 14 |
+
|
| 15 |
+
track_title = gr.Markdown(
|
| 16 |
+
"""
|
| 17 |
+
You can track your progress here
|
| 18 |
+
""")
|
| 19 |
+
|
| 20 |
+
file_output = gr.File()
|
| 21 |
+
upload_button = gr.UploadButton("Click to upload a file", file_types=["audio","video"], file_count="multiple")
|
| 22 |
+
|
| 23 |
+
title = gr.Markdown(
|
| 24 |
+
"""
|
| 25 |
+
# Playground
|
| 26 |
+
Upload your audio file here.
|
| 27 |
+
""")
|
| 28 |
+
|
| 29 |
+
with gr.Interface(fn=process_submit, inputs=[title, file_output, upload_button], outputs="file", allow_flagging="never") as demo:
|
| 30 |
+
upload_button.upload(upload_file, upload_button, file_output)
|
| 31 |
+
|
| 32 |
+
demo.launch()
|
| 33 |
+
|
| 34 |
+
|
deepgram_transcribe.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# this is a script that transcibe downloaded youtube video using deepgram
|
| 2 |
+
# the audio should be cleaned with UVR5 first, so the file is flac
|
| 3 |
+
# it will upload the full length of interview or podcast to deepgram
|
| 4 |
+
# and will return the speaker id. User must manually listen audio clip to find out which speaker is wanted
|
| 5 |
+
# discard remaining speakers and short length audio
|
| 6 |
+
#
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
from pydub import AudioSegment
|
| 11 |
+
import math
|
| 12 |
+
from os.path import join
|
| 13 |
+
import shutil
|
| 14 |
+
|
| 15 |
+
from deepgram import (
|
| 16 |
+
DeepgramClient,
|
| 17 |
+
PrerecordedOptions,
|
| 18 |
+
FileSource,
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
def process(audio_file, progress):
|
| 22 |
+
load_dotenv("myenv-variable.env")
|
| 23 |
+
|
| 24 |
+
# Path to the audio file
|
| 25 |
+
AUDIO_FILE = audio_file #audio name
|
| 26 |
+
TAGS = "vogue" # youtube source, for categorization
|
| 27 |
+
|
| 28 |
+
API_KEY = os.getenv('API_DEEPGRAM')
|
| 29 |
+
original_parent_folder = os.getcwd()
|
| 30 |
+
print(original_parent_folder)
|
| 31 |
+
# start_index = original_parent_folder.find("file=")
|
| 32 |
+
|
| 33 |
+
speaker_folder = join(original_parent_folder, "output")
|
| 34 |
+
|
| 35 |
+
if not os.path.isdir(speaker_folder):
|
| 36 |
+
os.mkdir(speaker_folder)
|
| 37 |
+
|
| 38 |
+
deepgram = DeepgramClient(API_KEY)
|
| 39 |
+
|
| 40 |
+
with open(AUDIO_FILE, "rb") as file:
|
| 41 |
+
buffer_data = file.read()
|
| 42 |
+
|
| 43 |
+
payload: FileSource = {
|
| 44 |
+
"buffer": buffer_data,
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
#STEP 2: Configure Deepgram options for audio analysis
|
| 48 |
+
options = PrerecordedOptions(
|
| 49 |
+
model="nova-2",
|
| 50 |
+
smart_format=True,
|
| 51 |
+
filler_words=True,
|
| 52 |
+
diarize=True
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
progress(0.20)
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
|
| 59 |
+
except Exception as e:
|
| 60 |
+
print(e)
|
| 61 |
+
|
| 62 |
+
progress(0.30)
|
| 63 |
+
|
| 64 |
+
audio = AudioSegment.from_file(AUDIO_FILE)
|
| 65 |
+
data = response
|
| 66 |
+
|
| 67 |
+
paragraphs = data['results']['channels'][0]['alternatives'][0]['paragraphs']['paragraphs']
|
| 68 |
+
|
| 69 |
+
csv_data = [["filename", "speaker", "text", "start_time", "end_time", "duration"]]
|
| 70 |
+
|
| 71 |
+
i=1
|
| 72 |
+
|
| 73 |
+
progress(0.40)
|
| 74 |
+
|
| 75 |
+
for paragraph in progress.tqdm(paragraphs, desc="Generating..."):
|
| 76 |
+
sentences = paragraph['sentences']
|
| 77 |
+
for text in sentences:
|
| 78 |
+
|
| 79 |
+
# convert the start and end time of the sentence to ms, add +- 5ms buffer to it
|
| 80 |
+
start_time_ms = math.floor(text['start']*1000)-5
|
| 81 |
+
end_time_ms = math.ceil(text['end']*1000)+5
|
| 82 |
+
duration_s = round(text['end']-text['start'],3)
|
| 83 |
+
duration_ms = str(end_time_ms-start_time_ms).zfill(6)
|
| 84 |
+
speaker_id = paragraph['speaker']
|
| 85 |
+
|
| 86 |
+
folder_path = join(speaker_folder, "Speaker_"+str(speaker_id))
|
| 87 |
+
if not os.path.isdir(folder_path):
|
| 88 |
+
os.mkdir(folder_path)
|
| 89 |
+
|
| 90 |
+
if speaker_id == 10:
|
| 91 |
+
speaker_id = "Tayr"
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
# Slice the audio segment
|
| 95 |
+
segment = audio[start_time_ms:end_time_ms]
|
| 96 |
+
|
| 97 |
+
# Generate file name
|
| 98 |
+
file_name = join("wavs",f"{TAGS}_Speaker_{speaker_id}_i{str(i).zfill(3)}_d{duration_ms}.wav")
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# Export the segment to temp folder
|
| 102 |
+
temp_folder = join(folder_path,f"{TAGS}_Speaker_{speaker_id}_i{str(i).zfill(3)}_d{duration_ms}.wav")
|
| 103 |
+
segment.export(temp_folder, format="wav")
|
| 104 |
+
|
| 105 |
+
# Add data to CSV list
|
| 106 |
+
csv_data.append([file_name, speaker_id, text['text'], start_time_ms, end_time_ms, duration_s])
|
| 107 |
+
|
| 108 |
+
i += 1
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
# Specify the filename
|
| 112 |
+
csv_filename = join(speaker_folder,f"{TAGS}_output.txt")
|
| 113 |
+
|
| 114 |
+
with open(csv_filename, 'w') as file:
|
| 115 |
+
# Iterate over each row in the data
|
| 116 |
+
for row in csv_data:
|
| 117 |
+
# Create a string where each field is separated by a '|'
|
| 118 |
+
row_string = '|'.join(str(item) for item in row)
|
| 119 |
+
# Write the string to the file, followed by a newline character
|
| 120 |
+
file.write(row_string + '\n')
|
| 121 |
+
|
| 122 |
+
progress(0.90)
|
| 123 |
+
|
| 124 |
+
shutil.make_archive("output", 'zip', speaker_folder)
|
| 125 |
+
print(f"Data written to {csv_filename}")
|
| 126 |
+
|
| 127 |
+
progress(1.00)
|
| 128 |
+
return "output.zip"
|
| 129 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
python-dotenv
|
| 2 |
+
deepgram-sdk==3.2.6
|