Spaces:

jessica07
/

Playground-Deepgram

Running

App Files Files Community

jessica07 commited on Apr 18, 2024

Commit

8554b58

verified ·

1 Parent(s): b645b5a

Upload 3 files

Browse files

Files changed (3) hide show

app.py +34 -0
deepgram_transcribe.py +129 -0
requirements.txt +2 -0

app.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import gradio as gr
+from pathlib import Path
+from deepgram_transcribe import process
+import os
+def upload_file(files):
+    file_paths = [file.name for file in files]
+    return file_paths
+def process_submit(title, file, upload_button, progress=gr.Progress()):
+    progress(0, desc="Starting...")
+    zip_folder_path = process(file, progress)
+    return zip_folder_path
+track_title = gr.Markdown(
+    """
+    You can track your progress here
+    """)
+file_output = gr.File()
+upload_button = gr.UploadButton("Click to upload a file", file_types=["audio","video"], file_count="multiple")
+title = gr.Markdown(
+    """
+    # Playground
+    Upload your audio file here.
+    """)
+with gr.Interface(fn=process_submit, inputs=[title, file_output, upload_button], outputs="file", allow_flagging="never") as demo:
+    upload_button.upload(upload_file, upload_button, file_output)
+demo.launch()

deepgram_transcribe.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# this is a script that transcibe downloaded youtube video using deepgram
+# the audio should be cleaned with UVR5 first, so the file is flac
+# it will upload the full length of interview or podcast to deepgram
+# and will return the speaker id. User must manually listen audio clip to find out which speaker is wanted
+# discard remaining speakers and short length audio
+#
+import os
+from dotenv import load_dotenv
+from pydub import AudioSegment
+import math
+from os.path import join
+import shutil
+from deepgram import (
+    DeepgramClient,
+    PrerecordedOptions,
+    FileSource,
+)
+def process(audio_file, progress):
+    load_dotenv("myenv-variable.env")
+    # Path to the audio file
+    AUDIO_FILE = audio_file #audio name
+    TAGS = "vogue" # youtube source, for categorization
+    API_KEY = os.getenv('API_DEEPGRAM')
+    original_parent_folder = os.getcwd()
+    print(original_parent_folder)
+    # start_index = original_parent_folder.find("file=")
+    speaker_folder = join(original_parent_folder, "output")
+    if not os.path.isdir(speaker_folder):
+        os.mkdir(speaker_folder)
+    deepgram = DeepgramClient(API_KEY)
+    with open(AUDIO_FILE, "rb") as file:
+        buffer_data = file.read()
+    payload: FileSource = {
+        "buffer": buffer_data,
+    }
+    #STEP 2: Configure Deepgram options for audio analysis
+    options = PrerecordedOptions(
+        model="nova-2",
+        smart_format=True,
+        filler_words=True,
+        diarize=True
+    )
+    progress(0.20)
+    try:
+        response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
+    except Exception as e:
+        print(e)
+    progress(0.30)
+    audio = AudioSegment.from_file(AUDIO_FILE)
+    data = response
+    paragraphs = data['results']['channels'][0]['alternatives'][0]['paragraphs']['paragraphs']
+    csv_data = [["filename", "speaker", "text", "start_time", "end_time", "duration"]]
+    i=1
+    progress(0.40)
+    for paragraph in progress.tqdm(paragraphs, desc="Generating..."):
+        sentences = paragraph['sentences']
+        for text in sentences:
+            # convert the start and end time of the sentence to ms, add +- 5ms buffer to it
+            start_time_ms = math.floor(text['start']*1000)-5
+            end_time_ms = math.ceil(text['end']*1000)+5
+            duration_s = round(text['end']-text['start'],3)
+            duration_ms = str(end_time_ms-start_time_ms).zfill(6)
+            speaker_id = paragraph['speaker']
+            folder_path = join(speaker_folder, "Speaker_"+str(speaker_id))
+            if not os.path.isdir(folder_path):
+                os.mkdir(folder_path)
+            if speaker_id == 10:
+                speaker_id = "Tayr"
+            # Slice the audio segment
+            segment = audio[start_time_ms:end_time_ms]
+            # Generate file name
+            file_name = join("wavs",f"{TAGS}_Speaker_{speaker_id}_i{str(i).zfill(3)}_d{duration_ms}.wav")
+            # Export the segment to temp folder
+            temp_folder = join(folder_path,f"{TAGS}_Speaker_{speaker_id}_i{str(i).zfill(3)}_d{duration_ms}.wav")
+            segment.export(temp_folder, format="wav")
+            # Add data to CSV list
+            csv_data.append([file_name, speaker_id, text['text'], start_time_ms, end_time_ms, duration_s])
+            i += 1
+    # Specify the filename
+    csv_filename = join(speaker_folder,f"{TAGS}_output.txt")
+    with open(csv_filename, 'w') as file:
+        # Iterate over each row in the data
+        for row in csv_data:
+            # Create a string where each field is separated by a '|'
+            row_string = '|'.join(str(item) for item in row)
+            # Write the string to the file, followed by a newline character
+            file.write(row_string + '\n')
+    progress(0.90)
+    shutil.make_archive("output", 'zip', speaker_folder)
+    print(f"Data written to {csv_filename}")
+    progress(1.00)
+    return "output.zip"

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ python-dotenv
2	+ deepgram-sdk==3.2.6