3417543_models

Paused

ajayarora1235 commited on Apr 3, 2024

Commit

6736ecf

1 Parent(s): afeb57e

get rid of cutoff time

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1530,6 +1530,11 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
     os.environ["CUDA_VISIBLE_DEVICES"] = "0"
     os.environ["USER"] = "USER"
     # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
     cut_off_sec = cutoff_value  # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
     target_transcript = transcribed_text + target_transcript
@@ -1603,15 +1608,13 @@ def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_mar
     os.environ["CUDA_VISIBLE_DEVICES"] = "0"
     os.environ["USER"] = "USER"
     # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
-    # cut_off_sec = cutoff_value  # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
     target_transcript = transcribed_text + ' ' + target_transcript
     print(target_transcript)
     info = torchaudio.info(audio_fn)
     audio_dur = info.num_frames / info.sample_rate
-    cut_off_sec = audio_dur - 0.1
     assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
     prompt_end_frame = int(cut_off_sec * info.sample_rate)

     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
     os.environ["CUDA_VISIBLE_DEVICES"] = "0"
     os.environ["USER"] = "USER"
+    print("Transcribing the input audio")
+    transcribe_btn_click(input_audio_fn)
+    print("Transcription complete")
     # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
     cut_off_sec = cutoff_value  # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
     target_transcript = transcribed_text + target_transcript
     os.environ["CUDA_VISIBLE_DEVICES"] = "0"
     os.environ["USER"] = "USER"
     # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
+    cut_off_sec = cutoff_value  # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
     target_transcript = transcribed_text + ' ' + target_transcript
     print(target_transcript)
     info = torchaudio.info(audio_fn)
     audio_dur = info.num_frames / info.sample_rate
     assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
     prompt_end_frame = int(cut_off_sec * info.sample_rate)