Commit
·
6736ecf
1
Parent(s):
afeb57e
get rid of cutoff time
Browse files
app.py
CHANGED
|
@@ -1530,6 +1530,11 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
|
|
| 1530 |
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
| 1531 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 1532 |
os.environ["USER"] = "USER"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1533 |
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
|
| 1534 |
cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
|
| 1535 |
target_transcript = transcribed_text + target_transcript
|
|
@@ -1603,15 +1608,13 @@ def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_mar
|
|
| 1603 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 1604 |
os.environ["USER"] = "USER"
|
| 1605 |
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
|
| 1606 |
-
|
| 1607 |
|
| 1608 |
target_transcript = transcribed_text + ' ' + target_transcript
|
| 1609 |
print(target_transcript)
|
| 1610 |
info = torchaudio.info(audio_fn)
|
| 1611 |
audio_dur = info.num_frames / info.sample_rate
|
| 1612 |
|
| 1613 |
-
cut_off_sec = audio_dur - 0.1
|
| 1614 |
-
|
| 1615 |
assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
|
| 1616 |
prompt_end_frame = int(cut_off_sec * info.sample_rate)
|
| 1617 |
|
|
|
|
| 1530 |
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
| 1531 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 1532 |
os.environ["USER"] = "USER"
|
| 1533 |
+
|
| 1534 |
+
print("Transcribing the input audio")
|
| 1535 |
+
transcribe_btn_click(input_audio_fn)
|
| 1536 |
+
print("Transcription complete")
|
| 1537 |
+
|
| 1538 |
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
|
| 1539 |
cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
|
| 1540 |
target_transcript = transcribed_text + target_transcript
|
|
|
|
| 1608 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 1609 |
os.environ["USER"] = "USER"
|
| 1610 |
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
|
| 1611 |
+
cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
|
| 1612 |
|
| 1613 |
target_transcript = transcribed_text + ' ' + target_transcript
|
| 1614 |
print(target_transcript)
|
| 1615 |
info = torchaudio.info(audio_fn)
|
| 1616 |
audio_dur = info.num_frames / info.sample_rate
|
| 1617 |
|
|
|
|
|
|
|
| 1618 |
assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
|
| 1619 |
prompt_end_frame = int(cut_off_sec * info.sample_rate)
|
| 1620 |
|