Spaces:
Running
on
Zero
Running
on
Zero
Refactor audio processing in app.py for improved tensor handling and update example inputs in the UI
Browse files- app.py +10 -15
- example/example1.wav +3 -0
app.py
CHANGED
|
@@ -362,9 +362,8 @@ def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, m
|
|
| 362 |
|
| 363 |
def tokenize_audio(audio_file_path, snac_model):
|
| 364 |
audio_array, sample_rate = librosa.load(audio_file_path, sr=24000)
|
| 365 |
-
waveform = torch.from_numpy(audio_array).unsqueeze(0)
|
| 366 |
-
waveform = waveform.to(dtype=torch.float32)
|
| 367 |
-
waveform = waveform.unsqueeze(0)
|
| 368 |
with torch.inference_mode():
|
| 369 |
codes = snac_model.encode(waveform)
|
| 370 |
all_codes = []
|
|
@@ -469,9 +468,9 @@ def redistribute_codes_zeroshot(code_list, snac_model):
|
|
| 469 |
layer_3.append(code_list[7 * i + 5] - (5 * 4096))
|
| 470 |
layer_3.append(code_list[7 * i + 6] - (6 * 4096))
|
| 471 |
codes = [
|
| 472 |
-
torch.tensor(layer_1).unsqueeze(0),
|
| 473 |
-
torch.tensor(layer_2).unsqueeze(0),
|
| 474 |
-
torch.tensor(layer_3).unsqueeze(0)
|
| 475 |
]
|
| 476 |
audio_hat = snac_model.decode(codes)
|
| 477 |
return audio_hat
|
|
@@ -777,16 +776,12 @@ with gr.Blocks(title="Khmer Text-to-Speech", css=css, theme=gr.themes.Soft()) as
|
|
| 777 |
)
|
| 778 |
|
| 779 |
# Zero-shot examples
|
| 780 |
-
zs_examples = [
|
| 781 |
-
["ααααΆααα½α αααα»αααααα αα»ααΆα", "αα½ααααΈ α’ααααα»ααααααΆααα?"],
|
| 782 |
-
["αααααααα’αΆααΆαααΆαα»ααα’α", "αααα»αα
αααα
ααααα½αα
αααΆαα"],
|
| 783 |
-
["αααα»αα
αΌαα
α·αααααΆαααΆαα", "ααΎα’αααα
αΌαα
α·αααααα αΌαα’αααΈ?"]
|
| 784 |
-
]
|
| 785 |
-
|
| 786 |
gr.Examples(
|
| 787 |
-
examples=
|
| 788 |
-
|
| 789 |
-
|
|
|
|
|
|
|
| 790 |
)
|
| 791 |
|
| 792 |
# Zero-shot event handlers
|
|
|
|
| 362 |
|
| 363 |
def tokenize_audio(audio_file_path, snac_model):
|
| 364 |
audio_array, sample_rate = librosa.load(audio_file_path, sr=24000)
|
| 365 |
+
waveform = torch.from_numpy(audio_array).unsqueeze(0).unsqueeze(0)
|
| 366 |
+
waveform = waveform.to(dtype=torch.float32, device=device)
|
|
|
|
| 367 |
with torch.inference_mode():
|
| 368 |
codes = snac_model.encode(waveform)
|
| 369 |
all_codes = []
|
|
|
|
| 468 |
layer_3.append(code_list[7 * i + 5] - (5 * 4096))
|
| 469 |
layer_3.append(code_list[7 * i + 6] - (6 * 4096))
|
| 470 |
codes = [
|
| 471 |
+
torch.tensor(layer_1, device=device).unsqueeze(0),
|
| 472 |
+
torch.tensor(layer_2, device=device).unsqueeze(0),
|
| 473 |
+
torch.tensor(layer_3, device=device).unsqueeze(0)
|
| 474 |
]
|
| 475 |
audio_hat = snac_model.decode(codes)
|
| 476 |
return audio_hat
|
|
|
|
| 776 |
)
|
| 777 |
|
| 778 |
# Zero-shot examples
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 779 |
gr.Examples(
|
| 780 |
+
examples=[
|
| 781 |
+
["example/example1.wav", "ααΊαα
ααααααααΆαααα»αααααααΆαα αα
ααΌαα·ααΆααα αααΉαα²ααααααΆααΆαααα»αααααααΆαα ααΊαααααααααααΆαααα αααΉα", "αα½ααααΈ α’ααααα»ααααααΆααα?"]
|
| 782 |
+
],
|
| 783 |
+
inputs=[ref_audio, ref_transcript, target_text_input],
|
| 784 |
+
label="π Example"
|
| 785 |
)
|
| 786 |
|
| 787 |
# Zero-shot event handlers
|
example/example1.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2f5b0884ffc6253de1490d0c09b89e268bfebb482a94ad6d60679f8c4c24c656
|
| 3 |
+
size 282718
|