mrrtmob commited on
Commit
5f8fcc3
Β·
1 Parent(s): 31bd2cf

Refactor audio processing in app.py for improved tensor handling and update example inputs in the UI

Browse files
Files changed (2) hide show
  1. app.py +10 -15
  2. example/example1.wav +3 -0
app.py CHANGED
@@ -362,9 +362,8 @@ def generate_speech(text, temperature=0.6, top_p=0.95, repetition_penalty=1.1, m
362
 
363
  def tokenize_audio(audio_file_path, snac_model):
364
  audio_array, sample_rate = librosa.load(audio_file_path, sr=24000)
365
- waveform = torch.from_numpy(audio_array).unsqueeze(0)
366
- waveform = waveform.to(dtype=torch.float32)
367
- waveform = waveform.unsqueeze(0)
368
  with torch.inference_mode():
369
  codes = snac_model.encode(waveform)
370
  all_codes = []
@@ -469,9 +468,9 @@ def redistribute_codes_zeroshot(code_list, snac_model):
469
  layer_3.append(code_list[7 * i + 5] - (5 * 4096))
470
  layer_3.append(code_list[7 * i + 6] - (6 * 4096))
471
  codes = [
472
- torch.tensor(layer_1).unsqueeze(0),
473
- torch.tensor(layer_2).unsqueeze(0),
474
- torch.tensor(layer_3).unsqueeze(0)
475
  ]
476
  audio_hat = snac_model.decode(codes)
477
  return audio_hat
@@ -777,16 +776,12 @@ with gr.Blocks(title="Khmer Text-to-Speech", css=css, theme=gr.themes.Soft()) as
777
  )
778
 
779
  # Zero-shot examples
780
- zs_examples = [
781
- ["αž‡αŸ†αžšαžΆαž”αžŸαž½αžš αžαŸ’αž‰αž»αŸ†αžˆαŸ’αž˜αŸ„αŸ‡ αžŸαž»αžαžΆαŸ”", "αžŸαž½αžŸαŸ’αžαžΈ αž’αŸ’αž“αž€αžŸαž»αžαžŸαž”αŸ’αž”αžΆαž™αž‘αŸ?"],
782
- ["αžαŸ’αž„αŸƒαž“αŸαŸ‡αž’αžΆαž€αžΆαžŸαž’αžΆαžαž»αž›αŸ’αž’αŸ”", "αžαŸ’αž‰αž»αŸ†αž…αž„αŸ‹αž‘αŸ…αž›αŸαž„αžŸαž½αž“αž…αŸ’αž”αžΆαžšαŸ”"],
783
- ["αžαŸ’αž‰αž»αŸ†αž…αžΌαž›αž…αž·αžαŸ’αžαž‰αžΆαŸ†αž”αžΆαž™αŸ”", "αžαžΎαž’αŸ’αž“αž€αž…αžΌαž›αž…αž·αžαŸ’αžαž˜αŸ’αž αžΌαž”αž’αŸ’αžœαžΈ?"]
784
- ]
785
-
786
  gr.Examples(
787
- examples=zs_examples,
788
- inputs=[ref_transcript, target_text_input],
789
- label="πŸ“ Example Transcript & Target Text Pairs"
 
 
790
  )
791
 
792
  # Zero-shot event handlers
 
362
 
363
  def tokenize_audio(audio_file_path, snac_model):
364
  audio_array, sample_rate = librosa.load(audio_file_path, sr=24000)
365
+ waveform = torch.from_numpy(audio_array).unsqueeze(0).unsqueeze(0)
366
+ waveform = waveform.to(dtype=torch.float32, device=device)
 
367
  with torch.inference_mode():
368
  codes = snac_model.encode(waveform)
369
  all_codes = []
 
468
  layer_3.append(code_list[7 * i + 5] - (5 * 4096))
469
  layer_3.append(code_list[7 * i + 6] - (6 * 4096))
470
  codes = [
471
+ torch.tensor(layer_1, device=device).unsqueeze(0),
472
+ torch.tensor(layer_2, device=device).unsqueeze(0),
473
+ torch.tensor(layer_3, device=device).unsqueeze(0)
474
  ]
475
  audio_hat = snac_model.decode(codes)
476
  return audio_hat
 
776
  )
777
 
778
  # Zero-shot examples
 
 
 
 
 
 
779
  gr.Examples(
780
+ examples=[
781
+ ["example/example1.wav", "αž‚αžΊαž“αŸ…αž–αŸαž›αžŠαŸ‚αž›αž˜αžΆαž“αž˜αž“αž»αžŸαŸ’αžŸαžŸαŸ’αž›αžΆαž”αŸ‹ αž“αŸ…αž—αžΌαž˜αž·αž‚αžΆαžαŸ‹αž αŸ’αž“αžΉαž„αž²αŸ’αž™αžαŸ‚αžαžΆαž˜αžΆαž“αž˜αž“αž»αžŸαŸ’αžŸαžŸαŸ’αž›αžΆαž”αŸ‹ αž‚αžΊαž‚αŸαžαŸ‚αž„αžαŸ‚αž™αž€αžŸαžΆαž€αžŸαž–αž αŸ’αž“αžΉαž„", "αžŸαž½αžŸαŸ’αžαžΈ αž’αŸ’αž“αž€αžŸαž»αžαžŸαž”αŸ’αž”αžΆαž™αž‘αŸ?"]
782
+ ],
783
+ inputs=[ref_audio, ref_transcript, target_text_input],
784
+ label="πŸ“ Example"
785
  )
786
 
787
  # Zero-shot event handlers
example/example1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f5b0884ffc6253de1490d0c09b89e268bfebb482a94ad6d60679f8c4c24c656
3
+ size 282718