BurhaanZargar commited on
Commit
bf328a0
·
verified ·
1 Parent(s): ea4e599

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +14 -13
app.py CHANGED
@@ -42,8 +42,9 @@ def load_models():
42
 
43
  model, vocoder = load_models()
44
 
 
45
  @torch.inference_mode()
46
- def process(text):
47
  # 1. Kashmiri script normalization
48
  text = text.replace("ي", "ی").replace("ك", "ک").strip()
49
 
@@ -53,12 +54,9 @@ def process(text):
53
  x = torch.tensor(intersperse(sequence, 0), dtype=torch.long, device=DEVICE)[None]
54
  x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=DEVICE)
55
 
56
- # 3. Handle Speaker ID for Multi-speaker Models
57
- # Check if the model expects speaker embeddings
58
- spks = None
59
- if model.n_spks > 1:
60
- # Default to speaker ID 0; change this if you have multiple voices
61
- spks = torch.tensor([0], device=DEVICE, dtype=torch.long)
62
 
63
  # 4. Generate Mel-spectrogram
64
  output = model.synthesise(
@@ -66,7 +64,7 @@ def process(text):
66
  x_lengths,
67
  n_timesteps=10,
68
  temperature=0.667,
69
- spks=spks, # Pass the speaker tensor here
70
  length_scale=1.0
71
  )
72
 
@@ -76,13 +74,16 @@ def process(text):
76
  sf.write(output_path, audio, 22050)
77
  return output_path
78
 
79
- gr.Interface(
 
80
  fn=process,
81
- # Add a slider if model.n_spks > 1
82
  inputs=[
83
- gr.Textbox(label="Kashmiri Text"),
84
- gr.Slider(0, model.n_spks - 1, step=1, label="Speaker ID") if model.n_spks > 1 else gr.Number(visible=False)
 
85
  ],
86
  outputs=gr.Audio(label="Audio", type="filepath"),
87
  title="GAASH-Lab: Kashmiri TTS"
88
- ).launch()
 
 
 
42
 
43
  model, vocoder = load_models()
44
 
45
+ # --- Update the function signature to accept two arguments ---
46
  @torch.inference_mode()
47
+ def process(text, speaker_id):
48
  # 1. Kashmiri script normalization
49
  text = text.replace("ي", "ی").replace("ك", "ک").strip()
50
 
 
54
  x = torch.tensor(intersperse(sequence, 0), dtype=torch.long, device=DEVICE)[None]
55
  x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=DEVICE)
56
 
57
+ # 3. Use the Speaker ID from the interface
58
+ # Even if you only use one voice, the model requires this tensor
59
+ spks = torch.tensor([int(speaker_id)], device=DEVICE, dtype=torch.long)
 
 
 
60
 
61
  # 4. Generate Mel-spectrogram
62
  output = model.synthesise(
 
64
  x_lengths,
65
  n_timesteps=10,
66
  temperature=0.667,
67
+ spks=spks,
68
  length_scale=1.0
69
  )
70
 
 
74
  sf.write(output_path, audio, 22050)
75
  return output_path
76
 
77
+ # --- Update the Interface inputs to match (2 inputs) ---
78
+ demo = gr.Interface(
79
  fn=process,
 
80
  inputs=[
81
+ gr.Textbox(label="Kashmiri Text", placeholder="کٲشِر زَبانہِ مَنٛز لِکھِو..."),
82
+ # Added a slider so you can select the voice (0 is usually the default)
83
+ gr.Slider(0, model.n_spks - 1, step=1, value=0, label="Speaker ID")
84
  ],
85
  outputs=gr.Audio(label="Audio", type="filepath"),
86
  title="GAASH-Lab: Kashmiri TTS"
87
+ )
88
+
89
+ demo.launch()