Spaces:

GAASH-Lab
/

Matcha-TTS-Kashmiri-Demo

Running

BurhaanZargar commited on Jan 28

Commit

bf328a0

verified ·

1 Parent(s): ea4e599

Upload folder using huggingface_hub

Files changed (1) hide show

app.py CHANGED Viewed

@@ -42,8 +42,9 @@ def load_models():
 model, vocoder = load_models()
 @torch.inference_mode()
-def process(text):
     # 1. Kashmiri script normalization
     text = text.replace("ي", "ی").replace("ك", "ک").strip()
@@ -53,12 +54,9 @@ def process(text):
     x = torch.tensor(intersperse(sequence, 0), dtype=torch.long, device=DEVICE)[None]
     x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=DEVICE)
-    # 3. Handle Speaker ID for Multi-speaker Models
-    # Check if the model expects speaker embeddings
-    spks = None
-    if model.n_spks > 1:
-        # Default to speaker ID 0; change this if you have multiple voices
-        spks = torch.tensor([0], device=DEVICE, dtype=torch.long)
     # 4. Generate Mel-spectrogram
     output = model.synthesise(
@@ -66,7 +64,7 @@ def process(text):
         x_lengths,
         n_timesteps=10,
         temperature=0.667,
-        spks=spks,           # Pass the speaker tensor here
         length_scale=1.0
     )
@@ -76,13 +74,16 @@ def process(text):
     sf.write(output_path, audio, 22050)
     return output_path
-gr.Interface(
     fn=process,
-    # Add a slider if model.n_spks > 1
     inputs=[
-        gr.Textbox(label="Kashmiri Text"),
-        gr.Slider(0, model.n_spks - 1, step=1, label="Speaker ID") if model.n_spks > 1 else gr.Number(visible=False)
     ],
     outputs=gr.Audio(label="Audio", type="filepath"),
     title="GAASH-Lab: Kashmiri TTS"
-).launch()

 model, vocoder = load_models()
+# --- Update the function signature to accept two arguments ---
 @torch.inference_mode()
+def process(text, speaker_id):
     # 1. Kashmiri script normalization
     text = text.replace("ي", "ی").replace("ك", "ک").strip()
     x = torch.tensor(intersperse(sequence, 0), dtype=torch.long, device=DEVICE)[None]
     x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=DEVICE)
+    # 3. Use the Speaker ID from the interface
+    # Even if you only use one voice, the model requires this tensor
+    spks = torch.tensor([int(speaker_id)], device=DEVICE, dtype=torch.long)
     # 4. Generate Mel-spectrogram
     output = model.synthesise(
         x_lengths,
         n_timesteps=10,
         temperature=0.667,
+        spks=spks,
         length_scale=1.0
     )
     sf.write(output_path, audio, 22050)
     return output_path
+# --- Update the Interface inputs to match (2 inputs) ---
+demo = gr.Interface(
     fn=process,
     inputs=[
+        gr.Textbox(label="Kashmiri Text", placeholder="کٲشِر زَبانہِ مَنٛز لِکھِو..."),
+        # Added a slider so you can select the voice (0 is usually the default)
+        gr.Slider(0, model.n_spks - 1, step=1, value=0, label="Speaker ID")
     ],
     outputs=gr.Audio(label="Audio", type="filepath"),
     title="GAASH-Lab: Kashmiri TTS"
+)
+demo.launch()