Spaces:

GAASH-Lab
/

Matcha-TTS-Kashmiri-Demo

Running

BurhaanZargar commited on Jan 28

Commit

ea4e599

verified ·

1 Parent(s): 592300c

Upload folder using huggingface_hub

Files changed (1) hide show

app.py CHANGED Viewed

@@ -47,27 +47,30 @@ def process(text):
     # 1. Kashmiri script normalization
     text = text.replace("ي", "ی").replace("ك", "ک").strip()
-    # 2. Convert text to sequence using the correct cleaner
-    # We use 'basic_cleaners' here because the model was trained to
-    # map Kashmiri characters directly to audio features.
     cleaner = "basic_cleaners"
-    x = torch.tensor(
-        intersperse(text_to_sequence(text, [cleaner])[0], 0),
-        dtype=torch.long,
-        device=DEVICE,
-    )[None]
     x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=DEVICE)
-    # 3. Generate Mel-spectrogram
     output = model.synthesise(
         x,
         x_lengths,
         n_timesteps=10,
         temperature=0.667,
         length_scale=1.0
     )
-    # 4. Generate Audio Waveform
     audio = vocoder(output['mel']).clamp(-1, 1).cpu().squeeze().numpy()
     output_path = "out.wav"
     sf.write(output_path, audio, 22050)
@@ -75,7 +78,11 @@ def process(text):
 gr.Interface(
     fn=process,
-    inputs=gr.Textbox(label="Kashmiri Text"),
     outputs=gr.Audio(label="Audio", type="filepath"),
     title="GAASH-Lab: Kashmiri TTS"
 ).launch()

     # 1. Kashmiri script normalization
     text = text.replace("ي", "ی").replace("ك", "ک").strip()
+    # 2. Text to Sequence
     cleaner = "basic_cleaners"
+    sequence, _ = text_to_sequence(text, [cleaner])
+    x = torch.tensor(intersperse(sequence, 0), dtype=torch.long, device=DEVICE)[None]
     x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=DEVICE)
+    # 3. Handle Speaker ID for Multi-speaker Models
+    # Check if the model expects speaker embeddings
+    spks = None
+    if model.n_spks > 1:
+        # Default to speaker ID 0; change this if you have multiple voices
+        spks = torch.tensor([0], device=DEVICE, dtype=torch.long)
+    # 4. Generate Mel-spectrogram
     output = model.synthesise(
         x,
         x_lengths,
         n_timesteps=10,
         temperature=0.667,
+        spks=spks,           # Pass the speaker tensor here
         length_scale=1.0
     )
+    # 5. Generate Waveform
     audio = vocoder(output['mel']).clamp(-1, 1).cpu().squeeze().numpy()
     output_path = "out.wav"
     sf.write(output_path, audio, 22050)
 gr.Interface(
     fn=process,
+    # Add a slider if model.n_spks > 1
+    inputs=[
+        gr.Textbox(label="Kashmiri Text"),
+        gr.Slider(0, model.n_spks - 1, step=1, label="Speaker ID") if model.n_spks > 1 else gr.Number(visible=False)
+    ],
     outputs=gr.Audio(label="Audio", type="filepath"),
     title="GAASH-Lab: Kashmiri TTS"
 ).launch()