Upload folder using huggingface_hub
Browse files
app.py
CHANGED
|
@@ -42,8 +42,9 @@ def load_models():
|
|
| 42 |
|
| 43 |
model, vocoder = load_models()
|
| 44 |
|
|
|
|
| 45 |
@torch.inference_mode()
|
| 46 |
-
def process(text):
|
| 47 |
# 1. Kashmiri script normalization
|
| 48 |
text = text.replace("ي", "ی").replace("ك", "ک").strip()
|
| 49 |
|
|
@@ -53,12 +54,9 @@ def process(text):
|
|
| 53 |
x = torch.tensor(intersperse(sequence, 0), dtype=torch.long, device=DEVICE)[None]
|
| 54 |
x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=DEVICE)
|
| 55 |
|
| 56 |
-
# 3.
|
| 57 |
-
#
|
| 58 |
-
spks =
|
| 59 |
-
if model.n_spks > 1:
|
| 60 |
-
# Default to speaker ID 0; change this if you have multiple voices
|
| 61 |
-
spks = torch.tensor([0], device=DEVICE, dtype=torch.long)
|
| 62 |
|
| 63 |
# 4. Generate Mel-spectrogram
|
| 64 |
output = model.synthesise(
|
|
@@ -66,7 +64,7 @@ def process(text):
|
|
| 66 |
x_lengths,
|
| 67 |
n_timesteps=10,
|
| 68 |
temperature=0.667,
|
| 69 |
-
spks=spks,
|
| 70 |
length_scale=1.0
|
| 71 |
)
|
| 72 |
|
|
@@ -76,13 +74,16 @@ def process(text):
|
|
| 76 |
sf.write(output_path, audio, 22050)
|
| 77 |
return output_path
|
| 78 |
|
| 79 |
-
|
|
|
|
| 80 |
fn=process,
|
| 81 |
-
# Add a slider if model.n_spks > 1
|
| 82 |
inputs=[
|
| 83 |
-
gr.Textbox(label="Kashmiri Text"),
|
| 84 |
-
|
|
|
|
| 85 |
],
|
| 86 |
outputs=gr.Audio(label="Audio", type="filepath"),
|
| 87 |
title="GAASH-Lab: Kashmiri TTS"
|
| 88 |
-
)
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
model, vocoder = load_models()
|
| 44 |
|
| 45 |
+
# --- Update the function signature to accept two arguments ---
|
| 46 |
@torch.inference_mode()
|
| 47 |
+
def process(text, speaker_id):
|
| 48 |
# 1. Kashmiri script normalization
|
| 49 |
text = text.replace("ي", "ی").replace("ك", "ک").strip()
|
| 50 |
|
|
|
|
| 54 |
x = torch.tensor(intersperse(sequence, 0), dtype=torch.long, device=DEVICE)[None]
|
| 55 |
x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=DEVICE)
|
| 56 |
|
| 57 |
+
# 3. Use the Speaker ID from the interface
|
| 58 |
+
# Even if you only use one voice, the model requires this tensor
|
| 59 |
+
spks = torch.tensor([int(speaker_id)], device=DEVICE, dtype=torch.long)
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
# 4. Generate Mel-spectrogram
|
| 62 |
output = model.synthesise(
|
|
|
|
| 64 |
x_lengths,
|
| 65 |
n_timesteps=10,
|
| 66 |
temperature=0.667,
|
| 67 |
+
spks=spks,
|
| 68 |
length_scale=1.0
|
| 69 |
)
|
| 70 |
|
|
|
|
| 74 |
sf.write(output_path, audio, 22050)
|
| 75 |
return output_path
|
| 76 |
|
| 77 |
+
# --- Update the Interface inputs to match (2 inputs) ---
|
| 78 |
+
demo = gr.Interface(
|
| 79 |
fn=process,
|
|
|
|
| 80 |
inputs=[
|
| 81 |
+
gr.Textbox(label="Kashmiri Text", placeholder="کٲشِر زَبانہِ مَنٛز لِکھِو..."),
|
| 82 |
+
# Added a slider so you can select the voice (0 is usually the default)
|
| 83 |
+
gr.Slider(0, model.n_spks - 1, step=1, value=0, label="Speaker ID")
|
| 84 |
],
|
| 85 |
outputs=gr.Audio(label="Audio", type="filepath"),
|
| 86 |
title="GAASH-Lab: Kashmiri TTS"
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
demo.launch()
|