BurhaanZargar commited on
Commit
ea4e599
·
verified ·
1 Parent(s): 592300c

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +18 -11
app.py CHANGED
@@ -47,27 +47,30 @@ def process(text):
47
  # 1. Kashmiri script normalization
48
  text = text.replace("ي", "ی").replace("ك", "ک").strip()
49
 
50
- # 2. Convert text to sequence using the correct cleaner
51
- # We use 'basic_cleaners' here because the model was trained to
52
- # map Kashmiri characters directly to audio features.
53
  cleaner = "basic_cleaners"
54
- x = torch.tensor(
55
- intersperse(text_to_sequence(text, [cleaner])[0], 0),
56
- dtype=torch.long,
57
- device=DEVICE,
58
- )[None]
59
  x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=DEVICE)
60
 
61
- # 3. Generate Mel-spectrogram
 
 
 
 
 
 
 
62
  output = model.synthesise(
63
  x,
64
  x_lengths,
65
  n_timesteps=10,
66
  temperature=0.667,
 
67
  length_scale=1.0
68
  )
69
 
70
- # 4. Generate Audio Waveform
71
  audio = vocoder(output['mel']).clamp(-1, 1).cpu().squeeze().numpy()
72
  output_path = "out.wav"
73
  sf.write(output_path, audio, 22050)
@@ -75,7 +78,11 @@ def process(text):
75
 
76
  gr.Interface(
77
  fn=process,
78
- inputs=gr.Textbox(label="Kashmiri Text"),
 
 
 
 
79
  outputs=gr.Audio(label="Audio", type="filepath"),
80
  title="GAASH-Lab: Kashmiri TTS"
81
  ).launch()
 
47
  # 1. Kashmiri script normalization
48
  text = text.replace("ي", "ی").replace("ك", "ک").strip()
49
 
50
+ # 2. Text to Sequence
 
 
51
  cleaner = "basic_cleaners"
52
+ sequence, _ = text_to_sequence(text, [cleaner])
53
+ x = torch.tensor(intersperse(sequence, 0), dtype=torch.long, device=DEVICE)[None]
 
 
 
54
  x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=DEVICE)
55
 
56
+ # 3. Handle Speaker ID for Multi-speaker Models
57
+ # Check if the model expects speaker embeddings
58
+ spks = None
59
+ if model.n_spks > 1:
60
+ # Default to speaker ID 0; change this if you have multiple voices
61
+ spks = torch.tensor([0], device=DEVICE, dtype=torch.long)
62
+
63
+ # 4. Generate Mel-spectrogram
64
  output = model.synthesise(
65
  x,
66
  x_lengths,
67
  n_timesteps=10,
68
  temperature=0.667,
69
+ spks=spks, # Pass the speaker tensor here
70
  length_scale=1.0
71
  )
72
 
73
+ # 5. Generate Waveform
74
  audio = vocoder(output['mel']).clamp(-1, 1).cpu().squeeze().numpy()
75
  output_path = "out.wav"
76
  sf.write(output_path, audio, 22050)
 
78
 
79
  gr.Interface(
80
  fn=process,
81
+ # Add a slider if model.n_spks > 1
82
+ inputs=[
83
+ gr.Textbox(label="Kashmiri Text"),
84
+ gr.Slider(0, model.n_spks - 1, step=1, label="Speaker ID") if model.n_spks > 1 else gr.Number(visible=False)
85
+ ],
86
  outputs=gr.Audio(label="Audio", type="filepath"),
87
  title="GAASH-Lab: Kashmiri TTS"
88
  ).launch()