BurhaanZargar commited on
Commit
067912c
·
verified ·
1 Parent(s): a6bef00

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +34 -5
app.py CHANGED
@@ -10,6 +10,8 @@ from matcha.models.matcha_tts import MatchaTTS
10
  from matcha.hifigan.models import Generator as HiFiGAN
11
  from matcha.hifigan.config import v1
12
  from matcha.hifigan.env import AttrDict
 
 
13
 
14
  HF_TOKEN = os.getenv("HF_TOKEN")
15
 
@@ -42,11 +44,38 @@ model, vocoder = load_models()
42
 
43
  @torch.inference_mode()
44
  def process(text):
45
- # Basic normalization for Kashmiri script
46
  text = text.replace("ي", "ی").replace("ك", "ک").strip()
47
- output = model.synthesise(text, n_timesteps=10, temperature=0.667, length_scale=1.0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  audio = vocoder(output['mel']).clamp(-1, 1).cpu().squeeze().numpy()
49
- sf.write("out.wav", audio, 22050)
50
- return "out.wav"
 
51
 
52
- gr.Interface(fn=process, inputs="text", outputs="audio", title="Kashmiri TTS").launch()
 
 
 
 
 
 
10
  from matcha.hifigan.models import Generator as HiFiGAN
11
  from matcha.hifigan.config import v1
12
  from matcha.hifigan.env import AttrDict
13
+ from matcha.text import text_to_sequence
14
+ from matcha.utils.utils import intersperse
15
 
16
  HF_TOKEN = os.getenv("HF_TOKEN")
17
 
 
44
 
45
  @torch.inference_mode()
46
  def process(text):
47
+ # 1. Kashmiri script normalization
48
  text = text.replace("ي", "ی").replace("ك", "ک").strip()
49
+
50
+ # 2. Convert text to sequence using the correct cleaner
51
+ # We use 'basic_cleaners' here because the model was trained to
52
+ # map Kashmiri characters directly to audio features.
53
+ cleaner = "basic_cleaners"
54
+ x = torch.tensor(
55
+ intersperse(text_to_sequence(text, [cleaner])[0], 0),
56
+ dtype=torch.long,
57
+ device=DEVICE,
58
+ )[None]
59
+ x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=DEVICE)
60
+
61
+ # 3. Generate Mel-spectrogram
62
+ output = model.synthesise(
63
+ x,
64
+ x_lengths,
65
+ n_timesteps=10,
66
+ temperature=0.667,
67
+ length_scale=1.0
68
+ )
69
+
70
+ # 4. Generate Audio Waveform
71
  audio = vocoder(output['mel']).clamp(-1, 1).cpu().squeeze().numpy()
72
+ output_path = "out.wav"
73
+ sf.write(output_path, audio, 22050)
74
+ return output_path
75
 
76
+ gr.Interface(
77
+ fn=process,
78
+ inputs=gr.Textbox(label="Kashmiri Text"),
79
+ outputs=gr.Audio(label="Audio", type="filepath"),
80
+ title="GAASH-Lab: Kashmiri TTS"
81
+ ).launch()