backtracking commited on
Commit
a0d48a5
·
verified ·
1 Parent(s): ebf9ad3

Upload tiny_tts/infer.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. tiny_tts/infer.py +14 -6
tiny_tts/infer.py CHANGED
@@ -56,10 +56,14 @@ def load_engine(checkpoint_path, device='cuda'):
56
 
57
  net_g.load_state_dict(new_state_dict, strict=False)
58
  net_g.eval()
 
 
 
 
59
  return net_g
60
 
61
 
62
- def synthesize(text, output_path, model, speaker="LJ", device='cuda'):
63
  print(f"Synthesizing: {text}")
64
 
65
  # Normalize text
@@ -93,12 +97,15 @@ def synthesize(text, output_path, model, speaker="LJ", device='cuda'):
93
  bert = torch.zeros(1024, len(phone_ids)).to(device).unsqueeze(0)
94
  ja_bert = torch.zeros(768, len(phone_ids)).to(device).unsqueeze(0)
95
 
 
 
 
96
  with torch.no_grad():
97
  audio, *_ = model.infer(
98
  x, x_lengths, sid, tone, language, bert, ja_bert,
99
  noise_scale=0.667,
100
  noise_scale_w=0.8,
101
- length_scale=1.0
102
  )
103
 
104
  audio = audio[0, 0].cpu().numpy()
@@ -124,8 +131,9 @@ def main():
124
  parser = argparse.ArgumentParser(description="TinyTTS — English Text-to-Speech Inference")
125
  parser.add_argument("--text", "-t", type=str, default="The weather is nice today, and I feel very relaxed.", help="Text to synthesize")
126
  parser.add_argument("--checkpoint", "-c", type=str, default=None, help="Path to checkpoint. Auto-downloads if not provided.")
127
- parser.add_argument("--output", "-o", type=str, default="english_test.wav", help="Output audio file path")
128
- parser.add_argument("--speaker", "-s", type=str, default="female", help="Speaker ID")
 
129
  parser.add_argument("--device", type=str, default="cuda", help="Device to use (cuda or cpu)")
130
 
131
  args = parser.parse_args()
@@ -174,10 +182,10 @@ def main():
174
  print(f"Synthesizing for all {len(SPK2ID)} speakers...")
175
  for spk in SPK2ID.keys():
176
  final_output = os.path.join(out_dir, f"{name}_step{step_str}_spk{spk}{ext}")
177
- synthesize(args.text, final_output, model, speaker=spk, device=args.device)
178
  else:
179
  final_output = os.path.join(out_dir, f"{name}_step{step_str}_spk{args.speaker}{ext}")
180
- synthesize(args.text, final_output, model, speaker=args.speaker, device=args.device)
181
 
182
  if __name__ == "__main__":
183
  main()
 
56
 
57
  net_g.load_state_dict(new_state_dict, strict=False)
58
  net_g.eval()
59
+
60
+ # Fold weight_norm into weight tensors for faster inference (~18% speedup)
61
+ net_g.dec.remove_weight_norm()
62
+
63
  return net_g
64
 
65
 
66
+ def synthesize(text, output_path, model, speaker="MALE", device='cuda', speed=1.0):
67
  print(f"Synthesizing: {text}")
68
 
69
  # Normalize text
 
97
  bert = torch.zeros(1024, len(phone_ids)).to(device).unsqueeze(0)
98
  ja_bert = torch.zeros(768, len(phone_ids)).to(device).unsqueeze(0)
99
 
100
+ # speed > 1.0 = faster speech, < 1.0 = slower speech
101
+ length_scale = 1.0 / speed
102
+
103
  with torch.no_grad():
104
  audio, *_ = model.infer(
105
  x, x_lengths, sid, tone, language, bert, ja_bert,
106
  noise_scale=0.667,
107
  noise_scale_w=0.8,
108
+ length_scale=length_scale
109
  )
110
 
111
  audio = audio[0, 0].cpu().numpy()
 
131
  parser = argparse.ArgumentParser(description="TinyTTS — English Text-to-Speech Inference")
132
  parser.add_argument("--text", "-t", type=str, default="The weather is nice today, and I feel very relaxed.", help="Text to synthesize")
133
  parser.add_argument("--checkpoint", "-c", type=str, default=None, help="Path to checkpoint. Auto-downloads if not provided.")
134
+ parser.add_argument("--output", "-o", type=str, default="output.wav", help="Output audio file path")
135
+ parser.add_argument("--speaker", "-s", type=str, default="MALE", help="Speaker ID")
136
+ parser.add_argument("--speed", type=float, default=1.0, help="Speech speed (1.0=normal, 1.5=faster, 0.7=slower)")
137
  parser.add_argument("--device", type=str, default="cuda", help="Device to use (cuda or cpu)")
138
 
139
  args = parser.parse_args()
 
182
  print(f"Synthesizing for all {len(SPK2ID)} speakers...")
183
  for spk in SPK2ID.keys():
184
  final_output = os.path.join(out_dir, f"{name}_step{step_str}_spk{spk}{ext}")
185
+ synthesize(args.text, final_output, model, speaker=spk, device=args.device, speed=args.speed)
186
  else:
187
  final_output = os.path.join(out_dir, f"{name}_step{step_str}_spk{args.speaker}{ext}")
188
+ synthesize(args.text, final_output, model, speaker=args.speaker, device=args.device, speed=args.speed)
189
 
190
  if __name__ == "__main__":
191
  main()