Spaces:
Running
Running
Upload tiny_tts/infer.py with huggingface_hub
Browse files- tiny_tts/infer.py +14 -6
tiny_tts/infer.py
CHANGED
|
@@ -56,10 +56,14 @@ def load_engine(checkpoint_path, device='cuda'):
|
|
| 56 |
|
| 57 |
net_g.load_state_dict(new_state_dict, strict=False)
|
| 58 |
net_g.eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
return net_g
|
| 60 |
|
| 61 |
|
| 62 |
-
def synthesize(text, output_path, model, speaker="
|
| 63 |
print(f"Synthesizing: {text}")
|
| 64 |
|
| 65 |
# Normalize text
|
|
@@ -93,12 +97,15 @@ def synthesize(text, output_path, model, speaker="LJ", device='cuda'):
|
|
| 93 |
bert = torch.zeros(1024, len(phone_ids)).to(device).unsqueeze(0)
|
| 94 |
ja_bert = torch.zeros(768, len(phone_ids)).to(device).unsqueeze(0)
|
| 95 |
|
|
|
|
|
|
|
|
|
|
| 96 |
with torch.no_grad():
|
| 97 |
audio, *_ = model.infer(
|
| 98 |
x, x_lengths, sid, tone, language, bert, ja_bert,
|
| 99 |
noise_scale=0.667,
|
| 100 |
noise_scale_w=0.8,
|
| 101 |
-
length_scale=
|
| 102 |
)
|
| 103 |
|
| 104 |
audio = audio[0, 0].cpu().numpy()
|
|
@@ -124,8 +131,9 @@ def main():
|
|
| 124 |
parser = argparse.ArgumentParser(description="TinyTTS — English Text-to-Speech Inference")
|
| 125 |
parser.add_argument("--text", "-t", type=str, default="The weather is nice today, and I feel very relaxed.", help="Text to synthesize")
|
| 126 |
parser.add_argument("--checkpoint", "-c", type=str, default=None, help="Path to checkpoint. Auto-downloads if not provided.")
|
| 127 |
-
parser.add_argument("--output", "-o", type=str, default="
|
| 128 |
-
parser.add_argument("--speaker", "-s", type=str, default="
|
|
|
|
| 129 |
parser.add_argument("--device", type=str, default="cuda", help="Device to use (cuda or cpu)")
|
| 130 |
|
| 131 |
args = parser.parse_args()
|
|
@@ -174,10 +182,10 @@ def main():
|
|
| 174 |
print(f"Synthesizing for all {len(SPK2ID)} speakers...")
|
| 175 |
for spk in SPK2ID.keys():
|
| 176 |
final_output = os.path.join(out_dir, f"{name}_step{step_str}_spk{spk}{ext}")
|
| 177 |
-
synthesize(args.text, final_output, model, speaker=spk, device=args.device)
|
| 178 |
else:
|
| 179 |
final_output = os.path.join(out_dir, f"{name}_step{step_str}_spk{args.speaker}{ext}")
|
| 180 |
-
synthesize(args.text, final_output, model, speaker=args.speaker, device=args.device)
|
| 181 |
|
| 182 |
if __name__ == "__main__":
|
| 183 |
main()
|
|
|
|
| 56 |
|
| 57 |
net_g.load_state_dict(new_state_dict, strict=False)
|
| 58 |
net_g.eval()
|
| 59 |
+
|
| 60 |
+
# Fold weight_norm into weight tensors for faster inference (~18% speedup)
|
| 61 |
+
net_g.dec.remove_weight_norm()
|
| 62 |
+
|
| 63 |
return net_g
|
| 64 |
|
| 65 |
|
| 66 |
+
def synthesize(text, output_path, model, speaker="MALE", device='cuda', speed=1.0):
|
| 67 |
print(f"Synthesizing: {text}")
|
| 68 |
|
| 69 |
# Normalize text
|
|
|
|
| 97 |
bert = torch.zeros(1024, len(phone_ids)).to(device).unsqueeze(0)
|
| 98 |
ja_bert = torch.zeros(768, len(phone_ids)).to(device).unsqueeze(0)
|
| 99 |
|
| 100 |
+
# speed > 1.0 = faster speech, < 1.0 = slower speech
|
| 101 |
+
length_scale = 1.0 / speed
|
| 102 |
+
|
| 103 |
with torch.no_grad():
|
| 104 |
audio, *_ = model.infer(
|
| 105 |
x, x_lengths, sid, tone, language, bert, ja_bert,
|
| 106 |
noise_scale=0.667,
|
| 107 |
noise_scale_w=0.8,
|
| 108 |
+
length_scale=length_scale
|
| 109 |
)
|
| 110 |
|
| 111 |
audio = audio[0, 0].cpu().numpy()
|
|
|
|
| 131 |
parser = argparse.ArgumentParser(description="TinyTTS — English Text-to-Speech Inference")
|
| 132 |
parser.add_argument("--text", "-t", type=str, default="The weather is nice today, and I feel very relaxed.", help="Text to synthesize")
|
| 133 |
parser.add_argument("--checkpoint", "-c", type=str, default=None, help="Path to checkpoint. Auto-downloads if not provided.")
|
| 134 |
+
parser.add_argument("--output", "-o", type=str, default="output.wav", help="Output audio file path")
|
| 135 |
+
parser.add_argument("--speaker", "-s", type=str, default="MALE", help="Speaker ID")
|
| 136 |
+
parser.add_argument("--speed", type=float, default=1.0, help="Speech speed (1.0=normal, 1.5=faster, 0.7=slower)")
|
| 137 |
parser.add_argument("--device", type=str, default="cuda", help="Device to use (cuda or cpu)")
|
| 138 |
|
| 139 |
args = parser.parse_args()
|
|
|
|
| 182 |
print(f"Synthesizing for all {len(SPK2ID)} speakers...")
|
| 183 |
for spk in SPK2ID.keys():
|
| 184 |
final_output = os.path.join(out_dir, f"{name}_step{step_str}_spk{spk}{ext}")
|
| 185 |
+
synthesize(args.text, final_output, model, speaker=spk, device=args.device, speed=args.speed)
|
| 186 |
else:
|
| 187 |
final_output = os.path.join(out_dir, f"{name}_step{step_str}_spk{args.speaker}{ext}")
|
| 188 |
+
synthesize(args.text, final_output, model, speaker=args.speaker, device=args.device, speed=args.speed)
|
| 189 |
|
| 190 |
if __name__ == "__main__":
|
| 191 |
main()
|