#!/usr/bin/env python3 import argparse from pathlib import Path import torch from scipy.io.wavfile import write from transformers import AutoTokenizer, VitsModel def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Generate Khmer speech with the local MMS TTS model.") parser.add_argument("--model", default=str(Path(__file__).resolve().parents[1])) parser.add_argument("--text", required=True) parser.add_argument("--output", default="khmer_tts.wav") return parser.parse_args() def main() -> None: args = parse_args() tokenizer = AutoTokenizer.from_pretrained(args.model) model = VitsModel.from_pretrained(args.model) model.eval() inputs = tokenizer(args.text, return_tensors="pt") with torch.no_grad(): waveform = model(**inputs).waveform.squeeze().cpu().numpy() write(args.output, rate=model.config.sampling_rate, data=waveform) print(f"Wrote {args.output} at {model.config.sampling_rate} Hz") if __name__ == "__main__": main()