File size: 2,472 Bytes
af11ce4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""
This file reads the texts in given manifest and save the new cuts with prepared tokens.
"""

import argparse
import logging
from functools import partial
from pathlib import Path

from lhotse import load_manifest, split_parallelize_combine

from zipvoice.tokenizer.tokenizer import add_tokens


def get_args():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--input-file",
        type=str,
        help="Input manifest without tokens",
    )

    parser.add_argument(
        "--output-file",
        type=str,
        help="Output manifest with tokens.",
    )

    parser.add_argument(
        "--num-jobs",
        type=int,
        default=20,
        help="Number of jobs to run in parallel.",
    )

    parser.add_argument(
        "--tokenizer",
        type=str,
        default="emilia",
        help="The destination directory of manifest files.",
    )

    parser.add_argument(
        "--lang",
        type=str,
        default="en-us",
        help="Language identifier, used when tokenizer type is espeak. see"
        "https://github.com/rhasspy/espeak-ng/blob/master/docs/languages.md",
    )

    return parser.parse_args()


def prepare_tokens(
    input_file: Path,
    output_file: Path,
    num_jobs: int,
    tokenizer: str,
    lang: str = "en-us",
):
    logging.info(f"Processing {input_file}")
    if output_file.is_file():
        logging.info(f"{output_file} exists, skipping.")
        return
    logging.info(f"loading manifest from {input_file}")
    cut_set = load_manifest(input_file)

    _add_tokens = partial(add_tokens, tokenizer=tokenizer, lang=lang)

    logging.info("Adding tokens")

    cut_set = split_parallelize_combine(
        num_jobs=num_jobs, manifest=cut_set, fn=_add_tokens
    )

    logging.info(f"Saving file to {output_file}")
    cut_set.to_file(output_file)


if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO, force=True)

    args = get_args()
    input_file = Path(args.input_file)
    output_file = Path(args.output_file)
    num_jobs = args.num_jobs
    tokenizer = args.tokenizer
    lang = args.lang

    output_file.parent.mkdir(parents=True, exist_ok=True)

    prepare_tokens(
        input_file=input_file,
        output_file=output_file,
        num_jobs=num_jobs,
        tokenizer=tokenizer,
        lang=lang,
    )

    logging.info("Done!")