| |
| """Select a DIVERSE training-text set to maximize phoneme/character coverage. |
| |
| Why this matters: at 4.63M params the model is NOT capacity-limited — but it can only |
| pronounce characters/words it has SEEN. A narrow corpus (e.g. a few hundred Han chars) |
| leaves most held-out characters unseen -> garbled output. This script builds a broad, |
| coverage-maximizing text set from Tatoeba so the teacher (and then the student) cover |
| the common vocabulary. |
| |
| zh-TW: Tatoeba `cmn` -> OpenCC s2twp (Taiwan traditional) -> greedy Han-CHAR coverage. |
| en : Tatoeba `eng` -> greedy WORD coverage (English phones are few; word/prosody variety matters). |
| |
| Usage: |
| python select_diverse_text.py --lang zh --n 6000 --out expand_zh.tsv |
| python select_diverse_text.py --lang en --n 6000 --out expand_en.tsv |
| Then feed the .tsv (id<TAB>text) to gen_breezy_corpus.py to synthesize the teacher audio. |
| |
| Deps: requests/urllib (download), opencc (zh only). Tatoeba dumps are CC-BY 2.0 FR. |
| """ |
| import argparse, bz2, os, re, random, urllib.request |
|
|
| TATOEBA = "https://downloads.tatoeba.org/exports/per_language/{lang}/{lang}_sentences.tsv.bz2" |
| HAN = lambda s: set(c for c in s if "一" <= c <= "鿿") |
|
|
|
|
| def download(lang): |
| f = f"{lang}_sentences.tsv" |
| if not os.path.exists(f): |
| url = TATOEBA.format(lang=lang) |
| print("downloading", url) |
| urllib.request.urlretrieve(url, f + ".bz2") |
| with bz2.open(f + ".bz2", "rt", encoding="utf-8") as i, open(f, "w", encoding="utf-8") as o: |
| for line in i: |
| o.write(line) |
| return f |
|
|
|
|
| def select_zh(path, n, seed=42): |
| import opencc |
| cc = opencc.OpenCC("s2twp") |
| allowed = set(",。!?、:;…") |
| seen_t, cands = set(), [] |
| for l in open(path, encoding="utf-8"): |
| p = l.rstrip("\n").split("\t") |
| if len(p) < 3: |
| continue |
| t = cc.convert(p[2].strip()).replace(",", ",").replace("!", "!").replace("?", "?") |
| h = HAN(t) |
| if not (6 <= len(h) <= 26): |
| continue |
| if any(("一" <= c <= "鿿") is False and c not in allowed for c in t): |
| continue |
| if t not in seen_t: |
| seen_t.add(t); cands.append(t) |
| return greedy_cover(cands, HAN, n, seed) |
|
|
|
|
| def select_en(path, n, seed=42): |
| words = lambda s: set(re.findall(r"[a-z']+", s.lower())) |
| seen_t, cands = set(), [] |
| for l in open(path, encoding="utf-8"): |
| p = l.rstrip("\n").split("\t") |
| if len(p) < 3: |
| continue |
| t = p[2].strip() |
| if not re.fullmatch(r"[A-Za-z0-9 ,.\-'?!]+", t): |
| continue |
| w = re.findall(r"[A-Za-z']+", t) |
| if not (4 <= len(w) <= 14) or any(len(x) > 15 for x in w): |
| continue |
| if t not in seen_t: |
| seen_t.add(t); cands.append(t) |
| return greedy_cover(cands, words, n, seed) |
|
|
|
|
| def greedy_cover(cands, unit, n, seed): |
| """Greedy max-coverage of `unit(text)` items, then random top-up to n for frequency.""" |
| random.seed(seed); random.shuffle(cands) |
| covered, selected, rest = set(), [], [] |
| cands.sort(key=lambda t: len(unit(t) - covered), reverse=True) |
| for t in cands: |
| if len(unit(t) - covered) >= 1 and len(selected) < n: |
| selected.append(t); covered |= unit(t) |
| else: |
| rest.append(t) |
| random.shuffle(rest) |
| selected += rest[: max(0, n - len(selected))] |
| print(f"selected {len(selected)} sentences | unique units covered: {len(covered)}") |
| return selected |
|
|
|
|
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--lang", choices=["zh", "en"], required=True) |
| ap.add_argument("--n", type=int, default=6000) |
| ap.add_argument("--out", required=True) |
| args = ap.parse_args() |
| path = download("cmn" if args.lang == "zh" else "eng") |
| sents = select_zh(path, args.n) if args.lang == "zh" else select_en(path, args.n) |
| with open(args.out, "w", encoding="utf-8") as o: |
| for i, t in enumerate(sents): |
| o.write(f"{args.lang}e{i:05d}\t{t}\n") |
| print("wrote", args.out, len(sents)) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|