| |
|
|
| |
|
|
| """ |
| This script takes as input lang_dir containing lexicon_disambig.txt, |
| tokens.txt, and words.txt and generates the following files: |
| |
| - H.fst |
| - HL.fst |
| - HLG.fst |
| |
| Note that saved files are in OpenFst binary format. |
| |
| Usage: |
| |
| ./local/prepare_lang_fst.py \ |
| --lang-dir ./data/lang_phone \ |
| --has-silence 1 |
| |
| Or |
| |
| ./local/prepare_lang_fst.py \ |
| --lang-dir ./data/lang_bpe_500 |
| """ |
|
|
| import argparse |
| import logging |
| from pathlib import Path |
|
|
| import kaldifst |
|
|
| from icefall.ctc import ( |
| Lexicon, |
| add_disambig_self_loops, |
| add_one, |
| build_standard_ctc_topo, |
| make_lexicon_fst_no_silence, |
| make_lexicon_fst_with_silence, |
| ) |
| from icefall.utils import str2bool |
|
|
|
|
| def get_args(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument( |
| "--lang-dir", |
| type=str, |
| help="""Input and output directory. |
| """, |
| ) |
|
|
| parser.add_argument( |
| "--has-silence", |
| type=str2bool, |
| default=False, |
| help="True if the lexicon has silence.", |
| ) |
|
|
| parser.add_argument( |
| "--ngram-G", |
| type=str, |
| help="""If not empty, it is the filename of G used to build HLG. |
| For instance, --ngram-G=./data/lm/G_3_fst.txt |
| """, |
| ) |
|
|
| return parser.parse_args() |
|
|
|
|
| def build_HL( |
| H: kaldifst.StdVectorFst, |
| L: kaldifst.StdVectorFst, |
| has_silence: bool, |
| lexicon: Lexicon, |
| ) -> kaldifst.StdVectorFst: |
| if has_silence: |
| |
| add_one(L, treat_ilabel_zero_specially=True, update_olabel=False) |
| else: |
| add_one(L, treat_ilabel_zero_specially=False, update_olabel=False) |
|
|
| |
| |
| add_disambig_self_loops( |
| H, |
| start=lexicon.token2id["#0"] + 1, |
| end=lexicon.max_disambig_id + 1, |
| ) |
|
|
| kaldifst.arcsort(H, sort_type="olabel") |
| kaldifst.arcsort(L, sort_type="ilabel") |
|
|
| HL = kaldifst.compose(H, L) |
| kaldifst.determinize_star(HL) |
|
|
| disambig0 = lexicon.token2id["#0"] + 1 |
| max_disambig = lexicon.max_disambig_id + 1 |
| for state in kaldifst.StateIterator(HL): |
| for arc in kaldifst.ArcIterator(HL, state): |
| |
| |
| if disambig0 <= arc.ilabel <= max_disambig: |
| arc.ilabel = 0 |
|
|
| |
| |
|
|
| return HL |
|
|
|
|
| def build_HLG( |
| H: kaldifst.StdVectorFst, |
| L: kaldifst.StdVectorFst, |
| G: kaldifst.StdVectorFst, |
| has_silence: bool, |
| lexicon: Lexicon, |
| ) -> kaldifst.StdVectorFst: |
| if has_silence: |
| |
| add_one(L, treat_ilabel_zero_specially=True, update_olabel=False) |
| else: |
| add_one(L, treat_ilabel_zero_specially=False, update_olabel=False) |
|
|
| |
| token_disambig0 = lexicon.token2id["#0"] + 1 |
| word_disambig0 = lexicon.word2id["#0"] |
|
|
| kaldifst.add_self_loops(L, isyms=[token_disambig0], osyms=[word_disambig0]) |
|
|
| kaldifst.arcsort(L, sort_type="olabel") |
| kaldifst.arcsort(G, sort_type="ilabel") |
| LG = kaldifst.compose(L, G) |
| kaldifst.determinize_star(LG) |
| kaldifst.minimize_encoded(LG) |
|
|
| kaldifst.arcsort(LG, sort_type="ilabel") |
|
|
| |
| |
| add_disambig_self_loops( |
| H, |
| start=lexicon.token2id["#0"] + 1, |
| end=lexicon.max_disambig_id + 1, |
| ) |
|
|
| kaldifst.arcsort(H, sort_type="olabel") |
|
|
| HLG = kaldifst.compose(H, LG) |
| kaldifst.determinize_star(HLG) |
|
|
| disambig0 = lexicon.token2id["#0"] + 1 |
| max_disambig = lexicon.max_disambig_id + 1 |
| for state in kaldifst.StateIterator(HLG): |
| for arc in kaldifst.ArcIterator(HLG, state): |
| |
| |
| if disambig0 <= arc.ilabel <= max_disambig: |
| arc.ilabel = 0 |
| return HLG |
|
|
|
|
| def copy_fst(fst): |
| |
| return kaldifst.StdVectorFst(fst) |
|
|
|
|
| def main(): |
| args = get_args() |
| lang_dir = args.lang_dir |
|
|
| lexicon = Lexicon(lang_dir) |
|
|
| logging.info("Building standard CTC topology") |
| max_token_id = max(lexicon.tokens) |
| H = build_standard_ctc_topo(max_token_id=max_token_id) |
|
|
| |
| |
| add_one(H, treat_ilabel_zero_specially=False, update_olabel=True) |
| H.write(f"{lang_dir}/H.fst") |
|
|
| logging.info("Building L") |
| |
|
|
| if args.has_silence: |
| L = make_lexicon_fst_with_silence(lexicon, attach_symbol_table=False) |
| else: |
| L = make_lexicon_fst_no_silence(lexicon, attach_symbol_table=False) |
|
|
| logging.info("Building HL") |
| HL = build_HL( |
| H=copy_fst(H), |
| L=copy_fst(L), |
| has_silence=args.has_silence, |
| lexicon=lexicon, |
| ) |
| HL.write(f"{lang_dir}/HL.fst") |
|
|
| if not args.ngram_G: |
| logging.info("Skip building HLG") |
| return |
|
|
| logging.info("Building HLG") |
| with open(args.ngram_G) as f: |
| G = kaldifst.compile( |
| s=f.read(), |
| acceptor=False, |
| ) |
|
|
| HLG = build_HLG(H=H, L=L, G=G, has_silence=args.has_silence, lexicon=lexicon) |
| HLG.write(f"{lang_dir}/HLG.fst") |
|
|
|
|
| if __name__ == "__main__": |
| formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" |
|
|
| logging.basicConfig(format=formatter, level=logging.INFO) |
| main() |
|
|