| | |
| | |
| | |
| | |
| | |
| |
|
| | """ |
| | Helper script to pre-compute embeddings for a flashlight (previously called wav2letter++) dataset |
| | """ |
| |
|
| | import argparse |
| | import os |
| |
|
| |
|
| | def main(): |
| | parser = argparse.ArgumentParser() |
| | parser.add_argument("tsv") |
| | parser.add_argument("--output-dir", required=True) |
| | parser.add_argument("--output-name", required=True) |
| | args = parser.parse_args() |
| |
|
| | os.makedirs(args.output_dir, exist_ok=True) |
| |
|
| | transcriptions = {} |
| |
|
| | with open(args.tsv, "r") as tsv, open( |
| | os.path.join(args.output_dir, args.output_name + ".ltr"), "w" |
| | ) as ltr_out, open( |
| | os.path.join(args.output_dir, args.output_name + ".wrd"), "w" |
| | ) as wrd_out: |
| | root = next(tsv).strip() |
| | for line in tsv: |
| | line = line.strip() |
| | dir = os.path.dirname(line) |
| | if dir not in transcriptions: |
| | parts = dir.split(os.path.sep) |
| | trans_path = f"{parts[-2]}-{parts[-1]}.trans.txt" |
| | path = os.path.join(root, dir, trans_path) |
| | assert os.path.exists(path) |
| | texts = {} |
| | with open(path, "r") as trans_f: |
| | for tline in trans_f: |
| | items = tline.strip().split() |
| | texts[items[0]] = " ".join(items[1:]) |
| | transcriptions[dir] = texts |
| | part = os.path.basename(line).split(".")[0] |
| | assert part in transcriptions[dir] |
| | print(transcriptions[dir][part], file=wrd_out) |
| | print( |
| | " ".join(list(transcriptions[dir][part].replace(" ", "|"))) + " |", |
| | file=ltr_out, |
| | ) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|