odg123
/

ranjit-task-logs-analysis

Model card Files Files and versions

Metrics Training metrics Community

ranjit-task-logs-analysis / egs /aishell2 /ASR /local /prepare_words.py

odg123's picture

Upload icefall experiment results and logs

d596074 verified 3 months ago

history blame contribute delete

2.52 kB

	#!/usr/bin/env python
	# -- coding: utf-8 --

	# Copyright 2021 Xiaomi Corp. (authors: Mingshuang Luo)
	#
	# See ../../../../LICENSE for clarification regarding multiple authors
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.


	"""
	This script takes as input words.txt without ids:
	- words_no_ids.txt
	and generates the new words.txt with related ids.
	- words.txt
	"""


	import argparse
	import logging

	from tqdm import tqdm


	def get_parser():
	parser = argparse.ArgumentParser(
	description="Prepare words.txt",
	formatter_class=argparse.ArgumentDefaultsHelpFormatter,
	)
	parser.add_argument(
	"--input-file",
	default="data/lang_char/words_no_ids.txt",
	type=str,
	help="the words file without ids for WenetSpeech",
	)
	parser.add_argument(
	"--output-file",
	default="data/lang_char/words.txt",
	type=str,
	help="the words file with ids for WenetSpeech",
	)

	return parser


	def main():
	parser = get_parser()
	args = parser.parse_args()

	input_file = args.input_file
	output_file = args.output_file

	f = open(input_file, "r", encoding="utf-8")
	lines = f.readlines()
	new_lines = []
	add_words = ["<eps> 0", "!SIL 1", "<SPOKEN_NOISE> 2", "<UNK> 3"]
	new_lines.extend(add_words)

	logging.info("Starting reading the input file")
	for i in tqdm(range(len(lines))):
	x = lines[i]
	idx = 4 + i
	new_line = str(x.strip("\n")) + " " + str(idx)
	new_lines.append(new_line)

	logging.info("Starting writing the words.txt")
	f_out = open(output_file, "w", encoding="utf-8")

	# LG decoding needs below symbols.
	id1, id2, id3 = (
	str(len(new_lines)),
	str(len(new_lines) + 1),
	str(len(new_lines) + 2),
	)
	add_words = ["#0 " + id1, "<s> " + id2, "</s> " + id3]
	new_lines.extend(add_words)

	for line in new_lines:
	f_out.write(line)
	f_out.write("\n")


	if __name__ == "__main__":
	main()