Spaces:

luoqiu
/

FireRedASR

Sleeping

App Files Files Community

FireRedASR / fireredasr /tokenizer /aed_tokenizer.py

luoqiu

添加相关代码

4ef9456 4 months ago

raw

history blame contribute delete

2.46 kB

	import logging
	import re

	import sentencepiece as spm

	from fireredasr.data.token_dict import TokenDict


	class ChineseCharEnglishSpmTokenizer:
	"""
	- One Chinese char is a token.
	- Split English word into SPM and one piece is a token.
	- Ignore ' ' between Chinese char
	- Replace ' ' between English word with "▁" by spm_model
	- Need to put SPM piece into dict file
	- If not set spm_model, will use English char and <space>
	"""
	SPM_SPACE = "▁"

	def __init__(self, dict_path, spm_model, unk="<unk>", space="<space>"):
	self.dict = TokenDict(dict_path, unk=unk)
	self.space = space
	if spm_model:
	self.sp = spm.SentencePieceProcessor()
	self.sp.Load(spm_model)
	else:
	self.sp = None
	print("[WRAN] Not set spm_model, will use English char")
	print("[WARN] Please check how to deal with ' '(space)")
	if self.space not in self.dict:
	print("Please add <space> to your dict, or it will be <unk>")

	def tokenize(self, text, replace_punc=True):
	#if text == "":
	# logging.info(f"empty text")
	text = text.upper()
	tokens = []
	if replace_punc:
	text = re.sub("[，。？！,\.?!]", " ", text)
	pattern = re.compile(r'([\u3400-\u4dbf\u4e00-\u9fff])')
	parts = pattern.split(text.strip())
	parts = [p for p in parts if len(p.strip()) > 0]
	for part in parts:
	if pattern.fullmatch(part) is not None:
	tokens.append(part)
	else:
	if self.sp:
	for piece in self.sp.EncodeAsPieces(part.strip()):
	tokens.append(piece)
	else:
	for char in part.strip():
	tokens.append(char if char != " " else self.space)
	tokens_id = []
	for token in tokens:
	tokens_id.append(self.dict.get(token, self.dict.unk))
	return tokens, tokens_id

	def detokenize(self, inputs, join_symbol="", replace_spm_space=True):
	"""inputs is ids or tokens, do not need self.sp"""
	if len(inputs) > 0 and type(inputs[0]) == int:
	tokens = [self.dict[id] for id in inputs]
	else:
	tokens = inputs
	s = f"{join_symbol}".join(tokens)
	if replace_spm_space:
	s = s.replace(self.SPM_SPACE, ' ').strip()
	return s