# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. DECODE_CTX_SIZE = 3 # the size of the input context to be provided to the DuplexDecoderModel LABEL_PAD_TOKEN_ID = -100 # Split names TRAIN, DEV, TEST = 'train', 'dev', 'test' SPLIT_NAMES = [TRAIN, DEV, TEST] # Languages ENGLISH = 'en' RUSSIAN = 'ru' GERMAN = 'de' MULTILINGUAL = 'multilingual' SUPPORTED_LANGS = [ENGLISH, RUSSIAN, GERMAN, MULTILINGUAL] # Task Prefixes ITN_TASK = 0 TN_TASK = 1 ITN_PREFIX = str(ITN_TASK) TN_PREFIX = str(TN_TASK) # Tagger Labels Prefixes B_PREFIX = 'B-' # Denote beginning I_PREFIX = 'I-' # Denote middle TAGGER_LABELS_PREFIXES = [B_PREFIX, I_PREFIX] # Modes TN_MODE = 'tn' ITN_MODE = 'itn' JOINT_MODE = 'joint' MODES = [TN_MODE, ITN_MODE, JOINT_MODE] TASK_ID_TO_MODE = {ITN_TASK: ITN_MODE, TN_TASK: TN_MODE} MODE_TO_TASK_ID = {v: k for k, v in TASK_ID_TO_MODE.items()} # Instance Directions INST_BACKWARD = 'BACKWARD' INST_FORWARD = 'FORWARD' INST_DIRECTIONS = [INST_BACKWARD, INST_FORWARD] DIRECTIONS_TO_ID = {INST_BACKWARD: ITN_TASK, INST_FORWARD: TN_TASK} DIRECTIONS_ID_TO_NAME = {ITN_TASK: INST_BACKWARD, TN_TASK: INST_FORWARD} DIRECTIONS_TO_MODE = {ITN_MODE: INST_BACKWARD, TN_MODE: INST_FORWARD} # TAGS SAME_TAG = 'SAME' # Tag indicates that a token can be kept the same without any further transformation TASK_TAG = 'TASK' # Tag indicates that a token belongs to a task prefix (the prefix indicates whether the current task is TN or ITN) PUNCT_TAG = 'PUNCT' # Tag indicates that a token is a punctuation TRANSFORM_TAG = 'TRANSFORM' # Tag indicates that a token needs to be transformed by the decoder ALL_TAGS = [TASK_TAG, SAME_TAG, TRANSFORM_TAG] # ALL_TAG_LABELS ALL_TAG_LABELS = [] for prefix in TAGGER_LABELS_PREFIXES: for tag in ALL_TAGS: ALL_TAG_LABELS.append(prefix + tag) ALL_TAG_LABELS.sort() LABEL_IDS = {l: idx for idx, l in enumerate(ALL_TAG_LABELS)} # Special Words SIL_WORD = 'sil' SELF_WORD = '' SPECIAL_WORDS = [SIL_WORD, SELF_WORD] # IDs for special tokens for encoding inputs of the decoder models EXTRA_ID_0 = '' EXTRA_ID_1 = '' EN_GREEK_TO_SPOKEN = { 'Τ': 'tau', 'Ο': 'omicron', 'Δ': 'delta', 'Η': 'eta', 'Κ': 'kappa', 'Ι': 'iota', 'Θ': 'theta', 'Α': 'alpha', 'Σ': 'sigma', 'Υ': 'upsilon', 'Μ': 'mu', 'Χ': 'chi', 'Π': 'pi', 'Ν': 'nu', 'Λ': 'lambda', 'Γ': 'gamma', 'Β': 'beta', 'Ρ': 'rho', 'τ': 'tau', 'υ': 'upsilon', 'φ': 'phi', 'α': 'alpha', 'λ': 'lambda', 'ι': 'iota', 'ς': 'sigma', 'ο': 'omicron', 'σ': 'sigma', 'η': 'eta', 'π': 'pi', 'ν': 'nu', 'γ': 'gamma', 'κ': 'kappa', 'ε': 'epsilon', 'β': 'beta', 'ρ': 'rho', 'ω': 'omega', 'χ': 'chi', }