File size: 3,364 Bytes
7934b29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

DECODE_CTX_SIZE = 3  # the size of the input context to be provided to the DuplexDecoderModel
LABEL_PAD_TOKEN_ID = -100

# Split names
TRAIN, DEV, TEST = 'train', 'dev', 'test'
SPLIT_NAMES = [TRAIN, DEV, TEST]

# Languages
ENGLISH = 'en'
RUSSIAN = 'ru'
GERMAN = 'de'
MULTILINGUAL = 'multilingual'
SUPPORTED_LANGS = [ENGLISH, RUSSIAN, GERMAN, MULTILINGUAL]

# Task Prefixes
ITN_TASK = 0
TN_TASK = 1
ITN_PREFIX = str(ITN_TASK)
TN_PREFIX = str(TN_TASK)

# Tagger Labels Prefixes
B_PREFIX = 'B-'  # Denote beginning
I_PREFIX = 'I-'  # Denote middle
TAGGER_LABELS_PREFIXES = [B_PREFIX, I_PREFIX]

# Modes
TN_MODE = 'tn'
ITN_MODE = 'itn'
JOINT_MODE = 'joint'
MODES = [TN_MODE, ITN_MODE, JOINT_MODE]
TASK_ID_TO_MODE = {ITN_TASK: ITN_MODE, TN_TASK: TN_MODE}
MODE_TO_TASK_ID = {v: k for k, v in TASK_ID_TO_MODE.items()}

# Instance Directions
INST_BACKWARD = 'BACKWARD'
INST_FORWARD = 'FORWARD'
INST_DIRECTIONS = [INST_BACKWARD, INST_FORWARD]
DIRECTIONS_TO_ID = {INST_BACKWARD: ITN_TASK, INST_FORWARD: TN_TASK}
DIRECTIONS_ID_TO_NAME = {ITN_TASK: INST_BACKWARD, TN_TASK: INST_FORWARD}
DIRECTIONS_TO_MODE = {ITN_MODE: INST_BACKWARD, TN_MODE: INST_FORWARD}

# TAGS
SAME_TAG = 'SAME'  # Tag indicates that a token can be kept the same without any further transformation
TASK_TAG = 'TASK'  # Tag indicates that a token belongs to a task prefix (the prefix indicates whether the current task is TN or ITN)
PUNCT_TAG = 'PUNCT'  # Tag indicates that a token is a punctuation
TRANSFORM_TAG = 'TRANSFORM'  # Tag indicates that a token needs to be transformed by the decoder
ALL_TAGS = [TASK_TAG, SAME_TAG, TRANSFORM_TAG]

# ALL_TAG_LABELS
ALL_TAG_LABELS = []
for prefix in TAGGER_LABELS_PREFIXES:
    for tag in ALL_TAGS:
        ALL_TAG_LABELS.append(prefix + tag)

ALL_TAG_LABELS.sort()
LABEL_IDS = {l: idx for idx, l in enumerate(ALL_TAG_LABELS)}

# Special Words
SIL_WORD = 'sil'
SELF_WORD = '<self>'
SPECIAL_WORDS = [SIL_WORD, SELF_WORD]

# IDs for special tokens for encoding inputs of the decoder models
EXTRA_ID_0 = '<extra_id_0>'
EXTRA_ID_1 = '<extra_id_1>'


EN_GREEK_TO_SPOKEN = {
    'Τ': 'tau',
    'Ο': 'omicron',
    'Δ': 'delta',
    'Η': 'eta',
    'Κ': 'kappa',
    'Ι': 'iota',
    'Θ': 'theta',
    'Α': 'alpha',
    'Σ': 'sigma',
    'Υ': 'upsilon',
    'Μ': 'mu',
    'Χ': 'chi',
    'Π': 'pi',
    'Ν': 'nu',
    'Λ': 'lambda',
    'Γ': 'gamma',
    'Β': 'beta',
    'Ρ': 'rho',
    'τ': 'tau',
    'υ': 'upsilon',
    'φ': 'phi',
    'α': 'alpha',
    'λ': 'lambda',
    'ι': 'iota',
    'ς': 'sigma',
    'ο': 'omicron',
    'σ': 'sigma',
    'η': 'eta',
    'π': 'pi',
    'ν': 'nu',
    'γ': 'gamma',
    'κ': 'kappa',
    'ε': 'epsilon',
    'β': 'beta',
    'ρ': 'rho',
    'ω': 'omega',
    'χ': 'chi',
}