File size: 1,999 Bytes
9906dbd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | """
Configuration constants and hyperparameters for the SinCode engine.
"""
import re
# βββ Model & Data Paths βββββββββββββββββββββββββββββββββββββββββββββββββββββ
DEFAULT_MODEL_NAME = "FacebookAI/xlm-roberta-base"
DEFAULT_DICTIONARY_PATH = "dictionary.pkl"
ENGLISH_CORPUS_URL = (
"https://raw.githubusercontent.com/first20hours/google-10000-english/master/20k.txt"
)
# βββ Scoring Weights (tunable hyperparameters) ββββββββββββββββββββββββββββββ
W_MLM: float = 0.55 # Contextual language model probability
W_FIDELITY: float = 0.45 # Source-aware transliteration fidelity
W_RANK: float = 0.00 # Dictionary rank prior (disabled β dict is unordered)
# βββ Decoding Parameters ββββββββββββββββββββββββββββββββββββββββββββββββββββ
MAX_CANDIDATES: int = 8 # Max candidates per word position
DEFAULT_BEAM_WIDTH: int = 5 # Beam search width
FIDELITY_SCALE: float = 10.0 # Edit-distance penalty multiplier
DICT_FIDELITY_DAMP: float = 2.0 # Decay rate for dict bonus (higher = stricter filter)
MIN_ENGLISH_LEN: int = 3 # Min word length for 20k-corpus English detection
# βββ Unicode Constants ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
SINHALA_VIRAMA: str = '\u0DCA' # Sinhala virama (hal) character
ZWJ: str = '\u200D' # Zero-width joiner (for conjuncts)
# βββ Regex ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
PUNCT_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$")
|