| """ | |
| Configuration constants and hyperparameters for the SinCode engine. | |
| """ | |
| import re | |
| # βββ Model & Data Paths βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| DEFAULT_MODEL_NAME = "FacebookAI/xlm-roberta-base" | |
| DEFAULT_DICTIONARY_PATH = "dictionary.pkl" | |
| ENGLISH_CORPUS_URL = ( | |
| "https://raw.githubusercontent.com/first20hours/google-10000-english/master/20k.txt" | |
| ) | |
| # βββ Scoring Weights (tunable hyperparameters) ββββββββββββββββββββββββββββββ | |
| W_MLM: float = 0.55 # Contextual language model probability | |
| W_FIDELITY: float = 0.45 # Source-aware transliteration fidelity | |
| W_RANK: float = 0.00 # Dictionary rank prior (disabled β dict is unordered) | |
| # βββ Decoding Parameters ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| MAX_CANDIDATES: int = 8 # Max candidates per word position | |
| DEFAULT_BEAM_WIDTH: int = 5 # Beam search width | |
| FIDELITY_SCALE: float = 10.0 # Edit-distance penalty multiplier | |
| DICT_FIDELITY_DAMP: float = 2.0 # Decay rate for dict bonus (higher = stricter filter) | |
| MIN_ENGLISH_LEN: int = 3 # Min word length for 20k-corpus English detection | |
| # βββ Unicode Constants ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| SINHALA_VIRAMA: str = '\u0DCA' # Sinhala virama (hal) character | |
| ZWJ: str = '\u200D' # Zero-width joiner (for conjuncts) | |
| # βββ Regex ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| PUNCT_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$") | |