File size: 1,999 Bytes
9906dbd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
"""
Configuration constants and hyperparameters for the SinCode engine.
"""

import re

# ─── Model & Data Paths ─────────────────────────────────────────────────────

DEFAULT_MODEL_NAME = "FacebookAI/xlm-roberta-base"
DEFAULT_DICTIONARY_PATH = "dictionary.pkl"

ENGLISH_CORPUS_URL = (
    "https://raw.githubusercontent.com/first20hours/google-10000-english/master/20k.txt"
)

# ─── Scoring Weights (tunable hyperparameters) ──────────────────────────────

W_MLM: float = 0.55           # Contextual language model probability
W_FIDELITY: float = 0.45      # Source-aware transliteration fidelity
W_RANK: float = 0.00          # Dictionary rank prior (disabled β€” dict is unordered)

# ─── Decoding Parameters ────────────────────────────────────────────────────

MAX_CANDIDATES: int = 8       # Max candidates per word position
DEFAULT_BEAM_WIDTH: int = 5   # Beam search width
FIDELITY_SCALE: float = 10.0  # Edit-distance penalty multiplier
DICT_FIDELITY_DAMP: float = 2.0  # Decay rate for dict bonus (higher = stricter filter)
MIN_ENGLISH_LEN: int = 3      # Min word length for 20k-corpus English detection

# ─── Unicode Constants ──────────────────────────────────────────────────────

SINHALA_VIRAMA: str = '\u0DCA'  # Sinhala virama (hal) character
ZWJ: str = '\u200D'             # Zero-width joiner (for conjuncts)

# ─── Regex ──────────────────────────────────────────────────────────────────

PUNCT_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$")