SinCode v3 — System Architecture
ByT5-small · XLM-RoBERTa · mBart50-large
flowchart TD
UI["🖥️ Streamlit UI\napp.py"]
MODE{Mode?}
UI --> MODE
subgraph MODE_FULL["Full Sinhala Mode"]
direction TB
ST["SentenceTransliterator\nseq2seq/mbart_infer.py"]
MBART["mBart50-large\nKalana001/mbart50-large-singlish-sinhala\nHF Hub · 2.4 GB"]
FIX["Compose Fix Map\nseq2seq/Compose_fix_map.json\nZWJ / Virama corrections"]
ST --> MBART
MBART -->|"raw Sinhala output"| FIX
end
subgraph MODE_MIXED["Code-Mixed Mode"]
direction TB
subgraph PHASE1["Phase 1 · Word Classification"]
direction LR
P1A["Sinhala script?\n(U+0D80–0DFF)"]
P1B["English vocab?\nenglish_20k.txt"]
P1C["Singlish\n(everything else)"]
end
subgraph PHASE2["Phase 2 · Candidate Generation (single ByT5 batch)"]
direction LR
BYT5["ByT5-small\nKalana001/byt5-small-singlish-sinhala\nHF Hub · 1.2 GB\nbeam=5 → top-5 candidates"]
SIN_PASS["Single candidate\n(word as-is)"]
ENG_CAND["English word\n+ ByT5 Sinhala alternatives"]
SIN_CAND["Top-5 ByT5\ncandidates"]
end
subgraph PHASE3["Phase 3 · Two-Pass MLM Reranking"]
direction LR
GREEDY["Pass 1 – Greedy\nBuild draft sentence\n(stale right context)"]
RESCORE["Pass 2 – Rescore\nActual decoded output\nas right context"]
MLM["XLM-RoBERTa\nKalana001/xlm-roberta-base-finetuned-sinhala\nHF Hub\nMulti-mask log-probability"]
SOFTMAX["Softmax normalise\npick argmax"]
end
PHASE1 --> PHASE2
P1A -->|Sinhala| SIN_PASS
P1B -->|English| ENG_CAND
P1C -->|Singlish| SIN_CAND
BYT5 --> ENG_CAND
BYT5 --> SIN_CAND
PHASE2 --> PHASE3
GREEDY --> MLM
MLM --> SOFTMAX
SOFTMAX --> RESCORE
RESCORE --> MLM
end
MODE -->|"Full Sinhala Output"| MODE_FULL
MODE -->|"Code-Mixed Output"| MODE_MIXED
MODE_FULL --> OUT["✅ Sinhala Output"]
MODE_MIXED --> OUT
subgraph MODELS["Models on Hugging Face Hub (Kalana001)"]
HF1["byt5-small-singlish-sinhala\n1.2 GB · ByT5-small"]
HF2["xlm-roberta-base-finetuned-sinhala\nXLM-RoBERTa"]
HF3["mbart50-large-singlish-sinhala\n2.4 GB · mBart50-large"]
end
style MODE_FULL fill:#e8f4fd,stroke:#4a9eda
style MODE_MIXED fill:#fdf3e8,stroke:#e8974a
style PHASE1 fill:#fff9e6,stroke:#cca800
style PHASE2 fill:#e8fff0,stroke:#2ecc71
style PHASE3 fill:#f4e8ff,stroke:#9b59b6
style MODELS fill:#eaf4ee,stroke:#27ae60