SinCode v3 — System Architecture

ByT5-small · XLM-RoBERTa · mBart50-large

flowchart TD UI["🖥️ Streamlit UI\napp.py"] MODE{Mode?} UI --> MODE subgraph MODE_FULL["Full Sinhala Mode"] direction TB ST["SentenceTransliterator\nseq2seq/mbart_infer.py"] MBART["mBart50-large\nKalana001/mbart50-large-singlish-sinhala\nHF Hub · 2.4 GB"] FIX["Compose Fix Map\nseq2seq/Compose_fix_map.json\nZWJ / Virama corrections"] ST --> MBART MBART -->|"raw Sinhala output"| FIX end subgraph MODE_MIXED["Code-Mixed Mode"] direction TB subgraph PHASE1["Phase 1 · Word Classification"] direction LR P1A["Sinhala script?\n(U+0D80–0DFF)"] P1B["English vocab?\nenglish_20k.txt"] P1C["Singlish\n(everything else)"] end subgraph PHASE2["Phase 2 · Candidate Generation (single ByT5 batch)"] direction LR BYT5["ByT5-small\nKalana001/byt5-small-singlish-sinhala\nHF Hub · 1.2 GB\nbeam=5 → top-5 candidates"] SIN_PASS["Single candidate\n(word as-is)"] ENG_CAND["English word\n+ ByT5 Sinhala alternatives"] SIN_CAND["Top-5 ByT5\ncandidates"] end subgraph PHASE3["Phase 3 · Two-Pass MLM Reranking"] direction LR GREEDY["Pass 1 – Greedy\nBuild draft sentence\n(stale right context)"] RESCORE["Pass 2 – Rescore\nActual decoded output\nas right context"] MLM["XLM-RoBERTa\nKalana001/xlm-roberta-base-finetuned-sinhala\nHF Hub\nMulti-mask log-probability"] SOFTMAX["Softmax normalise\npick argmax"] end PHASE1 --> PHASE2 P1A -->|Sinhala| SIN_PASS P1B -->|English| ENG_CAND P1C -->|Singlish| SIN_CAND BYT5 --> ENG_CAND BYT5 --> SIN_CAND PHASE2 --> PHASE3 GREEDY --> MLM MLM --> SOFTMAX SOFTMAX --> RESCORE RESCORE --> MLM end MODE -->|"Full Sinhala Output"| MODE_FULL MODE -->|"Code-Mixed Output"| MODE_MIXED MODE_FULL --> OUT["✅ Sinhala Output"] MODE_MIXED --> OUT subgraph MODELS["Models on Hugging Face Hub (Kalana001)"] HF1["byt5-small-singlish-sinhala\n1.2 GB · ByT5-small"] HF2["xlm-roberta-base-finetuned-sinhala\nXLM-RoBERTa"] HF3["mbart50-large-singlish-sinhala\n2.4 GB · mBart50-large"] end style MODE_FULL fill:#e8f4fd,stroke:#4a9eda style MODE_MIXED fill:#fdf3e8,stroke:#e8974a style PHASE1 fill:#fff9e6,stroke:#cca800 style PHASE2 fill:#e8fff0,stroke:#2ecc71 style PHASE3 fill:#f4e8ff,stroke:#9b59b6 style MODELS fill:#eaf4ee,stroke:#27ae60