File size: 3,607 Bytes
f6f45d5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | <!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<title>SinCode v3 — Architecture</title>
<script src="https://cdn.jsdelivr.net/npm/mermaid@11/dist/mermaid.min.js"></script>
<style>
body {
font-family: sans-serif;
background: #f8f9fa;
display: flex;
flex-direction: column;
align-items: center;
padding: 2rem;
}
h1 { color: #2c3e50; margin-bottom: 0.25rem; }
p { color: #666; margin-top: 0; margin-bottom: 2rem; }
.mermaid {
background: white;
border-radius: 12px;
padding: 2rem;
box-shadow: 0 2px 12px rgba(0,0,0,0.08);
max-width: 1200px;
width: 100%;
}
</style>
</head>
<body>
<h1>SinCode v3 — System Architecture</h1>
<p>ByT5-small · XLM-RoBERTa · mBart50-large</p>
<div class="mermaid">
flowchart TD
UI["🖥️ Streamlit UI\napp.py"]
MODE{Mode?}
UI --> MODE
subgraph MODE_FULL["Full Sinhala Mode"]
direction TB
ST["SentenceTransliterator\nseq2seq/mbart_infer.py"]
MBART["mBart50-large\nKalana001/mbart50-large-singlish-sinhala\nHF Hub · 2.4 GB"]
FIX["Compose Fix Map\nseq2seq/Compose_fix_map.json\nZWJ / Virama corrections"]
ST --> MBART
MBART -->|"raw Sinhala output"| FIX
end
subgraph MODE_MIXED["Code-Mixed Mode"]
direction TB
subgraph PHASE1["Phase 1 · Word Classification"]
direction LR
P1A["Sinhala script?\n(U+0D80–0DFF)"]
P1B["English vocab?\nenglish_20k.txt"]
P1C["Singlish\n(everything else)"]
end
subgraph PHASE2["Phase 2 · Candidate Generation (single ByT5 batch)"]
direction LR
BYT5["ByT5-small\nKalana001/byt5-small-singlish-sinhala\nHF Hub · 1.2 GB\nbeam=5 → top-5 candidates"]
SIN_PASS["Single candidate\n(word as-is)"]
ENG_CAND["English word\n+ ByT5 Sinhala alternatives"]
SIN_CAND["Top-5 ByT5\ncandidates"]
end
subgraph PHASE3["Phase 3 · Two-Pass MLM Reranking"]
direction LR
GREEDY["Pass 1 – Greedy\nBuild draft sentence\n(stale right context)"]
RESCORE["Pass 2 – Rescore\nActual decoded output\nas right context"]
MLM["XLM-RoBERTa\nKalana001/xlm-roberta-base-finetuned-sinhala\nHF Hub\nMulti-mask log-probability"]
SOFTMAX["Softmax normalise\npick argmax"]
end
PHASE1 --> PHASE2
P1A -->|Sinhala| SIN_PASS
P1B -->|English| ENG_CAND
P1C -->|Singlish| SIN_CAND
BYT5 --> ENG_CAND
BYT5 --> SIN_CAND
PHASE2 --> PHASE3
GREEDY --> MLM
MLM --> SOFTMAX
SOFTMAX --> RESCORE
RESCORE --> MLM
end
MODE -->|"Full Sinhala Output"| MODE_FULL
MODE -->|"Code-Mixed Output"| MODE_MIXED
MODE_FULL --> OUT["✅ Sinhala Output"]
MODE_MIXED --> OUT
subgraph MODELS["Models on Hugging Face Hub (Kalana001)"]
HF1["byt5-small-singlish-sinhala\n1.2 GB · ByT5-small"]
HF2["xlm-roberta-base-finetuned-sinhala\nXLM-RoBERTa"]
HF3["mbart50-large-singlish-sinhala\n2.4 GB · mBart50-large"]
end
style MODE_FULL fill:#e8f4fd,stroke:#4a9eda
style MODE_MIXED fill:#fdf3e8,stroke:#e8974a
style PHASE1 fill:#fff9e6,stroke:#cca800
style PHASE2 fill:#e8fff0,stroke:#2ecc71
style PHASE3 fill:#f4e8ff,stroke:#9b59b6
style MODELS fill:#eaf4ee,stroke:#27ae60
</div>
<script>
mermaid.initialize({ startOnLoad: true, theme: 'default', flowchart: { curve: 'basis' } });
</script>
</body>
</html>
|