File size: 3,607 Bytes
f6f45d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8" />
  <title>SinCode v3 — Architecture</title>
  <script src="https://cdn.jsdelivr.net/npm/mermaid@11/dist/mermaid.min.js"></script>
  <style>
    body {
      font-family: sans-serif;
      background: #f8f9fa;
      display: flex;
      flex-direction: column;
      align-items: center;
      padding: 2rem;
    }
    h1 { color: #2c3e50; margin-bottom: 0.25rem; }
    p  { color: #666; margin-top: 0; margin-bottom: 2rem; }
    .mermaid {
      background: white;
      border-radius: 12px;
      padding: 2rem;
      box-shadow: 0 2px 12px rgba(0,0,0,0.08);
      max-width: 1200px;
      width: 100%;
    }
  </style>
</head>
<body>
  <h1>SinCode v3 — System Architecture</h1>
  <p>ByT5-small · XLM-RoBERTa · mBart50-large</p>

  <div class="mermaid">
flowchart TD
    UI["🖥️ Streamlit UI\napp.py"]
    MODE{Mode?}

    UI --> MODE

    subgraph MODE_FULL["Full Sinhala Mode"]
        direction TB
        ST["SentenceTransliterator\nseq2seq/mbart_infer.py"]
        MBART["mBart50-large\nKalana001/mbart50-large-singlish-sinhala\nHF Hub · 2.4 GB"]
        FIX["Compose Fix Map\nseq2seq/Compose_fix_map.json\nZWJ / Virama corrections"]
        ST --> MBART
        MBART -->|"raw Sinhala output"| FIX
    end

    subgraph MODE_MIXED["Code-Mixed Mode"]
        direction TB

        subgraph PHASE1["Phase 1 · Word Classification"]
            direction LR
            P1A["Sinhala script?\n(U+0D80–0DFF)"]
            P1B["English vocab?\nenglish_20k.txt"]
            P1C["Singlish\n(everything else)"]
        end

        subgraph PHASE2["Phase 2 · Candidate Generation  (single ByT5 batch)"]
            direction LR
            BYT5["ByT5-small\nKalana001/byt5-small-singlish-sinhala\nHF Hub · 1.2 GB\nbeam=5 → top-5 candidates"]
            SIN_PASS["Single candidate\n(word as-is)"]
            ENG_CAND["English word\n+ ByT5 Sinhala alternatives"]
            SIN_CAND["Top-5 ByT5\ncandidates"]
        end

        subgraph PHASE3["Phase 3 · Two-Pass MLM Reranking"]
            direction LR
            GREEDY["Pass 1 – Greedy\nBuild draft sentence\n(stale right context)"]
            RESCORE["Pass 2 – Rescore\nActual decoded output\nas right context"]
            MLM["XLM-RoBERTa\nKalana001/xlm-roberta-base-finetuned-sinhala\nHF Hub\nMulti-mask log-probability"]
            SOFTMAX["Softmax normalise\npick argmax"]
        end

        PHASE1 --> PHASE2
        P1A -->|Sinhala| SIN_PASS
        P1B -->|English| ENG_CAND
        P1C -->|Singlish| SIN_CAND
        BYT5 --> ENG_CAND
        BYT5 --> SIN_CAND
        PHASE2 --> PHASE3
        GREEDY --> MLM
        MLM --> SOFTMAX
        SOFTMAX --> RESCORE
        RESCORE --> MLM
    end

    MODE -->|"Full Sinhala Output"| MODE_FULL
    MODE -->|"Code-Mixed Output"| MODE_MIXED

    MODE_FULL --> OUT["✅ Sinhala Output"]
    MODE_MIXED --> OUT

    subgraph MODELS["Models on Hugging Face Hub  (Kalana001)"]
        HF1["byt5-small-singlish-sinhala\n1.2 GB · ByT5-small"]
        HF2["xlm-roberta-base-finetuned-sinhala\nXLM-RoBERTa"]
        HF3["mbart50-large-singlish-sinhala\n2.4 GB · mBart50-large"]
    end

    style MODE_FULL fill:#e8f4fd,stroke:#4a9eda
    style MODE_MIXED fill:#fdf3e8,stroke:#e8974a
    style PHASE1 fill:#fff9e6,stroke:#cca800
    style PHASE2 fill:#e8fff0,stroke:#2ecc71
    style PHASE3 fill:#f4e8ff,stroke:#9b59b6
    style MODELS fill:#eaf4ee,stroke:#27ae60
  </div>

  <script>
    mermaid.initialize({ startOnLoad: true, theme: 'default', flowchart: { curve: 'basis' } });
  </script>
</body>
</html>