github-actions[bot] commited on
Commit Β·
eba757f
0
Parent(s):
Sync snapshot from GitHub 17b212cf3169b2b640d4712b81102194dfa5e1b6
Browse files- .gitattributes +37 -0
- .github/workflows/sync-to-hf.yml +28 -0
- .gitignore +25 -0
- .streamlit/config.toml +6 -0
- README.md +106 -0
- app.py +475 -0
- core/__init__.py +3 -0
- core/constants.py +38 -0
- core/decoder.py +773 -0
- core/dictionary.py +76 -0
- core/english.py +97 -0
- core/mappings.py +214 -0
- core/scorer.py +186 -0
- core/transliterate.py +49 -0
- dictionary.pkl +3 -0
- english_20k.txt +0 -0
- evaluation/dataset_110.csv +111 -0
- evaluation/dataset_40.csv +41 -0
- evaluation/evaluation.py +306 -0
- feedback_schema.sql +19 -0
- feedback_store.py +126 -0
- fine_tuning/attempt_1_wikipedia/eval_diagnostics.json +522 -0
- fine_tuning/attempt_1_wikipedia/eval_predictions.csv +41 -0
- fine_tuning/attempt_1_wikipedia/experiment_documentation.txt +624 -0
- fine_tuning/attempt_2_informal_sinhala/compare_perplexity.py +86 -0
- fine_tuning/attempt_2_informal_sinhala/eval_diagnostics.json +1432 -0
- fine_tuning/attempt_2_informal_sinhala/eval_predictions.csv +111 -0
- fine_tuning/attempt_2_informal_sinhala/experiment_notes.txt +102 -0
- fine_tuning/attempt_2_informal_sinhala/plot_training.py +117 -0
- fine_tuning/attempt_2_informal_sinhala/training_loss.png +3 -0
- fine_tuning/train_mlm.py +196 -0
- images/SinCodeLogo.jpg +0 -0
- images/background.png +3 -0
- requirements.txt +5 -0
- sincode_model.py +24 -0
.gitattributes
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
images/background.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
fine_tuning/attempt_2_informal_sinhala/training_loss.png filter=lfs diff=lfs merge=lfs -text
|
.github/workflows/sync-to-hf.yml
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Sync to Hugging Face Hub
|
| 2 |
+
on:
|
| 3 |
+
push:
|
| 4 |
+
branches: [main]
|
| 5 |
+
workflow_dispatch:
|
| 6 |
+
|
| 7 |
+
jobs:
|
| 8 |
+
sync-to-hub:
|
| 9 |
+
runs-on: ubuntu-latest
|
| 10 |
+
steps:
|
| 11 |
+
- uses: actions/checkout@v4
|
| 12 |
+
with:
|
| 13 |
+
fetch-depth: 1
|
| 14 |
+
lfs: true
|
| 15 |
+
|
| 16 |
+
- name: Create clean snapshot commit for HF
|
| 17 |
+
run: |
|
| 18 |
+
git lfs install
|
| 19 |
+
git config user.name "github-actions[bot]"
|
| 20 |
+
git config user.email "github-actions[bot]@users.noreply.github.com"
|
| 21 |
+
git checkout --orphan hf-sync
|
| 22 |
+
git add -A
|
| 23 |
+
git commit -m "Sync snapshot from GitHub ${GITHUB_SHA}"
|
| 24 |
+
|
| 25 |
+
- name: Force-push snapshot to Hugging Face Space
|
| 26 |
+
env:
|
| 27 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 28 |
+
run: git push --force https://Kalana001:$HF_TOKEN@huggingface.co/spaces/Kalana001/SinCode hf-sync:main
|
.gitignore
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Ignore local dev files
|
| 2 |
+
__pycache__/
|
| 3 |
+
.venv/
|
| 4 |
+
dump/
|
| 5 |
+
misc/
|
| 6 |
+
/feedback.csv
|
| 7 |
+
*.pyc
|
| 8 |
+
*.pkl
|
| 9 |
+
!dictionary.pkl
|
| 10 |
+
*.bak
|
| 11 |
+
|
| 12 |
+
# Local dev workspace config
|
| 13 |
+
.claude/
|
| 14 |
+
SKILL.md
|
| 15 |
+
|
| 16 |
+
# Training artifacts (model weights on HF Hub, too large for git)
|
| 17 |
+
/train_mlm.py
|
| 18 |
+
train_log.txt
|
| 19 |
+
training_log_*.txt
|
| 20 |
+
xlm-roberta-sinhala/
|
| 21 |
+
xlm-roberta-sinhala-v2/
|
| 22 |
+
|
| 23 |
+
# Root-level eval files
|
| 24 |
+
/eval_diagnostics.json
|
| 25 |
+
/eval_predictions.csv
|
.streamlit/config.toml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[theme]
|
| 2 |
+
base="dark"
|
| 3 |
+
primaryColor="#FF4B4B"
|
| 4 |
+
backgroundColor="#0E1117"
|
| 5 |
+
secondaryBackgroundColor="#262730"
|
| 6 |
+
textColor="#FAFAFA"
|
README.md
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: SinCode
|
| 3 |
+
emoji: π»
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
app_file: app.py
|
| 8 |
+
pinned: false
|
| 9 |
+
license: mit
|
| 10 |
+
short_description: Context-Aware Singlish-to-Sinhala Transliteration
|
| 11 |
+
sdk_version: 1.53.1
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# SinCode: Neuro-Symbolic Transliteration System
|
| 15 |
+
|
| 16 |
+
> **Context-Aware Singlish-to-Sinhala Transliteration with Code-Switching Support.**
|
| 17 |
+
|
| 18 |
+
**SinCode** is a final-year research project designed to solve the ambiguity of transliterating "Singlish" (phonetic Sinhala) into native Sinhala script.
|
| 19 |
+
|
| 20 |
+
## π Key Features
|
| 21 |
+
|
| 22 |
+
* **π§ Hybrid Neuro-Symbolic Engine:** Combines the speed of rule-based logic with the contextual understanding of Deep Learning (XLM-Roberta).
|
| 23 |
+
* **π Adaptive Code-Switching:** Intelligently detects English words (e.g., *"Assignment"*, *"Presentation"*) mixed within Sinhala sentences and preserves them automatically.
|
| 24 |
+
* **π Massive Vocabulary:** Powered by an optimized dictionary of **5.9 Million** Sinhala words to ensure high-accuracy suggestions.
|
| 25 |
+
* **β‘ Contextual Disambiguation:** Resolves ambiguous terms (e.g., detecting if *"nisa"* means *because* or *near*) based on the full sentence context.
|
| 26 |
+
|
| 27 |
+
## π οΈ How to Use
|
| 28 |
+
|
| 29 |
+
1. **Type** your Singlish sentence in the input box.
|
| 30 |
+
2. Click the **Transliterate** button.
|
| 31 |
+
3. View the **Result**.
|
| 32 |
+
4. (Optional) Expand the **"See How It Works"** section to view the real-time scoring logic used by the system.
|
| 33 |
+
|
| 34 |
+
## π Baseline Evaluation (New)
|
| 35 |
+
|
| 36 |
+
Use the evaluation script to measure current model quality before making tuning changes.
|
| 37 |
+
|
| 38 |
+
### 1) Prepare dataset
|
| 39 |
+
|
| 40 |
+
Create a CSV file with columns:
|
| 41 |
+
|
| 42 |
+
- `input` (Singlish / code-mixed input)
|
| 43 |
+
- `reference` (expected Sinhala output)
|
| 44 |
+
|
| 45 |
+
You can start from `eval_dataset_template.csv`.
|
| 46 |
+
|
| 47 |
+
### 2) Run evaluation
|
| 48 |
+
|
| 49 |
+
```bash
|
| 50 |
+
python evaluation.py --dataset eval_dataset_template.csv
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
Optional:
|
| 54 |
+
|
| 55 |
+
```bash
|
| 56 |
+
python evaluation.py --dataset your_dataset.csv --beam-width 5 --predictions-out eval_predictions.csv --diagnostics-out eval_diagnostics.json
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
### 3) Outputs
|
| 60 |
+
|
| 61 |
+
- `eval_predictions.csv`: per-sample prediction + metrics
|
| 62 |
+
- `eval_diagnostics.json`: per-word candidate scoring breakdown for error analysis
|
| 63 |
+
|
| 64 |
+
Reported aggregate metrics:
|
| 65 |
+
|
| 66 |
+
- Exact match
|
| 67 |
+
- Average Character Error Rate (CER)
|
| 68 |
+
- Average token accuracy
|
| 69 |
+
- Average English code-mix preservation
|
| 70 |
+
|
| 71 |
+
## π€ Hugging Face Spaces Notes
|
| 72 |
+
|
| 73 |
+
This project is compatible with Spaces. You can configure runtime paths with environment variables:
|
| 74 |
+
|
| 75 |
+
- `SINCODE_DICTIONARY_PATH` (default: `dictionary.pkl`)
|
| 76 |
+
- `SINCODE_MODEL_NAME` (default: `FacebookAI/xlm-roberta-base`)
|
| 77 |
+
- `SINCODE_ENGLISH_CACHE` (optional path for `english_20k.txt` cache)
|
| 78 |
+
|
| 79 |
+
Example:
|
| 80 |
+
|
| 81 |
+
```bash
|
| 82 |
+
SINCODE_DICTIONARY_PATH=dictionary.pkl
|
| 83 |
+
SINCODE_MODEL_NAME=FacebookAI/xlm-roberta-base
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
The engine now auto-selects a writable cache path for English corpus downloads when running in restricted environments.
|
| 87 |
+
|
| 88 |
+
## ποΈ System Architecture
|
| 89 |
+
|
| 90 |
+
The system utilizes a **Tiered Decoding Strategy**:
|
| 91 |
+
1. **Tier 1 (English Filter):** Checks the Google-20k English Corpus to filter out technical terms.
|
| 92 |
+
2. **Tier 2 (Dictionary Lookup):** Scans the 5.9M word database for exact Sinhala matches.
|
| 93 |
+
3. **Tier 3 (Phonetic Rules):** Generates Sinhala text for unknown words using a rule-based engine.
|
| 94 |
+
4. **Tier 4 (Neural Ranking):** The **XLM-R** model scores all possible candidates to pick the most grammatically correct sequence.
|
| 95 |
+
|
| 96 |
+
## β οΈ Disclaimer
|
| 97 |
+
|
| 98 |
+
* While accurate for common phrases, edge cases may still exist.
|
| 99 |
+
|
| 100 |
+
---
|
| 101 |
+
**Developer:** Kalana Chandrasekara
|
| 102 |
+
|
| 103 |
+
**Supervisor:** Hiruni Samarage
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
*Module: (2025) 6COSC023C.Y Computer Science Final Project (IIT Sri Lanka)*
|
app.py
ADDED
|
@@ -0,0 +1,475 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SinCode Web UI β Streamlit interface for the transliteration engine.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import streamlit as st
|
| 6 |
+
import time
|
| 7 |
+
import os
|
| 8 |
+
import hmac
|
| 9 |
+
import html as html_lib
|
| 10 |
+
import base64
|
| 11 |
+
from streamlit.errors import StreamlitSecretNotFoundError
|
| 12 |
+
from PIL import Image
|
| 13 |
+
from feedback_store import FeedbackStore, format_feedback_error
|
| 14 |
+
from sincode_model import BeamSearchDecoder
|
| 15 |
+
|
| 16 |
+
st.set_page_config(page_title="ΰ·ΰ·ΰΆCode", page_icon="π±π°", layout="centered")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# βββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 20 |
+
|
| 21 |
+
@st.cache_data
|
| 22 |
+
def _background_css(image_file: str) -> str:
|
| 23 |
+
"""Return the CSS string for the background image (cached after first read)."""
|
| 24 |
+
try:
|
| 25 |
+
with open(image_file, "rb") as f:
|
| 26 |
+
b64 = base64.b64encode(f.read()).decode()
|
| 27 |
+
return (
|
| 28 |
+
f"<style>.stApp {{background-image: linear-gradient(rgba(0,0,0,0.7),"
|
| 29 |
+
f"rgba(0,0,0,0.7)),url(data:image/png;base64,{b64});"
|
| 30 |
+
f"background-size:cover;background-position:center;"
|
| 31 |
+
f"background-attachment:fixed;}}</style>"
|
| 32 |
+
)
|
| 33 |
+
except FileNotFoundError:
|
| 34 |
+
return ""
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _set_background(image_file: str) -> None:
|
| 38 |
+
css = _background_css(image_file)
|
| 39 |
+
if css:
|
| 40 |
+
st.markdown(css, unsafe_allow_html=True)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@st.cache_data
|
| 44 |
+
def _load_logo(image_file: str):
|
| 45 |
+
return Image.open(image_file)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def _secret_or_env(name: str, default: str = "") -> str:
|
| 49 |
+
try:
|
| 50 |
+
if name in st.secrets:
|
| 51 |
+
return str(st.secrets[name])
|
| 52 |
+
except StreamlitSecretNotFoundError:
|
| 53 |
+
# Local runs may not have .streamlit/secrets.toml; fall back to env.
|
| 54 |
+
pass
|
| 55 |
+
return os.getenv(name, default)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@st.cache_resource
|
| 59 |
+
def _load_feedback_store() -> FeedbackStore:
|
| 60 |
+
return FeedbackStore(
|
| 61 |
+
supabase_url=_secret_or_env("SUPABASE_URL"),
|
| 62 |
+
supabase_anon_key=_secret_or_env("SUPABASE_ANON_KEY"),
|
| 63 |
+
supabase_service_key=_secret_or_env("SUPABASE_SERVICE_ROLE_KEY"),
|
| 64 |
+
table_name=_secret_or_env("SUPABASE_FEEDBACK_TABLE", "feedback_submissions"),
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def _admin_credentials_configured() -> bool:
|
| 69 |
+
return bool(_secret_or_env("ADMIN_USERNAME") and _secret_or_env("ADMIN_PASSWORD"))
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def _authenticate_admin(username: str, password: str) -> bool:
|
| 73 |
+
expected_username = _secret_or_env("ADMIN_USERNAME")
|
| 74 |
+
expected_password = _secret_or_env("ADMIN_PASSWORD")
|
| 75 |
+
return bool(
|
| 76 |
+
expected_username
|
| 77 |
+
and expected_password
|
| 78 |
+
and hmac.compare_digest(username, expected_username)
|
| 79 |
+
and hmac.compare_digest(password, expected_password)
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def _save_feedback(
|
| 84 |
+
input_sentence: str,
|
| 85 |
+
original_output: str,
|
| 86 |
+
corrected_output: str,
|
| 87 |
+
user_comment: str,
|
| 88 |
+
decode_mode: str,
|
| 89 |
+
) -> None:
|
| 90 |
+
_load_feedback_store().save_submission(
|
| 91 |
+
input_sentence=input_sentence,
|
| 92 |
+
original_output=original_output,
|
| 93 |
+
corrected_output=corrected_output,
|
| 94 |
+
user_comment=user_comment,
|
| 95 |
+
decode_mode=decode_mode,
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
@st.dialog("Admin Login")
|
| 100 |
+
def _show_admin_login_dialog(store: FeedbackStore) -> None:
|
| 101 |
+
st.caption(f"Feedback storage: {store.backend_label}")
|
| 102 |
+
|
| 103 |
+
if not _admin_credentials_configured():
|
| 104 |
+
st.info("Admin credentials are not configured.")
|
| 105 |
+
if st.button("Close", use_container_width=True):
|
| 106 |
+
st.rerun()
|
| 107 |
+
return
|
| 108 |
+
|
| 109 |
+
username = st.text_input("Username", key="admin_username")
|
| 110 |
+
password = st.text_input("Password", type="password", key="admin_password")
|
| 111 |
+
|
| 112 |
+
action_cols = st.columns(2)
|
| 113 |
+
if action_cols[0].button("Login", type="primary", use_container_width=True):
|
| 114 |
+
if _authenticate_admin(username, password):
|
| 115 |
+
st.session_state["admin_authenticated"] = True
|
| 116 |
+
st.session_state["show_admin_panel"] = True
|
| 117 |
+
st.rerun()
|
| 118 |
+
st.error("Invalid admin credentials.")
|
| 119 |
+
|
| 120 |
+
if action_cols[1].button("Cancel", use_container_width=True):
|
| 121 |
+
st.rerun()
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def _render_admin_panel(store: FeedbackStore) -> None:
|
| 125 |
+
st.title("Feedback Review")
|
| 126 |
+
st.caption("Review submitted corrections, approve useful examples, and export them later for future retraining.")
|
| 127 |
+
|
| 128 |
+
panel_controls = st.columns([1, 1, 4])
|
| 129 |
+
if panel_controls[0].button("Back", use_container_width=True):
|
| 130 |
+
st.session_state["show_admin_panel"] = False
|
| 131 |
+
st.rerun()
|
| 132 |
+
if panel_controls[1].button("Log Out", use_container_width=True):
|
| 133 |
+
st.session_state["admin_authenticated"] = False
|
| 134 |
+
st.session_state["show_admin_panel"] = False
|
| 135 |
+
st.rerun()
|
| 136 |
+
|
| 137 |
+
try:
|
| 138 |
+
all_rows = store.list_submissions(review_status=None, limit=500)
|
| 139 |
+
except Exception as exc:
|
| 140 |
+
st.error(f"Could not load feedback records: {format_feedback_error(exc)}")
|
| 141 |
+
return
|
| 142 |
+
|
| 143 |
+
pending_count = sum(1 for row in all_rows if row.get("review_status") == "pending")
|
| 144 |
+
approved_count = sum(1 for row in all_rows if row.get("review_status") == "approved")
|
| 145 |
+
rejected_count = sum(1 for row in all_rows if row.get("review_status") == "rejected")
|
| 146 |
+
|
| 147 |
+
metric_cols = st.columns(3)
|
| 148 |
+
metric_cols[0].metric("Pending", pending_count)
|
| 149 |
+
metric_cols[1].metric("Approved", approved_count)
|
| 150 |
+
metric_cols[2].metric("Rejected", rejected_count)
|
| 151 |
+
|
| 152 |
+
filter_cols = st.columns([1, 1, 2])
|
| 153 |
+
status_filter = filter_cols[0].selectbox(
|
| 154 |
+
"Status",
|
| 155 |
+
options=["pending", "approved", "rejected", "all"],
|
| 156 |
+
index=0,
|
| 157 |
+
)
|
| 158 |
+
row_limit = filter_cols[1].selectbox("Rows", options=[25, 50, 100, 200], index=1)
|
| 159 |
+
search_term = filter_cols[2].text_input("Search", placeholder="Search input, output, or note")
|
| 160 |
+
|
| 161 |
+
filtered_rows = all_rows
|
| 162 |
+
if status_filter != "all":
|
| 163 |
+
filtered_rows = [row for row in filtered_rows if row.get("review_status") == status_filter]
|
| 164 |
+
|
| 165 |
+
if search_term:
|
| 166 |
+
needle = search_term.casefold()
|
| 167 |
+
filtered_rows = [
|
| 168 |
+
row
|
| 169 |
+
for row in filtered_rows
|
| 170 |
+
if needle in row.get("input_sentence", "").casefold()
|
| 171 |
+
or needle in row.get("original_output", "").casefold()
|
| 172 |
+
or needle in row.get("corrected_output", "").casefold()
|
| 173 |
+
or needle in row.get("user_comment", "").casefold()
|
| 174 |
+
or needle in row.get("admin_notes", "").casefold()
|
| 175 |
+
]
|
| 176 |
+
|
| 177 |
+
filtered_rows = filtered_rows[:row_limit]
|
| 178 |
+
|
| 179 |
+
if not filtered_rows:
|
| 180 |
+
st.info("No feedback matches the current filters.")
|
| 181 |
+
return
|
| 182 |
+
|
| 183 |
+
for row in filtered_rows:
|
| 184 |
+
with st.container(border=True):
|
| 185 |
+
meta_cols = st.columns([2, 1, 1])
|
| 186 |
+
meta_cols[0].caption(f"Submitted: {row.get('created_at', 'unknown')}")
|
| 187 |
+
meta_cols[1].caption(f"Mode: {row.get('decode_mode') or 'n/a'}")
|
| 188 |
+
meta_cols[2].caption(f"Status: {row.get('review_status', 'pending')}")
|
| 189 |
+
|
| 190 |
+
st.markdown("**Input (Singlish)**")
|
| 191 |
+
st.code(row.get("input_sentence", ""), language=None)
|
| 192 |
+
st.markdown("**Model Output**")
|
| 193 |
+
st.code(row.get("original_output", ""), language=None)
|
| 194 |
+
st.markdown("**User Correction**")
|
| 195 |
+
st.code(row.get("corrected_output", ""), language=None)
|
| 196 |
+
|
| 197 |
+
if row.get("user_comment"):
|
| 198 |
+
st.markdown("**User Note**")
|
| 199 |
+
st.write(row["user_comment"])
|
| 200 |
+
|
| 201 |
+
notes_key = f"admin_notes_{row['id']}"
|
| 202 |
+
notes_value = st.text_area(
|
| 203 |
+
"Admin Notes",
|
| 204 |
+
value=row.get("admin_notes", ""),
|
| 205 |
+
key=notes_key,
|
| 206 |
+
height=80,
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
action_cols = st.columns(3)
|
| 210 |
+
if action_cols[0].button("Approve", key=f"approve_{row['id']}", use_container_width=True):
|
| 211 |
+
try:
|
| 212 |
+
store.update_submission_status(str(row["id"]), "approved", notes_value)
|
| 213 |
+
st.toast("Feedback approved.")
|
| 214 |
+
st.rerun()
|
| 215 |
+
except Exception as exc:
|
| 216 |
+
st.error(f"Could not update feedback: {format_feedback_error(exc)}")
|
| 217 |
+
if action_cols[1].button("Reject", key=f"reject_{row['id']}", use_container_width=True):
|
| 218 |
+
try:
|
| 219 |
+
store.update_submission_status(str(row["id"]), "rejected", notes_value)
|
| 220 |
+
st.toast("Feedback rejected.")
|
| 221 |
+
st.rerun()
|
| 222 |
+
except Exception as exc:
|
| 223 |
+
st.error(f"Could not update feedback: {format_feedback_error(exc)}")
|
| 224 |
+
if action_cols[2].button("Mark Pending", key=f"pending_{row['id']}", use_container_width=True):
|
| 225 |
+
try:
|
| 226 |
+
store.update_submission_status(str(row["id"]), "pending", notes_value)
|
| 227 |
+
st.toast("Feedback returned to pending.")
|
| 228 |
+
st.rerun()
|
| 229 |
+
except Exception as exc:
|
| 230 |
+
st.error(f"Could not update feedback: {format_feedback_error(exc)}")
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
@st.cache_resource
|
| 234 |
+
def _load_decoder() -> BeamSearchDecoder:
|
| 235 |
+
"""Load the transliteration engine (cached across reruns)."""
|
| 236 |
+
model_name = os.getenv("SINCODE_MODEL_NAME")
|
| 237 |
+
dict_path = os.getenv("SINCODE_DICTIONARY_PATH", "dictionary.pkl")
|
| 238 |
+
if model_name:
|
| 239 |
+
return BeamSearchDecoder(model_name=model_name, dictionary_path=dict_path)
|
| 240 |
+
return BeamSearchDecoder(dictionary_path=dict_path)
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
# βββ Layout ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 244 |
+
|
| 245 |
+
_set_background("images/background.png")
|
| 246 |
+
|
| 247 |
+
feedback_store = _load_feedback_store()
|
| 248 |
+
|
| 249 |
+
with st.sidebar:
|
| 250 |
+
st.image(_load_logo("images/SinCodeLogo.jpg"), width=200)
|
| 251 |
+
st.title("ΰ·ΰ·ΰΆCode Project")
|
| 252 |
+
st.info("6COSC023C.Y Final Project")
|
| 253 |
+
|
| 254 |
+
st.markdown("### βοΈ Settings")
|
| 255 |
+
decode_mode = st.radio(
|
| 256 |
+
"Decode Mode",
|
| 257 |
+
options=["greedy", "beam"],
|
| 258 |
+
index=0,
|
| 259 |
+
help=(
|
| 260 |
+
"**Greedy** (recommended) β Faster and more accurate. Picks the "
|
| 261 |
+
"best candidate at each step using real context.\n\n"
|
| 262 |
+
"**Beam** β Explores multiple paths but uses fixed context, "
|
| 263 |
+
"so results are similar with more computation."
|
| 264 |
+
),
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
st.markdown("### π Architecture")
|
| 268 |
+
st.success(
|
| 269 |
+
"**Hybrid Neuro-Symbolic Engine**\n\n"
|
| 270 |
+
"XLM-R contextual scoring (55%) "
|
| 271 |
+
"+ transliteration fidelity (45%).\n\n"
|
| 272 |
+
"**Common Word Overrides** β "
|
| 273 |
+
"Curated table for high-frequency unambiguous words.\n\n"
|
| 274 |
+
"**Adaptive Code-Switching** β "
|
| 275 |
+
"Preserves English words in mixed input.\n\n"
|
| 276 |
+
"**Contextual Disambiguation** β "
|
| 277 |
+
"Resolves ambiguity via sentence-level probability."
|
| 278 |
+
)
|
| 279 |
+
st.markdown("---")
|
| 280 |
+
st.write("Β© 2026 Kalana Chandrasekara")
|
| 281 |
+
|
| 282 |
+
if not feedback_store.is_remote_enabled:
|
| 283 |
+
st.warning("Feedback storage is offline. Set Supabase secrets to enable submissions.")
|
| 284 |
+
|
| 285 |
+
header_cols = st.columns([6, 1])
|
| 286 |
+
with header_cols[0]:
|
| 287 |
+
st.title("ΰ·ΰ·ΰΆCode: Context-Aware Transliteration")
|
| 288 |
+
with header_cols[1]:
|
| 289 |
+
if st.session_state.get("admin_authenticated", False):
|
| 290 |
+
if st.button("Admin", use_container_width=True, key="open_admin_panel"):
|
| 291 |
+
st.session_state["show_admin_panel"] = True
|
| 292 |
+
st.rerun()
|
| 293 |
+
else:
|
| 294 |
+
if st.button("Login", use_container_width=True, key="open_admin_login"):
|
| 295 |
+
_show_admin_login_dialog(feedback_store)
|
| 296 |
+
|
| 297 |
+
st.markdown(
|
| 298 |
+
"Type Singlish sentences below. "
|
| 299 |
+
"The system handles **code-mixing**, **ambiguity**, and **punctuation**."
|
| 300 |
+
)
|
| 301 |
+
|
| 302 |
+
if st.session_state.get("show_admin_panel", False):
|
| 303 |
+
_render_admin_panel(feedback_store)
|
| 304 |
+
st.stop()
|
| 305 |
+
|
| 306 |
+
input_text = st.text_area(
|
| 307 |
+
"Input Text", height=100, placeholder="e.g., Singlish sentences type krnna"
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
if st.button("Transliterate", type="primary", use_container_width=True) and input_text:
|
| 311 |
+
try:
|
| 312 |
+
with st.spinner("Processing..."):
|
| 313 |
+
decoder = _load_decoder()
|
| 314 |
+
t0 = time.time()
|
| 315 |
+
if decode_mode == "greedy":
|
| 316 |
+
result, trace_logs, diagnostics = decoder.greedy_decode_with_diagnostics(input_text)
|
| 317 |
+
else:
|
| 318 |
+
result, trace_logs, diagnostics = decoder.decode_with_diagnostics(input_text)
|
| 319 |
+
elapsed = time.time() - t0
|
| 320 |
+
|
| 321 |
+
# Store results in session state for interactive word swapping
|
| 322 |
+
selected = [d.selected_candidate for d in diagnostics]
|
| 323 |
+
st.session_state["diagnostics"] = diagnostics
|
| 324 |
+
st.session_state["output_words"] = selected
|
| 325 |
+
st.session_state["original_words"] = list(selected)
|
| 326 |
+
st.session_state["input_sentence"] = input_text
|
| 327 |
+
st.session_state["trace_logs"] = trace_logs
|
| 328 |
+
st.session_state["elapsed"] = elapsed
|
| 329 |
+
st.session_state["correction_mode"] = False
|
| 330 |
+
st.session_state["correction_submitted_for"] = None
|
| 331 |
+
st.session_state["feedback_comment"] = ""
|
| 332 |
+
|
| 333 |
+
except Exception as e:
|
| 334 |
+
st.error(f"Error: {e}")
|
| 335 |
+
|
| 336 |
+
# βββ Render output (persists across reruns for word swapping) βββββββββββββ
|
| 337 |
+
|
| 338 |
+
if "output_words" in st.session_state and st.session_state["output_words"]:
|
| 339 |
+
diagnostics = st.session_state["diagnostics"]
|
| 340 |
+
output_words = st.session_state["output_words"]
|
| 341 |
+
original_words = st.session_state.get("original_words", list(output_words))
|
| 342 |
+
trace_logs = st.session_state["trace_logs"]
|
| 343 |
+
elapsed = st.session_state["elapsed"]
|
| 344 |
+
|
| 345 |
+
current_result = " ".join(output_words)
|
| 346 |
+
original_result = " ".join(original_words)
|
| 347 |
+
has_changes = output_words != original_words
|
| 348 |
+
|
| 349 |
+
st.success("Transliteration Complete")
|
| 350 |
+
|
| 351 |
+
# Output display with native copy button (st.code has built-in clipboard support)
|
| 352 |
+
safe_display = html_lib.escape(current_result)
|
| 353 |
+
st.markdown(
|
| 354 |
+
f'<span style="font-size:1.4em;font-weight:700;">{safe_display}</span>',
|
| 355 |
+
unsafe_allow_html=True,
|
| 356 |
+
)
|
| 357 |
+
st.code(current_result, language=None)
|
| 358 |
+
st.caption(f"Mode: {decode_mode} Β· Time: {round(elapsed, 2)}s")
|
| 359 |
+
|
| 360 |
+
# ββ Correction mode toggle ββββββββββββββββββββββββββββββββββββββββ
|
| 361 |
+
correction_mode = st.toggle(
|
| 362 |
+
"Correct this translation",
|
| 363 |
+
value=st.session_state.get("correction_mode", False),
|
| 364 |
+
key="correction_toggle",
|
| 365 |
+
)
|
| 366 |
+
|
| 367 |
+
if correction_mode:
|
| 368 |
+
st.caption("Use the buttons below to swap alternative transliterations.")
|
| 369 |
+
|
| 370 |
+
# ββ Inline sentence display (natural text flow, no grid) βββββ
|
| 371 |
+
word_spans = []
|
| 372 |
+
for i, diag in enumerate(diagnostics):
|
| 373 |
+
has_alts = len(diag.candidate_breakdown) > 1
|
| 374 |
+
was_changed = output_words[i] != original_words[i]
|
| 375 |
+
w = html_lib.escape(output_words[i])
|
| 376 |
+
if was_changed:
|
| 377 |
+
word_spans.append(
|
| 378 |
+
f'<span style="color:#68d391;font-weight:700;">{w} β</span>'
|
| 379 |
+
)
|
| 380 |
+
elif has_alts:
|
| 381 |
+
word_spans.append(
|
| 382 |
+
f'<span style="color:#63b3ed;font-weight:700;'
|
| 383 |
+
f'border-bottom:2px dashed #63b3ed;cursor:default;">{w}</span>'
|
| 384 |
+
)
|
| 385 |
+
else:
|
| 386 |
+
word_spans.append(f'<span style="font-weight:600;">{w}</span>')
|
| 387 |
+
|
| 388 |
+
st.markdown(
|
| 389 |
+
'<div style="font-size:1.15em;line-height:2.4;">'
|
| 390 |
+
+ "   ".join(word_spans)
|
| 391 |
+
+ "</div>",
|
| 392 |
+
unsafe_allow_html=True,
|
| 393 |
+
)
|
| 394 |
+
# ββ Popover buttons only for swappable words βββββββββββββββββ
|
| 395 |
+
swappable = [
|
| 396 |
+
(i, diag)
|
| 397 |
+
for i, diag in enumerate(diagnostics)
|
| 398 |
+
if len(diag.candidate_breakdown) > 1
|
| 399 |
+
]
|
| 400 |
+
if swappable:
|
| 401 |
+
widths = [max(len(output_words[i]), 3) for i, _ in swappable]
|
| 402 |
+
cols = st.columns(widths, gap="small")
|
| 403 |
+
|
| 404 |
+
for col, (i, diag) in zip(cols, swappable):
|
| 405 |
+
was_changed = output_words[i] != original_words[i]
|
| 406 |
+
with col:
|
| 407 |
+
chip = (
|
| 408 |
+
f":green[**{output_words[i]}**] β"
|
| 409 |
+
if was_changed
|
| 410 |
+
else f":blue[**{output_words[i]}**]"
|
| 411 |
+
)
|
| 412 |
+
with st.popover(chip, use_container_width=True):
|
| 413 |
+
st.markdown(f"**`{diag.input_word}`** β pick alternative:")
|
| 414 |
+
for scored in diag.candidate_breakdown[:5]:
|
| 415 |
+
eng_tag = " π€" if scored.is_english else ""
|
| 416 |
+
is_sel = scored.text == output_words[i]
|
| 417 |
+
if st.button(
|
| 418 |
+
f"{'β
' if is_sel else ''}{scored.text}{eng_tag}",
|
| 419 |
+
key=f"alt_{i}_{scored.text}",
|
| 420 |
+
help=f"Score: {scored.combined_score:.2f}",
|
| 421 |
+
use_container_width=True,
|
| 422 |
+
type="primary" if is_sel else "secondary",
|
| 423 |
+
):
|
| 424 |
+
st.session_state["output_words"][i] = scored.text
|
| 425 |
+
st.rerun()
|
| 426 |
+
st.markdown("---")
|
| 427 |
+
custom = st.text_input(
|
| 428 |
+
"Not listed? Type correct word:",
|
| 429 |
+
key=f"custom_{i}",
|
| 430 |
+
placeholder="Type Sinhala word",
|
| 431 |
+
)
|
| 432 |
+
if custom and st.button(
|
| 433 |
+
"Use this", key=f"custom_apply_{i}", use_container_width=True
|
| 434 |
+
):
|
| 435 |
+
st.session_state["output_words"][i] = custom
|
| 436 |
+
st.rerun()
|
| 437 |
+
|
| 438 |
+
# ββ Submit correction button (only when changes exist, once per result) ββ
|
| 439 |
+
# Guard key: (original sentence, original output) β stable regardless of swaps
|
| 440 |
+
submit_key = (st.session_state["input_sentence"], original_result)
|
| 441 |
+
already_submitted = st.session_state.get("correction_submitted_for") == submit_key
|
| 442 |
+
if has_changes and not already_submitted:
|
| 443 |
+
st.info(f"**Original:** {original_result}\n\n**Corrected:** {current_result}")
|
| 444 |
+
feedback_comment = st.text_area(
|
| 445 |
+
"Optional note for reviewers",
|
| 446 |
+
key="feedback_comment",
|
| 447 |
+
placeholder="Example: The word 'kalaya' should mean time in this context.",
|
| 448 |
+
)
|
| 449 |
+
if st.button("Submit Correction", type="primary", use_container_width=True):
|
| 450 |
+
try:
|
| 451 |
+
_save_feedback(
|
| 452 |
+
input_sentence=st.session_state["input_sentence"],
|
| 453 |
+
original_output=original_result,
|
| 454 |
+
corrected_output=current_result,
|
| 455 |
+
user_comment=feedback_comment,
|
| 456 |
+
decode_mode=decode_mode,
|
| 457 |
+
)
|
| 458 |
+
st.session_state["correction_submitted_for"] = submit_key
|
| 459 |
+
st.session_state["correction_mode"] = False
|
| 460 |
+
st.toast("Correction submitted for review β thank you!")
|
| 461 |
+
st.rerun()
|
| 462 |
+
except Exception as exc:
|
| 463 |
+
st.error(f"Could not submit feedback: {format_feedback_error(exc)}")
|
| 464 |
+
|
| 465 |
+
# Show outside toggle so it remains visible after submission closes the toggle
|
| 466 |
+
input_sent = st.session_state.get("input_sentence", "")
|
| 467 |
+
if st.session_state.get("correction_submitted_for") == (input_sent, original_result):
|
| 468 |
+
st.success("Correction already submitted.")
|
| 469 |
+
|
| 470 |
+
with st.expander("Scoring Breakdown", expanded=False):
|
| 471 |
+
st.caption(
|
| 472 |
+
"MLM = contextual fit Β· Fid = transliteration fidelity Β· "
|
| 473 |
+
"Rank = dictionary prior Β· π€ = English"
|
| 474 |
+
)
|
| 475 |
+
st.markdown("\n\n---\n\n".join(trace_logs))
|
core/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SinCode core package β modular transliteration engine components.
|
| 3 |
+
"""
|
core/constants.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration constants and hyperparameters for the SinCode engine.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import re
|
| 6 |
+
|
| 7 |
+
# βββ Model & Data Paths βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 8 |
+
|
| 9 |
+
# DEFAULT_MODEL_NAME = "FacebookAI/xlm-roberta-base"
|
| 10 |
+
DEFAULT_MODEL_NAME = "Kalana001/xlm-roberta-sinhala-sincode"
|
| 11 |
+
DEFAULT_DICTIONARY_PATH = "dictionary.pkl"
|
| 12 |
+
|
| 13 |
+
ENGLISH_CORPUS_URL = (
|
| 14 |
+
"https://raw.githubusercontent.com/first20hours/google-10000-english/master/20k.txt"
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
# βββ Scoring Weights (tunable hyperparameters) ββββββββββββββββββββββββββββββ
|
| 18 |
+
|
| 19 |
+
W_MLM: float = 0.55 # Contextual language model probability
|
| 20 |
+
W_FIDELITY: float = 0.45 # Source-aware transliteration fidelity
|
| 21 |
+
W_RANK: float = 0.00 # Dictionary rank prior (disabled β dict is unordered)
|
| 22 |
+
|
| 23 |
+
# βββ Decoding Parameters ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 24 |
+
|
| 25 |
+
MAX_CANDIDATES: int = 8 # Max candidates per word position
|
| 26 |
+
DEFAULT_BEAM_WIDTH: int = 5 # Beam search width
|
| 27 |
+
FIDELITY_SCALE: float = 10.0 # Edit-distance penalty multiplier
|
| 28 |
+
DICT_FIDELITY_DAMP: float = 2.0 # Decay rate for dict bonus (higher = stricter filter)
|
| 29 |
+
MIN_ENGLISH_LEN: int = 3 # Min word length for 20k-corpus English detection
|
| 30 |
+
|
| 31 |
+
# βββ Unicode Constants ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 32 |
+
|
| 33 |
+
SINHALA_VIRAMA: str = '\u0DCA' # Sinhala virama (hal) character
|
| 34 |
+
ZWJ: str = '\u200D' # Zero-width joiner (for conjuncts)
|
| 35 |
+
|
| 36 |
+
# βββ Regex ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 37 |
+
|
| 38 |
+
PUNCT_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$")
|
core/decoder.py
ADDED
|
@@ -0,0 +1,773 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Beam search and greedy decoders for Singlish β Sinhala transliteration.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import math
|
| 6 |
+
import re
|
| 7 |
+
import torch
|
| 8 |
+
import pickle
|
| 9 |
+
import logging
|
| 10 |
+
from typing import List, Tuple, Dict, Optional, Set
|
| 11 |
+
|
| 12 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
| 13 |
+
|
| 14 |
+
from core.constants import (
|
| 15 |
+
DEFAULT_MODEL_NAME, DEFAULT_DICTIONARY_PATH,
|
| 16 |
+
DEFAULT_BEAM_WIDTH, MAX_CANDIDATES, MIN_ENGLISH_LEN,
|
| 17 |
+
PUNCT_PATTERN,
|
| 18 |
+
)
|
| 19 |
+
from core.mappings import COMMON_WORDS, CONTEXT_WORDS_STANDALONE
|
| 20 |
+
from core.english import ENGLISH_VOCAB
|
| 21 |
+
from core.scorer import CandidateScorer, ScoredCandidate, WordDiagnostic
|
| 22 |
+
from core.dictionary import DictionaryAdapter
|
| 23 |
+
|
| 24 |
+
logger = logging.getLogger(__name__)
|
| 25 |
+
|
| 26 |
+
# Sinhala Unicode block: U+0D80 β U+0DFF
|
| 27 |
+
_SINHALA_RE = re.compile(r"[\u0D80-\u0DFF]")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _is_sinhala(text: str) -> bool:
|
| 31 |
+
"""Return True if the text already contains Sinhala script characters."""
|
| 32 |
+
return bool(_SINHALA_RE.search(text))
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class BeamSearchDecoder:
|
| 36 |
+
"""
|
| 37 |
+
Contextual beam-search decoder for Singlish β Sinhala transliteration.
|
| 38 |
+
|
| 39 |
+
For each word position the decoder:
|
| 40 |
+
1. Generates candidates (dictionary + rule engine)
|
| 41 |
+
2. Scores them with XLM-R MLM in sentence context
|
| 42 |
+
3. Combines MLM score with fidelity & rank via CandidateScorer
|
| 43 |
+
4. Prunes to the top-k (beam width) hypotheses
|
| 44 |
+
"""
|
| 45 |
+
|
| 46 |
+
def __init__(
|
| 47 |
+
self,
|
| 48 |
+
model_name: str = DEFAULT_MODEL_NAME,
|
| 49 |
+
dictionary_path: str = DEFAULT_DICTIONARY_PATH,
|
| 50 |
+
device: Optional[str] = None,
|
| 51 |
+
):
|
| 52 |
+
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
| 53 |
+
|
| 54 |
+
logger.info("Loading tokenizer & model: %s", model_name)
|
| 55 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 56 |
+
self.model = AutoModelForMaskedLM.from_pretrained(model_name)
|
| 57 |
+
self.model.to(self.device)
|
| 58 |
+
self.model.eval()
|
| 59 |
+
|
| 60 |
+
logger.info("Loading dictionary: %s", dictionary_path)
|
| 61 |
+
with open(dictionary_path, "rb") as f:
|
| 62 |
+
d_data = pickle.load(f)
|
| 63 |
+
self.adapter = DictionaryAdapter(d_data)
|
| 64 |
+
self.scorer = CandidateScorer()
|
| 65 |
+
|
| 66 |
+
# ββ Normalization βββββββββββββββββββββββββββββββββββββββββββββ
|
| 67 |
+
|
| 68 |
+
@staticmethod
|
| 69 |
+
def _softmax_normalize(raw_scores: List[float]) -> List[float]:
|
| 70 |
+
"""
|
| 71 |
+
Normalize raw log-probability scores to [0, 1] via softmax.
|
| 72 |
+
|
| 73 |
+
Unlike min-max (which maps bestβ1.0, worstβ0.0 regardless of
|
| 74 |
+
the actual difference), softmax preserves the model's relative
|
| 75 |
+
confidence. When all candidates have similar log-probs the
|
| 76 |
+
output values cluster together; when the model is very
|
| 77 |
+
confident they spread apart.
|
| 78 |
+
|
| 79 |
+
The raw scores are already log-probs (negative), so we use
|
| 80 |
+
them directly as logits for softmax.
|
| 81 |
+
"""
|
| 82 |
+
if not raw_scores:
|
| 83 |
+
return []
|
| 84 |
+
if len(raw_scores) == 1:
|
| 85 |
+
return [1.0]
|
| 86 |
+
|
| 87 |
+
# Subtract max for numerical stability (standard log-sum-exp trick)
|
| 88 |
+
max_s = max(raw_scores)
|
| 89 |
+
exps = [math.exp(s - max_s) for s in raw_scores]
|
| 90 |
+
total = sum(exps)
|
| 91 |
+
return [e / total for e in exps]
|
| 92 |
+
|
| 93 |
+
# ββ MLM batch scoring ββββββββββββββββββββββββββββββββββββββββββββ
|
| 94 |
+
|
| 95 |
+
def _batch_mlm_score(
|
| 96 |
+
self,
|
| 97 |
+
left_contexts: List[str],
|
| 98 |
+
right_contexts: List[str],
|
| 99 |
+
candidates: List[str],
|
| 100 |
+
) -> List[float]:
|
| 101 |
+
"""
|
| 102 |
+
Score each candidate using masked LM log-probability with proper
|
| 103 |
+
multi-mask scoring for multi-subword candidates.
|
| 104 |
+
|
| 105 |
+
Instead of placing a single <mask> and summing subword log-probs
|
| 106 |
+
at that one position, this method creates one <mask> per subword
|
| 107 |
+
token and scores each subword at its own position:
|
| 108 |
+
|
| 109 |
+
score = (1/N) * Ξ£_i log P(t_i | mask_position_i, context)
|
| 110 |
+
"""
|
| 111 |
+
if not candidates:
|
| 112 |
+
return []
|
| 113 |
+
|
| 114 |
+
mask = self.tokenizer.mask_token
|
| 115 |
+
mask_token_id = self.tokenizer.mask_token_id
|
| 116 |
+
|
| 117 |
+
# Pre-tokenize every candidate to determine subword count
|
| 118 |
+
cand_token_ids: List[List[int]] = []
|
| 119 |
+
for c in candidates:
|
| 120 |
+
ids = self.tokenizer.encode(c, add_special_tokens=False)
|
| 121 |
+
cand_token_ids.append(ids if ids else [self.tokenizer.unk_token_id])
|
| 122 |
+
|
| 123 |
+
# Build context strings with the correct number of <mask> tokens
|
| 124 |
+
batch_texts: List[str] = []
|
| 125 |
+
for i in range(len(candidates)):
|
| 126 |
+
n_masks = len(cand_token_ids[i])
|
| 127 |
+
mask_str = " ".join([mask] * n_masks)
|
| 128 |
+
parts = [p for p in [left_contexts[i], mask_str, right_contexts[i]] if p]
|
| 129 |
+
batch_texts.append(" ".join(parts))
|
| 130 |
+
|
| 131 |
+
inputs = self.tokenizer(
|
| 132 |
+
batch_texts,
|
| 133 |
+
return_tensors="pt",
|
| 134 |
+
padding=True,
|
| 135 |
+
truncation=True,
|
| 136 |
+
).to(self.device)
|
| 137 |
+
|
| 138 |
+
with torch.no_grad():
|
| 139 |
+
logits = self.model(**inputs).logits
|
| 140 |
+
|
| 141 |
+
scores: List[float] = []
|
| 142 |
+
for i, target_ids in enumerate(cand_token_ids):
|
| 143 |
+
token_ids = inputs.input_ids[i]
|
| 144 |
+
mask_positions = (token_ids == mask_token_id).nonzero(as_tuple=True)[0]
|
| 145 |
+
|
| 146 |
+
if mask_positions.numel() == 0 or not target_ids:
|
| 147 |
+
scores.append(-100.0)
|
| 148 |
+
continue
|
| 149 |
+
|
| 150 |
+
# Score each subword at its corresponding mask position
|
| 151 |
+
n = min(len(target_ids), mask_positions.numel())
|
| 152 |
+
total = 0.0
|
| 153 |
+
for j in range(n):
|
| 154 |
+
pos = mask_positions[j].item()
|
| 155 |
+
log_probs = torch.log_softmax(logits[i, pos, :], dim=0)
|
| 156 |
+
total += log_probs[target_ids[j]].item()
|
| 157 |
+
|
| 158 |
+
scores.append(total / n)
|
| 159 |
+
|
| 160 |
+
return scores
|
| 161 |
+
|
| 162 |
+
# ββ Main decode entry-point ββββββββββββββββββββββββββββββββββββββ
|
| 163 |
+
|
| 164 |
+
def decode(
|
| 165 |
+
self,
|
| 166 |
+
sentence: str,
|
| 167 |
+
beam_width: int = DEFAULT_BEAM_WIDTH,
|
| 168 |
+
mode: str = "greedy",
|
| 169 |
+
) -> Tuple[str, List[str]]:
|
| 170 |
+
"""
|
| 171 |
+
Transliterate a full Singlish sentence into Sinhala script.
|
| 172 |
+
|
| 173 |
+
Args:
|
| 174 |
+
mode: "greedy" (accurate, uses dynamic context) or
|
| 175 |
+
"beam" (uses fixed rule-based context)
|
| 176 |
+
|
| 177 |
+
Returns:
|
| 178 |
+
result β the best transliteration string
|
| 179 |
+
trace_logs β per-step markdown logs for the debug UI
|
| 180 |
+
"""
|
| 181 |
+
if mode == "greedy":
|
| 182 |
+
result, trace_logs, _ = self.greedy_decode_with_diagnostics(sentence)
|
| 183 |
+
else:
|
| 184 |
+
result, trace_logs, _ = self.decode_with_diagnostics(
|
| 185 |
+
sentence=sentence,
|
| 186 |
+
beam_width=beam_width,
|
| 187 |
+
)
|
| 188 |
+
return result, trace_logs
|
| 189 |
+
|
| 190 |
+
# ββ Greedy decode (dynamic context β more accurate) ββββββββββββββ
|
| 191 |
+
|
| 192 |
+
def greedy_decode_with_diagnostics(
|
| 193 |
+
self,
|
| 194 |
+
sentence: str,
|
| 195 |
+
) -> Tuple[str, List[str], List[WordDiagnostic]]:
|
| 196 |
+
"""
|
| 197 |
+
Greedy word-by-word decode using actual selected outputs as
|
| 198 |
+
left context for subsequent MLM scoring.
|
| 199 |
+
|
| 200 |
+
More accurate than beam search with fixed context because XLM-R
|
| 201 |
+
sees the real transliteration built so far, not rule-based guesses.
|
| 202 |
+
"""
|
| 203 |
+
words = sentence.split()
|
| 204 |
+
if not words:
|
| 205 |
+
return "", [], []
|
| 206 |
+
|
| 207 |
+
# ββ Phase 1: candidate generation (same as beam) βββββββββββββ
|
| 208 |
+
word_infos: List[dict] = []
|
| 209 |
+
|
| 210 |
+
for raw in words:
|
| 211 |
+
match = PUNCT_PATTERN.match(raw)
|
| 212 |
+
prefix, core, suffix = match.groups() if match else ("", raw, "")
|
| 213 |
+
|
| 214 |
+
if not core:
|
| 215 |
+
word_infos.append({
|
| 216 |
+
"candidates": [raw],
|
| 217 |
+
"rule_output": raw,
|
| 218 |
+
"english_flags": [False],
|
| 219 |
+
"dict_flags": [False],
|
| 220 |
+
"prefix": prefix,
|
| 221 |
+
"suffix": suffix,
|
| 222 |
+
"sinhala_passthrough": False,
|
| 223 |
+
})
|
| 224 |
+
continue
|
| 225 |
+
|
| 226 |
+
# Already-Sinhala text: pass through unchanged
|
| 227 |
+
if _is_sinhala(core):
|
| 228 |
+
word_infos.append({
|
| 229 |
+
"candidates": [raw],
|
| 230 |
+
"rule_output": raw,
|
| 231 |
+
"english_flags": [False],
|
| 232 |
+
"dict_flags": [False],
|
| 233 |
+
"prefix": prefix,
|
| 234 |
+
"suffix": suffix,
|
| 235 |
+
"sinhala_passthrough": True,
|
| 236 |
+
})
|
| 237 |
+
continue
|
| 238 |
+
|
| 239 |
+
rule_output = self.adapter.get_rule_output(core)
|
| 240 |
+
cands = self.adapter.get_candidates(core, rule_output)
|
| 241 |
+
|
| 242 |
+
dict_entries: Set[str] = set()
|
| 243 |
+
if core in self.adapter.dictionary:
|
| 244 |
+
dict_entries.update(self.adapter.dictionary[core])
|
| 245 |
+
elif core.lower() in self.adapter.dictionary:
|
| 246 |
+
dict_entries.update(self.adapter.dictionary[core.lower()])
|
| 247 |
+
|
| 248 |
+
if rule_output and rule_output not in cands:
|
| 249 |
+
cands.append(rule_output)
|
| 250 |
+
if not cands:
|
| 251 |
+
cands = [rule_output]
|
| 252 |
+
|
| 253 |
+
english_flags = [c.lower() in ENGLISH_VOCAB for c in cands]
|
| 254 |
+
dict_flags = [c in dict_entries for c in cands]
|
| 255 |
+
|
| 256 |
+
full_cands = [prefix + c + suffix for c in cands]
|
| 257 |
+
|
| 258 |
+
word_infos.append({
|
| 259 |
+
"candidates": full_cands[:MAX_CANDIDATES],
|
| 260 |
+
"rule_output": prefix + rule_output + suffix,
|
| 261 |
+
"core_rule_output": rule_output,
|
| 262 |
+
"n_dict_entries": len(dict_entries),
|
| 263 |
+
"dict_entries": dict_entries,
|
| 264 |
+
"english_flags": english_flags[:MAX_CANDIDATES],
|
| 265 |
+
"dict_flags": dict_flags[:MAX_CANDIDATES],
|
| 266 |
+
"prefix": prefix,
|
| 267 |
+
"suffix": suffix,
|
| 268 |
+
"sinhala_passthrough": False,
|
| 269 |
+
})
|
| 270 |
+
|
| 271 |
+
# Build right-side stable context (rule outputs for future words)
|
| 272 |
+
stable_right: List[str] = []
|
| 273 |
+
for info in word_infos:
|
| 274 |
+
eng_cands = [
|
| 275 |
+
c for c, e in zip(info["candidates"], info["english_flags"]) if e
|
| 276 |
+
]
|
| 277 |
+
stable_right.append(
|
| 278 |
+
eng_cands[0] if eng_cands else info["rule_output"]
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
# ββ Phase 2: greedy word-by-word with dynamic left context βββ
|
| 282 |
+
selected_words: List[str] = []
|
| 283 |
+
trace_logs: List[str] = []
|
| 284 |
+
diagnostics: List[WordDiagnostic] = []
|
| 285 |
+
|
| 286 |
+
for t, info in enumerate(word_infos):
|
| 287 |
+
candidates = info["candidates"]
|
| 288 |
+
eng_flags = info["english_flags"]
|
| 289 |
+
d_flags = info.get("dict_flags", [False] * len(candidates))
|
| 290 |
+
rule_out = info["rule_output"]
|
| 291 |
+
prefix = info.get("prefix", "")
|
| 292 |
+
suffix = info.get("suffix", "")
|
| 293 |
+
total_cands = len(candidates)
|
| 294 |
+
|
| 295 |
+
# ββ Sinhala passthrough ββββββββββββββββββββββββββββββββββββ
|
| 296 |
+
if info.get("sinhala_passthrough"):
|
| 297 |
+
selected_words.append(words[t])
|
| 298 |
+
trace_logs.append(
|
| 299 |
+
f"**Step {t + 1}: `{words[t]}`** β "
|
| 300 |
+
f"`{words[t]}` (Sinhala passthrough)\n"
|
| 301 |
+
)
|
| 302 |
+
diagnostics.append(WordDiagnostic(
|
| 303 |
+
step_index=t,
|
| 304 |
+
input_word=words[t],
|
| 305 |
+
rule_output=rule_out,
|
| 306 |
+
selected_candidate=words[t],
|
| 307 |
+
beam_score=0.0,
|
| 308 |
+
candidate_breakdown=[],
|
| 309 |
+
))
|
| 310 |
+
continue
|
| 311 |
+
|
| 312 |
+
# ββ Common-word shortcut βββββββββββββββββββββββββββββββββ
|
| 313 |
+
core_lower = words[t].lower().strip()
|
| 314 |
+
if core_lower in COMMON_WORDS:
|
| 315 |
+
override = prefix + COMMON_WORDS[core_lower] + suffix
|
| 316 |
+
selected_words.append(override)
|
| 317 |
+
trace_logs.append(
|
| 318 |
+
f"**Step {t + 1}: `{words[t]}`** β "
|
| 319 |
+
f"`{override}` (common-word override)\n"
|
| 320 |
+
)
|
| 321 |
+
diagnostics.append(WordDiagnostic(
|
| 322 |
+
step_index=t,
|
| 323 |
+
input_word=words[t],
|
| 324 |
+
rule_output=rule_out,
|
| 325 |
+
selected_candidate=override,
|
| 326 |
+
beam_score=0.0,
|
| 327 |
+
candidate_breakdown=[],
|
| 328 |
+
))
|
| 329 |
+
continue
|
| 330 |
+
|
| 331 |
+
# ββ Context-dependent standalone overrides ββββββββββββββββ
|
| 332 |
+
if core_lower in CONTEXT_WORDS_STANDALONE:
|
| 333 |
+
prev_word_lower = words[t - 1].lower() if t > 0 else ""
|
| 334 |
+
prev_common_val = COMMON_WORDS.get(prev_word_lower, "")
|
| 335 |
+
prev_is_english = (
|
| 336 |
+
t > 0
|
| 337 |
+
and (
|
| 338 |
+
prev_word_lower in ENGLISH_VOCAB
|
| 339 |
+
or prev_common_val.isascii() and prev_common_val != ""
|
| 340 |
+
)
|
| 341 |
+
)
|
| 342 |
+
if not prev_is_english:
|
| 343 |
+
override = prefix + CONTEXT_WORDS_STANDALONE[core_lower] + suffix
|
| 344 |
+
selected_words.append(override)
|
| 345 |
+
trace_logs.append(
|
| 346 |
+
f"**Step {t + 1}: `{words[t]}`** β "
|
| 347 |
+
f"`{override}` (standalone override)\n"
|
| 348 |
+
)
|
| 349 |
+
diagnostics.append(WordDiagnostic(
|
| 350 |
+
step_index=t,
|
| 351 |
+
input_word=words[t],
|
| 352 |
+
rule_output=rule_out,
|
| 353 |
+
selected_candidate=override,
|
| 354 |
+
beam_score=0.0,
|
| 355 |
+
candidate_breakdown=[],
|
| 356 |
+
))
|
| 357 |
+
continue
|
| 358 |
+
|
| 359 |
+
# ββ English-word shortcut ββββββββββββββββββββββββββββββββ
|
| 360 |
+
# Preserve English immediately UNLESS the romanisation maps
|
| 361 |
+
# to a genuine Sinhala word (rule output found in the
|
| 362 |
+
# dictionary with 3+ entries β multiple meanings).
|
| 363 |
+
# e.g. "game" ruleβΰΆΰΆΈΰ· exists in dict with 7 entries β ambiguous.
|
| 364 |
+
# e.g. "meeting" ruleβΰΆΈΰ·ΰΆ§ΰ·ΰΆ±ΰ·ΰΆΰ· is in dict but only 1 entry β
|
| 365 |
+
# loanword transliteration, keep English.
|
| 366 |
+
core_rule = info.get("core_rule_output", "")
|
| 367 |
+
core_dict = info.get("dict_entries", set())
|
| 368 |
+
is_semantically_ambiguous = (
|
| 369 |
+
core_rule in core_dict and len(core_dict) >= 3
|
| 370 |
+
)
|
| 371 |
+
if (
|
| 372 |
+
len(core_lower) >= MIN_ENGLISH_LEN
|
| 373 |
+
and core_lower in ENGLISH_VOCAB
|
| 374 |
+
and not is_semantically_ambiguous
|
| 375 |
+
):
|
| 376 |
+
selected_words.append(words[t])
|
| 377 |
+
trace_logs.append(
|
| 378 |
+
f"**Step {t + 1}: `{words[t]}`** β "
|
| 379 |
+
f"`{words[t]}` (English preserved)\n"
|
| 380 |
+
)
|
| 381 |
+
diagnostics.append(WordDiagnostic(
|
| 382 |
+
step_index=t,
|
| 383 |
+
input_word=words[t],
|
| 384 |
+
rule_output=rule_out,
|
| 385 |
+
selected_candidate=words[t],
|
| 386 |
+
beam_score=0.0,
|
| 387 |
+
candidate_breakdown=[],
|
| 388 |
+
))
|
| 389 |
+
continue
|
| 390 |
+
|
| 391 |
+
# Dynamic left context = actual selected outputs so far
|
| 392 |
+
left_ctx = " ".join(selected_words) if selected_words else ""
|
| 393 |
+
# Right context = rule-based stable context for future words
|
| 394 |
+
right_ctx = " ".join(stable_right[t + 1:]) if t + 1 < len(words) else ""
|
| 395 |
+
|
| 396 |
+
# Score all candidates for this position in one batch
|
| 397 |
+
batch_left = [left_ctx] * total_cands
|
| 398 |
+
batch_right = [right_ctx] * total_cands
|
| 399 |
+
|
| 400 |
+
mlm_scores = self._batch_mlm_score(batch_left, batch_right, candidates)
|
| 401 |
+
|
| 402 |
+
# ββ Softmax normalise MLM scores βββββββββββββββββββββββββ
|
| 403 |
+
# Preserves the model's relative confidence β close raw
|
| 404 |
+
# log-probs yield close normalised values, unlike min-max
|
| 405 |
+
# which always maps bestβ1.0 / worstβ0.0.
|
| 406 |
+
mlm_scores = self._softmax_normalize(mlm_scores)
|
| 407 |
+
|
| 408 |
+
# MLM floor for English code-switching
|
| 409 |
+
# Skip floor for semantically ambiguous words (rule output
|
| 410 |
+
# found in dict with 3+ entries) so raw MLM context signal
|
| 411 |
+
# can distinguish e.g. "game" (English) vs ΰΆΰΆΈΰ· (village).
|
| 412 |
+
best_nonenglish_mlm = -1e9
|
| 413 |
+
if not is_semantically_ambiguous:
|
| 414 |
+
for i, mlm in enumerate(mlm_scores):
|
| 415 |
+
is_eng = eng_flags[i] if i < len(eng_flags) else False
|
| 416 |
+
if not is_eng and mlm > best_nonenglish_mlm:
|
| 417 |
+
best_nonenglish_mlm = mlm
|
| 418 |
+
|
| 419 |
+
# Score & select best candidate
|
| 420 |
+
step_log = f"**Step {t + 1}: `{words[t]}`** (rule β `{rule_out}`)\n\n"
|
| 421 |
+
best_scored: Optional[ScoredCandidate] = None
|
| 422 |
+
candidate_breakdown: List[ScoredCandidate] = []
|
| 423 |
+
|
| 424 |
+
for i, mlm in enumerate(mlm_scores):
|
| 425 |
+
cand = candidates[i]
|
| 426 |
+
is_eng = eng_flags[i] if i < len(eng_flags) else False
|
| 427 |
+
is_dict = d_flags[i] if i < len(d_flags) else False
|
| 428 |
+
|
| 429 |
+
effective_mlm = mlm
|
| 430 |
+
if is_eng and cand.lower() == words[t].lower() and not is_semantically_ambiguous:
|
| 431 |
+
effective_mlm = max(mlm, best_nonenglish_mlm)
|
| 432 |
+
|
| 433 |
+
scored = self.scorer.score(
|
| 434 |
+
mlm_score=effective_mlm,
|
| 435 |
+
candidate=cand,
|
| 436 |
+
rule_output=rule_out,
|
| 437 |
+
rank=i,
|
| 438 |
+
total_candidates=total_cands,
|
| 439 |
+
is_english=is_eng,
|
| 440 |
+
original_input=words[t],
|
| 441 |
+
is_from_dict=is_dict,
|
| 442 |
+
is_ambiguous=is_semantically_ambiguous,
|
| 443 |
+
)
|
| 444 |
+
candidate_breakdown.append(scored)
|
| 445 |
+
|
| 446 |
+
if best_scored is None or scored.combined_score > best_scored.combined_score:
|
| 447 |
+
best_scored = scored
|
| 448 |
+
|
| 449 |
+
if mlm > -25.0:
|
| 450 |
+
eng_tag = " π€" if is_eng else ""
|
| 451 |
+
step_log += (
|
| 452 |
+
f"- `{cand}`{eng_tag} "
|
| 453 |
+
f"MLM={scored.mlm_score:.2f} "
|
| 454 |
+
f"Fid={scored.fidelity_score:.2f} "
|
| 455 |
+
f"Rank={scored.rank_score:.2f} β "
|
| 456 |
+
f"**{scored.combined_score:.2f}**\n"
|
| 457 |
+
)
|
| 458 |
+
|
| 459 |
+
trace_logs.append(step_log)
|
| 460 |
+
|
| 461 |
+
selected = best_scored.text if best_scored else rule_out
|
| 462 |
+
selected_words.append(selected)
|
| 463 |
+
|
| 464 |
+
candidate_breakdown.sort(key=lambda s: s.combined_score, reverse=True)
|
| 465 |
+
diagnostics.append(WordDiagnostic(
|
| 466 |
+
step_index=t,
|
| 467 |
+
input_word=words[t],
|
| 468 |
+
rule_output=rule_out,
|
| 469 |
+
selected_candidate=selected,
|
| 470 |
+
beam_score=best_scored.combined_score if best_scored else 0.0,
|
| 471 |
+
candidate_breakdown=candidate_breakdown,
|
| 472 |
+
))
|
| 473 |
+
|
| 474 |
+
result = " ".join(selected_words)
|
| 475 |
+
return result, trace_logs, diagnostics
|
| 476 |
+
|
| 477 |
+
# ββ Beam decode (fixed context β legacy comparison) ββββββββββββββ
|
| 478 |
+
|
| 479 |
+
def decode_with_diagnostics(
|
| 480 |
+
self,
|
| 481 |
+
sentence: str,
|
| 482 |
+
beam_width: int = DEFAULT_BEAM_WIDTH,
|
| 483 |
+
) -> Tuple[str, List[str], List[WordDiagnostic]]:
|
| 484 |
+
"""
|
| 485 |
+
Decode sentence using beam search and return detailed diagnostics.
|
| 486 |
+
|
| 487 |
+
Uses fixed rule-based context for all beam paths. Kept for
|
| 488 |
+
comparison with greedy decode in evaluation.
|
| 489 |
+
"""
|
| 490 |
+
words = sentence.split()
|
| 491 |
+
if not words:
|
| 492 |
+
return "", [], []
|
| 493 |
+
|
| 494 |
+
# ββ Phase 1: candidate generation ββββββββββββββββββββββββββββ
|
| 495 |
+
word_infos: List[dict] = []
|
| 496 |
+
|
| 497 |
+
for raw in words:
|
| 498 |
+
match = PUNCT_PATTERN.match(raw)
|
| 499 |
+
prefix, core, suffix = match.groups() if match else ("", raw, "")
|
| 500 |
+
|
| 501 |
+
if not core:
|
| 502 |
+
word_infos.append({
|
| 503 |
+
"candidates": [raw],
|
| 504 |
+
"rule_output": raw,
|
| 505 |
+
"english_flags": [False],
|
| 506 |
+
"prefix": prefix,
|
| 507 |
+
"suffix": suffix,
|
| 508 |
+
"sinhala_passthrough": False,
|
| 509 |
+
})
|
| 510 |
+
continue
|
| 511 |
+
|
| 512 |
+
# Already-Sinhala text: pass through unchanged
|
| 513 |
+
if _is_sinhala(core):
|
| 514 |
+
word_infos.append({
|
| 515 |
+
"candidates": [raw],
|
| 516 |
+
"rule_output": raw,
|
| 517 |
+
"english_flags": [False],
|
| 518 |
+
"prefix": prefix,
|
| 519 |
+
"suffix": suffix,
|
| 520 |
+
"sinhala_passthrough": True,
|
| 521 |
+
})
|
| 522 |
+
continue
|
| 523 |
+
|
| 524 |
+
rule_output = self.adapter.get_rule_output(core)
|
| 525 |
+
cands = self.adapter.get_candidates(core, rule_output)
|
| 526 |
+
|
| 527 |
+
dict_entries: Set[str] = set()
|
| 528 |
+
if core in self.adapter.dictionary:
|
| 529 |
+
dict_entries.update(self.adapter.dictionary[core])
|
| 530 |
+
elif core.lower() in self.adapter.dictionary:
|
| 531 |
+
dict_entries.update(self.adapter.dictionary[core.lower()])
|
| 532 |
+
|
| 533 |
+
if rule_output and rule_output not in cands:
|
| 534 |
+
cands.append(rule_output)
|
| 535 |
+
if not cands:
|
| 536 |
+
cands = [rule_output]
|
| 537 |
+
|
| 538 |
+
english_flags = [c.lower() in ENGLISH_VOCAB for c in cands]
|
| 539 |
+
dict_flags = [c in dict_entries for c in cands]
|
| 540 |
+
full_cands = [prefix + c + suffix for c in cands]
|
| 541 |
+
|
| 542 |
+
word_infos.append({
|
| 543 |
+
"candidates": full_cands[:MAX_CANDIDATES],
|
| 544 |
+
"rule_output": prefix + rule_output + suffix,
|
| 545 |
+
"core_rule_output": rule_output,
|
| 546 |
+
"n_dict_entries": len(dict_entries),
|
| 547 |
+
"dict_entries": dict_entries,
|
| 548 |
+
"english_flags": english_flags[:MAX_CANDIDATES],
|
| 549 |
+
"dict_flags": dict_flags[:MAX_CANDIDATES],
|
| 550 |
+
"prefix": prefix,
|
| 551 |
+
"suffix": suffix,
|
| 552 |
+
"sinhala_passthrough": False,
|
| 553 |
+
})
|
| 554 |
+
|
| 555 |
+
# Build stable context (fixed for all beam paths)
|
| 556 |
+
stable_context: List[str] = []
|
| 557 |
+
for info in word_infos:
|
| 558 |
+
eng_cands = [
|
| 559 |
+
c for c, e in zip(info["candidates"], info["english_flags"]) if e
|
| 560 |
+
]
|
| 561 |
+
stable_context.append(
|
| 562 |
+
eng_cands[0] if eng_cands else info["rule_output"]
|
| 563 |
+
)
|
| 564 |
+
|
| 565 |
+
# ββ Phase 2: beam search with data-driven scoring ββββββββββββ
|
| 566 |
+
beam: List[Tuple[List[str], float]] = [([], 0.0)]
|
| 567 |
+
trace_logs: List[str] = []
|
| 568 |
+
diagnostics: List[WordDiagnostic] = []
|
| 569 |
+
|
| 570 |
+
for t, info in enumerate(word_infos):
|
| 571 |
+
candidates = info["candidates"]
|
| 572 |
+
eng_flags = info["english_flags"]
|
| 573 |
+
d_flags = info.get("dict_flags", [False] * len(candidates))
|
| 574 |
+
rule_out = info["rule_output"]
|
| 575 |
+
prefix = info.get("prefix", "")
|
| 576 |
+
suffix = info.get("suffix", "")
|
| 577 |
+
total_cands = len(candidates)
|
| 578 |
+
|
| 579 |
+
# ββ Sinhala passthrough ββββββββββββββββββββββββββββββββββββ
|
| 580 |
+
if info.get("sinhala_passthrough"):
|
| 581 |
+
next_beam_si = [(path + [words[t]], sc) for path, sc in beam]
|
| 582 |
+
beam = next_beam_si[:beam_width]
|
| 583 |
+
trace_logs.append(
|
| 584 |
+
f"**Step {t + 1}: `{words[t]}`** β "
|
| 585 |
+
f"`{words[t]}` (Sinhala passthrough)\n"
|
| 586 |
+
)
|
| 587 |
+
diagnostics.append(WordDiagnostic(
|
| 588 |
+
step_index=t,
|
| 589 |
+
input_word=words[t],
|
| 590 |
+
rule_output=rule_out,
|
| 591 |
+
selected_candidate=words[t],
|
| 592 |
+
beam_score=beam[0][1] if beam else 0.0,
|
| 593 |
+
candidate_breakdown=[],
|
| 594 |
+
))
|
| 595 |
+
continue
|
| 596 |
+
|
| 597 |
+
# ββ Common-word shortcut βββββββββββββββββββββββββββββββββ
|
| 598 |
+
core_lower = words[t].lower().strip()
|
| 599 |
+
if core_lower in COMMON_WORDS:
|
| 600 |
+
override = prefix + COMMON_WORDS[core_lower] + suffix
|
| 601 |
+
next_beam_cw = [(path + [override], sc) for path, sc in beam]
|
| 602 |
+
beam = next_beam_cw[:beam_width]
|
| 603 |
+
trace_logs.append(
|
| 604 |
+
f"**Step {t + 1}: `{words[t]}`** β "
|
| 605 |
+
f"`{override}` (common-word override)\n"
|
| 606 |
+
)
|
| 607 |
+
diagnostics.append(WordDiagnostic(
|
| 608 |
+
step_index=t,
|
| 609 |
+
input_word=words[t],
|
| 610 |
+
rule_output=rule_out,
|
| 611 |
+
selected_candidate=override,
|
| 612 |
+
beam_score=beam[0][1] if beam else 0.0,
|
| 613 |
+
candidate_breakdown=[],
|
| 614 |
+
))
|
| 615 |
+
continue
|
| 616 |
+
|
| 617 |
+
# ββ Context-dependent standalone overrides ββββββββββββββββ
|
| 618 |
+
if core_lower in CONTEXT_WORDS_STANDALONE:
|
| 619 |
+
prev_word_lower = words[t - 1].lower() if t > 0 else ""
|
| 620 |
+
prev_common_val = COMMON_WORDS.get(prev_word_lower, "")
|
| 621 |
+
prev_is_english = (
|
| 622 |
+
t > 0
|
| 623 |
+
and (
|
| 624 |
+
prev_word_lower in ENGLISH_VOCAB
|
| 625 |
+
or prev_common_val.isascii() and prev_common_val != ""
|
| 626 |
+
)
|
| 627 |
+
)
|
| 628 |
+
if not prev_is_english:
|
| 629 |
+
override = prefix + CONTEXT_WORDS_STANDALONE[core_lower] + suffix
|
| 630 |
+
next_beam_ctx = [(path + [override], sc) for path, sc in beam]
|
| 631 |
+
beam = next_beam_ctx[:beam_width]
|
| 632 |
+
trace_logs.append(
|
| 633 |
+
f"**Step {t + 1}: `{words[t]}`** β "
|
| 634 |
+
f"`{override}` (standalone override)\n"
|
| 635 |
+
)
|
| 636 |
+
diagnostics.append(WordDiagnostic(
|
| 637 |
+
step_index=t,
|
| 638 |
+
input_word=words[t],
|
| 639 |
+
rule_output=rule_out,
|
| 640 |
+
selected_candidate=override,
|
| 641 |
+
beam_score=beam[0][1] if beam else 0.0,
|
| 642 |
+
candidate_breakdown=[],
|
| 643 |
+
))
|
| 644 |
+
continue
|
| 645 |
+
|
| 646 |
+
# ββ English-word shortcut ββββββββββββββββββββββββββββββββ
|
| 647 |
+
# See greedy decode for detailed comment on criterion.
|
| 648 |
+
core_rule = info.get("core_rule_output", "")
|
| 649 |
+
core_dict = info.get("dict_entries", set())
|
| 650 |
+
is_semantically_ambiguous = (
|
| 651 |
+
core_rule in core_dict and len(core_dict) >= 3
|
| 652 |
+
)
|
| 653 |
+
if (
|
| 654 |
+
len(core_lower) >= MIN_ENGLISH_LEN
|
| 655 |
+
and core_lower in ENGLISH_VOCAB
|
| 656 |
+
and not is_semantically_ambiguous
|
| 657 |
+
):
|
| 658 |
+
eng_word = words[t]
|
| 659 |
+
next_beam_eng = [(path + [eng_word], sc) for path, sc in beam]
|
| 660 |
+
beam = next_beam_eng[:beam_width]
|
| 661 |
+
trace_logs.append(
|
| 662 |
+
f"**Step {t + 1}: `{words[t]}`** β "
|
| 663 |
+
f"`{eng_word}` (English preserved)\n"
|
| 664 |
+
)
|
| 665 |
+
diagnostics.append(WordDiagnostic(
|
| 666 |
+
step_index=t,
|
| 667 |
+
input_word=words[t],
|
| 668 |
+
rule_output=rule_out,
|
| 669 |
+
selected_candidate=eng_word,
|
| 670 |
+
beam_score=beam[0][1] if beam else 0.0,
|
| 671 |
+
candidate_breakdown=[],
|
| 672 |
+
))
|
| 673 |
+
continue
|
| 674 |
+
|
| 675 |
+
# Build left/right context pairs for multi-mask MLM scoring
|
| 676 |
+
batch_left: List[str] = []
|
| 677 |
+
batch_right: List[str] = []
|
| 678 |
+
batch_tgt: List[str] = []
|
| 679 |
+
batch_meta: List[Tuple[int, int]] = [] # (beam_idx, cand_idx)
|
| 680 |
+
|
| 681 |
+
for p_idx, (path, _) in enumerate(beam):
|
| 682 |
+
for c_idx, cand in enumerate(candidates):
|
| 683 |
+
future = stable_context[t + 1:] if t + 1 < len(words) else []
|
| 684 |
+
batch_left.append(" ".join(stable_context[:t]))
|
| 685 |
+
batch_right.append(" ".join(future))
|
| 686 |
+
batch_tgt.append(cand)
|
| 687 |
+
batch_meta.append((p_idx, c_idx))
|
| 688 |
+
|
| 689 |
+
if not batch_tgt:
|
| 690 |
+
continue
|
| 691 |
+
|
| 692 |
+
mlm_scores = self._batch_mlm_score(batch_left, batch_right, batch_tgt)
|
| 693 |
+
|
| 694 |
+
# ββ Softmax normalise MLM scores βββββββββββββββββββββββββ
|
| 695 |
+
mlm_scores = self._softmax_normalize(mlm_scores)
|
| 696 |
+
|
| 697 |
+
# ββ MLM floor for English code-switching βββββββββββββββββ
|
| 698 |
+
# See greedy decode for detailed comment on criterion.
|
| 699 |
+
best_nonenglish_mlm: Dict[int, float] = {}
|
| 700 |
+
if not is_semantically_ambiguous:
|
| 701 |
+
for i, mlm in enumerate(mlm_scores):
|
| 702 |
+
p_idx, c_idx = batch_meta[i]
|
| 703 |
+
is_eng = eng_flags[c_idx] if c_idx < len(eng_flags) else False
|
| 704 |
+
if not is_eng:
|
| 705 |
+
prev = best_nonenglish_mlm.get(p_idx, -1e9)
|
| 706 |
+
if mlm > prev:
|
| 707 |
+
best_nonenglish_mlm[p_idx] = mlm
|
| 708 |
+
|
| 709 |
+
# ββ Score & trace ββββββββββββββββββββββββββββββββββββββββ
|
| 710 |
+
next_beam: List[Tuple[List[str], float]] = []
|
| 711 |
+
all_step_scores: List[Tuple[int, ScoredCandidate, float]] = []
|
| 712 |
+
step_log = f"**Step {t + 1}: `{words[t]}`** (rule β `{rule_out}`)\n\n"
|
| 713 |
+
|
| 714 |
+
for i, mlm in enumerate(mlm_scores):
|
| 715 |
+
p_idx, c_idx = batch_meta[i]
|
| 716 |
+
orig_path, orig_score = beam[p_idx]
|
| 717 |
+
cand = batch_tgt[i]
|
| 718 |
+
is_eng = eng_flags[c_idx] if c_idx < len(eng_flags) else False
|
| 719 |
+
is_dict = d_flags[c_idx] if c_idx < len(d_flags) else False
|
| 720 |
+
|
| 721 |
+
effective_mlm = mlm
|
| 722 |
+
if is_eng and cand.lower() == words[t].lower() and not is_semantically_ambiguous:
|
| 723 |
+
floor = best_nonenglish_mlm.get(p_idx, mlm)
|
| 724 |
+
effective_mlm = max(mlm, floor)
|
| 725 |
+
|
| 726 |
+
scored = self.scorer.score(
|
| 727 |
+
mlm_score=effective_mlm,
|
| 728 |
+
candidate=cand,
|
| 729 |
+
rule_output=rule_out,
|
| 730 |
+
rank=c_idx,
|
| 731 |
+
total_candidates=total_cands,
|
| 732 |
+
is_english=is_eng,
|
| 733 |
+
original_input=words[t],
|
| 734 |
+
is_from_dict=is_dict,
|
| 735 |
+
is_ambiguous=is_semantically_ambiguous,
|
| 736 |
+
)
|
| 737 |
+
|
| 738 |
+
new_total = orig_score + scored.combined_score
|
| 739 |
+
next_beam.append((orig_path + [cand], new_total))
|
| 740 |
+
all_step_scores.append((p_idx, scored, new_total))
|
| 741 |
+
|
| 742 |
+
if mlm > -25.0:
|
| 743 |
+
eng_tag = " π€" if is_eng else ""
|
| 744 |
+
step_log += (
|
| 745 |
+
f"- `{cand}`{eng_tag} "
|
| 746 |
+
f"MLM={scored.mlm_score:.2f} "
|
| 747 |
+
f"Fid={scored.fidelity_score:.2f} "
|
| 748 |
+
f"Rank={scored.rank_score:.2f} β "
|
| 749 |
+
f"**{scored.combined_score:.2f}**\n"
|
| 750 |
+
)
|
| 751 |
+
|
| 752 |
+
trace_logs.append(step_log)
|
| 753 |
+
|
| 754 |
+
beam = sorted(next_beam, key=lambda x: x[1], reverse=True)[:beam_width]
|
| 755 |
+
|
| 756 |
+
root_scores = [item for item in all_step_scores if item[0] == 0]
|
| 757 |
+
root_scores_sorted = sorted(root_scores, key=lambda x: x[2], reverse=True)
|
| 758 |
+
|
| 759 |
+
selected = beam[0][0][t] if beam and beam[0][0] else ""
|
| 760 |
+
selected_total = beam[0][1] if beam else float("-inf")
|
| 761 |
+
candidate_breakdown = [item[1] for item in root_scores_sorted]
|
| 762 |
+
|
| 763 |
+
diagnostics.append(WordDiagnostic(
|
| 764 |
+
step_index=t,
|
| 765 |
+
input_word=words[t],
|
| 766 |
+
rule_output=rule_out,
|
| 767 |
+
selected_candidate=selected,
|
| 768 |
+
beam_score=selected_total,
|
| 769 |
+
candidate_breakdown=candidate_breakdown,
|
| 770 |
+
))
|
| 771 |
+
|
| 772 |
+
result = " ".join(beam[0][0]) if beam else ""
|
| 773 |
+
return result, trace_logs, diagnostics
|
core/dictionary.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Dictionary adapter for retrieving Sinhala transliteration candidates.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from typing import Dict, List, Set
|
| 6 |
+
|
| 7 |
+
from core.constants import MAX_CANDIDATES
|
| 8 |
+
from core.english import ENGLISH_VOCAB
|
| 9 |
+
from core.scorer import CandidateScorer
|
| 10 |
+
from core.transliterate import rule_based_transliterate
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class DictionaryAdapter:
|
| 14 |
+
"""Retrieves transliteration candidates from the Sinhala dictionary."""
|
| 15 |
+
|
| 16 |
+
def __init__(self, dictionary_dict: Dict[str, List[str]]):
|
| 17 |
+
self.dictionary = dictionary_dict
|
| 18 |
+
|
| 19 |
+
def get_candidates(self, word: str, rule_output: str = "") -> List[str]:
|
| 20 |
+
"""
|
| 21 |
+
Return candidate transliterations for a Romanized word.
|
| 22 |
+
|
| 23 |
+
Priority:
|
| 24 |
+
1. English corpus match β keep original word
|
| 25 |
+
2. Dictionary lookup β exact / lowercase
|
| 26 |
+
3. Subword decomposition β only when 1 & 2 yield nothing
|
| 27 |
+
|
| 28 |
+
When more candidates exist than MAX_CANDIDATES, results are
|
| 29 |
+
sorted by Levenshtein distance to ``rule_output`` so the most
|
| 30 |
+
phonetically plausible entries survive the cut.
|
| 31 |
+
"""
|
| 32 |
+
cands: List[str] = []
|
| 33 |
+
word_lower = word.lower()
|
| 34 |
+
|
| 35 |
+
# 1. English corpus check
|
| 36 |
+
if word_lower in ENGLISH_VOCAB:
|
| 37 |
+
cands.append(word)
|
| 38 |
+
|
| 39 |
+
# 2. Sinhala dictionary check
|
| 40 |
+
if word in self.dictionary:
|
| 41 |
+
cands.extend(self.dictionary[word])
|
| 42 |
+
elif word_lower in self.dictionary:
|
| 43 |
+
cands.extend(self.dictionary[word_lower])
|
| 44 |
+
|
| 45 |
+
# 3. Deduplicate preserving order
|
| 46 |
+
if cands:
|
| 47 |
+
cands = list(dict.fromkeys(cands))
|
| 48 |
+
# Sort Sinhala candidates by closeness to rule output
|
| 49 |
+
if rule_output and len(cands) > MAX_CANDIDATES:
|
| 50 |
+
english = [c for c in cands if c.lower() in ENGLISH_VOCAB]
|
| 51 |
+
sinhala = [c for c in cands if c.lower() not in ENGLISH_VOCAB]
|
| 52 |
+
sinhala.sort(
|
| 53 |
+
key=lambda c: CandidateScorer.levenshtein(c, rule_output)
|
| 54 |
+
)
|
| 55 |
+
cands = english + sinhala
|
| 56 |
+
return cands
|
| 57 |
+
|
| 58 |
+
# 4. Subword fallback (compound words)
|
| 59 |
+
length = len(word)
|
| 60 |
+
if length > 3:
|
| 61 |
+
for i in range(2, length - 1):
|
| 62 |
+
part1, part2 = word[:i], word[i:]
|
| 63 |
+
p1 = self.dictionary.get(part1) or self.dictionary.get(part1.lower())
|
| 64 |
+
p2 = self.dictionary.get(part2) or self.dictionary.get(part2.lower())
|
| 65 |
+
|
| 66 |
+
if p1 and p2:
|
| 67 |
+
for w1 in p1[:3]:
|
| 68 |
+
for w2 in p2[:3]:
|
| 69 |
+
cands.append(w1 + w2)
|
| 70 |
+
|
| 71 |
+
return list(dict.fromkeys(cands)) if cands else []
|
| 72 |
+
|
| 73 |
+
@staticmethod
|
| 74 |
+
def get_rule_output(word: str) -> str:
|
| 75 |
+
"""Generate Sinhala output via the phonetic rule engine."""
|
| 76 |
+
return rule_based_transliterate(word)
|
core/english.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
English vocabulary loader and cache management for code-switch detection.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import logging
|
| 7 |
+
import requests
|
| 8 |
+
from typing import Set
|
| 9 |
+
|
| 10 |
+
from core.constants import ENGLISH_CORPUS_URL, MIN_ENGLISH_LEN
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
# Core English words always recognised (supplements the 20k corpus)
|
| 15 |
+
CORE_ENGLISH_WORDS: Set[str] = {
|
| 16 |
+
"transliteration", "sincode", "prototype", "assignment", "singlish",
|
| 17 |
+
"rest", "complete", "tutorial", "small", "mistakes", "game", "play",
|
| 18 |
+
"type", "test", "online", "code", "mixing", "project", "demo", "today",
|
| 19 |
+
"tomorrow", "presentation", "slide", "submit", "feedback", "deploy",
|
| 20 |
+
"merge", "update", "delete", "download", "upload", "install", "server",
|
| 21 |
+
"meeting", "backlog", "comment", "reply", "chat", "selfie", "post",
|
| 22 |
+
"share", "private", "message", "group", "study", "exam", "results",
|
| 23 |
+
"viva", "prepared", "site", "redo", "story", "poll",
|
| 24 |
+
"hall", "exam", "PR", "DM", "page", "app", "bug", "fix",
|
| 25 |
+
"log", "push", "pull", "branch", "build", "run", "save",
|
| 26 |
+
"link", "edit", "file", "open", "close", "live", "view",
|
| 27 |
+
"deployments", "leaderboard", "instagram", "github", "standup",
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _resolve_english_cache_path() -> str:
|
| 32 |
+
"""
|
| 33 |
+
Resolve a writable cache path for the English corpus.
|
| 34 |
+
|
| 35 |
+
Hugging Face Spaces may run with constrained write locations, so we prefer:
|
| 36 |
+
1) explicit env override,
|
| 37 |
+
2) HF_HOME cache dir,
|
| 38 |
+
3) local working dir,
|
| 39 |
+
4) system temp dir.
|
| 40 |
+
"""
|
| 41 |
+
override = os.getenv("SINCODE_ENGLISH_CACHE")
|
| 42 |
+
if override:
|
| 43 |
+
return override
|
| 44 |
+
|
| 45 |
+
candidates = [
|
| 46 |
+
os.path.join(os.getenv("HF_HOME", ""), "english_20k.txt") if os.getenv("HF_HOME") else "",
|
| 47 |
+
os.path.join(os.getcwd(), "english_20k.txt"),
|
| 48 |
+
os.path.join(os.getenv("TMPDIR", os.getenv("TEMP", "/tmp")), "english_20k.txt"),
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
for path in candidates:
|
| 52 |
+
if not path:
|
| 53 |
+
continue
|
| 54 |
+
parent = os.path.dirname(path) or "."
|
| 55 |
+
try:
|
| 56 |
+
os.makedirs(parent, exist_ok=True)
|
| 57 |
+
with open(path, "a", encoding="utf-8"):
|
| 58 |
+
pass
|
| 59 |
+
return path
|
| 60 |
+
except OSError:
|
| 61 |
+
continue
|
| 62 |
+
|
| 63 |
+
return "english_20k.txt"
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
ENGLISH_CORPUS_CACHE = _resolve_english_cache_path()
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def load_english_vocab() -> Set[str]:
|
| 70 |
+
"""Load and cache a ~20k English word list for code-switch detection."""
|
| 71 |
+
vocab = CORE_ENGLISH_WORDS.copy()
|
| 72 |
+
|
| 73 |
+
if not os.path.exists(ENGLISH_CORPUS_CACHE):
|
| 74 |
+
try:
|
| 75 |
+
logger.info("Downloading English corpus...")
|
| 76 |
+
response = requests.get(ENGLISH_CORPUS_URL, timeout=10)
|
| 77 |
+
response.raise_for_status()
|
| 78 |
+
with open(ENGLISH_CORPUS_CACHE, "wb") as f:
|
| 79 |
+
f.write(response.content)
|
| 80 |
+
except (requests.RequestException, OSError) as exc:
|
| 81 |
+
logger.warning("Could not download English corpus: %s", exc)
|
| 82 |
+
return vocab
|
| 83 |
+
|
| 84 |
+
try:
|
| 85 |
+
with open(ENGLISH_CORPUS_CACHE, "r", encoding="utf-8") as f:
|
| 86 |
+
vocab.update(
|
| 87 |
+
w for line in f
|
| 88 |
+
if (w := line.strip().lower()) and len(w) >= MIN_ENGLISH_LEN
|
| 89 |
+
)
|
| 90 |
+
except OSError as exc:
|
| 91 |
+
logger.warning("Could not read English corpus file: %s", exc)
|
| 92 |
+
|
| 93 |
+
logger.info("English vocabulary loaded: %d words", len(vocab))
|
| 94 |
+
return vocab
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
ENGLISH_VOCAB: Set[str] = load_english_vocab()
|
core/mappings.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Static mapping tables for the SinCode engine.
|
| 3 |
+
|
| 4 |
+
Includes common-word overrides, context-dependent overrides,
|
| 5 |
+
and phonetic mapping tables (consonants, vowels, modifiers).
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from typing import Dict, List
|
| 9 |
+
|
| 10 |
+
# βββ Common Word Overrides ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 11 |
+
# High-frequency Singlish words whose romanisation is ambiguous (long vs.
|
| 12 |
+
# short vowel, retroflex vs. dental, etc.). When a word appears here the
|
| 13 |
+
# decoder uses the override directly, bypassing MLM/fidelity scoring.
|
| 14 |
+
# Only add words that are *unambiguous* β i.e. one dominant Sinhala form
|
| 15 |
+
# in colloquial written chat. Context-dependent words (e.g. "eka") should
|
| 16 |
+
# NOT be listed so that MLM can resolve them.
|
| 17 |
+
|
| 18 |
+
COMMON_WORDS: Dict[str, str] = {
|
| 19 |
+
# Pronouns & particles
|
| 20 |
+
"oya": "ΰΆΰΆΊΰ·", # you
|
| 21 |
+
"oyaa": "ΰΆΰΆΊΰ·",
|
| 22 |
+
"eya": "ΰΆΰΆΊΰ·", # he/she
|
| 23 |
+
"eyaa": "ΰΆΰΆΊΰ·",
|
| 24 |
+
"api": "ΰΆ
ΰΆ΄ΰ·", # we
|
| 25 |
+
"mama": "ΰΆΈΰΆΈ", # I
|
| 26 |
+
"mage": "ΰΆΈΰΆΰ·", # my
|
| 27 |
+
"oyage": "ΰΆΰΆΊΰ·ΰΆΰ·", # your
|
| 28 |
+
# Common verbs (past tense)
|
| 29 |
+
"awa": "ΰΆΰ·ΰ·", # came
|
| 30 |
+
"aawa": "ΰΆΰ·ΰ·",
|
| 31 |
+
"giya": "ΰΆΰ·ΰΆΊΰ·", # went
|
| 32 |
+
"kala": "ΰΆΰ·
ΰ·", # did
|
| 33 |
+
"kiwa": "ΰΆΰ·ΰ·ΰ·ΰ·ΰ·", # said
|
| 34 |
+
"kiwwa": "ΰΆΰ·ΰ·ΰ·ΰ·ΰ·",
|
| 35 |
+
"yewwa": "ΰΆΊΰ·ΰ·ΰ·ΰ·ΰ·", # sent
|
| 36 |
+
"gawa": "ΰΆΰ·ΰ·ΰ·ΰ·ΰ·", # hit
|
| 37 |
+
"katha": "ΰΆΰΆΰ·", # talked / story
|
| 38 |
+
# Time
|
| 39 |
+
"heta": "ΰ·ΰ·ΰΆ§", # tomorrow
|
| 40 |
+
"ada": "ΰΆ
ΰΆ―", # today
|
| 41 |
+
"iye": "ΰΆΰΆΊΰ·", # yesterday
|
| 42 |
+
# Common adverbs / particles
|
| 43 |
+
"one": "ΰΆΰΆ±ΰ·", # need/want
|
| 44 |
+
"oney": "ΰΆΰΆ±ΰ·",
|
| 45 |
+
"naa": "ΰΆ±ΰ·", # no (long form)
|
| 46 |
+
"na": "ΰΆ±ΰ·", # no
|
| 47 |
+
"hari": "ΰ·ΰΆ»ΰ·", # ok / right
|
| 48 |
+
"wage": "ΰ·ΰΆΰ·", # like
|
| 49 |
+
"nisa": "ΰΆ±ΰ·ΰ·ΰ·", # because
|
| 50 |
+
"inne": "ΰΆΰΆ±ΰ·ΰΆ±ΰ·", # being/staying (colloquial)
|
| 51 |
+
"inna": "ΰΆΰΆ±ΰ·ΰΆ±", # stay (imperative)
|
| 52 |
+
"kalin": "ΰΆΰΆ½ΰ·ΰΆ±ΰ·", # before / earlier
|
| 53 |
+
"madi": "ΰΆΈΰΆ―ΰ·", # insufficient / not enough
|
| 54 |
+
# Common verb endings
|
| 55 |
+
"giye": "ΰΆΰ·ΰΆΊΰ·", # went (emphatic)
|
| 56 |
+
"una": "ΰΆΰΆ±ΰ·", # became / happened
|
| 57 |
+
"wuna": "ΰΆΰΆ±ΰ·", # became (alt spelling)
|
| 58 |
+
# Locations / misc
|
| 59 |
+
"gedaradi": "ΰΆΰ·ΰΆ―ΰΆ»ΰΆ―ΰ·", # at home
|
| 60 |
+
"gedara": "ΰΆΰ·ΰΆ―ΰΆ»", # home
|
| 61 |
+
# Common adjectives / other
|
| 62 |
+
"honda": "ΰ·ΰ·ΰΆ³", # good
|
| 63 |
+
"ape": "ΰΆ
ΰΆ΄ΰ·", # our
|
| 64 |
+
"me": "ΰΆΈΰ·", # this
|
| 65 |
+
"passe": "ΰΆ΄ΰ·ΰ·ΰ·ΰ·", # after / later
|
| 66 |
+
"ba": "ΰΆΆΰ·", # can't
|
| 67 |
+
"bari": "ΰΆΆΰ·ΰΆ»ΰ·", # impossible
|
| 68 |
+
"bri": "ΰΆΆΰ·ΰΆ»ΰ·", # can't (abbrev)
|
| 69 |
+
"danne": "ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ·", # know
|
| 70 |
+
"wada": "ΰ·ΰ·ΰΆ©", # work (noun)
|
| 71 |
+
"epa": "ΰΆΰΆ΄ΰ·", # don't
|
| 72 |
+
# Common ad-hoc abbreviations
|
| 73 |
+
"mn": "ΰΆΈΰΆ", # man (I, informal first person)
|
| 74 |
+
"mta": "ΰΆΈΰΆ§", # mata
|
| 75 |
+
"oyta": "ΰΆΰΆΊΰ·ΰΆ§", # oyata
|
| 76 |
+
"oyata": "ΰΆΰΆΊΰ·ΰΆ§", # to you
|
| 77 |
+
"krnna": "ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±", # karanna
|
| 78 |
+
"blnna": "ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ±", # balanna
|
| 79 |
+
"on": "ΰΆΰΆ±ΰ·", # one (abbrev)
|
| 80 |
+
# Common -nawa verb endings
|
| 81 |
+
"thiyanawa": "ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·", # is/has
|
| 82 |
+
"wenawa": "ΰ·ΰ·ΰΆ±ΰ·ΰ·", # becomes
|
| 83 |
+
"enawa": "ΰΆΰΆ±ΰ·ΰ·", # comes
|
| 84 |
+
"yanawa": "ΰΆΊΰΆ±ΰ·ΰ·", # goes
|
| 85 |
+
"hithenawa":"ΰ·ΰ·ΰΆΰ·ΰΆ±ΰ·ΰ·", # thinks/feels
|
| 86 |
+
"penenawa": "ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·", # appears/visible
|
| 87 |
+
"karamu": "ΰΆΰΆ»ΰΆΈΰ·", # let's do
|
| 88 |
+
"balamu": "ΰΆΆΰΆ½ΰΆΈΰ·", # let's see
|
| 89 |
+
"damu": "ΰΆ―ΰ·ΰΆΈΰ·", # let's put
|
| 90 |
+
"yamu": "ΰΆΊΰΆΈΰ·", # let's go
|
| 91 |
+
# Short English abbreviations (keys are lowercase for lookup)
|
| 92 |
+
"pr": "PR",
|
| 93 |
+
"dm": "DM",
|
| 94 |
+
"ai": "AI",
|
| 95 |
+
"it": "IT",
|
| 96 |
+
"qa": "QA",
|
| 97 |
+
"ui": "UI",
|
| 98 |
+
"ok": "ok",
|
| 99 |
+
# Common ad-hoc abbreviations (contd.)
|
| 100 |
+
"ek": "ΰΆΰΆ", # eka (short form)
|
| 101 |
+
"ekta": "ΰΆΰΆΰΆ§", # ekata = to that one
|
| 102 |
+
"ekat": "ΰΆΰΆΰΆ§", # that-thing + to (standalone form)
|
| 103 |
+
"eke": "ΰΆΰΆΰ·", # of that one
|
| 104 |
+
"hta": "ΰ·ΰ·ΰΆ§", # heta (abbrev)
|
| 105 |
+
"damma": "ΰΆ―ΰ·ΰΆΈΰ·ΰΆΈΰ·", # put/posted
|
| 106 |
+
"gannako": "ΰΆΰΆ±ΰ·ΰΆ±ΰΆΰ·", # take (imperative, long Ε)
|
| 107 |
+
# Additional words for accuracy
|
| 108 |
+
"gena": "ΰΆΰ·ΰΆ±", # about
|
| 109 |
+
"mata": "ΰΆΈΰΆ§", # to me
|
| 110 |
+
"laga": "ΰ·
ΰΆ", # near
|
| 111 |
+
"poth": "ΰΆ΄ΰ·ΰΆ", # book
|
| 112 |
+
"iwara": "ΰΆΰ·ΰΆ»", # finished
|
| 113 |
+
"karanna": "ΰΆοΏ½οΏ½ΰΆ±ΰ·ΰΆ±", # to do
|
| 114 |
+
"hadamu": "ΰ·ΰΆ―ΰΆΈΰ·", # let's make
|
| 115 |
+
"kiyawala": "ΰΆΰ·ΰΆΊΰ·ΰΆ½ΰ·", # having read
|
| 116 |
+
"baya": "ΰΆΆΰΆΊ", # fear/scared
|
| 117 |
+
# Ad-hoc and alternative spellings (accuracy fixes)
|
| 118 |
+
"kema": "ΰΆΰ·ΰΆΈ", # food (colloquial spelling)
|
| 119 |
+
"kama": "ΰΆΰ·ΰΆΈ", # food (alt spelling)
|
| 120 |
+
"hodai": "ΰ·ΰ·ΰΆ³ΰΆΊΰ·", # good! (no-n spelling)
|
| 121 |
+
"oyge": "ΰΆΰΆΊΰ·ΰΆΰ·", # your (shortened form)
|
| 122 |
+
"iwra": "ΰΆΰ·ΰΆ»", # finished (vowel-stripped)
|
| 123 |
+
"krd": "ΰΆΰΆ»ΰ·ΰΆ―", # did? (extreme abbreviation)
|
| 124 |
+
"handawata": "ΰ·ΰ·ΰΆ±ΰ·ΰΆ―ΰ·ΰ·ΰΆ§", # in the evening
|
| 125 |
+
"wenwa": "ΰ·ΰ·ΰΆ±ΰ·ΰ·", # becomes/happens
|
| 126 |
+
"ep": "ΰΆΰΆ΄ΰ·", # epa (single-syllable abbrev)
|
| 127 |
+
"prashnya": "ΰΆ΄ΰ·\u200dΰΆ»ΰ·ΰ·\u200dΰΆ±ΰΆΊ", # question (without final vowel)
|
| 128 |
+
# ββ Verb forms / participles (no English conflict) ββββββββββββββββββββ
|
| 129 |
+
"penawa": "ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·", # appears/visible (alt spelling of penenawa)
|
| 130 |
+
"thiyana": "ΰΆΰ·ΰΆΊΰ·ΰΆ±", # that which is/exists (relative participle)
|
| 131 |
+
"enakota": "ΰΆΰΆ±ΰΆΰ·ΰΆ§", # when (you/they) come
|
| 132 |
+
"hadanna": "ΰ·ΰΆ―ΰΆ±ΰ·ΰΆ±", # to make/build (imperative)
|
| 133 |
+
"yawwa": "ΰΆΊΰ·ΰ·ΰ·ΰ·ΰ·", # sent (alt spelling of yewwa)
|
| 134 |
+
"gihilla": "ΰΆΰ·ΰ·ΰ·ΰΆ½ΰ·ΰΆ½ΰ·", # having gone
|
| 135 |
+
"kewata": "ΰΆΰ·ΰ·ΰΆ§", # having eaten / for the eating
|
| 136 |
+
"kiyla": "ΰΆΰ·ΰΆΊΰΆ½ΰ·", # having said (ad-hoc spelling)
|
| 137 |
+
"krganna": "ΰΆΰΆ»ΰΆΰΆ±ΰ·ΰΆ±", # to do-and-get (ad-hoc abbreviation)
|
| 138 |
+
# ββ Adjectives (no English conflict) ββββββββββββββββββββββββββββββββββββ
|
| 139 |
+
"amarui": "ΰΆ
ΰΆΈΰ·ΰΆ»ΰ·ΰΆΊΰ·", # difficult / hard
|
| 140 |
+
"hodama": "ΰ·ΰ·ΰΆ³ΰΆΈ", # best (superlative of honda)
|
| 141 |
+
# ββ Particles / negation (no English conflict) βββββββββββββββββββββββββββ
|
| 142 |
+
"nathi": "ΰΆ±ΰ·ΰΆΰ·", # without / lacking (negation)
|
| 143 |
+
"nati": "ΰΆ±ΰ·ΰΆΰ·", # without (alt spelling)
|
| 144 |
+
"naththe": "ΰΆ±ΰ·ΰΆΰ·ΰΆΰ·", # negative participle (not ...ing)
|
| 145 |
+
"dan": "ΰΆ―ΰ·ΰΆ±ΰ·", # now
|
| 146 |
+
"oni": "ΰΆΰΆ±ΰ·", # need/want (alt spelling of one)
|
| 147 |
+
# ββ Time ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 148 |
+
"udee": "ΰΆΰΆ―ΰ·", # morning
|
| 149 |
+
# ββ Ad-hoc abbreviations (no English conflict) βββββββββββββββββββββββββββ
|
| 150 |
+
"hri": "ΰ·ΰΆ»ΰ·", # ok/right (shortened hari)
|
| 151 |
+
"mge": "ΰΆΈΰΆΰ·", # my (shortened mage)
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
# Context-dependent words: use this form ONLY when the previous word is
|
| 155 |
+
# NOT English. When "eka" follows an English noun (e.g., "assignment eka")
|
| 156 |
+
# the scorer resolves it to ΰΆΰΆ naturally; standalone "eka" maps to ΰΆΰΆ.
|
| 157 |
+
CONTEXT_WORDS_STANDALONE: Dict[str, str] = {
|
| 158 |
+
"eka": "ΰΆΰΆ", # that thing (standalone)
|
| 159 |
+
"ekak": "ΰΆΰΆΰΆΰ·", # one of (quantifier β same either way)
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
# βββ Phonetic Mapping Tables ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 164 |
+
# Singlish Romanized β Sinhala Unicode
|
| 165 |
+
# Tables are ordered longest-pattern-first so greedy replacement works.
|
| 166 |
+
|
| 167 |
+
CONSONANTS: List[str] = [
|
| 168 |
+
"nnd", "nndh", "nng",
|
| 169 |
+
"th", "dh", "gh", "ch", "ph", "bh", "jh", "sh",
|
| 170 |
+
"GN", "KN", "Lu", "kh", "Th", "Dh",
|
| 171 |
+
"S", "d", "c", "th", "t", "k", "D", "n", "p", "b", "m",
|
| 172 |
+
"\\y",
|
| 173 |
+
"Y", "y", "j", "l", "v", "w", "s", "h",
|
| 174 |
+
"N", "L", "K", "G", "P", "B", "f", "g", "r",
|
| 175 |
+
]
|
| 176 |
+
|
| 177 |
+
CONSONANTS_UNI: List[str] = [
|
| 178 |
+
"ΰΆ¬", "ΰΆ³", "ΰΆ",
|
| 179 |
+
"ΰΆ", "ΰΆ°", "ΰΆ", "ΰΆ ", "ΰΆ΅", "ΰΆ·", "ΰΆ£", "ΰ·",
|
| 180 |
+
"ΰΆ₯", "ΰΆ€", "ΰ·
ΰ·", "ΰΆ", "ΰΆ¨", "ΰΆͺ",
|
| 181 |
+
"ΰ·", "ΰΆ―", "ΰΆ ", "ΰΆ", "ΰΆ§", "ΰΆ", "ΰΆ©", "ΰΆ±", "ΰΆ΄", "ΰΆΆ", "ΰΆΈ",
|
| 182 |
+
"βΰΆΊ",
|
| 183 |
+
"βΰΆΊ", "ΰΆΊ", "ΰΆ’", "ΰΆ½", "ΰ·", "ΰ·", "ΰ·", "ΰ·",
|
| 184 |
+
"ΰΆ«", "ΰ·
", "ΰΆ", "ΰΆ", "ΰΆ΅", "ΰΆΉ", "ΰ·", "ΰΆ", "ΰΆ»",
|
| 185 |
+
]
|
| 186 |
+
|
| 187 |
+
VOWELS: List[str] = [
|
| 188 |
+
"oo", "o\\)", "oe", "aa", "a\\)", "Aa", "A\\)", "ae",
|
| 189 |
+
"ii", "i\\)", "ie", "ee", "ea", "e\\)", "ei",
|
| 190 |
+
"uu", "u\\)", "au",
|
| 191 |
+
"\\a", "a", "A", "i", "e", "u", "o", "I",
|
| 192 |
+
]
|
| 193 |
+
|
| 194 |
+
VOWELS_UNI: List[str] = [
|
| 195 |
+
"ΰΆ", "ΰΆ", "ΰΆ", "ΰΆ", "ΰΆ", "ΰΆ", "ΰΆ", "ΰΆ",
|
| 196 |
+
"ΰΆ", "ΰΆ", "ΰΆ", "ΰΆ", "ΰΆ", "ΰΆ", "ΰΆ",
|
| 197 |
+
"ΰΆ", "ΰΆ", "ΰΆ",
|
| 198 |
+
"ΰΆ", "ΰΆ
", "ΰΆ", "ΰΆ", "ΰΆ", "ΰΆ", "ΰΆ", "ΰΆ",
|
| 199 |
+
]
|
| 200 |
+
|
| 201 |
+
VOWEL_MODIFIERS_UNI: List[str] = [
|
| 202 |
+
"ΰ·", "ΰ·", "ΰ·", "ΰ·", "ΰ·", "ΰ·", "ΰ·", "ΰ·",
|
| 203 |
+
"ΰ·", "ΰ·", "ΰ·", "ΰ·", "ΰ·", "ΰ·", "ΰ·",
|
| 204 |
+
"ΰ·", "ΰ·", "ΰ·",
|
| 205 |
+
"ΰ·", "", "ΰ·", "ΰ·", "ΰ·", "ΰ·", "ΰ·", "ΰ·",
|
| 206 |
+
]
|
| 207 |
+
|
| 208 |
+
SPECIAL_CONSONANTS: List[str] = ["\\n", "\\h", "\\N", "\\R", "R", "\\r"]
|
| 209 |
+
SPECIAL_CONSONANTS_UNI: List[str] = ["ΰΆ", "ΰΆ", "ΰΆ", "ΰΆ", "ΰΆ»ΰ·\u200D", "ΰΆ»ΰ·\u200D"]
|
| 210 |
+
|
| 211 |
+
SPECIAL_CHARS: List[str] = ["ruu", "ru"]
|
| 212 |
+
SPECIAL_CHARS_UNI: List[str] = ["ΰ·²", "ΰ·"]
|
| 213 |
+
|
| 214 |
+
N_VOWELS: int = 26
|
core/scorer.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data-driven candidate scorer combining MLM, fidelity, and rank signals.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import math
|
| 6 |
+
from dataclasses import dataclass, field
|
| 7 |
+
from typing import List
|
| 8 |
+
|
| 9 |
+
from core.constants import (
|
| 10 |
+
W_MLM, W_FIDELITY, W_RANK,
|
| 11 |
+
FIDELITY_SCALE, DICT_FIDELITY_DAMP,
|
| 12 |
+
SINHALA_VIRAMA, ZWJ,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@dataclass
|
| 17 |
+
class ScoredCandidate:
|
| 18 |
+
"""Holds a candidate word and its scoring breakdown."""
|
| 19 |
+
text: str
|
| 20 |
+
mlm_score: float = 0.0
|
| 21 |
+
fidelity_score: float = 0.0
|
| 22 |
+
rank_score: float = 0.0
|
| 23 |
+
combined_score: float = 0.0
|
| 24 |
+
is_english: bool = False
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@dataclass
|
| 28 |
+
class WordDiagnostic:
|
| 29 |
+
"""Structured per-word diagnostics for evaluation and error analysis."""
|
| 30 |
+
step_index: int
|
| 31 |
+
input_word: str
|
| 32 |
+
rule_output: str
|
| 33 |
+
selected_candidate: str
|
| 34 |
+
beam_score: float
|
| 35 |
+
candidate_breakdown: List[ScoredCandidate]
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class CandidateScorer:
|
| 39 |
+
"""
|
| 40 |
+
Data-driven replacement for the old hardcoded penalty table.
|
| 41 |
+
|
| 42 |
+
Combines three probabilistic signals to rank candidates:
|
| 43 |
+
|
| 44 |
+
1. **MLM Score** (weight Ξ± = 0.55)
|
| 45 |
+
Contextual fit from XLM-RoBERTa masked language model.
|
| 46 |
+
|
| 47 |
+
2. **Source-Aware Fidelity** (weight Ξ² = 0.45)
|
| 48 |
+
English candidates matching input β 0.0 (user intent).
|
| 49 |
+
Dictionary candidates β damped Levenshtein to rule output.
|
| 50 |
+
Rule-only outputs β penalised by virama/skeleton density.
|
| 51 |
+
Other β full Levenshtein distance to rule output.
|
| 52 |
+
|
| 53 |
+
3. **Rank Prior** (weight Ξ³ = 0.0, disabled)
|
| 54 |
+
Dictionary rank prior is disabled because entries are unordered.
|
| 55 |
+
"""
|
| 56 |
+
|
| 57 |
+
def __init__(
|
| 58 |
+
self,
|
| 59 |
+
w_mlm: float = W_MLM,
|
| 60 |
+
w_fidelity: float = W_FIDELITY,
|
| 61 |
+
w_rank: float = W_RANK,
|
| 62 |
+
fidelity_scale: float = FIDELITY_SCALE,
|
| 63 |
+
):
|
| 64 |
+
self.w_mlm = w_mlm
|
| 65 |
+
self.w_fidelity = w_fidelity
|
| 66 |
+
self.w_rank = w_rank
|
| 67 |
+
self.fidelity_scale = fidelity_scale
|
| 68 |
+
|
| 69 |
+
# ββ Levenshtein distance (pure-Python, no dependencies) ββββββββββ
|
| 70 |
+
|
| 71 |
+
@staticmethod
|
| 72 |
+
def levenshtein(s1: str, s2: str) -> int:
|
| 73 |
+
"""Compute the Levenshtein edit distance between two strings."""
|
| 74 |
+
if not s1:
|
| 75 |
+
return len(s2)
|
| 76 |
+
if not s2:
|
| 77 |
+
return len(s1)
|
| 78 |
+
|
| 79 |
+
m, n = len(s1), len(s2)
|
| 80 |
+
prev_row = list(range(n + 1))
|
| 81 |
+
|
| 82 |
+
for i in range(1, m + 1):
|
| 83 |
+
curr_row = [i] + [0] * n
|
| 84 |
+
for j in range(1, n + 1):
|
| 85 |
+
cost = 0 if s1[i - 1] == s2[j - 1] else 1
|
| 86 |
+
curr_row[j] = min(
|
| 87 |
+
prev_row[j] + 1, # deletion
|
| 88 |
+
curr_row[j - 1] + 1, # insertion
|
| 89 |
+
prev_row[j - 1] + cost, # substitution
|
| 90 |
+
)
|
| 91 |
+
prev_row = curr_row
|
| 92 |
+
|
| 93 |
+
return prev_row[n]
|
| 94 |
+
|
| 95 |
+
# ββ Scoring components βββββββββββββββββββββββββββββββββββββββββββ
|
| 96 |
+
|
| 97 |
+
def compute_fidelity(
|
| 98 |
+
self, candidate: str, rule_output: str,
|
| 99 |
+
original_input: str = "", is_from_dict: bool = False,
|
| 100 |
+
is_ambiguous: bool = False,
|
| 101 |
+
) -> float:
|
| 102 |
+
"""
|
| 103 |
+
Source-aware transliteration fidelity.
|
| 104 |
+
|
| 105 |
+
- **English matching input** β 0.0 (user-intent preservation).
|
| 106 |
+
- **Dict + matches rule output** β strong bonus (+2.0),
|
| 107 |
+
reduced to +0.5 when *is_ambiguous* (many dict candidates
|
| 108 |
+
with different meanings β let MLM context decide).
|
| 109 |
+
- **Dict only** β decaying bonus (1.0 down to 0.0 with distance).
|
| 110 |
+
- **Rule-only outputs not in dictionary** β penalised by
|
| 111 |
+
consonant-skeleton density (high virama ratio = malformed).
|
| 112 |
+
- **Other** β full Levenshtein distance to rule output.
|
| 113 |
+
"""
|
| 114 |
+
# 1. English candidate matching the original input word
|
| 115 |
+
if original_input and candidate.lower() == original_input.lower():
|
| 116 |
+
return 0.0
|
| 117 |
+
|
| 118 |
+
# 2. Dictionary-validated candidates
|
| 119 |
+
if is_from_dict:
|
| 120 |
+
if candidate == rule_output:
|
| 121 |
+
return 0.5 if is_ambiguous else 2.0
|
| 122 |
+
max_len = max(len(candidate), len(rule_output), 1)
|
| 123 |
+
norm_dist = self.levenshtein(candidate, rule_output) / max_len
|
| 124 |
+
return max(0.0, 1.0 - norm_dist * DICT_FIDELITY_DAMP)
|
| 125 |
+
|
| 126 |
+
# 3. Rule-only output (not validated by dictionary)
|
| 127 |
+
if candidate == rule_output:
|
| 128 |
+
bare_virama = sum(
|
| 129 |
+
1 for i, ch in enumerate(candidate)
|
| 130 |
+
if ch == SINHALA_VIRAMA
|
| 131 |
+
and (i + 1 >= len(candidate) or candidate[i + 1] != ZWJ)
|
| 132 |
+
)
|
| 133 |
+
density = bare_virama / max(len(candidate), 1)
|
| 134 |
+
return -density * self.fidelity_scale * 2
|
| 135 |
+
|
| 136 |
+
# 4. English word not matching input β uncertain
|
| 137 |
+
if candidate.isascii():
|
| 138 |
+
return -0.5
|
| 139 |
+
|
| 140 |
+
# 5. Sinhala candidate not from dictionary β distance penalty
|
| 141 |
+
max_len = max(len(candidate), len(rule_output), 1)
|
| 142 |
+
norm_dist = self.levenshtein(candidate, rule_output) / max_len
|
| 143 |
+
return -norm_dist * self.fidelity_scale
|
| 144 |
+
|
| 145 |
+
@staticmethod
|
| 146 |
+
def compute_rank_prior(rank: int, total: int) -> float:
|
| 147 |
+
"""Log-decay rank prior. First candidate β 0.0; later ones decay."""
|
| 148 |
+
if total <= 1:
|
| 149 |
+
return 0.0
|
| 150 |
+
return math.log(1.0 / (rank + 1))
|
| 151 |
+
|
| 152 |
+
# ββ Combined score βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 153 |
+
|
| 154 |
+
def score(
|
| 155 |
+
self,
|
| 156 |
+
mlm_score: float,
|
| 157 |
+
candidate: str,
|
| 158 |
+
rule_output: str,
|
| 159 |
+
rank: int,
|
| 160 |
+
total_candidates: int,
|
| 161 |
+
is_english: bool = False,
|
| 162 |
+
original_input: str = "",
|
| 163 |
+
is_from_dict: bool = False,
|
| 164 |
+
is_ambiguous: bool = False,
|
| 165 |
+
) -> ScoredCandidate:
|
| 166 |
+
"""Return a :class:`ScoredCandidate` with full breakdown."""
|
| 167 |
+
fidelity = self.compute_fidelity(
|
| 168 |
+
candidate, rule_output, original_input, is_from_dict,
|
| 169 |
+
is_ambiguous,
|
| 170 |
+
)
|
| 171 |
+
rank_prior = self.compute_rank_prior(rank, total_candidates)
|
| 172 |
+
|
| 173 |
+
combined = (
|
| 174 |
+
self.w_mlm * mlm_score
|
| 175 |
+
+ self.w_fidelity * fidelity
|
| 176 |
+
+ self.w_rank * rank_prior
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
return ScoredCandidate(
|
| 180 |
+
text=candidate,
|
| 181 |
+
mlm_score=mlm_score,
|
| 182 |
+
fidelity_score=fidelity,
|
| 183 |
+
rank_score=rank_prior,
|
| 184 |
+
combined_score=combined,
|
| 185 |
+
is_english=is_english,
|
| 186 |
+
)
|
core/transliterate.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Rule-based phonetic transliteration engine (Singlish β Sinhala Unicode).
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from core.mappings import (
|
| 6 |
+
CONSONANTS, CONSONANTS_UNI,
|
| 7 |
+
VOWELS, VOWELS_UNI, VOWEL_MODIFIERS_UNI,
|
| 8 |
+
SPECIAL_CONSONANTS, SPECIAL_CONSONANTS_UNI,
|
| 9 |
+
SPECIAL_CHARS, SPECIAL_CHARS_UNI,
|
| 10 |
+
N_VOWELS,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def rule_based_transliterate(text: str) -> str:
|
| 15 |
+
"""
|
| 16 |
+
Convert Romanized Singlish text to Sinhala script using phonetic rules.
|
| 17 |
+
|
| 18 |
+
Replacement order matters: longer patterns are consumed first so that
|
| 19 |
+
greedy left-to-right substitution produces correct output.
|
| 20 |
+
"""
|
| 21 |
+
# 1. Special consonants (anusvara, visarga, etc.)
|
| 22 |
+
for pat, uni in zip(SPECIAL_CONSONANTS, SPECIAL_CONSONANTS_UNI):
|
| 23 |
+
text = text.replace(pat, uni)
|
| 24 |
+
|
| 25 |
+
# 2. Consonant + special-char combinations (e.g., kru β ΰΆΰ·)
|
| 26 |
+
for sc, sc_uni in zip(SPECIAL_CHARS, SPECIAL_CHARS_UNI):
|
| 27 |
+
for cons, cons_uni in zip(CONSONANTS, CONSONANTS_UNI):
|
| 28 |
+
text = text.replace(cons + sc, cons_uni + sc_uni)
|
| 29 |
+
|
| 30 |
+
# 3. Consonant + ra + vowel clusters (e.g., kra β ΰΆΰ·βΰΆ»ΰ·)
|
| 31 |
+
for cons, cons_uni in zip(CONSONANTS, CONSONANTS_UNI):
|
| 32 |
+
for vow, vmod in zip(VOWELS, VOWEL_MODIFIERS_UNI):
|
| 33 |
+
text = text.replace(cons + "r" + vow, cons_uni + "ΰ·βΰΆ»" + vmod)
|
| 34 |
+
text = text.replace(cons + "r", cons_uni + "ΰ·βΰΆ»")
|
| 35 |
+
|
| 36 |
+
# 4. Consonant + vowel combinations
|
| 37 |
+
for cons, cons_uni in zip(CONSONANTS, CONSONANTS_UNI):
|
| 38 |
+
for j in range(N_VOWELS):
|
| 39 |
+
text = text.replace(cons + VOWELS[j], cons_uni + VOWEL_MODIFIERS_UNI[j])
|
| 40 |
+
|
| 41 |
+
# 5. Bare consonants β consonant + hal (virama)
|
| 42 |
+
for cons, cons_uni in zip(CONSONANTS, CONSONANTS_UNI):
|
| 43 |
+
text = text.replace(cons, cons_uni + "ΰ·")
|
| 44 |
+
|
| 45 |
+
# 6. Standalone vowels
|
| 46 |
+
for vow, vow_uni in zip(VOWELS, VOWELS_UNI):
|
| 47 |
+
text = text.replace(vow, vow_uni)
|
| 48 |
+
|
| 49 |
+
return text
|
dictionary.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e214bca77be43a9705e84baa870cf6c26b6d77cbc297231905138193cc8aaf40
|
| 3 |
+
size 326599035
|
english_20k.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
evaluation/dataset_110.csv
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
id,input,reference,split,has_code_mix,has_ambiguity,domain,notes
|
| 2 |
+
1,api kalin katha kala,ΰΆ
ΰΆ΄ΰ· ΰΆΰΆ½ΰ·ΰΆ±ΰ· ΰΆΰΆΰ· ΰΆΰ·
ΰ·,test,0,0,general,pure singlish
|
| 3 |
+
2,eka honda wage thiyanawa,ΰΆΰΆ ΰ·ΰ·ΰΆ³ ΰ·ΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,test,0,1,general,wage=seems
|
| 4 |
+
3,meheta thadata wessa,ΰΆΈΰ·ΰ·ΰ·ΰΆ§ ΰΆΰΆ―ΰΆ§ ΰ·ΰ·ΰ·ΰ·ΰ·ΰ·,test,0,1,general,thadata=very
|
| 5 |
+
4,oya kiwwata mama giye,ΰΆΰΆΊΰ· ΰΆΰ·ΰ·ΰ·ΰ·ΰΆ§ ΰΆΈΰΆΈ ΰΆΰ·ΰΆΊΰ·,test,0,0,general,contextual past
|
| 6 |
+
5,mama danne na eka gena,ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ· ΰΆΰΆ ΰΆΰ·ΰΆ±,test,0,1,general,eka pronoun
|
| 7 |
+
6,oya awa wage na,ΰΆΰΆΊΰ· ΰΆΰ·ΰ· ΰ·ΰΆΰ· ΰΆ±ΰ·,test,0,1,general,wage=seems
|
| 8 |
+
7,ekat ynna bri,ΰΆΰΆΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΆΰ·ΰΆ»ΰ·,test,0,0,general,ad-hoc bri=bari
|
| 9 |
+
8,mama inne gedaradi,ΰΆΈΰΆΈ ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆΰ·ΰΆ―ΰΆ»ΰΆ―ΰ·,test,0,0,general,pure singlish
|
| 10 |
+
9,eka heta balamu,ΰΆΰΆ ΰ·ΰ·ΰΆ§ ΰΆΆΰΆ½ΰΆΈΰ·,test,0,0,general,eka pronoun
|
| 11 |
+
10,klya madi api passe yamu,ΰΆΰ·ΰΆ½ΰΆΊ ΰΆΈΰΆ―ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆΊΰΆΈΰ·,test,0,0,general,ad-hoc klya=kalaya
|
| 12 |
+
11,assignment eka ada submit karanna one,assignment ΰΆΰΆ ΰΆ
ΰΆ― submit ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,test,1,0,education,eka after English noun
|
| 13 |
+
12,exam hall eka nisa mama baya una,exam hall ΰΆΰΆ ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆΆΰΆΊ ΰΆΰΆ±ΰ·,test,1,1,education,nisa=because
|
| 14 |
+
13,results blnna one,results ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,test,1,0,education,ad-hoc blnna=balanna
|
| 15 |
+
14,study group ekak hadamu,study group ΰΆΰΆΰΆΰ· ΰ·ΰΆ―ΰΆΈΰ·,test,1,0,education,ekak after English noun
|
| 16 |
+
15,viva ekta prepared wage na,viva ΰΆΰΆΰΆ§ prepared ΰ·ΰΆΰ· ΰΆ±ΰ·,test,1,1,education,wage=seems
|
| 17 |
+
16,mta project ek submit krnna one,ΰΆΈΰΆ§ project ΰΆΰΆ submit ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,test,1,0,education,ad-hoc mta krnna
|
| 18 |
+
17,hta parikshanaya thiyanawa,ΰ·ΰ·ΰΆ§ ΰΆ΄ΰΆ»ΰ·ΰΆΰ·βΰ·ΰΆ«ΰΆΊ ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,test,0,0,education,ad-hoc hta=heta
|
| 19 |
+
18,mama potha kiyawala iwara kala,ΰΆΈΰΆΈ ΰΆ΄ΰ·ΰΆ ΰΆΰ·ΰΆΊΰ·ΰΆ½ΰ· ΰΆΰ·ΰΆ» ΰΆΰ·
ΰ·,test,0,0,education,pure singlish
|
| 20 |
+
19,prkku nisa api kalin giya,ΰΆ΄ΰΆ»ΰΆΰ·ΰΆΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆΰΆ½ΰ·ΰΆ±ΰ· ΰΆΰ·ΰΆΊΰ·,test,0,1,education,nisa=because
|
| 21 |
+
20,prashnaya hondai wage penenawa,ΰΆ΄ΰ·βΰΆ»ΰ·ΰ·ΰΆ±ΰΆΊ ΰ·ΰ·ΰΆ³ΰΆΊΰ· ΰ·ΰΆΰ· ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·,test,0,1,education,wage=seems
|
| 22 |
+
21,deployments nisa site down wuna,deployments ΰΆ±ΰ·ΰ·ΰ· site down ΰΆΰΆ±ΰ·,test,1,1,work,nisa=because
|
| 23 |
+
22,PR eka merge karanna one,PR ΰΆΰΆ merge ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,test,1,0,work,eka after English noun
|
| 24 |
+
23,backlog eka update kala,backlog ΰΆΰΆ update ΰΆΰ·
ΰ·,test,1,0,work,eka after English noun
|
| 25 |
+
24,server down nisa work karanna ba,server down ΰΆ±ΰ·ΰ·ΰ· work ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΆΰ·,test,1,1,work,nisa=because
|
| 26 |
+
25,meeting eka tomorrow damu,meeting ΰΆΰΆ tomorrow ΰΆ―ΰ·ΰΆΈΰ·,test,1,0,work,code-mix preserved
|
| 27 |
+
26,feedback nisa redo karanna una,feedback ΰΆ±ΰ·ΰ·ΰ· redo ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,test,1,1,work,nisa=because
|
| 28 |
+
27,ape wada ada iwara wenawa,ΰΆ
ΰΆ΄ΰ· ΰ·ΰ·ΰΆ© ΰΆ
ΰΆ― ΰΆΰ·ΰΆ» ΰ·ΰ·ΰΆ±ΰ·ΰ·,test,0,0,work,pure singlish
|
| 29 |
+
28,kalamanakaru hitpu nisa api katha kala,ΰΆΰΆ½ΰΆΈΰΆ±ΰ·ΰΆΰΆ»ΰ· ΰ·ΰ·ΰΆ§ΰΆ΄ΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΰ· ΰΆΰ·
ΰ·,test,0,1,work,nisa=because; known failure (complex OOV)
|
| 30 |
+
29,me wada hondai wage penawa,ΰΆΈΰ· ΰ·ΰ·ΰΆ© ΰ·ΰ·ΰΆ³ΰΆΊΰ· ΰ·ΰΆΰ· ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·,test,0,1,work,wage=seems
|
| 31 |
+
30,wada tika ada iwara karamu,ΰ·ΰ·ΰΆ© ΰΆ§ΰ·ΰΆ ΰΆ
ΰΆ― ΰΆΰ·ΰΆ» ΰΆΰΆ»ΰΆΈΰ·,test,0,0,work,pure singlish
|
| 32 |
+
31,story eke poll ekak damma,story ΰΆΰΆΰ· poll ΰΆΰΆΰΆΰ· ΰΆ―ΰ·ΰΆΈΰ·ΰΆΈΰ·,test,1,0,social,eke and ekak forms
|
| 33 |
+
32,oyata DM ekak yawwa,ΰΆΰΆΊΰ·ΰΆ§ DM ΰΆΰΆΰΆΰ· ΰΆΊΰ·ΰ·ΰ·ΰ·ΰ·,test,1,0,social,ekak after English noun
|
| 34 |
+
33,comment eka delete kala nisa mama danne na,comment ΰΆΰΆ delete ΰΆΰ·
ΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·,test,1,1,social,"nisa=because; known failure (ΰΆΰ·
ΰ·/ΰΆΰΆ½, ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ·/ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ·)"
|
| 35 |
+
34,selfie ekak gannako,selfie ΰΆΰΆΰΆΰ· ΰΆΰΆ±ΰ·ΰΆ±ΰΆΰ·,test,1,0,social,ekak after English noun
|
| 36 |
+
35,post eka private nisa share karanna epa,post ΰΆΰΆ private ΰΆ±ΰ·ΰ·ΰ· share ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ΄ΰ·,test,1,1,social,nisa=because
|
| 37 |
+
36,oyta message krnna one,ΰΆΰΆΊΰ·ΰΆ§ message ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,test,1,0,social,ad-hoc oyta krnna on=one
|
| 38 |
+
37,api passe katha karamu,ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆΰΆΰ· ΰΆΰΆ»ΰΆΈΰ·,test,0,0,social,pure singlish
|
| 39 |
+
38,eya laga pinthurayk thiyanawa,ΰΆΰΆΊΰ· ΰ·
ΰΆ ΰΆ΄ΰ·ΰΆ±ΰ·ΰΆΰ·ΰΆ»ΰΆΊΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,test,0,0,social,ad-hoc pinthurayk
|
| 40 |
+
39,oya awa wage mata hithenawa,ΰΆΰΆΊΰ· ΰΆΰ·ΰ· ΰ·ΰΆΰ· ΰΆΈΰΆ§ ΰ·ΰ·ΰΆΰ·ΰΆ±ΰ·ΰ·,test,0,1,social,wage=seems
|
| 41 |
+
40,api passe hambawemu,ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰ·ΰΆΈΰ·ΰΆΆΰ·ΰ·ΰΆΈΰ·,test,0,0,social,pure singlish
|
| 42 |
+
41,phone eka charge karanna one,phone ΰΆΰΆ charge ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,test,1,0,general,NEW: general code-mix (gap fix)
|
| 43 |
+
42,bus eka late una,bus ΰΆΰΆ late ΰΆΰΆ±ΰ·,test,1,0,general,NEW: general code-mix
|
| 44 |
+
43,mama online inne,ΰΆΈΰΆΈ online ΰΆΰΆ±ΰ·ΰΆ±ΰ·,test,1,0,general,NEW: English mid-sentence
|
| 45 |
+
44,time nathi nisa heta yamu,time ΰΆ±ΰ·ΰΆΰ· ΰΆ±ΰ·ΰ·ΰ· ΰ·ΰ·ΰΆ§ ΰΆΊΰΆΈΰ·,test,1,1,general,NEW: English+nisa in general
|
| 46 |
+
45,oya call eka ganna,ΰΆΰΆΊΰ· call ΰΆΰΆ ΰΆΰΆ±ΰ·ΰΆ±,test,1,0,general,NEW: general code-mix eka pattern
|
| 47 |
+
46,api game yanawa heta,ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΈΰ· ΰΆΊΰΆ±ΰ·ΰ· ΰ·ΰ·ΰΆ§,test,0,1,general,NEW: game=ΰΆΰΆΈΰ·(village) ambig with English 'game'
|
| 48 |
+
47,man heta enne na,ΰΆΈΰΆ±ΰ· ΰ·ΰ·ΰΆ§ ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·,test,0,1,general,NEW: man=ΰΆΈΰΆ(I) ambig with English 'man'
|
| 49 |
+
48,eka hari lassanai,ΰΆΰΆ ΰ·ΰΆ»ΰ· ΰΆ½ΰ·ΰ·ΰ·ΰΆ±ΰΆΊΰ·,test,0,1,general,NEW: hari=very (not OK/correct)
|
| 50 |
+
49,oya kiwwa hari,ΰΆΰΆΊΰ· ΰΆΰ·ΰ·ΰ·ΰ·ΰ· ΰ·ΰΆ»ΰ·,test,0,1,general,NEW: hari=correct (not very)
|
| 51 |
+
50,kalaya ithuru krganna one,ΰΆΰΆ½ΰΆΊ ΰΆΰΆΰ·ΰΆ»ΰ· ΰΆΰΆ»ΰΆΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,test,0,1,general,NEW: one=ΰΆΰΆ±ΰ·(need) ambig with English 'one'
|
| 52 |
+
51,date eka fix karanna one,date ΰΆΰΆ fix ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,test,1,1,general,NEW: date=English preserve; one=ΰΆΰΆ±ΰ·
|
| 53 |
+
52,rata yanna one,ΰΆ»ΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,test,0,0,general,"NEW: rata=country, pure singlish"
|
| 54 |
+
53,game eke leaderboard eka balanna,game ΰΆΰΆΰ· leaderboard ΰΆΰΆ ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ±,test,1,1,social,NEW: game=English(video game) not ΰΆΰΆΈΰ·
|
| 55 |
+
54,api thamai hodama,ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΈΰΆΊΰ· ΰ·ΰ·ΰΆ³ΰΆΈ,test,0,1,general,NEW: thamai=emphatic we; hodama=best; looks English but Singlish
|
| 56 |
+
55,mama heta udee enawa oya enakota message ekk dnna,ΰΆΈΰΆΈ ΰ·ΰ·ΰΆ§ ΰΆΰΆ―ΰ· ΰΆΰΆ±ΰ·ΰ· ΰΆΰΆΊΰ· ΰΆΰΆ±ΰΆΰ·ΰΆ§ message ΰΆΰΆΰΆΰ· ΰΆ―ΰ·ΰΆ±ΰ·ΰΆ±,test,0,0,general,NEW: 8-word pure singlish
|
| 57 |
+
56,ape gedara langa thiyana kadeta yanna one,ΰΆ
ΰΆ΄ΰ· ΰΆΰ·ΰΆ―ΰΆ» ΰ·
ΰΆ ΰΆΰ·ΰΆΊΰ·ΰΆ± ΰΆΰΆ©ΰ·ΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,test,0,0,general,NEW: 7-word with ΰ·
ΰΆ
|
| 58 |
+
57,mama assignment eka karala submit karanawa ada raa,ΰΆΈΰΆΈ assignment ΰΆΰΆ ΰΆΰΆ»ΰΆ½ΰ· submit ΰΆΰΆ»ΰΆ±ΰ·ΰ· ΰΆ
ΰΆ― ΰΆ»ΰ·,test,1,0,education,NEW: 8-word code-mix long
|
| 59 |
+
58,oya enne naththe mokada kiyla mama danne na,ΰΆΰΆΊΰ· ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·ΰΆΰ·ΰΆΰ· ΰΆΈΰ·ΰΆΰΆ― ΰΆΰ·ΰΆΊΰΆ½ΰ· ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·,test,0,0,general,NEW: 9-word complex clause
|
| 60 |
+
59,client ekka call karala feedback eka ahanna one,client ΰΆΰΆΰ·ΰΆ call ΰΆΰΆ»ΰΆ½ΰ· feedback ΰΆΰΆ ΰΆ
ΰ·ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,test,1,0,work,NEW: 8-word heavy code-mix
|
| 61 |
+
60,mama gedara gihilla kewata passe call karannm,ΰΆΈΰΆΈ ΰΆΰ·ΰΆ―ΰΆ» ΰΆΰ·ΰ·ΰ·ΰΆ½ΰ·ΰΆ½ΰ· ΰΆΰ·ΰ·ΰΆ§ ΰΆ΄ΰ·ΰ·ΰ·ΰ· call ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±ΰΆΈΰ·,test,1,0,general,NEW: 8-word code-mix + temporal
|
| 62 |
+
61,laptop eke software update karanna one,laptop ΰΆΰΆΰ· software update ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,test,1,0,work,NEW: 3 English words consecutive
|
| 63 |
+
62,office eke wifi password eka mokakda,office ΰΆΰΆΰ· wifi password ΰΆΰΆ ΰΆΈΰ·ΰΆΰΆΰ·ΰΆ―,test,1,0,work,NEW: 3 English words; question
|
| 64 |
+
63,online order eka track karanna ba,online order ΰΆΰΆ track ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΆΰ·,test,1,0,general,NEW: 3 English words
|
| 65 |
+
64,email eke attachment eka download karanna,email ΰΆΰΆΰ· attachment ΰΆΰΆ download ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±,test,1,0,work,NEW: 3 English words + double eka
|
| 66 |
+
65,Instagram story eke filter eka hadanna,Instagram story ΰΆΰΆΰ· filter ΰΆΰΆ ΰ·ΰΆ―ΰΆ±ΰ·ΰΆ±,test,1,0,social,NEW: 4 English words; social media
|
| 67 |
+
66,oyge wada iwra krd,ΰΆΰΆΊΰ·ΰΆΰ· ΰ·ΰ·ΰΆ© ΰΆΰ·ΰΆ» ΰΆΰΆ»ΰ·ΰΆ―,test,0,0,general,NEW: extreme vowel omission
|
| 68 |
+
67,mge phone ek hack una,ΰΆΈΰΆΰ· phone ΰΆΰΆ hack ΰΆΰΆ±ΰ·,test,1,0,general,"NEW: heavy ad-hoc mmge=mage, hrk=hack"
|
| 69 |
+
68,handawata ynna wenwa,ΰ·ΰ·ΰΆ±ΰ·ΰΆ―ΰ·ΰ·ΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰ·ΰ·ΰΆ±ΰ·ΰ·,test,0,0,general,"NEW: ad-hoc hndta=handeta, wenwa=wenawa"
|
| 70 |
+
69,prashnya krnna oni,ΰΆ΄ΰ·βΰΆ»ΰ·ΰ·βΰΆ±ΰΆΊ ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,test,0,0,education,NEW: replaced extreme ad-hoc with more readable form
|
| 71 |
+
70,apita gdra ynna oni,ΰΆ
ΰΆ΄ΰ·ΰΆ§ ΰΆΰ·ΰΆ―ΰΆ» ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,test,0,0,general,NEW: ad-hoc gdra=gedara
|
| 72 |
+
71,mama oyata kiwwa,ΰΆΈΰΆΈ ΰΆΰΆΊΰ·ΰΆ§ ΰΆΰ·ΰ·ΰ·ΰ·ΰ·,test,0,0,general,"NEW: common words only (mama, oyata)"
|
| 73 |
+
72,oya hari hondai,ΰΆΰΆΊΰ· ΰ·ΰΆ»ΰ· ΰ·ΰ·ΰΆ³ΰΆΊΰ·,test,0,1,general,NEW: hari=very; common words
|
| 74 |
+
73,api heta yamu,ΰΆ
ΰΆ΄ΰ· ΰ·ΰ·ΰΆ§ ΰΆΊΰΆΈΰ·,test,0,0,general,NEW: common words bypass test
|
| 75 |
+
74,app eka crash wenawa phone eke,app ΰΆΰΆ crash ΰ·ΰ·ΰΆ±ΰ·ΰ· phone ΰΆΰΆΰ·,test,1,0,technology,NEW: tech domain
|
| 76 |
+
75,code eka push karanna github ekata,code ΰΆΰΆ push ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± github ΰΆΰΆΰΆ§,test,1,0,technology,NEW: dev workflow code-mix
|
| 77 |
+
76,database eka slow nisa query eka optimize karanna one,database ΰΆΰΆ slow ΰΆ±ΰ·ΰ·ΰ· query ΰΆΰΆ optimize ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,test,1,1,technology,NEW: heavy tech code-mix + nisa; long
|
| 78 |
+
77,bug eka fix kala merge karanna,bug ΰΆΰΆ fix ΰΆΰ·
ΰ· merge ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±,test,1,0,technology,NEW: sequential actions code-mix
|
| 79 |
+
78,internet eka slow wage thiyanawa,internet ΰΆΰΆ slow ΰ·ΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,test,1,1,technology,NEW: tech + wage ambiguity
|
| 80 |
+
79,kema hodai ada,ΰΆΰ·ΰΆΈ ΰ·ΰ·ΰΆ³ΰΆΊΰ· ΰΆ
ΰΆ―,test,0,0,daily_life,NEW: daily life; short
|
| 81 |
+
80,mama bus eke enawa,ΰΆΈΰΆΈ bus ΰΆΰΆΰ· ΰΆΰΆ±ΰ·ΰ·,test,1,0,daily_life,NEW: transport code-mix
|
| 82 |
+
81,ganu depala ekka market giya,ΰΆΰ·ΰΆ±ΰ· ΰΆ―ΰ·ΰΆ΄ΰΆ½ ΰΆΰΆΰ·ΰΆ market ΰΆΰ·ΰΆΊΰ·,test,1,0,daily_life,NEW: colloquial + code-mix
|
| 83 |
+
82,watura bonna one,ΰ·ΰΆΰ·ΰΆ» ΰΆΆΰ·ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,test,0,0,daily_life,NEW: health advice singlish
|
| 84 |
+
83,shop eke sugar nati nisa mama giye na,shop ΰΆΰΆΰ· sugar ΰΆ±ΰ·ΰΆΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆΰ·ΰΆΊΰ· ΰΆ±ΰ·,test,1,1,daily_life,NEW: daily code-mix + nisa; negative
|
| 85 |
+
84,hri hari,ΰ·ΰΆ»ΰ· ΰ·ΰΆ»ΰ·,test,0,0,general,NEW: 2-word repetition; common expression + ad-hoc hri=hari
|
| 86 |
+
85,mta ep,ΰΆΈΰΆ§ ΰΆΰΆ΄ΰ·,test,0,0,general,NEW: ad-hoc mta=mata ep=epa
|
| 87 |
+
86,ok hari,ok ΰ·ΰΆ»ΰ·,test,1,0,general,NEW: 2-word code-mix
|
| 88 |
+
87,ape game hari dewal wenne,ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΈΰ· ΰ·ΰΆ»ΰ· ΰΆ―ΰ·ΰ·ΰΆ½ΰ· ΰ·ΰ·ΰΆ±ΰ·ΰΆ±ΰ·,test,0,1,general,"NEW: game=village, hari=nice; looks English"
|
| 89 |
+
88,mta dan one na,ΰΆΈΰΆ§ ΰΆ―ΰ·ΰΆ±ΰ· ΰΆΰΆ±ΰ· ΰΆ±ΰ·,test,0,1,general,NEW: man+one look English but Singlish
|
| 90 |
+
89,eka hari hondai wage dnuna nisa mama giya,ΰΆΰΆ ΰ·ΰΆ»ΰ· ΰ·ΰ·ΰΆ³ΰΆΊΰ· ΰ·ΰΆΰ· ΰΆ―ΰ·ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆΰ·ΰΆΊΰ·,test,0,1,general,NEW: hari+wage+nisa triple ambiguity; ref corrected to ΰ·ΰ·ΰΆ³ΰΆΊΰ·
|
| 91 |
+
90,game eke mission hari amarui,game ΰΆΰΆΰ· mission ΰ·ΰΆ»ΰ· ΰΆ
ΰΆΈΰ·ΰΆ»ΰ·ΰΆΊΰ·,test,0,1,general,NEW: game=video game hari=very amarui=difficult; looks English but Singlish
|
| 92 |
+
91,mama heta yanawa,ΰΆΈΰΆΈ ΰ·ΰ·ΰΆ§ ΰΆΊΰΆ±ΰ·ΰ·,test,0,0,general,NEW: future tense
|
| 93 |
+
92,ey iye aawa,ΰΆΰΆΊΰ· ΰΆΰΆΊΰ· ΰΆΰ·ΰ·,test,0,0,general,NEW: past tense
|
| 94 |
+
93,api dan yanawa,ΰΆ
ΰΆ΄ΰ· ΰΆ―ΰ·ΰΆ±ΰ· ΰΆΊΰΆ±ΰ·ΰ·,test,0,0,general,NEW: present tense
|
| 95 |
+
94,video eka balanna one,video ΰΆΰΆ ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,test,1,0,social,NEW: eka definite article
|
| 96 |
+
95,video ekak hadamu,video ΰΆΰΆΰΆΰ· ΰ·ΰΆ―ΰΆΈΰ·,test,1,0,social,NEW: ekak indefinite
|
| 97 |
+
96,video eke comment eka balanna,video ΰΆΰΆΰ· comment ΰΆΰΆ ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ±,test,1,0,social,NEW: eke possessive + double eka
|
| 98 |
+
97,video ekata like ekak danna,video ΰΆΰΆΰΆ§ like ΰΆΰΆΰΆΰ· ΰΆ―ΰ·ΰΆ±ΰ·ΰΆ±,test,1,0,social,NEW: ekata dative case
|
| 99 |
+
98,lecture eka record karala share karanna,lecture ΰΆΰΆ record ΰΆΰΆ»ΰΆ½ΰ· share ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±,test,1,0,education,NEW: sequential code-mix actions
|
| 100 |
+
99,research paper eka liyanna one heta wge,research paper ΰΆΰΆ ΰΆ½ΰ·ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ· ΰ·ΰ·ΰΆ§ ΰ·ΰΆΰ·,test,1,0,education,NEW: long + temporal; 8 words
|
| 101 |
+
100,exam eka hari amarui,exam ΰΆΰΆ ΰ·ΰΆ»ΰ· ΰΆ
ΰΆΈΰ·ΰΆ»ΰ·ΰΆΊΰ·,test,1,1,education,NEW: hari=very; difficulty context
|
| 102 |
+
101,sprint eka plan karamu Monday,sprint ΰΆΰΆ plan ΰΆΰΆ»ΰΆΈΰ· Monday,test,1,0,work,NEW: day name preserved
|
| 103 |
+
102,ape team eka deadline ekata kala,ΰΆ
ΰΆ΄ΰ· team ΰΆΰΆ deadline ΰΆΰΆΰΆ§ ΰΆΰ·
ΰ·,test,1,0,work,NEW: possessive + double English
|
| 104 |
+
103,standup eke mokada kiwwe,standup ΰΆΰΆΰ· ΰΆΈΰ·ΰΆΰΆ― ΰΆΰ·ΰ·ΰ·ΰ·ΰ·,test,1,0,work,NEW: question form code-mix
|
| 105 |
+
104,reel eka viral una,reel ΰΆΰΆ viral ΰΆΰΆ±ΰ·,test,1,0,social,NEW: social media terminology
|
| 106 |
+
105,group chat eke mokada wenne,group chat ΰΆΰΆΰ· ΰΆΈΰ·ΰΆΰΆ― ΰ·ΰ·ΰΆ±ΰ·ΰΆ±ΰ·,test,1,0,social,NEW: compound English + question
|
| 107 |
+
106,oyge profile picture eka lassanai,ΰΆΰΆΊΰ·ΰΆΰ· profile picture ΰΆΰΆ ΰΆ½ΰ·ΰ·ΰ·ΰΆ±ΰΆΊΰ·,test,1,0,social,NEW: compound English noun + eka; ref corrected to ΰΆΰΆΊΰ·ΰΆΰ·
|
| 108 |
+
107,mama enne na heta,ΰΆΈΰΆΈ ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ· ΰ·ΰ·ΰΆ§,test,0,0,general,NEW: negation at end
|
| 109 |
+
108,eka karanna epa,ΰΆΰΆ ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ΄ΰ·,test,0,0,general,NEW: prohibition form
|
| 110 |
+
109,kawruwath enne na,ΰΆΰ·ΰ·ΰΆ»ΰ·ΰ·ΰΆΰ· ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·,test,0,0,general,NEW: nobody negation
|
| 111 |
+
110,oya koheda ynne,ΰΆΰΆΊΰ· ΰΆΰ·ΰ·ΰ·ΰΆ― ΰΆΊΰΆ±ΰ·ΰΆ±ΰ·,test,0,0,general,NEW: question form where
|
evaluation/dataset_40.csv
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
id,input,reference,split,has_code_mix,has_ambiguity,domain,notes
|
| 2 |
+
1,api kalin katha kala,ΰΆ
ΰΆ΄ΰ· ΰΆΰΆ½ΰ·ΰΆ±ΰ· ΰΆΰΆΰ· ΰΆΰ·
ΰ·,train,0,0,general,pure singlish
|
| 3 |
+
2,eka honda wage thiyanawa,ΰΆΰΆ ΰ·ΰ·ΰΆ³ ΰ·ΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,train,0,1,general,wage=seems
|
| 4 |
+
3,pola nisa gedara thiyanawa,ΰΆ΄ΰ·ΰΆ½ ΰΆ±ΰ·ΰ·ΰ· ΰΆΰ·ΰΆ―ΰΆ» ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,train,0,1,general,nisa=because
|
| 5 |
+
4,oya kiwwata mama giye,ΰΆΰΆΊΰ· ΰΆΰ·ΰ·ΰ·ΰ·ΰΆ§ ΰΆΈΰΆΈ ΰΆΰ·ΰΆΊΰ·,train,0,0,general,contextual past
|
| 6 |
+
5,mama danne na eka gena,ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ· ΰΆΰΆ ΰΆΰ·ΰΆ±,train,0,1,general,eka pronoun
|
| 7 |
+
6,oya awa wage na,ΰΆΰΆΊΰ· ΰΆΰ·ΰ· ΰ·ΰΆΰ· ΰΆ±ΰ·,train,0,1,general,wage=seems
|
| 8 |
+
7,ekat ynna bri,ΰΆΰΆΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΆΰ·ΰΆ»ΰ·,train,0,0,general,ad hoc bri=bari
|
| 9 |
+
8,mama inne gedaradi,ΰΆΈΰΆΈ ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆΰ·ΰΆ―ΰΆ»ΰΆ―ΰ·,train,0,0,general,pure singlish
|
| 10 |
+
9,eka heta balamu,ΰΆΰΆ ΰ·ΰ·ΰΆ§ ΰΆΆΰΆ½ΰΆΈΰ·,train,0,0,general,eka pronoun
|
| 11 |
+
10,klya madi api passe yamu,ΰΆΰ·ΰΆ½ΰΆΊ ΰΆΈΰΆ―ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆΊΰΆΈΰ·,train,0,0,general,ad hoc klya=kalaya
|
| 12 |
+
11,assignment eka ada submit karanna one,assignment ΰΆΰΆ ΰΆ
ΰΆ― submit ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,train,1,0,education,eka after English noun
|
| 13 |
+
12,exam hall eka nisa mama baya una,exam hall ΰΆΰΆ ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆΆΰΆΊ ΰΆΰΆ±ΰ·,train,1,1,education,nisa=because
|
| 14 |
+
13,results blnna one,results ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,train,1,0,education,ad hoc blnna=balanna
|
| 15 |
+
14,study group ekak hadamu,study group ΰΆΰΆΰΆΰ· ΰ·ΰΆ―ΰΆΈΰ·,train,1,0,education,ekak after English noun
|
| 16 |
+
15,viva ekta prepared wage na,viva ΰΆΰΆΰΆ§ prepared ΰ·ΰΆΰ· ΰΆ±ΰ·,train,1,1,education,wage=seems
|
| 17 |
+
16,mta project ek submit krnna one,ΰΆΈΰΆ§ project ΰΆΰΆ submit ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,train,1,0,education,ad hoc mta krnna
|
| 18 |
+
17,hta parikshanaya thiyanawa,ΰ·ΰ·ΰΆ§ ΰΆ΄ΰΆ»ΰ·ΰΆΰ·βΰ·ΰΆ«ΰΆΊ ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,train,0,0,education,ad hoc hta=heta
|
| 19 |
+
18,mama poth kiyawala iwara kala,ΰΆΈΰΆΈ ΰΆ΄ΰ·ΰΆ ΰΆΰ·ΰΆΊΰ·ΰΆ½ΰ· ΰΆΰ·ΰΆ» ΰΆΰ·
ΰ·,train,0,0,education,pure singlish
|
| 20 |
+
19,guruwaraya nisa api kalin giya,ΰΆΰ·ΰΆ»ΰ·ΰ·ΰΆ»ΰΆΊΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆΰΆ½ΰ·ΰΆ±ΰ· ΰΆΰ·ΰΆΊΰ·,train,0,1,education,nisa=because
|
| 21 |
+
20,prashnaya honda wage penenawa,ΰΆ΄ΰ·βΰΆ»ΰ·ΰ·ΰΆ±ΰΆΊ ΰ·ΰ·ΰΆ³ ΰ·ΰΆΰ· ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·,train,0,1,education,wage=seems
|
| 22 |
+
21,deploy nisa site down wuna,deploy ΰΆ±ΰ·ΰ·ΰ· site down ΰΆΰΆ±ΰ·,train,1,1,work,nisa=because
|
| 23 |
+
22,PR eka merge karanna one,PR ΰΆΰΆ merge ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,train,1,0,work,eka after English noun
|
| 24 |
+
23,backlog eka update kala,backlog ΰΆΰΆ update ΰΆΰ·
ΰ·,train,1,0,work,eka after English noun
|
| 25 |
+
24,server down nisa work karanna ba,server down ΰΆ±ΰ·ΰ·ΰ· work ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΆΰ·,train,1,1,work,nisa=because
|
| 26 |
+
25,meeting eka tomorrow damu,meeting ΰΆΰΆ tomorrow ΰΆ―ΰ·ΰΆΈΰ·,train,1,0,work,code mix preserved
|
| 27 |
+
26,feedback nisa redo karanna una,feedback ΰΆ±ΰ·ΰ·ΰ· redo ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,train,1,1,work,nisa=because
|
| 28 |
+
27,ape wada ada iwara wenawa,ΰΆ
ΰΆ΄ΰ· ΰ·ΰ·ΰΆ© ΰΆ
ΰΆ― ΰΆΰ·ΰΆ» ΰ·ΰ·ΰΆ±ΰ·ΰ·,train,0,0,work,pure singlish
|
| 29 |
+
28,kalamanakaru apu nisa api katha kala,ΰΆΰΆ½ΰΆΈΰΆ«ΰ·ΰΆΰΆ»ΰ· ΰΆΰΆ΄ΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΰ· ΰΆΰΆ½ΰ·,train,0,1,work,nisa=because
|
| 30 |
+
29,me wada honda wage penenawa,ΰΆΈΰ· ΰ·ΰ·ΰΆ© ΰ·ΰ·ΰΆ³ ΰ·ΰΆΰ· ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·,train,0,1,work,wage=seems
|
| 31 |
+
30,wada tika ada iwara karamu,ΰ·ΰ·ΰΆ© ΰΆ§ΰ·ΰΆ ΰΆ
ΰΆ― ΰΆΰ·ΰΆ» ΰΆΰΆ»ΰΆΈΰ·,train,0,0,work,pure singlish
|
| 32 |
+
31,story eke poll ekak damma,story ΰΆΰΆΰ· poll ΰΆΰΆΰΆΰ· ΰΆ―ΰ·ΰΆΈΰ·ΰΆΈΰ·,train,1,0,social,eke and ekak forms
|
| 33 |
+
32,oyata DM ekak yewwa,ΰΆΰΆΊΰ·ΰΆ§ DM ΰΆΰΆΰΆΰ· ΰΆΊΰ·ΰ·ΰ·ΰ·ΰ·,train,1,0,social,ekak after English noun
|
| 34 |
+
33,comment eka delete kala nisa mama danne na,comment ΰΆΰΆ delete ΰΆΰΆ½ ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·,train,1,1,social,nisa=because
|
| 35 |
+
34,selfie ekak gannako,selfie ΰΆΰΆΰΆΰ· ΰΆΰΆ±ΰ·ΰΆ±ΰΆΰ·,train,1,0,social,ekak after English noun
|
| 36 |
+
35,post eka private nisa share karanna epa,post ΰΆΰΆ private ΰΆ±ΰ·ΰ·ΰ· share ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ΄ΰ·,train,1,1,social,nisa=because
|
| 37 |
+
36,oyta message krnna on,ΰΆΰΆΊΰ·ΰΆ§ message ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,train,1,0,social,ad hoc oyta krnna
|
| 38 |
+
37,oya passe katha karamu,ΰΆΰΆΊΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆΰΆΰ· ΰΆΰΆ»ΰΆΈΰ·,train,0,0,social,pure singlish
|
| 39 |
+
38,eya laga pinthurayk thiyanawa,ΰΆΰΆΊΰ· ΰ·
ΰΆ ΰΆ΄ΰ·ΰΆ±ΰ·ΰΆΰ·ΰΆ»ΰΆΊΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,train,0,0,social,ad hoc pinthurayk
|
| 40 |
+
39,oya awa wage mata hithenawa,ΰΆΰΆΊΰ· ΰΆΰ·ΰ· ΰ·ΰΆΰ· ΰΆΈΰΆ§ ΰ·ΰ·ΰΆΰ·ΰΆ±ΰ·ΰ·,train,0,1,social,wage=seems
|
| 41 |
+
40,api passe hambawemu,ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰ·ΰΆΈΰ·ΰΆΆΰ·ΰ·ΰΆΈΰ·,train,0,0,social,pure singlish
|
evaluation/evaluation.py
ADDED
|
@@ -0,0 +1,306 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import csv
|
| 3 |
+
import json
|
| 4 |
+
import math
|
| 5 |
+
import os
|
| 6 |
+
import re
|
| 7 |
+
import sys
|
| 8 |
+
import time
|
| 9 |
+
from collections import Counter
|
| 10 |
+
from dataclasses import asdict
|
| 11 |
+
from typing import Dict, List, Tuple
|
| 12 |
+
|
| 13 |
+
# Ensure parent dir is on path so sincode_model can be imported from misc/
|
| 14 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
| 15 |
+
|
| 16 |
+
from sincode_model import BeamSearchDecoder
|
| 17 |
+
|
| 18 |
+
ASCII_WORD_RE = re.compile(r"[A-Za-z][A-Za-z0-9_'-]*")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# ββ String-level metrics ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 22 |
+
|
| 23 |
+
def levenshtein(a: str, b: str) -> int:
|
| 24 |
+
if not a:
|
| 25 |
+
return len(b)
|
| 26 |
+
if not b:
|
| 27 |
+
return len(a)
|
| 28 |
+
|
| 29 |
+
prev = list(range(len(b) + 1))
|
| 30 |
+
for i, ca in enumerate(a, start=1):
|
| 31 |
+
curr = [i] + [0] * len(b)
|
| 32 |
+
for j, cb in enumerate(b, start=1):
|
| 33 |
+
cost = 0 if ca == cb else 1
|
| 34 |
+
curr[j] = min(
|
| 35 |
+
prev[j] + 1,
|
| 36 |
+
curr[j - 1] + 1,
|
| 37 |
+
prev[j - 1] + cost,
|
| 38 |
+
)
|
| 39 |
+
prev = curr
|
| 40 |
+
return prev[-1]
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def cer(pred: str, ref: str) -> float:
|
| 44 |
+
if not ref:
|
| 45 |
+
return 0.0 if not pred else 1.0
|
| 46 |
+
return levenshtein(pred, ref) / max(len(ref), 1)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def wer(pred: str, ref: str) -> float:
|
| 50 |
+
pred_tokens = pred.split()
|
| 51 |
+
ref_tokens = ref.split()
|
| 52 |
+
if not ref_tokens:
|
| 53 |
+
return 0.0 if not pred_tokens else 1.0
|
| 54 |
+
return levenshtein_tokens(pred_tokens, ref_tokens) / max(len(ref_tokens), 1)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def levenshtein_tokens(a: list, b: list) -> int:
|
| 58 |
+
if not a:
|
| 59 |
+
return len(b)
|
| 60 |
+
if not b:
|
| 61 |
+
return len(a)
|
| 62 |
+
prev = list(range(len(b) + 1))
|
| 63 |
+
for i, ta in enumerate(a, start=1):
|
| 64 |
+
curr = [i] + [0] * len(b)
|
| 65 |
+
for j, tb in enumerate(b, start=1):
|
| 66 |
+
cost = 0 if ta == tb else 1
|
| 67 |
+
curr[j] = min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost)
|
| 68 |
+
prev = curr
|
| 69 |
+
return prev[-1]
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def bleu_sentence(pred: str, ref: str, max_n: int = 4) -> float:
|
| 73 |
+
pred_tokens = pred.split()
|
| 74 |
+
ref_tokens = ref.split()
|
| 75 |
+
if not pred_tokens or not ref_tokens:
|
| 76 |
+
return 0.0
|
| 77 |
+
|
| 78 |
+
# Cap n-gram order at the shorter sentence length
|
| 79 |
+
effective_n = min(max_n, len(pred_tokens), len(ref_tokens))
|
| 80 |
+
if effective_n == 0:
|
| 81 |
+
return 0.0
|
| 82 |
+
|
| 83 |
+
brevity = min(1.0, len(pred_tokens) / len(ref_tokens))
|
| 84 |
+
log_avg = 0.0
|
| 85 |
+
for n in range(1, effective_n + 1):
|
| 86 |
+
pred_ngrams = Counter(
|
| 87 |
+
tuple(pred_tokens[i : i + n]) for i in range(len(pred_tokens) - n + 1)
|
| 88 |
+
)
|
| 89 |
+
ref_ngrams = Counter(
|
| 90 |
+
tuple(ref_tokens[i : i + n]) for i in range(len(ref_tokens) - n + 1)
|
| 91 |
+
)
|
| 92 |
+
clipped = sum(min(c, ref_ngrams[ng]) for ng, c in pred_ngrams.items())
|
| 93 |
+
total = max(sum(pred_ngrams.values()), 1)
|
| 94 |
+
precision = clipped / total
|
| 95 |
+
if precision == 0:
|
| 96 |
+
return 0.0
|
| 97 |
+
log_avg += math.log(precision) / effective_n
|
| 98 |
+
|
| 99 |
+
return brevity * math.exp(log_avg)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def token_accuracy(pred: str, ref: str) -> float:
|
| 103 |
+
pred_tokens = pred.split()
|
| 104 |
+
ref_tokens = ref.split()
|
| 105 |
+
if not ref_tokens:
|
| 106 |
+
return 0.0 if pred_tokens else 1.0
|
| 107 |
+
|
| 108 |
+
matches = sum(1 for p, r in zip(pred_tokens, ref_tokens) if p == r)
|
| 109 |
+
return matches / max(len(ref_tokens), 1)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def extract_english_tokens(text: str) -> List[str]:
|
| 113 |
+
return [m.group(0) for m in ASCII_WORD_RE.finditer(text)]
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def code_mix_preservation(input_text: str, ref_text: str, pred_text: str) -> float:
|
| 117 |
+
"""Measure how well English tokens from the reference are preserved.
|
| 118 |
+
Only counts English words that appear in the REFERENCE (not raw input,
|
| 119 |
+
since the input is all ASCII). Returns 1.0 if no English in reference."""
|
| 120 |
+
ref_eng = extract_english_tokens(ref_text)
|
| 121 |
+
if not ref_eng:
|
| 122 |
+
return 1.0
|
| 123 |
+
|
| 124 |
+
pred_tokens = set(pred_text.split())
|
| 125 |
+
preserved = sum(1 for token in ref_eng if token in pred_tokens)
|
| 126 |
+
return preserved / len(ref_eng)
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def load_dataset(csv_path: str) -> List[Tuple[str, str]]:
|
| 130 |
+
rows: List[Tuple[str, str]] = []
|
| 131 |
+
with open(csv_path, "r", encoding="utf-8", newline="") as f:
|
| 132 |
+
reader = csv.DictReader(f)
|
| 133 |
+
if "input" not in reader.fieldnames or "reference" not in reader.fieldnames:
|
| 134 |
+
raise ValueError("CSV must contain 'input' and 'reference' columns")
|
| 135 |
+
|
| 136 |
+
for row in reader:
|
| 137 |
+
src = (row.get("input") or "").strip()
|
| 138 |
+
ref = (row.get("reference") or "").strip()
|
| 139 |
+
if src:
|
| 140 |
+
rows.append((src, ref))
|
| 141 |
+
return rows
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def evaluate(
|
| 145 |
+
decoder: BeamSearchDecoder,
|
| 146 |
+
dataset: List[Tuple[str, str]],
|
| 147 |
+
mode: str = "greedy",
|
| 148 |
+
beam_width: int = 5,
|
| 149 |
+
) -> Tuple[Dict[str, float], List[Dict[str, object]]]:
|
| 150 |
+
details: List[Dict[str, object]] = []
|
| 151 |
+
exact = 0
|
| 152 |
+
total_cer = 0.0
|
| 153 |
+
total_wer = 0.0
|
| 154 |
+
total_bleu = 0.0
|
| 155 |
+
total_token_acc = 0.0
|
| 156 |
+
total_code_mix = 0.0
|
| 157 |
+
total_time = 0.0
|
| 158 |
+
|
| 159 |
+
for idx, (src, ref) in enumerate(dataset, start=1):
|
| 160 |
+
t0 = time.perf_counter()
|
| 161 |
+
if mode == "greedy":
|
| 162 |
+
pred, _, diagnostics = decoder.greedy_decode_with_diagnostics(src)
|
| 163 |
+
else:
|
| 164 |
+
pred, _, diagnostics = decoder.decode_with_diagnostics(
|
| 165 |
+
src, beam_width=beam_width
|
| 166 |
+
)
|
| 167 |
+
elapsed = time.perf_counter() - t0
|
| 168 |
+
total_time += elapsed
|
| 169 |
+
|
| 170 |
+
is_exact = int(pred == ref)
|
| 171 |
+
exact += is_exact
|
| 172 |
+
|
| 173 |
+
sample_cer = cer(pred, ref)
|
| 174 |
+
sample_wer = wer(pred, ref)
|
| 175 |
+
sample_bleu = bleu_sentence(pred, ref)
|
| 176 |
+
sample_token_acc = token_accuracy(pred, ref)
|
| 177 |
+
sample_code_mix = code_mix_preservation(src, ref, pred)
|
| 178 |
+
|
| 179 |
+
total_cer += sample_cer
|
| 180 |
+
total_wer += sample_wer
|
| 181 |
+
total_bleu += sample_bleu
|
| 182 |
+
total_token_acc += sample_token_acc
|
| 183 |
+
total_code_mix += sample_code_mix
|
| 184 |
+
|
| 185 |
+
details.append({
|
| 186 |
+
"id": idx,
|
| 187 |
+
"input": src,
|
| 188 |
+
"reference": ref,
|
| 189 |
+
"prediction": pred,
|
| 190 |
+
"exact_match": bool(is_exact),
|
| 191 |
+
"cer": round(sample_cer, 4),
|
| 192 |
+
"wer": round(sample_wer, 4),
|
| 193 |
+
"bleu": round(sample_bleu, 4),
|
| 194 |
+
"token_accuracy": round(sample_token_acc, 4),
|
| 195 |
+
"code_mix_preservation": round(sample_code_mix, 4),
|
| 196 |
+
"time_s": round(elapsed, 3),
|
| 197 |
+
})
|
| 198 |
+
|
| 199 |
+
n = max(len(dataset), 1)
|
| 200 |
+
metrics = {
|
| 201 |
+
"mode": mode,
|
| 202 |
+
"samples": len(dataset),
|
| 203 |
+
"exact_match": round(exact / n, 4),
|
| 204 |
+
"exact_match_count": f"{exact}/{len(dataset)}",
|
| 205 |
+
"avg_cer": round(total_cer / n, 4),
|
| 206 |
+
"avg_wer": round(total_wer / n, 4),
|
| 207 |
+
"avg_bleu": round(total_bleu / n, 4),
|
| 208 |
+
"avg_token_accuracy": round(total_token_acc / n, 4),
|
| 209 |
+
"avg_code_mix_preservation": round(total_code_mix / n, 4),
|
| 210 |
+
"total_time_s": round(total_time, 2),
|
| 211 |
+
"avg_time_per_sentence_s": round(total_time / n, 3),
|
| 212 |
+
}
|
| 213 |
+
return metrics, details
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def write_predictions(path: str, rows: List[Dict[str, object]]) -> None:
|
| 217 |
+
with open(path, "w", encoding="utf-8", newline="") as f:
|
| 218 |
+
writer = csv.DictWriter(
|
| 219 |
+
f,
|
| 220 |
+
fieldnames=[
|
| 221 |
+
"id",
|
| 222 |
+
"input",
|
| 223 |
+
"reference",
|
| 224 |
+
"prediction",
|
| 225 |
+
"exact_match",
|
| 226 |
+
"cer",
|
| 227 |
+
"wer",
|
| 228 |
+
"bleu",
|
| 229 |
+
"token_accuracy",
|
| 230 |
+
"code_mix_preservation",
|
| 231 |
+
"time_s",
|
| 232 |
+
],
|
| 233 |
+
)
|
| 234 |
+
writer.writeheader()
|
| 235 |
+
for row in rows:
|
| 236 |
+
writer.writerow({k: row[k] for k in writer.fieldnames})
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
def write_diagnostics(path: str, rows: List[Dict[str, object]]) -> None:
|
| 240 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 241 |
+
json.dump(rows, f, ensure_ascii=False, indent=2)
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def parse_args() -> argparse.Namespace:
|
| 245 |
+
parser = argparse.ArgumentParser(
|
| 246 |
+
description="Evaluate SinCode transliteration quality on a CSV dataset.",
|
| 247 |
+
)
|
| 248 |
+
parser.add_argument(
|
| 249 |
+
"--dataset",
|
| 250 |
+
required=True,
|
| 251 |
+
help="Path to CSV with columns: input,reference",
|
| 252 |
+
)
|
| 253 |
+
parser.add_argument(
|
| 254 |
+
"--mode",
|
| 255 |
+
choices=["greedy", "beam"],
|
| 256 |
+
default="greedy",
|
| 257 |
+
help="Decode mode (default: greedy)",
|
| 258 |
+
)
|
| 259 |
+
parser.add_argument(
|
| 260 |
+
"--beam-width",
|
| 261 |
+
type=int,
|
| 262 |
+
default=5,
|
| 263 |
+
help="Beam width used during decoding (default: 5, only for beam mode)",
|
| 264 |
+
)
|
| 265 |
+
parser.add_argument(
|
| 266 |
+
"--model",
|
| 267 |
+
default=None,
|
| 268 |
+
help="Optional Hugging Face model name or local path to evaluate",
|
| 269 |
+
)
|
| 270 |
+
parser.add_argument(
|
| 271 |
+
"--predictions-out",
|
| 272 |
+
default="eval_predictions.csv",
|
| 273 |
+
help="Output CSV path for per-sample predictions",
|
| 274 |
+
)
|
| 275 |
+
parser.add_argument(
|
| 276 |
+
"--diagnostics-out",
|
| 277 |
+
default="eval_diagnostics.json",
|
| 278 |
+
help="Output JSON path with per-word diagnostics",
|
| 279 |
+
)
|
| 280 |
+
return parser.parse_args()
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
def main() -> None:
|
| 284 |
+
args = parse_args()
|
| 285 |
+
dataset = load_dataset(args.dataset)
|
| 286 |
+
if not dataset:
|
| 287 |
+
raise ValueError("Dataset is empty. Add rows with input/reference values.")
|
| 288 |
+
|
| 289 |
+
decoder = BeamSearchDecoder(model_name=args.model) if args.model else BeamSearchDecoder()
|
| 290 |
+
metrics, details = evaluate(
|
| 291 |
+
decoder, dataset, mode=args.mode, beam_width=args.beam_width
|
| 292 |
+
)
|
| 293 |
+
|
| 294 |
+
write_predictions(args.predictions_out, details)
|
| 295 |
+
write_diagnostics(args.diagnostics_out, details)
|
| 296 |
+
|
| 297 |
+
print("\n" + "=" * 60)
|
| 298 |
+
print(" SinCode Evaluation Results")
|
| 299 |
+
print("=" * 60)
|
| 300 |
+
print(json.dumps(metrics, ensure_ascii=False, indent=2))
|
| 301 |
+
print(f"\nPredictions saved to: {args.predictions_out}")
|
| 302 |
+
print(f"Diagnostics saved to: {args.diagnostics_out}")
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
if __name__ == "__main__":
|
| 306 |
+
main()
|
feedback_schema.sql
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
create table if not exists public.feedback_submissions (
|
| 2 |
+
id bigint generated by default as identity primary key,
|
| 3 |
+
created_at timestamptz not null default timezone('utc', now()),
|
| 4 |
+
input_sentence text not null,
|
| 5 |
+
original_output text not null,
|
| 6 |
+
corrected_output text not null,
|
| 7 |
+
user_comment text not null default '',
|
| 8 |
+
decode_mode text not null default '',
|
| 9 |
+
review_status text not null default 'pending'
|
| 10 |
+
check (review_status in ('pending', 'approved', 'rejected')),
|
| 11 |
+
admin_notes text not null default '',
|
| 12 |
+
source text not null default 'streamlit'
|
| 13 |
+
);
|
| 14 |
+
|
| 15 |
+
create index if not exists feedback_submissions_created_at_idx
|
| 16 |
+
on public.feedback_submissions (created_at desc);
|
| 17 |
+
|
| 18 |
+
create index if not exists feedback_submissions_review_status_idx
|
| 19 |
+
on public.feedback_submissions (review_status);
|
feedback_store.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from typing import Any, Dict, List, Optional
|
| 3 |
+
|
| 4 |
+
import requests
|
| 5 |
+
|
| 6 |
+
class FeedbackStore:
|
| 7 |
+
def __init__(
|
| 8 |
+
self,
|
| 9 |
+
supabase_url: str = "",
|
| 10 |
+
supabase_anon_key: str = "",
|
| 11 |
+
supabase_service_key: str = "",
|
| 12 |
+
table_name: str = "feedback_submissions",
|
| 13 |
+
) -> None:
|
| 14 |
+
self.supabase_url = supabase_url.rstrip("/")
|
| 15 |
+
self.supabase_anon_key = supabase_anon_key
|
| 16 |
+
self.supabase_service_key = supabase_service_key
|
| 17 |
+
self.table_name = table_name
|
| 18 |
+
|
| 19 |
+
@property
|
| 20 |
+
def is_remote_enabled(self) -> bool:
|
| 21 |
+
return bool(self.supabase_url and (self.supabase_service_key or self.supabase_anon_key))
|
| 22 |
+
|
| 23 |
+
@property
|
| 24 |
+
def backend_label(self) -> str:
|
| 25 |
+
return "Supabase" if self.is_remote_enabled else "Supabase (not configured)"
|
| 26 |
+
|
| 27 |
+
def save_submission(
|
| 28 |
+
self,
|
| 29 |
+
input_sentence: str,
|
| 30 |
+
original_output: str,
|
| 31 |
+
corrected_output: str,
|
| 32 |
+
user_comment: str = "",
|
| 33 |
+
decode_mode: str = "",
|
| 34 |
+
) -> Dict[str, Any]:
|
| 35 |
+
payload = {
|
| 36 |
+
"input_sentence": input_sentence,
|
| 37 |
+
"original_output": original_output,
|
| 38 |
+
"corrected_output": corrected_output,
|
| 39 |
+
"user_comment": user_comment.strip(),
|
| 40 |
+
"decode_mode": decode_mode,
|
| 41 |
+
"review_status": "pending",
|
| 42 |
+
"admin_notes": "",
|
| 43 |
+
"source": "streamlit",
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
self._require_remote()
|
| 47 |
+
return self._insert_remote(payload)
|
| 48 |
+
|
| 49 |
+
def list_submissions(self, review_status: Optional[str] = None, limit: int = 200) -> List[Dict[str, Any]]:
|
| 50 |
+
self._require_remote()
|
| 51 |
+
return self._list_remote(review_status=review_status, limit=limit)
|
| 52 |
+
|
| 53 |
+
def update_submission_status(self, submission_id: str, review_status: str, admin_notes: str = "") -> Dict[str, Any]:
|
| 54 |
+
self._require_remote()
|
| 55 |
+
return self._update_remote(submission_id=submission_id, review_status=review_status, admin_notes=admin_notes)
|
| 56 |
+
|
| 57 |
+
def _require_remote(self) -> None:
|
| 58 |
+
if not self.is_remote_enabled:
|
| 59 |
+
raise RuntimeError(
|
| 60 |
+
"Supabase is not configured. Set SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY in secrets."
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
def _insert_remote(self, payload: Dict[str, Any]) -> Dict[str, Any]:
|
| 64 |
+
url = f"{self.supabase_url}/rest/v1/{self.table_name}"
|
| 65 |
+
response = requests.post(
|
| 66 |
+
url,
|
| 67 |
+
headers=self._headers(admin=False, prefer="return=representation"),
|
| 68 |
+
json=payload,
|
| 69 |
+
timeout=15,
|
| 70 |
+
)
|
| 71 |
+
response.raise_for_status()
|
| 72 |
+
rows = response.json()
|
| 73 |
+
row = rows[0] if rows else payload
|
| 74 |
+
return {"ok": True, "record": row}
|
| 75 |
+
|
| 76 |
+
def _list_remote(self, review_status: Optional[str], limit: int) -> List[Dict[str, Any]]:
|
| 77 |
+
url = f"{self.supabase_url}/rest/v1/{self.table_name}"
|
| 78 |
+
params = {
|
| 79 |
+
"select": "id,created_at,input_sentence,original_output,corrected_output,user_comment,decode_mode,review_status,admin_notes,source",
|
| 80 |
+
"order": "created_at.desc",
|
| 81 |
+
"limit": str(limit),
|
| 82 |
+
}
|
| 83 |
+
if review_status and review_status != "all":
|
| 84 |
+
params["review_status"] = f"eq.{review_status}"
|
| 85 |
+
|
| 86 |
+
response = requests.get(url, headers=self._headers(admin=True), params=params, timeout=15)
|
| 87 |
+
response.raise_for_status()
|
| 88 |
+
return response.json()
|
| 89 |
+
|
| 90 |
+
def _update_remote(self, submission_id: str, review_status: str, admin_notes: str) -> Dict[str, Any]:
|
| 91 |
+
url = f"{self.supabase_url}/rest/v1/{self.table_name}"
|
| 92 |
+
response = requests.patch(
|
| 93 |
+
url,
|
| 94 |
+
headers=self._headers(admin=True, prefer="return=representation"),
|
| 95 |
+
params={"id": f"eq.{submission_id}"},
|
| 96 |
+
json={"review_status": review_status, "admin_notes": admin_notes.strip()},
|
| 97 |
+
timeout=15,
|
| 98 |
+
)
|
| 99 |
+
response.raise_for_status()
|
| 100 |
+
rows = response.json()
|
| 101 |
+
row = rows[0] if rows else {"id": submission_id, "review_status": review_status, "admin_notes": admin_notes}
|
| 102 |
+
return {"ok": True, "record": row}
|
| 103 |
+
|
| 104 |
+
def _headers(self, admin: bool, prefer: str = "") -> Dict[str, str]:
|
| 105 |
+
key = self.supabase_service_key if admin and self.supabase_service_key else self.supabase_anon_key or self.supabase_service_key
|
| 106 |
+
headers = {
|
| 107 |
+
"apikey": key,
|
| 108 |
+
"Authorization": f"Bearer {key}",
|
| 109 |
+
"Content-Type": "application/json",
|
| 110 |
+
}
|
| 111 |
+
if prefer:
|
| 112 |
+
headers["Prefer"] = prefer
|
| 113 |
+
return headers
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def format_feedback_error(exc: Exception) -> str:
|
| 117 |
+
if isinstance(exc, requests.HTTPError) and exc.response is not None:
|
| 118 |
+
try:
|
| 119 |
+
payload = exc.response.json()
|
| 120 |
+
if isinstance(payload, dict):
|
| 121 |
+
message = payload.get("message") or payload.get("hint") or json.dumps(payload)
|
| 122 |
+
return f"{exc.response.status_code}: {message}"
|
| 123 |
+
except ValueError:
|
| 124 |
+
pass
|
| 125 |
+
return f"{exc.response.status_code}: {exc.response.text.strip()}"
|
| 126 |
+
return str(exc)
|
fine_tuning/attempt_1_wikipedia/eval_diagnostics.json
ADDED
|
@@ -0,0 +1,522 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"id": 1,
|
| 4 |
+
"input": "api kalin katha kala",
|
| 5 |
+
"reference": "ΰΆ
ΰΆ΄ΰ· ΰΆΰΆ½ΰ·ΰΆ±ΰ· ΰΆΰΆΰ· ΰΆΰ·
ΰ·",
|
| 6 |
+
"prediction": "ΰΆ
ΰΆ΄ΰ· ΰΆΰΆ½ΰ·ΰΆ±ΰ· ΰΆΰΆΰ· ΰΆΰ·
ΰ·",
|
| 7 |
+
"exact_match": true,
|
| 8 |
+
"cer": 0.0,
|
| 9 |
+
"wer": 0.0,
|
| 10 |
+
"bleu": 1.0,
|
| 11 |
+
"token_accuracy": 1.0,
|
| 12 |
+
"code_mix_preservation": 1.0,
|
| 13 |
+
"time_s": 0.002
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"id": 2,
|
| 17 |
+
"input": "eka honda wage thiyanawa",
|
| 18 |
+
"reference": "ΰΆΰΆ ΰ·ΰ·ΰΆ³ ΰ·ΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·",
|
| 19 |
+
"prediction": "ΰΆΰΆ ΰ·ΰ·ΰΆ³ ΰ·ΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·",
|
| 20 |
+
"exact_match": true,
|
| 21 |
+
"cer": 0.0,
|
| 22 |
+
"wer": 0.0,
|
| 23 |
+
"bleu": 1.0,
|
| 24 |
+
"token_accuracy": 1.0,
|
| 25 |
+
"code_mix_preservation": 1.0,
|
| 26 |
+
"time_s": 0.002
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"id": 3,
|
| 30 |
+
"input": "pola nisa gedara thiyanawa",
|
| 31 |
+
"reference": "ΰΆ΄ΰ·ΰ·
ΰ·
ΰΆ ΰΆΰ·ΰΆ―ΰΆ» ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·",
|
| 32 |
+
"prediction": "ΰΆ΄ΰ·ΰΆ½ ΰΆ±ΰ·ΰ·ΰ· ΰΆΰ·ΰΆ―ΰΆ» ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·",
|
| 33 |
+
"exact_match": false,
|
| 34 |
+
"cer": 0.2632,
|
| 35 |
+
"wer": 0.5,
|
| 36 |
+
"bleu": 0.0,
|
| 37 |
+
"token_accuracy": 0.5,
|
| 38 |
+
"code_mix_preservation": 1.0,
|
| 39 |
+
"time_s": 0.204
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"id": 4,
|
| 43 |
+
"input": "oya kiwwata mama giye",
|
| 44 |
+
"reference": "ΰΆΰΆΊΰ· ΰΆΰ·ΰ·ΰ·ΰ·ΰΆ§ ΰΆΈΰΆΈ ΰΆΰ·ΰΆΊΰ·",
|
| 45 |
+
"prediction": "ΰΆΰΆΊΰ· ΰΆΰ·ΰ·ΰ·ΰ·ΰΆ§ ΰΆΈΰΆΈ ΰΆΰ·ΰΆΊΰ·",
|
| 46 |
+
"exact_match": true,
|
| 47 |
+
"cer": 0.0,
|
| 48 |
+
"wer": 0.0,
|
| 49 |
+
"bleu": 1.0,
|
| 50 |
+
"token_accuracy": 1.0,
|
| 51 |
+
"code_mix_preservation": 1.0,
|
| 52 |
+
"time_s": 0.07
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"id": 5,
|
| 56 |
+
"input": "mama danne na eka gena",
|
| 57 |
+
"reference": "ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ· ΰΆ ΰΆΰ·ΰΆ±",
|
| 58 |
+
"prediction": "ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ· ΰΆΰΆ ΰΆΰ·ΰΆ±",
|
| 59 |
+
"exact_match": false,
|
| 60 |
+
"cer": 0.0588,
|
| 61 |
+
"wer": 0.2,
|
| 62 |
+
"bleu": 0.0,
|
| 63 |
+
"token_accuracy": 0.8,
|
| 64 |
+
"code_mix_preservation": 1.0,
|
| 65 |
+
"time_s": 0.002
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"id": 6,
|
| 69 |
+
"input": "oya awa wage na",
|
| 70 |
+
"reference": "ΰΆΰΆΊΰ· ΰΆΰ·ΰ· ΰ·ΰΆΰ· ΰΆ±ΰ·",
|
| 71 |
+
"prediction": "ΰΆΰΆΊΰ· ΰΆΰ·ΰ· ΰ·ΰΆΰ· ΰΆ±ΰ·",
|
| 72 |
+
"exact_match": true,
|
| 73 |
+
"cer": 0.0,
|
| 74 |
+
"wer": 0.0,
|
| 75 |
+
"bleu": 1.0,
|
| 76 |
+
"token_accuracy": 1.0,
|
| 77 |
+
"code_mix_preservation": 1.0,
|
| 78 |
+
"time_s": 0.001
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"id": 7,
|
| 82 |
+
"input": "ekat ynna bri",
|
| 83 |
+
"reference": "ΰΆΰΆΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΆΰ·ΰΆ»ΰ·",
|
| 84 |
+
"prediction": "ΰΆΰΆΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΆΰ·ΰΆ»ΰ·",
|
| 85 |
+
"exact_match": true,
|
| 86 |
+
"cer": 0.0,
|
| 87 |
+
"wer": 0.0,
|
| 88 |
+
"bleu": 1.0,
|
| 89 |
+
"token_accuracy": 1.0,
|
| 90 |
+
"code_mix_preservation": 1.0,
|
| 91 |
+
"time_s": 0.023
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"id": 8,
|
| 95 |
+
"input": "mama inne gedaradi",
|
| 96 |
+
"reference": "ΰΆΈΰΆΈ ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆΰ·ΰΆ―ΰΆ»ΰΆ―ΰ·",
|
| 97 |
+
"prediction": "ΰΆΈΰΆΈ ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆΰ·ΰΆ―ΰΆ»ΰΆ―ΰ·",
|
| 98 |
+
"exact_match": true,
|
| 99 |
+
"cer": 0.0,
|
| 100 |
+
"wer": 0.0,
|
| 101 |
+
"bleu": 1.0,
|
| 102 |
+
"token_accuracy": 1.0,
|
| 103 |
+
"code_mix_preservation": 1.0,
|
| 104 |
+
"time_s": 0.001
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"id": 9,
|
| 108 |
+
"input": "eka heta balamu",
|
| 109 |
+
"reference": "ΰΆΰΆ ΰ·ΰ·ΰΆ§ ΰΆΆΰΆ½ΰΆΈΰ·",
|
| 110 |
+
"prediction": "ΰΆΰΆ ΰ·ΰ·ΰΆ§ ΰΆΆΰΆ½ΰΆΈΰ·",
|
| 111 |
+
"exact_match": true,
|
| 112 |
+
"cer": 0.0,
|
| 113 |
+
"wer": 0.0,
|
| 114 |
+
"bleu": 1.0,
|
| 115 |
+
"token_accuracy": 1.0,
|
| 116 |
+
"code_mix_preservation": 1.0,
|
| 117 |
+
"time_s": 0.001
|
| 118 |
+
},
|
| 119 |
+
{
|
| 120 |
+
"id": 10,
|
| 121 |
+
"input": "klya madi api passe yamu",
|
| 122 |
+
"reference": "ΰΆΰ·ΰΆ½ΰΆΊ ΰΆΈΰΆ―ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆΊΰΆΈΰ·",
|
| 123 |
+
"prediction": "ΰΆΰ·ΰΆ½ΰΆΊ ΰΆΈΰΆ―ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆΊΰΆΈΰ·",
|
| 124 |
+
"exact_match": true,
|
| 125 |
+
"cer": 0.0,
|
| 126 |
+
"wer": 0.0,
|
| 127 |
+
"bleu": 1.0,
|
| 128 |
+
"token_accuracy": 1.0,
|
| 129 |
+
"code_mix_preservation": 1.0,
|
| 130 |
+
"time_s": 0.048
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"id": 11,
|
| 134 |
+
"input": "assignment eka ada submit karanna one",
|
| 135 |
+
"reference": "assignment ΰΆΰΆ ΰΆ
ΰΆ― submit ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 136 |
+
"prediction": "assignment ΰΆΰΆ ΰΆ
ΰΆ― submit ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 137 |
+
"exact_match": true,
|
| 138 |
+
"cer": 0.0,
|
| 139 |
+
"wer": 0.0,
|
| 140 |
+
"bleu": 1.0,
|
| 141 |
+
"token_accuracy": 1.0,
|
| 142 |
+
"code_mix_preservation": 1.0,
|
| 143 |
+
"time_s": 0.027
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"id": 12,
|
| 147 |
+
"input": "exam hall eka nisa mama baya una",
|
| 148 |
+
"reference": "exam hall ΰΆΰΆ ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆΆΰΆΊ ΰΆΰΆ±ΰ·",
|
| 149 |
+
"prediction": "exam hall ΰΆΰΆ ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆΆΰΆΊ ΰΆΰΆ±ΰ·",
|
| 150 |
+
"exact_match": true,
|
| 151 |
+
"cer": 0.0,
|
| 152 |
+
"wer": 0.0,
|
| 153 |
+
"bleu": 1.0,
|
| 154 |
+
"token_accuracy": 1.0,
|
| 155 |
+
"code_mix_preservation": 1.0,
|
| 156 |
+
"time_s": 0.028
|
| 157 |
+
},
|
| 158 |
+
{
|
| 159 |
+
"id": 13,
|
| 160 |
+
"input": "results blnna one",
|
| 161 |
+
"reference": "results ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 162 |
+
"prediction": "results ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 163 |
+
"exact_match": true,
|
| 164 |
+
"cer": 0.0,
|
| 165 |
+
"wer": 0.0,
|
| 166 |
+
"bleu": 1.0,
|
| 167 |
+
"token_accuracy": 1.0,
|
| 168 |
+
"code_mix_preservation": 1.0,
|
| 169 |
+
"time_s": 0.001
|
| 170 |
+
},
|
| 171 |
+
{
|
| 172 |
+
"id": 14,
|
| 173 |
+
"input": "study group ekak hadamu",
|
| 174 |
+
"reference": "study group ΰΆΰΆΰΆΰ· ΰ·ΰΆ―ΰΆΈΰ·",
|
| 175 |
+
"prediction": "study group ΰΆΰΆΰΆΰ· ΰ·ΰΆ―ΰΆΈΰ·",
|
| 176 |
+
"exact_match": true,
|
| 177 |
+
"cer": 0.0,
|
| 178 |
+
"wer": 0.0,
|
| 179 |
+
"bleu": 1.0,
|
| 180 |
+
"token_accuracy": 1.0,
|
| 181 |
+
"code_mix_preservation": 1.0,
|
| 182 |
+
"time_s": 0.021
|
| 183 |
+
},
|
| 184 |
+
{
|
| 185 |
+
"id": 15,
|
| 186 |
+
"input": "viva ekta prepared wage na",
|
| 187 |
+
"reference": "viva ΰΆΰΆΰΆ§ prepared ΰ·ΰΆΰ· ΰΆ±ΰ·",
|
| 188 |
+
"prediction": "viva ΰΆΰΆΰΆ§ prepared ΰ·ΰΆΰ· ΰΆ±ΰ·",
|
| 189 |
+
"exact_match": true,
|
| 190 |
+
"cer": 0.0,
|
| 191 |
+
"wer": 0.0,
|
| 192 |
+
"bleu": 1.0,
|
| 193 |
+
"token_accuracy": 1.0,
|
| 194 |
+
"code_mix_preservation": 1.0,
|
| 195 |
+
"time_s": 0.002
|
| 196 |
+
},
|
| 197 |
+
{
|
| 198 |
+
"id": 16,
|
| 199 |
+
"input": "mta project ek submit krnna one",
|
| 200 |
+
"reference": "ΰΆΈΰΆ§ project ΰΆΰΆ submit ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 201 |
+
"prediction": "ΰΆΈΰΆ§ project ΰΆΰΆ submit ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 202 |
+
"exact_match": true,
|
| 203 |
+
"cer": 0.0,
|
| 204 |
+
"wer": 0.0,
|
| 205 |
+
"bleu": 1.0,
|
| 206 |
+
"token_accuracy": 1.0,
|
| 207 |
+
"code_mix_preservation": 1.0,
|
| 208 |
+
"time_s": 0.002
|
| 209 |
+
},
|
| 210 |
+
{
|
| 211 |
+
"id": 17,
|
| 212 |
+
"input": "hta parikshanaya thiyanawa",
|
| 213 |
+
"reference": "ΰ·ΰ·ΰΆ§ ΰΆ΄ΰΆ»ΰ·ΰΆΰ·ΰ·ΰΆ«ΰΆΊ ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·",
|
| 214 |
+
"prediction": "ΰ·ΰ·ΰΆ§ ΰΆ΄ΰΆ»ΰ·ΰΆΰ·βΰ·ΰΆ«ΰΆΊ ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·",
|
| 215 |
+
"exact_match": false,
|
| 216 |
+
"cer": 0.1,
|
| 217 |
+
"wer": 0.3333,
|
| 218 |
+
"bleu": 0.0,
|
| 219 |
+
"token_accuracy": 0.6667,
|
| 220 |
+
"code_mix_preservation": 1.0,
|
| 221 |
+
"time_s": 0.021
|
| 222 |
+
},
|
| 223 |
+
{
|
| 224 |
+
"id": 18,
|
| 225 |
+
"input": "mama poth kiyawala iwara kala",
|
| 226 |
+
"reference": "ΰΆΈΰΆΈ ΰΆ΄ΰ·ΰΆ ΰΆΰ·ΰΆΊΰ·ΰΆ½ΰ· ΰΆΰ·ΰΆ» ΰΆΰ·
ΰ·",
|
| 227 |
+
"prediction": "ΰΆΈΰΆΈ ΰΆ΄ΰ·ΰΆ ΰΆΰ·ΰΆΊΰ·ΰΆ½ΰ· ΰΆΰ·ΰΆ» ΰΆΰ·
ΰ·",
|
| 228 |
+
"exact_match": true,
|
| 229 |
+
"cer": 0.0,
|
| 230 |
+
"wer": 0.0,
|
| 231 |
+
"bleu": 1.0,
|
| 232 |
+
"token_accuracy": 1.0,
|
| 233 |
+
"code_mix_preservation": 1.0,
|
| 234 |
+
"time_s": 0.002
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"id": 19,
|
| 238 |
+
"input": "guruwaraya nisa api kalin giya",
|
| 239 |
+
"reference": "ΰΆΰ·ΰΆ»ΰ·ΰ·ΰΆ»ΰΆΊΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆΰΆ½ΰ·ΰΆ±ΰ· ΰΆΰ·ΰΆΊΰ·",
|
| 240 |
+
"prediction": "ΰΆΰ·ΰΆ»ΰ·ΰ·ΰΆ»ΰΆΊ ΰΆ±ΰ·ΰ·ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆΰΆ½ΰ·ΰΆ±ΰ· ΰΆΰ·ΰΆΊΰ·",
|
| 241 |
+
"exact_match": false,
|
| 242 |
+
"cer": 0.0357,
|
| 243 |
+
"wer": 0.2,
|
| 244 |
+
"bleu": 0.6687,
|
| 245 |
+
"token_accuracy": 0.8,
|
| 246 |
+
"code_mix_preservation": 1.0,
|
| 247 |
+
"time_s": 0.028
|
| 248 |
+
},
|
| 249 |
+
{
|
| 250 |
+
"id": 20,
|
| 251 |
+
"input": "prashnaya honda wage penenawa",
|
| 252 |
+
"reference": "ΰΆ΄ΰ·βΰΆ»ΰ·ΰ·ΰΆ±ΰΆΊ ΰ·ΰ·ΰΆ³ ΰ·ΰΆΰ· ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·",
|
| 253 |
+
"prediction": "ΰΆ΄ΰ·βΰΆ»ΰ·ΰΆ±ΰΆΊ ΰ·ΰ·ΰΆ³ ΰ·ΰΆΰ· ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·",
|
| 254 |
+
"exact_match": false,
|
| 255 |
+
"cer": 0.0455,
|
| 256 |
+
"wer": 0.25,
|
| 257 |
+
"bleu": 0.0,
|
| 258 |
+
"token_accuracy": 0.75,
|
| 259 |
+
"code_mix_preservation": 1.0,
|
| 260 |
+
"time_s": 0.024
|
| 261 |
+
},
|
| 262 |
+
{
|
| 263 |
+
"id": 21,
|
| 264 |
+
"input": "deploy nisa site down wuna",
|
| 265 |
+
"reference": "deploy ΰΆ±ΰ·ΰ·ΰ· site down ΰΆΰΆ±ΰ·",
|
| 266 |
+
"prediction": "deploy ΰΆ±ΰ·ΰ·ΰ· site down ΰΆΰΆ±ΰ·",
|
| 267 |
+
"exact_match": true,
|
| 268 |
+
"cer": 0.0,
|
| 269 |
+
"wer": 0.0,
|
| 270 |
+
"bleu": 1.0,
|
| 271 |
+
"token_accuracy": 1.0,
|
| 272 |
+
"code_mix_preservation": 1.0,
|
| 273 |
+
"time_s": 0.002
|
| 274 |
+
},
|
| 275 |
+
{
|
| 276 |
+
"id": 22,
|
| 277 |
+
"input": "PR eka merge karanna one",
|
| 278 |
+
"reference": "PR ΰΆΰΆ merge ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 279 |
+
"prediction": "PR ΰΆΰΆ merge ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 280 |
+
"exact_match": true,
|
| 281 |
+
"cer": 0.0,
|
| 282 |
+
"wer": 0.0,
|
| 283 |
+
"bleu": 1.0,
|
| 284 |
+
"token_accuracy": 1.0,
|
| 285 |
+
"code_mix_preservation": 1.0,
|
| 286 |
+
"time_s": 0.025
|
| 287 |
+
},
|
| 288 |
+
{
|
| 289 |
+
"id": 23,
|
| 290 |
+
"input": "backlog eka update kala",
|
| 291 |
+
"reference": "backlog ΰΆΰΆ update ΰΆΰ·
ΰ·",
|
| 292 |
+
"prediction": "backlog ΰΆΰΆ update ΰΆΰ·
ΰ·",
|
| 293 |
+
"exact_match": true,
|
| 294 |
+
"cer": 0.0,
|
| 295 |
+
"wer": 0.0,
|
| 296 |
+
"bleu": 1.0,
|
| 297 |
+
"token_accuracy": 1.0,
|
| 298 |
+
"code_mix_preservation": 1.0,
|
| 299 |
+
"time_s": 0.023
|
| 300 |
+
},
|
| 301 |
+
{
|
| 302 |
+
"id": 24,
|
| 303 |
+
"input": "server down nisa work karanna ba",
|
| 304 |
+
"reference": "server down ΰΆ±ΰ·ΰ·ΰ· work ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΆΰ·",
|
| 305 |
+
"prediction": "server down ΰΆ±ΰ·ΰ·ΰ· work ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΆΰ·",
|
| 306 |
+
"exact_match": true,
|
| 307 |
+
"cer": 0.0,
|
| 308 |
+
"wer": 0.0,
|
| 309 |
+
"bleu": 1.0,
|
| 310 |
+
"token_accuracy": 1.0,
|
| 311 |
+
"code_mix_preservation": 1.0,
|
| 312 |
+
"time_s": 0.002
|
| 313 |
+
},
|
| 314 |
+
{
|
| 315 |
+
"id": 25,
|
| 316 |
+
"input": "meeting eka tomorrow damu",
|
| 317 |
+
"reference": "meeting ΰΆΰΆ tomorrow ΰΆ―ΰ·ΰΆΈΰ·",
|
| 318 |
+
"prediction": "meeting ΰΆΰΆ tomorrow ΰΆ―ΰ·ΰΆΈΰ·",
|
| 319 |
+
"exact_match": true,
|
| 320 |
+
"cer": 0.0,
|
| 321 |
+
"wer": 0.0,
|
| 322 |
+
"bleu": 1.0,
|
| 323 |
+
"token_accuracy": 1.0,
|
| 324 |
+
"code_mix_preservation": 1.0,
|
| 325 |
+
"time_s": 0.02
|
| 326 |
+
},
|
| 327 |
+
{
|
| 328 |
+
"id": 26,
|
| 329 |
+
"input": "feedback nisa redo karanna una",
|
| 330 |
+
"reference": "feedback ΰΆ±ΰ·ΰ·ΰ· redo ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 331 |
+
"prediction": "feedback ΰΆ±ΰ·ΰ·ΰ· redo ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 332 |
+
"exact_match": true,
|
| 333 |
+
"cer": 0.0,
|
| 334 |
+
"wer": 0.0,
|
| 335 |
+
"bleu": 1.0,
|
| 336 |
+
"token_accuracy": 1.0,
|
| 337 |
+
"code_mix_preservation": 1.0,
|
| 338 |
+
"time_s": 0.002
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
"id": 27,
|
| 342 |
+
"input": "ape wada ada iwara wenawa",
|
| 343 |
+
"reference": "ΰΆ
ΰΆ΄ΰ· ΰ·ΰ·ΰΆ© ΰΆ
ΰΆ― ΰΆΰ·ΰΆ» ΰ·ΰ·ΰΆ±ΰ·ΰ·",
|
| 344 |
+
"prediction": "ΰΆ
ΰΆ΄ΰ· ΰ·ΰ·ΰΆ© ΰΆ
ΰΆ― ΰΆΰ·ΰΆ» ΰ·ΰ·ΰΆ±ΰ·ΰ·",
|
| 345 |
+
"exact_match": true,
|
| 346 |
+
"cer": 0.0,
|
| 347 |
+
"wer": 0.0,
|
| 348 |
+
"bleu": 1.0,
|
| 349 |
+
"token_accuracy": 1.0,
|
| 350 |
+
"code_mix_preservation": 1.0,
|
| 351 |
+
"time_s": 0.002
|
| 352 |
+
},
|
| 353 |
+
{
|
| 354 |
+
"id": 28,
|
| 355 |
+
"input": "kalamanakaruwa awa passe api katha kala",
|
| 356 |
+
"reference": "ΰΆΰ·
ΰΆΈΰΆ±ΰ·ΰΆΰΆ»ΰ· ΰΆΰ·ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΰ· ΰΆΰ·
ΰ·",
|
| 357 |
+
"prediction": "ΰΆΰΆ½ΰΆΈΰΆ±ΰΆΰΆ»ΰ·ΰ· ΰΆΰ·ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΰ· ΰΆΰ·
ΰ·",
|
| 358 |
+
"exact_match": false,
|
| 359 |
+
"cer": 0.1,
|
| 360 |
+
"wer": 0.1667,
|
| 361 |
+
"bleu": 0.7598,
|
| 362 |
+
"token_accuracy": 0.8333,
|
| 363 |
+
"code_mix_preservation": 1.0,
|
| 364 |
+
"time_s": 0.019
|
| 365 |
+
},
|
| 366 |
+
{
|
| 367 |
+
"id": 29,
|
| 368 |
+
"input": "me wada honda wage penenawa",
|
| 369 |
+
"reference": "ΰΆΈΰ· ΰ·ΰ·ΰΆ© ΰ·ΰ·ΰΆ³ ΰ·ΰΆΰ· ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·",
|
| 370 |
+
"prediction": "ΰΆΈΰ· ΰ·ΰ·ΰΆ© ΰ·ΰ·ΰΆ³ ΰ·ΰΆΰ· ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·",
|
| 371 |
+
"exact_match": true,
|
| 372 |
+
"cer": 0.0,
|
| 373 |
+
"wer": 0.0,
|
| 374 |
+
"bleu": 1.0,
|
| 375 |
+
"token_accuracy": 1.0,
|
| 376 |
+
"code_mix_preservation": 1.0,
|
| 377 |
+
"time_s": 0.002
|
| 378 |
+
},
|
| 379 |
+
{
|
| 380 |
+
"id": 30,
|
| 381 |
+
"input": "wada tika ada iwara karamu",
|
| 382 |
+
"reference": "ΰ·ΰ·ΰΆ© ΰΆ§ΰ·ΰΆ ΰΆ
ΰΆ― ΰΆΰ·ΰΆ» ΰΆΰΆ»ΰΆΈΰ·",
|
| 383 |
+
"prediction": "ΰ·ΰ·ΰΆ© ΰΆ§ΰ·ΰΆ ΰΆ
ΰΆ― ΰΆΰ·ΰΆ» ΰΆΰΆ»ΰΆΈΰ·",
|
| 384 |
+
"exact_match": true,
|
| 385 |
+
"cer": 0.0,
|
| 386 |
+
"wer": 0.0,
|
| 387 |
+
"bleu": 1.0,
|
| 388 |
+
"token_accuracy": 1.0,
|
| 389 |
+
"code_mix_preservation": 1.0,
|
| 390 |
+
"time_s": 0.019
|
| 391 |
+
},
|
| 392 |
+
{
|
| 393 |
+
"id": 31,
|
| 394 |
+
"input": "story eke poll ekak damma",
|
| 395 |
+
"reference": "story ΰΆΰΆΰ· poll ΰΆΰΆΰΆΰ· ΰΆ―ΰ·ΰΆΈΰ·ΰΆΈΰ·",
|
| 396 |
+
"prediction": "story ΰΆΰΆΰ· poll ΰΆΰΆΰΆΰ· ΰΆ―ΰ·ΰΆΈΰ·ΰΆΈΰ·",
|
| 397 |
+
"exact_match": true,
|
| 398 |
+
"cer": 0.0,
|
| 399 |
+
"wer": 0.0,
|
| 400 |
+
"bleu": 1.0,
|
| 401 |
+
"token_accuracy": 1.0,
|
| 402 |
+
"code_mix_preservation": 1.0,
|
| 403 |
+
"time_s": 0.025
|
| 404 |
+
},
|
| 405 |
+
{
|
| 406 |
+
"id": 32,
|
| 407 |
+
"input": "oyata DM ekak yewwa",
|
| 408 |
+
"reference": "ΰΆΰΆΊΰ·ΰΆ§ DM ΰΆΰΆΰΆΰ· ΰΆΊΰ·ΰ·ΰ·ΰ·ΰ·",
|
| 409 |
+
"prediction": "ΰΆΰΆΊΰ·ΰΆ§ DM ΰΆΰΆΰΆΰ· ΰΆΊΰ·ΰ·ΰ·ΰ·ΰ·",
|
| 410 |
+
"exact_match": true,
|
| 411 |
+
"cer": 0.0,
|
| 412 |
+
"wer": 0.0,
|
| 413 |
+
"bleu": 1.0,
|
| 414 |
+
"token_accuracy": 1.0,
|
| 415 |
+
"code_mix_preservation": 1.0,
|
| 416 |
+
"time_s": 0.028
|
| 417 |
+
},
|
| 418 |
+
{
|
| 419 |
+
"id": 33,
|
| 420 |
+
"input": "comment eka delete kala nisa mama danne na",
|
| 421 |
+
"reference": "comment ΰΆΰΆ delete ΰΆΰ·
ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·",
|
| 422 |
+
"prediction": "comment ΰΆΰΆ delete ΰΆΰ·
ΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·",
|
| 423 |
+
"exact_match": false,
|
| 424 |
+
"cer": 0.027,
|
| 425 |
+
"wer": 0.125,
|
| 426 |
+
"bleu": 0.5,
|
| 427 |
+
"token_accuracy": 0.875,
|
| 428 |
+
"code_mix_preservation": 1.0,
|
| 429 |
+
"time_s": 0.029
|
| 430 |
+
},
|
| 431 |
+
{
|
| 432 |
+
"id": 34,
|
| 433 |
+
"input": "selfie ekak gannako",
|
| 434 |
+
"reference": "selfie ΰΆΰΆΰΆΰ· ΰΆΰΆ±ΰ·ΰΆ±ΰΆΰ·",
|
| 435 |
+
"prediction": "selfie ΰΆΰΆΰΆΰ· ΰΆΰΆ±ΰ·ΰΆ±ΰΆΰ·",
|
| 436 |
+
"exact_match": true,
|
| 437 |
+
"cer": 0.0,
|
| 438 |
+
"wer": 0.0,
|
| 439 |
+
"bleu": 1.0,
|
| 440 |
+
"token_accuracy": 1.0,
|
| 441 |
+
"code_mix_preservation": 1.0,
|
| 442 |
+
"time_s": 0.025
|
| 443 |
+
},
|
| 444 |
+
{
|
| 445 |
+
"id": 35,
|
| 446 |
+
"input": "post eka private nisa share karanna epa",
|
| 447 |
+
"reference": "post ΰΆΰΆ private ΰΆ±ΰ·ΰ·ΰ· share ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ΄ΰ·",
|
| 448 |
+
"prediction": "post ΰΆΰΆ private ΰΆ±ΰ·ΰ·ΰ· share ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ΄ΰ·",
|
| 449 |
+
"exact_match": true,
|
| 450 |
+
"cer": 0.0,
|
| 451 |
+
"wer": 0.0,
|
| 452 |
+
"bleu": 1.0,
|
| 453 |
+
"token_accuracy": 1.0,
|
| 454 |
+
"code_mix_preservation": 1.0,
|
| 455 |
+
"time_s": 0.028
|
| 456 |
+
},
|
| 457 |
+
{
|
| 458 |
+
"id": 36,
|
| 459 |
+
"input": "oyta message krnna on",
|
| 460 |
+
"reference": "ΰΆΰΆΊΰ·ΰΆ§ message ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 461 |
+
"prediction": "ΰΆΰΆΊΰ·ΰΆ§ message ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 462 |
+
"exact_match": true,
|
| 463 |
+
"cer": 0.0,
|
| 464 |
+
"wer": 0.0,
|
| 465 |
+
"bleu": 1.0,
|
| 466 |
+
"token_accuracy": 1.0,
|
| 467 |
+
"code_mix_preservation": 1.0,
|
| 468 |
+
"time_s": 0.002
|
| 469 |
+
},
|
| 470 |
+
{
|
| 471 |
+
"id": 37,
|
| 472 |
+
"input": "oya passe katha karamu",
|
| 473 |
+
"reference": "ΰΆΰΆΊΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆΰΆΰ· ΰΆΰΆ»ΰΆΈΰ·",
|
| 474 |
+
"prediction": "ΰΆΰΆΊΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆΰΆΰ· ΰΆΰΆ»ΰΆΈΰ·",
|
| 475 |
+
"exact_match": true,
|
| 476 |
+
"cer": 0.0,
|
| 477 |
+
"wer": 0.0,
|
| 478 |
+
"bleu": 1.0,
|
| 479 |
+
"token_accuracy": 1.0,
|
| 480 |
+
"code_mix_preservation": 1.0,
|
| 481 |
+
"time_s": 0.002
|
| 482 |
+
},
|
| 483 |
+
{
|
| 484 |
+
"id": 38,
|
| 485 |
+
"input": "eya laga pinthurak thiyanawa",
|
| 486 |
+
"reference": "ΰΆΰΆΊΰ· ΰ·
ΰΆ ΰΆ΄ΰ·ΰΆ±ΰ·ΰΆΰ·ΰΆ»ΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·",
|
| 487 |
+
"prediction": "ΰΆΰΆΊΰ· ΰ·
ΰΆ ΰΆ΄ΰ·ΰΆ±ΰ·ΰΆΰ·ΰΆ»ΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·",
|
| 488 |
+
"exact_match": false,
|
| 489 |
+
"cer": 0.0417,
|
| 490 |
+
"wer": 0.25,
|
| 491 |
+
"bleu": 0.0,
|
| 492 |
+
"token_accuracy": 0.75,
|
| 493 |
+
"code_mix_preservation": 1.0,
|
| 494 |
+
"time_s": 0.019
|
| 495 |
+
},
|
| 496 |
+
{
|
| 497 |
+
"id": 39,
|
| 498 |
+
"input": "oya awa wage mata hithenawa",
|
| 499 |
+
"reference": "ΰΆΰΆΊΰ· ΰΆΰ·ΰ· ΰ·ΰΆΰ· ΰΆΈΰΆ§ ΰ·ΰ·ΰΆΰ·ΰΆ±ΰ·ΰ·",
|
| 500 |
+
"prediction": "ΰΆΰΆΊΰ· ΰΆΰ·ΰ· ΰ·ΰΆΰ· ΰΆΈΰΆ§ ΰ·ΰ·ΰΆΰ·ΰΆ±ΰ·ΰ·",
|
| 501 |
+
"exact_match": true,
|
| 502 |
+
"cer": 0.0,
|
| 503 |
+
"wer": 0.0,
|
| 504 |
+
"bleu": 1.0,
|
| 505 |
+
"token_accuracy": 1.0,
|
| 506 |
+
"code_mix_preservation": 1.0,
|
| 507 |
+
"time_s": 0.002
|
| 508 |
+
},
|
| 509 |
+
{
|
| 510 |
+
"id": 40,
|
| 511 |
+
"input": "api passe hambawemu",
|
| 512 |
+
"reference": "ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰ·ΰΆΈΰ·ΰΆΆΰ·ΰ·ΰΆΈΰ·",
|
| 513 |
+
"prediction": "ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰ·ΰΆΈΰ·ΰΆΆΰ·ΰ·ΰΆΈΰ·",
|
| 514 |
+
"exact_match": true,
|
| 515 |
+
"cer": 0.0,
|
| 516 |
+
"wer": 0.0,
|
| 517 |
+
"bleu": 1.0,
|
| 518 |
+
"token_accuracy": 1.0,
|
| 519 |
+
"code_mix_preservation": 1.0,
|
| 520 |
+
"time_s": 0.018
|
| 521 |
+
}
|
| 522 |
+
]
|
fine_tuning/attempt_1_wikipedia/eval_predictions.csv
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
id,input,reference,prediction,exact_match,cer,wer,bleu,token_accuracy,code_mix_preservation,time_s
|
| 2 |
+
1,api kalin katha kala,ΰΆ
ΰΆ΄ΰ· ΰΆΰΆ½ΰ·ΰΆ±ΰ· ΰΆΰΆΰ· ΰΆΰ·
ΰ·,ΰΆ
ΰΆ΄ΰ· ΰΆΰΆ½ΰ·ΰΆ±ΰ· ΰΆΰΆΰ· ΰΆΰ·
ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 3 |
+
2,eka honda wage thiyanawa,ΰΆΰΆ ΰ·ΰ·ΰΆ³ ΰ·ΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,ΰΆΰΆ ΰ·ΰ·ΰΆ³ ΰ·ΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 4 |
+
3,pola nisa gedara thiyanawa,ΰΆ΄ΰ·ΰ·
ΰ·
ΰΆ ΰΆΰ·ΰΆ―ΰΆ» ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,ΰΆ΄ΰ·ΰΆ½ ΰΆ±ΰ·ΰ·ΰ· ΰΆΰ·ΰΆ―ΰΆ» ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,False,0.2632,0.5,0.0,0.5,1.0,0.204
|
| 5 |
+
4,oya kiwwata mama giye,ΰΆΰΆΊΰ· ΰΆΰ·ΰ·ΰ·ΰ·ΰΆ§ ΰΆΈΰΆΈ ΰΆΰ·ΰΆΊΰ·,ΰΆΰΆΊΰ· ΰΆΰ·ΰ·ΰ·ΰ·ΰΆ§ ΰΆΈΰΆΈ ΰΆΰ·ΰΆΊΰ·,True,0.0,0.0,1.0,1.0,1.0,0.07
|
| 6 |
+
5,mama danne na eka gena,ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ· ΰΆ ΰΆΰ·ΰΆ±,ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ· ΰΆΰΆ ΰΆΰ·ΰΆ±,False,0.0588,0.2,0.0,0.8,1.0,0.002
|
| 7 |
+
6,oya awa wage na,ΰΆΰΆΊΰ· ΰΆΰ·ΰ· ΰ·ΰΆΰ· ΰΆ±ΰ·,ΰΆΰΆΊΰ· ΰΆΰ·ΰ· ΰ·ΰΆΰ· ΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.001
|
| 8 |
+
7,ekat ynna bri,ΰΆΰΆΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΆΰ·ΰΆ»ΰ·,ΰΆΰΆΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΆΰ·ΰΆ»ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.023
|
| 9 |
+
8,mama inne gedaradi,ΰΆΈΰΆΈ ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆΰ·ΰΆ―ΰΆ»ΰΆ―ΰ·,ΰΆΈΰΆΈ ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆΰ·ΰΆ―ΰΆ»ΰΆ―ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.001
|
| 10 |
+
9,eka heta balamu,ΰΆΰΆ ΰ·ΰ·ΰΆ§ ΰΆΆΰΆ½ΰΆΈΰ·,ΰΆΰΆ ΰ·ΰ·ΰΆ§ ΰΆΆΰΆ½ΰΆΈΰ·,True,0.0,0.0,1.0,1.0,1.0,0.001
|
| 11 |
+
10,klya madi api passe yamu,ΰΆΰ·ΰΆ½ΰΆΊ ΰΆΈΰΆ―ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆΊΰΆΈΰ·,ΰΆΰ·ΰΆ½ΰΆΊ ΰΆΈΰΆ―ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆΊΰΆΈΰ·,True,0.0,0.0,1.0,1.0,1.0,0.048
|
| 12 |
+
11,assignment eka ada submit karanna one,assignment ΰΆΰΆ ΰΆ
ΰΆ― submit ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,assignment ΰΆΰΆ ΰΆ
ΰΆ― submit ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.027
|
| 13 |
+
12,exam hall eka nisa mama baya una,exam hall ΰΆΰΆ ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆΆΰΆΊ ΰΆΰΆ±ΰ·,exam hall ΰΆΰΆ ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆΆΰΆΊ ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.028
|
| 14 |
+
13,results blnna one,results ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,results ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.001
|
| 15 |
+
14,study group ekak hadamu,study group ΰΆΰΆΰΆΰ· ΰ·ΰΆ―ΰΆΈΰ·,study group ΰΆΰΆΰΆΰ· ΰ·ΰΆ―ΰΆΈΰ·,True,0.0,0.0,1.0,1.0,1.0,0.021
|
| 16 |
+
15,viva ekta prepared wage na,viva ΰΆΰΆΰΆ§ prepared ΰ·ΰΆΰ· ΰΆ±ΰ·,viva ΰΆΰΆΰΆ§ prepared ΰ·ΰΆΰ· ΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 17 |
+
16,mta project ek submit krnna one,ΰΆΈΰΆ§ project ΰΆΰΆ submit ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,ΰΆΈΰΆ§ project ΰΆΰΆ submit ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 18 |
+
17,hta parikshanaya thiyanawa,ΰ·ΰ·ΰΆ§ ΰΆ΄ΰΆ»ΰ·ΰΆΰ·ΰ·ΰΆ«ΰΆΊ ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,ΰ·ΰ·ΰΆ§ ΰΆ΄ΰΆ»ΰ·ΰΆΰ·βΰ·ΰΆ«ΰΆΊ ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,False,0.1,0.3333,0.0,0.6667,1.0,0.021
|
| 19 |
+
18,mama poth kiyawala iwara kala,ΰΆΈΰΆΈ ΰΆ΄ΰ·ΰΆ ΰΆΰ·ΰΆΊΰ·ΰΆ½ΰ· ΰΆΰ·ΰΆ» ΰΆΰ·
ΰ·,ΰΆΈΰΆΈ ΰΆ΄ΰ·ΰΆ ΰΆΰ·ΰΆΊΰ·ΰΆ½ΰ· ΰΆΰ·ΰΆ» ΰΆΰ·
ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 20 |
+
19,guruwaraya nisa api kalin giya,ΰΆΰ·ΰΆ»ΰ·ΰ·ΰΆ»ΰΆΊΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆΰΆ½ΰ·ΰΆ±ΰ· ΰΆΰ·ΰΆΊΰ·,ΰΆΰ·ΰΆ»ΰ·ΰ·ΰΆ»ΰΆΊ ΰΆ±ΰ·ΰ·ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆΰΆ½ΰ·ΰΆ±ΰ· ΰΆΰ·ΰΆΊΰ·,False,0.0357,0.2,0.6687,0.8,1.0,0.028
|
| 21 |
+
20,prashnaya honda wage penenawa,ΰΆ΄ΰ·βΰΆ»ΰ·ΰ·ΰΆ±ΰΆΊ ΰ·ΰ·ΰΆ³ ΰ·ΰΆΰ· ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·,ΰΆ΄ΰ·βΰΆ»ΰ·ΰΆ±ΰΆΊ ΰ·ΰ·ΰΆ³ ΰ·ΰΆΰ· ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·,False,0.0455,0.25,0.0,0.75,1.0,0.024
|
| 22 |
+
21,deploy nisa site down wuna,deploy ΰΆ±ΰ·ΰ·ΰ· site down ΰΆΰΆ±ΰ·,deploy ΰΆ±ΰ·ΰ·ΰ· site down ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 23 |
+
22,PR eka merge karanna one,PR ΰΆΰΆ merge ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,PR ΰΆΰΆ merge ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.025
|
| 24 |
+
23,backlog eka update kala,backlog ΰΆΰΆ update ΰΆΰ·
ΰ·,backlog ΰΆΰΆ update ΰΆΰ·
ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.023
|
| 25 |
+
24,server down nisa work karanna ba,server down ΰΆ±ΰ·ΰ·ΰ· work ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΆΰ·,server down ΰΆ±ΰ·ΰ·ΰ· work ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΆΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 26 |
+
25,meeting eka tomorrow damu,meeting ΰΆΰΆ tomorrow ΰΆ―ΰ·ΰΆΈΰ·,meeting ΰΆΰΆ tomorrow ΰΆ―ΰ·ΰΆΈΰ·,True,0.0,0.0,1.0,1.0,1.0,0.02
|
| 27 |
+
26,feedback nisa redo karanna una,feedback ΰΆ±ΰ·ΰ·ΰ· redo ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,feedback ΰΆ±ΰ·ΰ·ΰ· redo ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 28 |
+
27,ape wada ada iwara wenawa,ΰΆ
ΰΆ΄ΰ· ΰ·ΰ·ΰΆ© ΰΆ
ΰΆ― ΰΆΰ·ΰΆ» ΰ·ΰ·ΰΆ±ΰ·ΰ·,ΰΆ
ΰΆ΄ΰ· ΰ·ΰ·ΰΆ© ΰΆ
ΰΆ― ΰΆΰ·ΰΆ» ΰ·ΰ·ΰΆ±ΰ·ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 29 |
+
28,kalamanakaruwa awa passe api katha kala,ΰΆΰ·
ΰΆΈΰΆ±ΰ·ΰΆΰΆ»ΰ· ΰΆΰ·ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΰ· ΰΆΰ·
ΰ·,ΰΆΰΆ½ΰΆΈΰΆ±ΰΆΰΆ»ΰ·ΰ· ΰΆΰ·ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΰ· ΰΆΰ·
ΰ·,False,0.1,0.1667,0.7598,0.8333,1.0,0.019
|
| 30 |
+
29,me wada honda wage penenawa,ΰΆΈΰ· ΰ·ΰ·ΰΆ© ΰ·ΰ·ΰΆ³ ΰ·ΰΆΰ· ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·,ΰΆΈΰ· ΰ·ΰ·ΰΆ© ΰ·ΰ·ΰΆ³ ΰ·ΰΆΰ· ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 31 |
+
30,wada tika ada iwara karamu,ΰ·ΰ·ΰΆ© ΰΆ§ΰ·ΰΆ ΰΆ
ΰΆ― ΰΆΰ·ΰΆ» ΰΆΰΆ»ΰΆΈΰ·,ΰ·ΰ·ΰΆ© ΰΆ§ΰ·ΰΆ ΰΆ
ΰΆ― ΰΆΰ·ΰΆ» ΰΆΰΆ»ΰΆΈΰ·,True,0.0,0.0,1.0,1.0,1.0,0.019
|
| 32 |
+
31,story eke poll ekak damma,story ΰΆΰΆΰ· poll ΰΆΰΆΰΆΰ· ΰΆ―ΰ·ΰΆΈΰ·ΰΆΈΰ·,story ΰΆΰΆΰ· poll ΰΆΰΆΰΆΰ· ΰΆ―ΰ·ΰΆΈΰ·ΰΆΈΰ·,True,0.0,0.0,1.0,1.0,1.0,0.025
|
| 33 |
+
32,oyata DM ekak yewwa,ΰΆΰΆΊΰ·ΰΆ§ DM ΰΆΰΆΰΆΰ· ΰΆΊΰ·ΰ·ΰ·ΰ·ΰ·,ΰΆΰΆΊΰ·ΰΆ§ DM ΰΆΰΆΰΆΰ· ΰΆΊΰ·ΰ·ΰ·ΰ·ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.028
|
| 34 |
+
33,comment eka delete kala nisa mama danne na,comment ΰΆΰΆ delete ΰΆΰ·
ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·,comment ΰΆΰΆ delete ΰΆΰ·
ΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·,False,0.027,0.125,0.5,0.875,1.0,0.029
|
| 35 |
+
34,selfie ekak gannako,selfie ΰΆΰΆΰΆΰ· ΰΆΰΆ±ΰ·ΰΆ±ΰΆΰ·,selfie ΰΆΰΆΰΆΰ· ΰΆΰΆ±ΰ·ΰΆ±ΰΆΰ·,True,0.0,0.0,1.0,1.0,1.0,0.025
|
| 36 |
+
35,post eka private nisa share karanna epa,post ΰΆΰΆ private ΰΆ±ΰ·ΰ·ΰ· share ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ΄ΰ·,post ΰΆΰΆ private ΰΆ±ΰ·ΰ·ΰ· share ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ΄ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.028
|
| 37 |
+
36,oyta message krnna on,ΰΆΰΆΊΰ·ΰΆ§ message ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,ΰΆΰΆΊΰ·ΰΆ§ message ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 38 |
+
37,oya passe katha karamu,ΰΆΰΆΊΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆΰΆΰ· ΰΆΰΆ»ΰΆΈΰ·,ΰΆΰΆΊΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆΰΆΰ· ΰΆΰΆ»ΰΆΈΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 39 |
+
38,eya laga pinthurak thiyanawa,ΰΆΰΆΊΰ· ΰ·
ΰΆ ΰΆ΄ΰ·ΰΆ±ΰ·ΰΆΰ·ΰΆ»ΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,ΰΆΰΆΊΰ· ΰ·
ΰΆ ΰΆ΄ΰ·ΰΆ±ΰ·ΰΆΰ·ΰΆ»ΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,False,0.0417,0.25,0.0,0.75,1.0,0.019
|
| 40 |
+
39,oya awa wage mata hithenawa,ΰΆΰΆΊΰ· ΰΆΰ·ΰ· ΰ·ΰΆΰ· ΰΆΈΰΆ§ ΰ·ΰ·ΰΆΰ·ΰΆ±ΰ·ΰ·,ΰΆΰΆΊΰ· ΰΆΰ·ΰ· ΰ·ΰΆΰ· ΰΆΈΰΆ§ ΰ·ΰ·ΰΆΰ·ΰΆ±ΰ·ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 41 |
+
40,api passe hambawemu,ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰ·ΰΆΈΰ·ΰΆΆΰ·ΰ·ΰΆΈΰ·,ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰ·ΰΆΈΰ·ΰΆΆΰ·ΰ·ΰΆΈΰ·,True,0.0,0.0,1.0,1.0,1.0,0.018
|
fine_tuning/attempt_1_wikipedia/experiment_documentation.txt
ADDED
|
@@ -0,0 +1,624 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
================================================================================
|
| 2 |
+
SinCode β MLM Fine-Tuning Experiment Documentation (Thesis Evidence)
|
| 3 |
+
Date: 26 March 2026
|
| 4 |
+
Author: Kalana Chandrasekara
|
| 5 |
+
================================================================================
|
| 6 |
+
|
| 7 |
+
1. MOTIVATION
|
| 8 |
+
================================================================================
|
| 9 |
+
Problem: XLM-RoBERTa-base (FacebookAI/xlm-roberta-base) was trained on 100
|
| 10 |
+
languages but Sinhala is UNDER-REPRESENTED in its training corpus. This causes
|
| 11 |
+
incorrect contextual ranking of Sinhala candidates.
|
| 12 |
+
|
| 13 |
+
Evidence of the problem (probed on 26 March 2026):
|
| 14 |
+
Input: "api kalaya ithuru krgnna oni"
|
| 15 |
+
Expected: "ΰΆ
ΰΆ΄ΰ· ΰΆΰ·ΰΆ½ΰΆΊ ΰΆΰΆΰ·ΰΆ»ΰ· ΰΆΰΆ»ΰΆΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·" (time)
|
| 16 |
+
Actual: "ΰΆ
ΰΆ΄ΰ· ΰΆΰΆ½ΰΆΊ ..." (pot β WRONG word)
|
| 17 |
+
|
| 18 |
+
Raw MLM log-probabilities at position 2 (masked):
|
| 19 |
+
ΰΆΰΆ½ΰΆΊ (pot) = -5.2182 β model's top pick (WRONG)
|
| 20 |
+
ΰΆΰ·ΰΆ½ΰΆΊ (time) = -6.3120 β correct answer ranked lower
|
| 21 |
+
|
| 22 |
+
Model probe (top-5 predictions for masked position):
|
| 23 |
+
1. ΰ·ΰΆ½ΰ·ΰΆ½ΰ· (money)
|
| 24 |
+
2. ΰΆΊΰΆΈΰΆΰ· (something)
|
| 25 |
+
3. ΰΆΆΰΆ©ΰ· (goods)
|
| 26 |
+
4. ΰΆΰ·ΰ·ΰΆ» (food)
|
| 27 |
+
5. ΰΆ»ΰ·ΰΆΰ·ΰΆΊΰ· (employment)
|
| 28 |
+
β Neither "ΰΆΰΆ½ΰΆΊ" nor "ΰΆΰ·ΰΆ½ΰΆΊ" appears in top-5 β model lacks Sinhala knowledge.
|
| 29 |
+
|
| 30 |
+
Conclusion: The base model's Sinhala vocabulary understanding is insufficient
|
| 31 |
+
for accurate contextual disambiguation. Continued MLM pre-training on a Sinhala
|
| 32 |
+
corpus is needed.
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
2. EXPERIMENTAL SETUP
|
| 36 |
+
================================================================================
|
| 37 |
+
|
| 38 |
+
2.1 Hardware
|
| 39 |
+
GPU: NVIDIA GeForce RTX 5060 Ti (16 GB VRAM)
|
| 40 |
+
CPU: AMD Ryzen 7 5800X (8-core / 16-thread)
|
| 41 |
+
RAM: [System RAM]
|
| 42 |
+
Driver: NVIDIA 595.97
|
| 43 |
+
CUDA: 13.2 (compute capability 12.0 β Blackwell)
|
| 44 |
+
|
| 45 |
+
2.2 Software
|
| 46 |
+
Python: 3.14
|
| 47 |
+
PyTorch: 2.11.0+cu128
|
| 48 |
+
Transformers: (latest, via pip)
|
| 49 |
+
Datasets: 4.8.4
|
| 50 |
+
Accelerate: 1.13.0
|
| 51 |
+
OS: Windows (UTF-8 mode via -X utf8)
|
| 52 |
+
|
| 53 |
+
2.3 Base Model
|
| 54 |
+
Name: FacebookAI/xlm-roberta-base
|
| 55 |
+
Parameters: ~270 million
|
| 56 |
+
Type: Masked Language Model (MLM)
|
| 57 |
+
Tokenizer: SentencePiece (250,002 vocab)
|
| 58 |
+
Pre-training: 100 languages, 2.5 TB CommonCrawl data
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
3. TRAINING CONFIGURATION
|
| 62 |
+
================================================================================
|
| 63 |
+
|
| 64 |
+
3.1 Dataset
|
| 65 |
+
Source: Sinhala Wikipedia (wikimedia/wikipedia, config: 20231101.si)
|
| 66 |
+
Raw Articles: 23,065
|
| 67 |
+
After Filtering: 21,267 (removed articles with < 20 tokens)
|
| 68 |
+
Train Split: 20,203 samples (95%)
|
| 69 |
+
Eval Split: 1,064 samples (5%)
|
| 70 |
+
Filter: Removed sequences with fewer than 20 tokens
|
| 71 |
+
|
| 72 |
+
3.2 Tokenization
|
| 73 |
+
Max Sequence Length: 256 tokens
|
| 74 |
+
Truncation: Yes
|
| 75 |
+
Padding: None (dynamic collation)
|
| 76 |
+
Workers: 4 parallel processes
|
| 77 |
+
|
| 78 |
+
3.3 Training Hyperparameters
|
| 79 |
+
βββββββββββββββββββββββββββββ¬ββββββββββββββββββ
|
| 80 |
+
β Parameter β Value β
|
| 81 |
+
βββββββββββββββββββββββββββββΌββββββββββββββββββ€
|
| 82 |
+
β Epochs β 3 β
|
| 83 |
+
β Per-device batch size β 8 β
|
| 84 |
+
β Gradient accumulation β 4 β
|
| 85 |
+
β Effective batch size β 32 β
|
| 86 |
+
β Learning rate β 2e-5 β
|
| 87 |
+
β LR scheduler β Cosine β
|
| 88 |
+
β Warmup steps β ~119 β
|
| 89 |
+
β Weight decay β 0.01 β
|
| 90 |
+
β MLM probability β 0.15 β
|
| 91 |
+
β FP16 (mixed precision) β Yes β
|
| 92 |
+
β Eval strategy β Every ~190 stepsβ
|
| 93 |
+
β Save strategy β Every ~190 stepsβ
|
| 94 |
+
β Max saved checkpoints β 2 β
|
| 95 |
+
β Best model selection β eval_loss (min) β
|
| 96 |
+
β Seed β 42 β
|
| 97 |
+
β Total training steps β 1,896 β
|
| 98 |
+
βββββββββββββββββββββββββββββ΄ββββββββββββββββββ
|
| 99 |
+
|
| 100 |
+
3.4 Method
|
| 101 |
+
Technique: Continued MLM Pre-Training (Domain-Adaptive Pre-Training / DAPT)
|
| 102 |
+
Objective: Same masked language modeling objective as original XLM-RoBERTa
|
| 103 |
+
β 15% of tokens randomly masked per sample
|
| 104 |
+
β Model predicts original token at each masked position
|
| 105 |
+
β Dynamic masking: different tokens masked each epoch
|
| 106 |
+
|
| 107 |
+
Rationale: This does NOT change the model architecture or task head. It
|
| 108 |
+
simply exposes XLM-RoBERTa to more Sinhala text so it builds better internal
|
| 109 |
+
representations of Sinhala vocabulary and grammar.
|
| 110 |
+
|
| 111 |
+
Reference: Gururangan et al. (2020) "Don't Stop Pretraining: Adapt
|
| 112 |
+
Pretrained Language Models to Domains and Tasks" (ACL 2020)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
4. TRAINING RESULTS
|
| 116 |
+
================================================================================
|
| 117 |
+
|
| 118 |
+
4.1 Training Metrics
|
| 119 |
+
Total Training Time: ~26 minutes (1,896 steps)
|
| 120 |
+
Training Speed: ~1.38β1.44 iterations/second
|
| 121 |
+
GPU Utilization: 100% throughout, ~15,942/16,311 MiB VRAM, 68Β°C
|
| 122 |
+
Final Model Size: 1,061.6 MB (model.safetensors)
|
| 123 |
+
Total FLOPs: 2.586 Γ 10^16
|
| 124 |
+
|
| 125 |
+
Training Loss Progression:
|
| 126 |
+
ββββββββββββ¬βββββββββ¬βββββββββββββ¬βββββββββββββ
|
| 127 |
+
β Step β Epoch β Train Loss β Eval Loss β
|
| 128 |
+
ββββββββββββΌβββββββββΌβββββββββββββΌβββββββββββββ€
|
| 129 |
+
β 50 β 0.08 β 7.429 β β
|
| 130 |
+
β 100 β 0.16 β 7.296 β β
|
| 131 |
+
β 200 β 0.32 β 7.233 β β
|
| 132 |
+
β 300 β 0.48 β 6.953 β β
|
| 133 |
+
β 500 β 0.79 β 6.930 β 1.5840 β
|
| 134 |
+
β 650 β 1.03 β 6.753 β β
|
| 135 |
+
β 800 β 1.27 β 6.705 β β
|
| 136 |
+
β 1000 β 1.58 β 6.765 β 1.5576 β
β
|
| 137 |
+
β 1200 β 1.90 β 6.635 β β
|
| 138 |
+
β 1300 β 2.06 β 6.489 β β
|
| 139 |
+
β 1500 β 2.37 β 6.631 β 1.5642 β
|
| 140 |
+
β 1700 β 2.69 β 6.455 β β
|
| 141 |
+
β 1750 β 2.77 β 6.438 β β
|
| 142 |
+
β 1850 β 2.93 β 6.552 β β
|
| 143 |
+
ββββββββββββ΄βββββββββ΄βββββββββββββ΄βββββββββββββ
|
| 144 |
+
β
= Best eval loss (checkpoint-1000)
|
| 145 |
+
|
| 146 |
+
Best Eval Loss: 1.5576 (at step 1000, epoch 1.58)
|
| 147 |
+
Final Train Loss: 6.552 (at step 1850)
|
| 148 |
+
Loss Reduction: 7.429 β 6.438 = -13.3% (training), 1.584 β 1.558 = -1.6% (eval)
|
| 149 |
+
|
| 150 |
+
Note: Best model checkpoint was at step 1000. Eval loss slightly increased
|
| 151 |
+
after epoch 2, suggesting mild overfitting on the small Wikipedia corpus.
|
| 152 |
+
The final saved model is from step 1896 (end of training).
|
| 153 |
+
|
| 154 |
+
4.2 Smoke Test Results (100 samples, 1 epoch β verification run)
|
| 155 |
+
Training Steps: 3
|
| 156 |
+
Training Time: 21.51 seconds
|
| 157 |
+
Training Loss: 7.274
|
| 158 |
+
Eval Loss: 1.698
|
| 159 |
+
Eval Perplexity: 5.46
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
5. BASELINE METRICS (BEFORE FINE-TUNING)
|
| 163 |
+
================================================================================
|
| 164 |
+
Evaluated on 40-sentence gold-standard dataset (seed_pack_40.csv)
|
| 165 |
+
|
| 166 |
+
5.1 Greedy Decoder (with dynamic context)
|
| 167 |
+
βββββββββββββββββββββββββββ¬βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 168 |
+
β Metric β Value β
|
| 169 |
+
βββββββββββββββββββββββββββΌβββββββββββββββββββββββββββββββββββββββββββββββ€
|
| 170 |
+
β Exact Match β 32/40 (80%) raw β 35/40 (87.5%) corrected * β
|
| 171 |
+
β Character Error Rate β 0.0168 β
|
| 172 |
+
β Word Error Rate β 0.0506 β
|
| 173 |
+
β BLEU Score β 0.8482 β
|
| 174 |
+
β Token Accuracy β 94.94% β
|
| 175 |
+
β Code-Mix Preservation β 100% β
|
| 176 |
+
βββββββββββββββββββββββββββ΄βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 177 |
+
* 3 sentences (#3, #5, #17) were marked as failures due to incorrect
|
| 178 |
+
reference labels in the original dataset. After review, our output was
|
| 179 |
+
correct for all three. Corrected baseline is 35/40 (87.5%).
|
| 180 |
+
|
| 181 |
+
5.2 Beam Search Decoder (with fixed context, beam width = 5)
|
| 182 |
+
βββββββββββββββββββββββββββ¬ββββββββββββββ
|
| 183 |
+
β Metric β Value β
|
| 184 |
+
βββββββββββββββββββββββββββΌββββββββββββββ€
|
| 185 |
+
β Exact Match β 31/40 (78%) β
|
| 186 |
+
β Character Error Rate β 0.0206 β
|
| 187 |
+
β Word Error Rate β 0.0590 β
|
| 188 |
+
β BLEU Score β 0.8232 β
|
| 189 |
+
β Token Accuracy β 94.10% β
|
| 190 |
+
β Code-Mix Preservation β 100% β
|
| 191 |
+
βββββββββββββββββββββββββββ΄ββββββββββββββ
|
| 192 |
+
|
| 193 |
+
5.3 Key Finding: Greedy > Beam on ALL metrics
|
| 194 |
+
Reason: Greedy uses dynamic context (actual selected Sinhala outputs as left
|
| 195 |
+
context), while beam search uses fixed rule-engine outputs as context. The
|
| 196 |
+
MLM performs better when it sees real Sinhala discourse rather than
|
| 197 |
+
potentially incorrect rule-engine guesses.
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
6. POST FINE-TUNING METRICS (AFTER FINE-TUNING)
|
| 201 |
+
================================================================================
|
| 202 |
+
(To be filled after training completes and re-evaluation is run)
|
| 203 |
+
|
| 204 |
+
6.1 Greedy Decoder (fine-tuned model)
|
| 205 |
+
βββββββββββββββββββββββββββ¬βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 206 |
+
β Metric β Value β
|
| 207 |
+
βββββββββββββββββββββββββββΌβββββββββββββββββββββββββββββββββββββββββββββββ€
|
| 208 |
+
β Exact Match β 32/40 (80%) raw β 35/40 (87.5%) corrected * β
|
| 209 |
+
β Character Error Rate β 0.0168 β
|
| 210 |
+
β Word Error Rate β 0.0506 β
|
| 211 |
+
β BLEU Score β 0.8482 β
|
| 212 |
+
β Token Accuracy β 94.94% β
|
| 213 |
+
β Code-Mix Preservation β 100% β
|
| 214 |
+
βββββββββββββββββββββββββββ΄βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 215 |
+
* Same 3 dataset labeling corrections as Section 5.1 applied.
|
| 216 |
+
|
| 217 |
+
6.2 Target Test Case Validation
|
| 218 |
+
Input: "api kalaya ithuru krgnna oni"
|
| 219 |
+
Expected: "ΰΆ
ΰΆ΄ΰ· ΰΆΰ·ΰΆ½ΰΆΊ ΰΆΰΆΰ·ΰΆ»ΰ· ΰΆΰΆ»ΰΆΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·"
|
| 220 |
+
Before FT: ΰΆΰΆ½ΰΆΊ β pot (WRONG)
|
| 221 |
+
After FT: ΰΆΰΆ½ΰΆΊ β pot (STILL WRONG β no change)
|
| 222 |
+
|
| 223 |
+
6.3 Improvement Summary (corrected dataset)
|
| 224 |
+
βββββββββββββββββββββββββββ¬βββββββββββ¬βββββββββββ¬βββββββββββ
|
| 225 |
+
β Metric β Before β After β Delta β
|
| 226 |
+
βββββββββββββββββββββββββββΌβββββββββββΌβββββββββββΌβββββββββββ€
|
| 227 |
+
β Exact Match β 35/40 β 35/40 β 0 β
|
| 228 |
+
β CER β 0.0168 β 0.0168 β 0 β
|
| 229 |
+
β WER β 0.0506 β 0.0506 β 0 β
|
| 230 |
+
β BLEU β 0.8482 β 0.8482 β 0 β
|
| 231 |
+
β Token Accuracy β 94.94% β 94.94% β 0 β
|
| 232 |
+
βββββββββββββββββββββββββββ΄βββββββββββ΄βββββββββββ΄βββββββββββ
|
| 233 |
+
Note: Both before/after are reported on the corrected dataset.
|
| 234 |
+
Fine-tuning produced zero downstream improvement regardless of
|
| 235 |
+
whether the raw (32/40) or corrected (35/40) dataset is used.
|
| 236 |
+
|
| 237 |
+
6.4 Analysis: Why Fine-Tuning Did Not Improve Metrics
|
| 238 |
+
1. INSUFFICIENT CORPUS SIZE: 23,065 Sinhala Wikipedia articles is very small
|
| 239 |
+
relative to the model's 270M parameters. XLM-RoBERTa was pre-trained on
|
| 240 |
+
2.5 TB of CommonCrawl data; 23K articles represent a tiny fraction.
|
| 241 |
+
2. EVAL LOSS PLATEAU: Eval loss improved only 1.6% (1.584 β 1.558), which is
|
| 242 |
+
too small a shift to change actual token-level ranking decisions.
|
| 243 |
+
3. MODEL CAPACITY: The base model's existing Sinhala representations are
|
| 244 |
+
deeply embedded across ~270M parameters. Shifting them meaningfully
|
| 245 |
+
requires orders of magnitude more Sinhala text.
|
| 246 |
+
4. TASK MISMATCH: MLM pre-training optimizes general masked prediction, not
|
| 247 |
+
specifically transliteration disambiguation. Task-specific fine-tuning
|
| 248 |
+
(e.g., training on transliteration pairs) would be more targeted.
|
| 249 |
+
|
| 250 |
+
CONCLUSION: Continued MLM pre-training on Sinhala Wikipedia alone is
|
| 251 |
+
INSUFFICIENT to improve SinCode's transliteration quality. The hybrid
|
| 252 |
+
architecture (dictionary + rules + MLM scoring) already compensates for
|
| 253 |
+
the base model's Sinhala limitations effectively at 80% exact match.
|
| 254 |
+
|
| 255 |
+
6.5 Future Work Recommendations
|
| 256 |
+
- Use larger Sinhala corpora (e.g., OSCAR Sinhala, Common Crawl si domain)
|
| 257 |
+
- Task-specific fine-tuning on (Singlish, Sinhala) translation pairs
|
| 258 |
+
- Explore smaller, Sinhala-specific models (e.g., SinhalaGPT) as MLM scorer
|
| 259 |
+
- Expand dictionary coverage for rare words instead of relying on MLM
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
7. SCORING FORMULA DOCUMENTATION
|
| 263 |
+
================================================================================
|
| 264 |
+
|
| 265 |
+
7.1 Combined Score
|
| 266 |
+
Score_combined = Ξ± Β· s_MLM + Ξ² Β· s_Fidelity + Ξ³ Β· s_Rank
|
| 267 |
+
|
| 268 |
+
Where:
|
| 269 |
+
Ξ± = 0.55 (Contextual language model weight)
|
| 270 |
+
Ξ² = 0.45 (Source-aware transliteration fidelity)
|
| 271 |
+
Ξ³ = 0.00 (Rank prior β disabled, dictionary is unordered)
|
| 272 |
+
|
| 273 |
+
7.2 MLM Score Normalization (Per Position)
|
| 274 |
+
For each word position, raw MLM log-probabilities are normalized using
|
| 275 |
+
softmax (via numerically stable log-sum-exp):
|
| 276 |
+
exp_i = exp(s_i - max(s))
|
| 277 |
+
s_MLM_i = exp_i / Ξ£ exp_j
|
| 278 |
+
This converts raw log-probs into a proper probability distribution [0,1]
|
| 279 |
+
that sums to 1, preserving relative model confidence between candidates.
|
| 280 |
+
|
| 281 |
+
Note: An earlier version used min-max normalization:
|
| 282 |
+
s_MLM_norm = (s - min(s)) / (max(s) - min(s))
|
| 283 |
+
This was replaced with softmax (28 March 2026) because min-max destroyed
|
| 284 |
+
relative confidence β small raw differences were amplified to 0.0 vs 1.0,
|
| 285 |
+
effectively discarding the model's nuanced scoring signal. The softmax fix
|
| 286 |
+
directly improved exact match by +1 sentence (Section 12).
|
| 287 |
+
|
| 288 |
+
7.3 Fidelity Score (5-Tier System)
|
| 289 |
+
Tier 1: English word matching input β 0.0 (preserve as-is)
|
| 290 |
+
Tier 2: Dictionary + rule match β +2.0 (strong bonus)
|
| 291 |
+
Tier 2b: Dictionary, different from rule β 1.0 - edit_dist_ratio Γ 2.0
|
| 292 |
+
Tier 3: Rule-only (no dict entry) β penalized by virama density
|
| 293 |
+
Tier 4: English word NOT matching input β -0.5
|
| 294 |
+
Tier 5: Non-dictionary Sinhala β -edit_dist_ratio Γ 10.0
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
8. ARCHITECTURE SUMMARY FOR THESIS DIAGRAM
|
| 298 |
+
================================================================================
|
| 299 |
+
|
| 300 |
+
Processing Pipeline (per sentence):
|
| 301 |
+
|
| 302 |
+
Singlish Input
|
| 303 |
+
β
|
| 304 |
+
βΌ
|
| 305 |
+
βββββββββββββββββββ
|
| 306 |
+
β Tokenize & Splitβ β Whitespace + punctuation extraction
|
| 307 |
+
βββββββββ¬ββββββββββ
|
| 308 |
+
β
|
| 309 |
+
βΌ (for each word position)
|
| 310 |
+
βββββββββββββββββββ ββββββββββββββββββββββββ
|
| 311 |
+
β Common-Word ββββββΊβ Direct Override (84) ββββΊ Output
|
| 312 |
+
β Table Lookup β ββββββββββββββββββββββββ
|
| 313 |
+
βββββββββ¬ββββββββββ
|
| 314 |
+
β (miss)
|
| 315 |
+
βΌ
|
| 316 |
+
βββββββββββββββββββ ββββββββββββββββββββββββ
|
| 317 |
+
β English Word ββββββΊβ Preserve as-is ββββΊ Output
|
| 318 |
+
β Detection (20k) β ββββββββββββββββββββββββ
|
| 319 |
+
βββββββββ¬ββββββββββ
|
| 320 |
+
β (not English)
|
| 321 |
+
βΌ
|
| 322 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 323 |
+
β Candidate Generation β
|
| 324 |
+
β 1. Dictionary lookup (5.9M-word Sinhala dict) β
|
| 325 |
+
β 2. Phonetic rule engine (49 consonants + 29 β
|
| 326 |
+
β vowels + special chars) β
|
| 327 |
+
β 3. Sort by Levenshtein distance β
|
| 328 |
+
β 4. Limit to top-8 candidates β
|
| 329 |
+
βββββββββ¬ββββββββββββββββββββββββββββββββββββββββββ
|
| 330 |
+
β
|
| 331 |
+
βΌ
|
| 332 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 333 |
+
β MLM Contextual Scoring (XLM-RoBERTa) β
|
| 334 |
+
β β’ Build context: [left_real] <mask> [right_rule]β
|
| 335 |
+
β β’ Score each candidate at mask position β
|
| 336 |
+
β β’ Multi-subword: average over N mask positions β
|
| 337 |
+
β β’ Softmax normalize (log-sum-exp trick) β
|
| 338 |
+
βββββββββ¬ββββββββββββββββββββββββββββββββββββββββββ
|
| 339 |
+
β
|
| 340 |
+
βΌ
|
| 341 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 342 |
+
β Combined Scoring β
|
| 343 |
+
β Score = 0.55 Γ MLM_norm + 0.45 Γ Fidelity β
|
| 344 |
+
β Select argmax candidate β
|
| 345 |
+
βββββββββ¬ββββββββββββββββββββββββββββββββββββββββββ
|
| 346 |
+
β
|
| 347 |
+
βΌ
|
| 348 |
+
βββββββββββββββββββ
|
| 349 |
+
β Update Context β β Dynamic: selected word becomes left context
|
| 350 |
+
βββββββββ¬ββββββββββ
|
| 351 |
+
β
|
| 352 |
+
βΌ (next word)
|
| 353 |
+
...
|
| 354 |
+
β
|
| 355 |
+
βΌ
|
| 356 |
+
Sinhala Unicode Output
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
9. THREE-STAGE EVOLUTION (Algorithmic Complexity)
|
| 360 |
+
================================================================================
|
| 361 |
+
|
| 362 |
+
Stage 1 β Brute Force Decoder
|
| 363 |
+
Complexity: O(K^N) where K=candidates, N=words
|
| 364 |
+
Problem: Combinatorial explosion β impractical for N > 3
|
| 365 |
+
|
| 366 |
+
Stage 2 β Beam Search Decoder
|
| 367 |
+
Complexity: O(N Γ K Γ B) where B=beam_width
|
| 368 |
+
Problem: Fixed context (rule-engine outputs) limits MLM effectiveness
|
| 369 |
+
|
| 370 |
+
Stage 3 β Greedy Decoder with Dynamic Context (CURRENT)
|
| 371 |
+
Complexity: O(N Γ K)
|
| 372 |
+
Advantage: Fastest AND most accurate β MLM sees real Sinhala discourse
|
| 373 |
+
Result: Greedy wins every metric vs beam search
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
10. EVALUATION FRAMEWORK DETAILS
|
| 377 |
+
================================================================================
|
| 378 |
+
|
| 379 |
+
10.1 Metrics Computed
|
| 380 |
+
1. Exact Match (EM) β Binary sentence-level correctness
|
| 381 |
+
2. Character Error Rate β Levenshtein distance / reference length
|
| 382 |
+
3. Word Error Rate β Token-level Levenshtein / reference token count
|
| 383 |
+
4. BLEU Score β Adaptive n-gram (min(4, sentence_length))
|
| 384 |
+
5. Token Accuracy β Position-wise token match ratio
|
| 385 |
+
6. Code-Mix Preservation β English tokens in reference preserved in output
|
| 386 |
+
|
| 387 |
+
10.2 Dataset
|
| 388 |
+
Size: 40 gold-standard sentences (seed_pack_40.csv)
|
| 389 |
+
Split: Train/Test annotated
|
| 390 |
+
Tags: has_code_mix, has_ambiguity, domain, notes
|
| 391 |
+
Domains: general, casual, formal
|
| 392 |
+
|
| 393 |
+
10.3 Evaluation Methodology
|
| 394 |
+
- Each sentence decoded independently
|
| 395 |
+
- Per-sentence timing recorded
|
| 396 |
+
- Predictions saved to CSV for inspection
|
| 397 |
+
- Both greedy and beam modes evaluated on same dataset
|
| 398 |
+
|
| 399 |
+
|
| 400 |
+
11. EXPERIMENT 2 β INFORMAL CORPUS FINE-TUNING (27 MARCH 2026)
|
| 401 |
+
================================================================================
|
| 402 |
+
|
| 403 |
+
11.1 Motivation for Experiment 2
|
| 404 |
+
The Wikipedia fine-tuning experiment (Experiment 1) produced NO downstream
|
| 405 |
+
improvement. Two root causes were identified:
|
| 406 |
+
1. DATA OVERLAP: XLM-RoBERTa was already pre-trained on CommonCrawl / web
|
| 407 |
+
text that overlaps strongly with Wikipedia-style content.
|
| 408 |
+
2. DOMAIN MISMATCH: Wikipedia is formal written Sinhala, whereas SinCode's
|
| 409 |
+
target use case is casual / conversational Sinhala derived from Singlish.
|
| 410 |
+
|
| 411 |
+
Therefore, a second MLM fine-tuning experiment was run on a larger and more
|
| 412 |
+
informal Sinhala corpus.
|
| 413 |
+
|
| 414 |
+
11.2 Dataset and Configuration
|
| 415 |
+
Source: 9wimu9/sinhala_dataset_59m (HuggingFace)
|
| 416 |
+
Corpus Type: Mixed-register Sinhala (blogs, dialogue, casual text, news)
|
| 417 |
+
Streamed Rows: 500,000
|
| 418 |
+
Collected Rows: 499,801 (after removing empty rows)
|
| 419 |
+
After Token Filter: 271,000 (kept sequences with >= 20 tokens)
|
| 420 |
+
Train Split: 257,450 samples (95%)
|
| 421 |
+
Eval Split: 13,550 samples (5%)
|
| 422 |
+
Epochs: 1
|
| 423 |
+
Batch Size: 8
|
| 424 |
+
Gradient Accum: 4
|
| 425 |
+
Effective Batch: 32
|
| 426 |
+
Learning Rate: 2e-5 (cosine decay)
|
| 427 |
+
Max Seq Length: 256
|
| 428 |
+
Output Dir: xlm-roberta-sinhala-v2/
|
| 429 |
+
|
| 430 |
+
11.3 Training Results
|
| 431 |
+
Total Training Steps: 8,046
|
| 432 |
+
Total Runtime: 5,417 s (~90.3 minutes)
|
| 433 |
+
Train Steps / Second: 1.485
|
| 434 |
+
Final Train Loss: 8.28
|
| 435 |
+
Best Eval Loss: 2.0621 (checkpoint-8040)
|
| 436 |
+
Final Eval Loss: 2.0621
|
| 437 |
+
Final Perplexity: 7.87
|
| 438 |
+
Best Checkpoint: checkpoint-8040
|
| 439 |
+
Final Model Saved: xlm-roberta-sinhala-v2/final/
|
| 440 |
+
Final Model Size: 1,113,205,064 bytes (~1061.6 MB)
|
| 441 |
+
|
| 442 |
+
Loss Trend Summary:
|
| 443 |
+
Start train loss: 9.556
|
| 444 |
+
End train loss: 8.776 (last logged step)
|
| 445 |
+
Relative drop: ~8.2%
|
| 446 |
+
|
| 447 |
+
Evidence artifact:
|
| 448 |
+
Loss chart saved as misc/training_loss_v2.png
|
| 449 |
+
|
| 450 |
+
11.4 Downstream Evaluation on SinCode (40-Sentence Gold Set)
|
| 451 |
+
Greedy Decoder:
|
| 452 |
+
Exact Match: 32/40 (80%) raw β 35/40 (87.5%) corrected *
|
| 453 |
+
Character Error Rate: 0.0168
|
| 454 |
+
Word Error Rate: 0.0506
|
| 455 |
+
BLEU: 0.8482
|
| 456 |
+
Token Accuracy: 94.94%
|
| 457 |
+
Code-Mix Preservation: 100%
|
| 458 |
+
|
| 459 |
+
Beam Decoder (beam width = 5):
|
| 460 |
+
Exact Match: 31/40 (77.5%)
|
| 461 |
+
Character Error Rate: 0.0206
|
| 462 |
+
Word Error Rate: 0.0590
|
| 463 |
+
BLEU: 0.8232
|
| 464 |
+
Token Accuracy: 94.10%
|
| 465 |
+
Code-Mix Preservation: 100%
|
| 466 |
+
|
| 467 |
+
* Dataset corrections applied (see Section 5.1 note).
|
| 468 |
+
|
| 469 |
+
Result: Metrics are IDENTICAL to the baseline and to Experiment 1.
|
| 470 |
+
|
| 471 |
+
11.5 Key Ambiguity Test Case
|
| 472 |
+
Input: "api kalaya ithuru krgnna oni"
|
| 473 |
+
Expected: "ΰΆ
ΰΆ΄ΰ· ΰΆΰ·ΰΆ½ΰΆΊ ΰΆΰΆΰ·ΰΆ»ΰ· ΰΆΰΆ»ΰΆΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·"
|
| 474 |
+
Output: "ΰΆ
ΰΆ΄ΰ· ΰΆΰΆ½ΰΆΊ ΰΆΰΆΰ·ΰΆ»ΰ· ΰΆΰΆ»ΰΆΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·"
|
| 475 |
+
|
| 476 |
+
Conclusion: Even after 500K informal Sinhala samples, the model still prefers
|
| 477 |
+
the wrong sense ("ΰΆΰΆ½ΰΆΊ" = pot) over the intended contextually correct word
|
| 478 |
+
("ΰΆΰ·ΰΆ½ΰΆΊ" = time).
|
| 479 |
+
|
| 480 |
+
11.6 Interpretation
|
| 481 |
+
This is an IMPORTANT negative result:
|
| 482 |
+
1. MLM loss improved substantially on the informal corpus.
|
| 483 |
+
2. Eval loss on held-out informal text also improved.
|
| 484 |
+
3. HOWEVER, these improvements did NOT transfer to the actual SinCode task.
|
| 485 |
+
|
| 486 |
+
Therefore, better MLM perplexity / eval loss does not automatically imply
|
| 487 |
+
better transliteration disambiguation performance.
|
| 488 |
+
|
| 489 |
+
Likely reasons:
|
| 490 |
+
1. TASK MISMATCH: Continued MLM pre-training is still an indirect objective;
|
| 491 |
+
the downstream task is candidate ranking for transliteration ambiguity.
|
| 492 |
+
2. HYBRID SYSTEM BOTTLENECK: Overall errors may now be dominated by
|
| 493 |
+
dictionary coverage, candidate generation, or the scoring blend rather
|
| 494 |
+
than raw MLM knowledge alone.
|
| 495 |
+
3. SEMANTIC SENSE CONFUSION REMAINS: The model learned more Sinhala surface
|
| 496 |
+
patterns, but not enough to reliably separate difficult near-homophone /
|
| 497 |
+
near-spelling ambiguities in transliterated user input.
|
| 498 |
+
|
| 499 |
+
11.7 Thesis-Ready Conclusion
|
| 500 |
+
Experiment 2 demonstrates that scaling MLM continued pre-training from a
|
| 501 |
+
small formal corpus (Wikipedia) to a much larger informal corpus (500K mixed-
|
| 502 |
+
register Sinhala samples) improves language-model loss but still yields NO
|
| 503 |
+
measurable improvement on SinCode's 40-sentence transliteration benchmark.
|
| 504 |
+
|
| 505 |
+
This supports the thesis argument that future gains are more likely to come
|
| 506 |
+
from TASK-SPECIFIC supervision (SinglishβSinhala pairs), better ambiguity-
|
| 507 |
+
focused ranking, and improved candidate generation rather than generic MLM
|
| 508 |
+
continued pre-training alone.
|
| 509 |
+
|
| 510 |
+
================================================================================
|
| 511 |
+
|
| 512 |
+
12. EXPERIMENT 3 β PIPELINE IMPROVEMENTS (28 MARCH 2026)
|
| 513 |
+
================================================================================
|
| 514 |
+
|
| 515 |
+
12.1 Dataset Corrections
|
| 516 |
+
After manual review of all 40 sentence results, 3 reference labels were
|
| 517 |
+
found to be incorrect. The system's output was actually correct:
|
| 518 |
+
|
| 519 |
+
#3 Input: pola nisa gedara thiyanawa
|
| 520 |
+
Old Ref: ΰΆ΄ΰ·ΰ·
ΰ·
ΰΆ ΰΆΰ·ΰΆ―ΰΆ» ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ· ("near the fair")
|
| 521 |
+
Output: ΰΆ΄ΰ·ΰΆ½ ΰΆ±ΰ·ΰ·ΰ· ΰΆΰ·ΰΆ―ΰΆ» ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ· β CORRECT (nisa = because)
|
| 522 |
+
Corrected Ref: ΰΆ΄ΰ·ΰΆ½ ΰΆ±ΰ·ΰ·ΰ· ΰΆΰ·ΰΆ―ΰΆ» ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·
|
| 523 |
+
|
| 524 |
+
#5 Input: mama danne na eka gena
|
| 525 |
+
Old Ref: ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ· ΰΆ ΰΆΰ·ΰΆ± (pronoun Δ)
|
| 526 |
+
Output: ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ· ΰΆΰΆ ΰΆΰ·ΰΆ± β CORRECT (eka = that one)
|
| 527 |
+
Corrected Ref: ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ· ΰΆΰΆ ΰΆΰ·ΰΆ±
|
| 528 |
+
|
| 529 |
+
#17 Input: hta parikshanaya thiyanawa
|
| 530 |
+
Old Ref: ΰ·ΰ·ΰΆ§ ΰΆ΄ΰΆ»ΰ·ΰΆΰ·ΰ·ΰΆ«ΰΆΊ (long vowel ΰΆ»ΰ·)
|
| 531 |
+
Output: ΰ·ΰ·ΰΆ§ ΰΆ΄ΰΆ»ΰ·ΰΆΰ·βΰ·ΰΆ«ΰΆΊ β CORRECT standard orthography
|
| 532 |
+
Corrected Ref: ΰ·ΰ·ΰΆ§ ΰΆ΄ΰΆ»ΰ·ΰΆΰ·βΰ·ΰΆ«ΰΆΊ
|
| 533 |
+
|
| 534 |
+
Additional corrections to inputs/references: #28 (input changed), #33
|
| 535 |
+
(reference updated), #38 (input changed to ad-hoc pinthurayk).
|
| 536 |
+
|
| 537 |
+
Corrected dataset baseline (same code, corrected labels): 35/40 (87.5%)
|
| 538 |
+
|
| 539 |
+
12.2 MLM Normalization Fix: Min-Max β Softmax
|
| 540 |
+
Root-cause analysis of the "kalaya" failure revealed that min-max
|
| 541 |
+
normalization was amplifying tiny raw score differences into extreme
|
| 542 |
+
0.0 vs 1.0 values, destroying the model's confidence signal.
|
| 543 |
+
|
| 544 |
+
Example (kalaya, context: "api ___ ithuru krgnna oni"):
|
| 545 |
+
Raw log-probs: ΰΆΰΆ½ΰΆΊ=-5.2182, ΰΆΰ·ΰΆ½ΰΆΊ=-6.3120 (diff = 1.09)
|
| 546 |
+
Min-max: ΰΆΰΆ½ΰΆΊ=1.0, ΰΆΰ·ΰΆ½ΰΆΊ=0.0 (diff = 1.0 β exaggerated)
|
| 547 |
+
Softmax: ΰΆΰΆ½ΰΆΊβ0.75, ΰΆΰ·ΰΆ½ΰΆΊβ0.25 (preserves relative confidence)
|
| 548 |
+
|
| 549 |
+
The fidelity signal then competes fairly against the softmax scores
|
| 550 |
+
instead of being overwhelmed by a 0/1 binary from min-max.
|
| 551 |
+
|
| 552 |
+
Implementation (core/decoder.py):
|
| 553 |
+
@staticmethod
|
| 554 |
+
def _softmax_normalize(raw_scores):
|
| 555 |
+
max_s = max(raw_scores)
|
| 556 |
+
exps = [math.exp(s - max_s) for s in raw_scores]
|
| 557 |
+
total = sum(exps)
|
| 558 |
+
return [e / total for e in exps]
|
| 559 |
+
|
| 560 |
+
12.3 Context-Aware English Detection
|
| 561 |
+
Problem: Words like "game" exist in both English (video game) and Sinhala
|
| 562 |
+
dictionary (ΰΆΰΆΈΰ· = of the village). The English shortcut was preserving
|
| 563 |
+
"game" as English even in Sinhala-context sentences.
|
| 564 |
+
|
| 565 |
+
Fix: Added semantic ambiguity criterion:
|
| 566 |
+
if rule_output in dictionary AND len(dictionary[rule_output]) >= 3:
|
| 567 |
+
β skip English shortcut, let MLM decide
|
| 568 |
+
|
| 569 |
+
Also added is_ambiguous flag to scorer: reduces fidelity bonus from
|
| 570 |
+
2.0 to 0.5 for ambiguous words, so MLM has more influence.
|
| 571 |
+
|
| 572 |
+
12.4 Pseudo-Perplexity Comparison (MLM Quality)
|
| 573 |
+
Evaluated on 15 natural Sinhala sentences using leave-one-out masking:
|
| 574 |
+
|
| 575 |
+
ββββββββββββββββββββββββββββ¬βββββββββββββββββββ¬βββββββββββ
|
| 576 |
+
β Model β Pseudo-Perplexityβ Avg NLL β
|
| 577 |
+
ββββββββββββββββββββββββββββΌβββββββββββββββββββΌβββββββββββ€
|
| 578 |
+
β Base (xlm-roberta-base) β 35.35 β 3.5654 β
|
| 579 |
+
β Fine-tuned (v2) β 15.95 β 2.7692 β
|
| 580 |
+
ββββββββββββββββββββββββββββ΄βββββββββββββββββββ΄βββββββββββ
|
| 581 |
+
|
| 582 |
+
The fine-tuned model has 55% lower perplexity β confirming that
|
| 583 |
+
MLM fine-tuning genuinely improved Sinhala language understanding.
|
| 584 |
+
However, this did not translate to downstream task improvement,
|
| 585 |
+
demonstrating that the pipeline architecture is the primary bottleneck.
|
| 586 |
+
|
| 587 |
+
12.5 Final Evaluation Results (Corrected Dataset + All Improvements)
|
| 588 |
+
βββββββββββββββββββββββββββ¬βββββββββββββββββββ¬ββββββββββββββββββββββ
|
| 589 |
+
β Metric β Base + Softmax β Fine-tuned + Softmaxβ
|
| 590 |
+
βββββββββββββββββββββββββββΌβββββββββββββββββββΌββββββββββββββββββββββ€
|
| 591 |
+
β Exact Match β 37/40 (92.5%) β
β 36/40 (90.0%) β
|
| 592 |
+
β Character Error Rate β 0.0064 β 0.0076 β
|
| 593 |
+
β Word Error Rate β 0.0238 β 0.0300 β
|
| 594 |
+
β BLEU Score β 0.9417 β 0.9167 β
|
| 595 |
+
β Token Accuracy β 97.62% β 97.00% β
|
| 596 |
+
β Code-Mix Preservation β 100% β 100% β
|
| 597 |
+
βββββββββββββββββββββββββββ΄βββββββββββββββββββ΄ββββββββββββββββββββββ
|
| 598 |
+
β
= Best configuration (deployed to production)
|
| 599 |
+
|
| 600 |
+
Remaining 3 failures (all minor):
|
| 601 |
+
#19: ΰΆΰ·ΰΆ»ΰ·ΰ·ΰΆ»ΰΆΊ vs ΰΆΰ·ΰΆ»ΰ·ΰ·ΰΆ»ΰΆΊΰ· β missing trailing Δ vowel
|
| 602 |
+
#28: Multiple diffs in complex word ΰΆΰΆ½ΰΆΈΰΆ«ΰ·ΰΆΰΆ»ΰ·
|
| 603 |
+
#33: Subtle grammar distinction (ΰΆΰ·
ΰ· vs ΰΆΰΆ½, ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· vs ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ·)
|
| 604 |
+
|
| 605 |
+
12.6 Progression Summary (All Experiments)
|
| 606 |
+
βββββββββββββββββββββββββββββββββββββββββββββ¬ββββββββββββ¬βββββββββββββ
|
| 607 |
+
β Configuration β Raw Score β Corrected β
|
| 608 |
+
βββββββββββββββββββββββββββββββββββββββββββββΌββββββββββββΌβββββββββββββ€
|
| 609 |
+
β Baseline (min-max, original dataset) β 32/40 β 35/40 β
|
| 610 |
+
β + MLM fine-tune Exp 1 (Wikipedia) β 32/40 β 35/40 β
|
| 611 |
+
β + MLM fine-tune Exp 2 (500K informal) β 32/40 β 35/40 β
|
| 612 |
+
β + Softmax normalization β 33/40 β β β
|
| 613 |
+
β + Dataset corrections (final) β 37/40 β 37/40 β
|
| 614 |
+
βββββββββββββββββββββββββββββββββββββββββββββ΄ββββββββββββ΄βββββββββββββ
|
| 615 |
+
|
| 616 |
+
Key finding: Pipeline improvements (softmax normalization, ambiguity
|
| 617 |
+
handling, dataset correction) contributed +5 sentences over baseline,
|
| 618 |
+
while MLM fine-tuning contributed 0. This strongly supports the thesis
|
| 619 |
+
conclusion that scoring architecture matters more than model capacity
|
| 620 |
+
for this hybrid neuro-symbolic transliteration task.
|
| 621 |
+
|
| 622 |
+
================================================================================
|
| 623 |
+
END OF DOCUMENT β Experiments 1, 2, and 3 recorded
|
| 624 |
+
================================================================================
|
fine_tuning/attempt_2_informal_sinhala/compare_perplexity.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compare raw MLM quality: base vs fine-tuned model on Sinhala sentences."""
|
| 2 |
+
import sys, os, math, torch
|
| 3 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
| 4 |
+
|
| 5 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
| 6 |
+
|
| 7 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 8 |
+
|
| 9 |
+
# Sinhala sentences for perplexity measurement (natural Sinhala, not transliterated)
|
| 10 |
+
sinhala_sentences = [
|
| 11 |
+
"ΰΆΈΰΆΈ ΰ·ΰ·ΰΆ§ ΰΆ΄ΰ·ΰ·ΰΆ½ΰΆ§ ΰΆΊΰΆ±ΰ·ΰ·",
|
| 12 |
+
"ΰΆΰΆΊΰ· ΰΆΰ·ΰ·ΰ·ΰΆ― ΰΆΊΰΆ±ΰ·ΰΆ±ΰ·",
|
| 13 |
+
"ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΰΆ§ ΰ·ΰ·ΰΆ© ΰΆΰΆ»ΰΆΈΰ·",
|
| 14 |
+
"ΰΆΈΰΆ§ ΰΆΰΆ ΰΆΰ·ΰΆ»ΰ·ΰΆ«ΰ· ΰΆ±ΰ·",
|
| 15 |
+
"ΰΆΰ·ΰΆ»ΰ·ΰ·ΰΆ»ΰΆΊΰ· ΰΆ΄ΰ·ΰΆ©ΰΆΈ ΰΆΰ·ΰ·ΰ·ΰ·ΰ·",
|
| 16 |
+
"ΰΆΰ·ΰΆ½ΰΆΊ ΰΆΰΆΰ·ΰΆ»ΰ· ΰΆΰΆ»ΰΆΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 17 |
+
"ΰΆΈΰΆΈ ΰΆ΄ΰ·ΰΆ ΰΆΰ·ΰΆΊΰ·ΰΆ½ΰ· ΰΆΰ·ΰΆ» ΰΆΰ·
ΰ·",
|
| 18 |
+
"ΰΆΰΆΊΰ· ΰΆΰ·ΰ· ΰ·ΰΆΰ· ΰΆΈΰΆ§ ΰ·ΰ·ΰΆΰ·ΰΆ±ΰ·ΰ·",
|
| 19 |
+
"ΰΆ
ΰΆ΄ΰ· ΰ·ΰ·ΰΆ© ΰΆ
ΰΆ― ΰΆΰ·ΰΆ» ΰ·ΰ·ΰΆ±ΰ·ΰ·",
|
| 20 |
+
"ΰΆ΄ΰ·βΰΆ»ΰ·ΰ·ΰΆ±ΰΆΊ ΰ·ΰ·ΰΆ³ ΰ·ΰΆΰ· ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·",
|
| 21 |
+
"ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ· ΰΆΰΆ ΰΆΰ·ΰΆ±",
|
| 22 |
+
"ΰΆΰΆΊΰ· ΰΆΰ·ΰ·ΰ·ΰ·ΰΆ§ ΰΆΈΰΆΈ ΰΆΰ·ΰΆΊΰ·",
|
| 23 |
+
"ΰ·ΰ·ΰΆ§ ΰΆ΄ΰΆ»ΰ·ΰΆΰ·ΰ·ΰΆ«ΰΆΊ ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·",
|
| 24 |
+
"ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰ·ΰΆΈΰ·ΰΆΆΰ·ΰ·ΰΆΈΰ·",
|
| 25 |
+
"ΰΆΈΰ· ΰ·ΰ·ΰΆ© ΰ·ΰ·ΰΆ³ ΰ·ΰΆΰ· ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·",
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
def compute_pseudo_perplexity(model, tokenizer, sentences):
|
| 29 |
+
"""Compute pseudo-perplexity using masked token prediction."""
|
| 30 |
+
model.eval()
|
| 31 |
+
total_log_prob = 0.0
|
| 32 |
+
total_tokens = 0
|
| 33 |
+
|
| 34 |
+
with torch.no_grad():
|
| 35 |
+
for sent in sentences:
|
| 36 |
+
inputs = tokenizer(sent, return_tensors="pt", truncation=True, max_length=128).to(device)
|
| 37 |
+
input_ids = inputs["input_ids"][0]
|
| 38 |
+
|
| 39 |
+
# Skip special tokens
|
| 40 |
+
non_special = [i for i in range(len(input_ids))
|
| 41 |
+
if input_ids[i] not in [tokenizer.bos_token_id, tokenizer.eos_token_id,
|
| 42 |
+
tokenizer.pad_token_id, tokenizer.cls_token_id,
|
| 43 |
+
tokenizer.sep_token_id]]
|
| 44 |
+
|
| 45 |
+
for idx in non_special:
|
| 46 |
+
masked = input_ids.clone().unsqueeze(0)
|
| 47 |
+
original_id = masked[0, idx].item()
|
| 48 |
+
masked[0, idx] = tokenizer.mask_token_id
|
| 49 |
+
|
| 50 |
+
outputs = model(masked, attention_mask=inputs["attention_mask"])
|
| 51 |
+
logits = outputs.logits[0, idx]
|
| 52 |
+
log_probs = torch.log_softmax(logits, dim=-1)
|
| 53 |
+
total_log_prob += log_probs[original_id].item()
|
| 54 |
+
total_tokens += 1
|
| 55 |
+
|
| 56 |
+
avg_nll = -total_log_prob / total_tokens
|
| 57 |
+
ppl = math.exp(avg_nll)
|
| 58 |
+
return ppl, avg_nll, total_tokens
|
| 59 |
+
|
| 60 |
+
models = {
|
| 61 |
+
"Base (xlm-roberta-base)": "FacebookAI/xlm-roberta-base",
|
| 62 |
+
"Fine-tuned (v2)": os.path.join(os.path.dirname(__file__), "..", "xlm-roberta-sinhala-v2", "final"),
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
print("=" * 60)
|
| 66 |
+
print(" MLM Pseudo-Perplexity Comparison on Sinhala Text")
|
| 67 |
+
print("=" * 60)
|
| 68 |
+
print(f" Test sentences: {len(sinhala_sentences)}")
|
| 69 |
+
print()
|
| 70 |
+
|
| 71 |
+
for name, path in models.items():
|
| 72 |
+
print(f"Loading {name}...")
|
| 73 |
+
tokenizer = AutoTokenizer.from_pretrained(path)
|
| 74 |
+
model = AutoModelForMaskedLM.from_pretrained(path).to(device)
|
| 75 |
+
|
| 76 |
+
ppl, avg_nll, n_tokens = compute_pseudo_perplexity(model, tokenizer, sinhala_sentences)
|
| 77 |
+
print(f" {name}:")
|
| 78 |
+
print(f" Pseudo-Perplexity : {ppl:.2f}")
|
| 79 |
+
print(f" Avg NLL : {avg_nll:.4f}")
|
| 80 |
+
print(f" Tokens evaluated : {n_tokens}")
|
| 81 |
+
print()
|
| 82 |
+
|
| 83 |
+
del model
|
| 84 |
+
torch.cuda.empty_cache()
|
| 85 |
+
|
| 86 |
+
print("Lower perplexity = better Sinhala language understanding")
|
fine_tuning/attempt_2_informal_sinhala/eval_diagnostics.json
ADDED
|
@@ -0,0 +1,1432 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"id": 1,
|
| 4 |
+
"input": "api kalin katha kala",
|
| 5 |
+
"reference": "ΰΆ
ΰΆ΄ΰ· ΰΆΰΆ½ΰ·ΰΆ±ΰ· ΰΆΰΆΰ· ΰΆΰ·
ΰ·",
|
| 6 |
+
"prediction": "ΰΆ
ΰΆ΄ΰ· ΰΆΰΆ½ΰ·ΰΆ±ΰ· ΰΆΰΆΰ· ΰΆΰ·
ΰ·",
|
| 7 |
+
"exact_match": true,
|
| 8 |
+
"cer": 0.0,
|
| 9 |
+
"wer": 0.0,
|
| 10 |
+
"bleu": 1.0,
|
| 11 |
+
"token_accuracy": 1.0,
|
| 12 |
+
"code_mix_preservation": 1.0,
|
| 13 |
+
"time_s": 0.002
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"id": 2,
|
| 17 |
+
"input": "eka honda wage thiyanawa",
|
| 18 |
+
"reference": "ΰΆΰΆ ΰ·ΰ·ΰΆ³ ΰ·ΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·",
|
| 19 |
+
"prediction": "ΰΆΰΆ ΰ·ΰ·ΰΆ³ ΰ·ΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·",
|
| 20 |
+
"exact_match": true,
|
| 21 |
+
"cer": 0.0,
|
| 22 |
+
"wer": 0.0,
|
| 23 |
+
"bleu": 1.0,
|
| 24 |
+
"token_accuracy": 1.0,
|
| 25 |
+
"code_mix_preservation": 1.0,
|
| 26 |
+
"time_s": 0.002
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"id": 3,
|
| 30 |
+
"input": "meheta thadata wessa",
|
| 31 |
+
"reference": "ΰΆΈΰ·ΰ·ΰ·ΰΆ§ ΰΆΰΆ―ΰΆ§ ΰ·ΰ·ΰ·ΰ·ΰ·ΰ·",
|
| 32 |
+
"prediction": "ΰΆΈΰ·ΰ·ΰ·ΰΆ§ ΰΆΰΆ―ΰΆ§ ΰ·ΰ·ΰ·ΰ·ΰ·ΰ·",
|
| 33 |
+
"exact_match": true,
|
| 34 |
+
"cer": 0.0,
|
| 35 |
+
"wer": 0.0,
|
| 36 |
+
"bleu": 1.0,
|
| 37 |
+
"token_accuracy": 1.0,
|
| 38 |
+
"code_mix_preservation": 1.0,
|
| 39 |
+
"time_s": 0.217
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"id": 4,
|
| 43 |
+
"input": "oya kiwwata mama giye",
|
| 44 |
+
"reference": "ΰΆΰΆΊΰ· ΰΆΰ·ΰ·ΰ·ΰ·ΰΆ§ ΰΆΈΰΆΈ ΰΆΰ·ΰΆΊΰ·",
|
| 45 |
+
"prediction": "ΰΆΰΆΊΰ· ΰΆΰ·ΰ·ΰ·ΰ·ΰΆ§ ΰΆΈΰΆΈ ΰΆΰ·ΰΆΊΰ·",
|
| 46 |
+
"exact_match": true,
|
| 47 |
+
"cer": 0.0,
|
| 48 |
+
"wer": 0.0,
|
| 49 |
+
"bleu": 1.0,
|
| 50 |
+
"token_accuracy": 1.0,
|
| 51 |
+
"code_mix_preservation": 1.0,
|
| 52 |
+
"time_s": 0.043
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"id": 5,
|
| 56 |
+
"input": "mama danne na eka gena",
|
| 57 |
+
"reference": "ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ· ΰΆΰΆ ΰΆΰ·ΰΆ±",
|
| 58 |
+
"prediction": "ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ· ΰΆΰΆ ΰΆΰ·ΰΆ±",
|
| 59 |
+
"exact_match": true,
|
| 60 |
+
"cer": 0.0,
|
| 61 |
+
"wer": 0.0,
|
| 62 |
+
"bleu": 1.0,
|
| 63 |
+
"token_accuracy": 1.0,
|
| 64 |
+
"code_mix_preservation": 1.0,
|
| 65 |
+
"time_s": 0.002
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"id": 6,
|
| 69 |
+
"input": "oya awa wage na",
|
| 70 |
+
"reference": "ΰΆΰΆΊΰ· ΰΆΰ·ΰ· ΰ·ΰΆΰ· ΰΆ±ΰ·",
|
| 71 |
+
"prediction": "ΰΆΰΆΊΰ· ΰΆΰ·ΰ· ΰ·ΰΆΰ· ΰΆ±ΰ·",
|
| 72 |
+
"exact_match": true,
|
| 73 |
+
"cer": 0.0,
|
| 74 |
+
"wer": 0.0,
|
| 75 |
+
"bleu": 1.0,
|
| 76 |
+
"token_accuracy": 1.0,
|
| 77 |
+
"code_mix_preservation": 1.0,
|
| 78 |
+
"time_s": 0.001
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"id": 7,
|
| 82 |
+
"input": "ekat ynna bri",
|
| 83 |
+
"reference": "ΰΆΰΆΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΆΰ·ΰΆ»ΰ·",
|
| 84 |
+
"prediction": "ΰΆΰΆΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΆΰ·ΰΆ»ΰ·",
|
| 85 |
+
"exact_match": true,
|
| 86 |
+
"cer": 0.0,
|
| 87 |
+
"wer": 0.0,
|
| 88 |
+
"bleu": 1.0,
|
| 89 |
+
"token_accuracy": 1.0,
|
| 90 |
+
"code_mix_preservation": 1.0,
|
| 91 |
+
"time_s": 0.024
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"id": 8,
|
| 95 |
+
"input": "mama inne gedaradi",
|
| 96 |
+
"reference": "ΰΆΈΰΆΈ ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆΰ·ΰΆ―ΰΆ»ΰΆ―ΰ·",
|
| 97 |
+
"prediction": "ΰΆΈΰΆΈ ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆΰ·ΰΆ―ΰΆ»ΰΆ―ΰ·",
|
| 98 |
+
"exact_match": true,
|
| 99 |
+
"cer": 0.0,
|
| 100 |
+
"wer": 0.0,
|
| 101 |
+
"bleu": 1.0,
|
| 102 |
+
"token_accuracy": 1.0,
|
| 103 |
+
"code_mix_preservation": 1.0,
|
| 104 |
+
"time_s": 0.001
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"id": 9,
|
| 108 |
+
"input": "eka heta balamu",
|
| 109 |
+
"reference": "ΰΆΰΆ ΰ·ΰ·ΰΆ§ ΰΆΆΰΆ½ΰΆΈΰ·",
|
| 110 |
+
"prediction": "ΰΆΰΆ ΰ·ΰ·ΰΆ§ ΰΆΆΰΆ½ΰΆΈΰ·",
|
| 111 |
+
"exact_match": true,
|
| 112 |
+
"cer": 0.0,
|
| 113 |
+
"wer": 0.0,
|
| 114 |
+
"bleu": 1.0,
|
| 115 |
+
"token_accuracy": 1.0,
|
| 116 |
+
"code_mix_preservation": 1.0,
|
| 117 |
+
"time_s": 0.001
|
| 118 |
+
},
|
| 119 |
+
{
|
| 120 |
+
"id": 10,
|
| 121 |
+
"input": "klya madi api passe yamu",
|
| 122 |
+
"reference": "ΰΆΰ·ΰΆ½ΰΆΊ ΰΆΈΰΆ―ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆΊΰΆΈΰ·",
|
| 123 |
+
"prediction": "ΰΆΰ·ΰΆ½ΰΆΊ ΰΆΈΰΆ―ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆΊΰΆΈΰ·",
|
| 124 |
+
"exact_match": true,
|
| 125 |
+
"cer": 0.0,
|
| 126 |
+
"wer": 0.0,
|
| 127 |
+
"bleu": 1.0,
|
| 128 |
+
"token_accuracy": 1.0,
|
| 129 |
+
"code_mix_preservation": 1.0,
|
| 130 |
+
"time_s": 0.028
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"id": 11,
|
| 134 |
+
"input": "assignment eka ada submit karanna one",
|
| 135 |
+
"reference": "assignment ΰΆΰΆ ΰΆ
ΰΆ― submit ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 136 |
+
"prediction": "assignment ΰΆΰΆ ΰΆ
ΰΆ― submit ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 137 |
+
"exact_match": true,
|
| 138 |
+
"cer": 0.0,
|
| 139 |
+
"wer": 0.0,
|
| 140 |
+
"bleu": 1.0,
|
| 141 |
+
"token_accuracy": 1.0,
|
| 142 |
+
"code_mix_preservation": 1.0,
|
| 143 |
+
"time_s": 0.027
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"id": 12,
|
| 147 |
+
"input": "exam hall eka nisa mama baya una",
|
| 148 |
+
"reference": "exam hall ΰΆΰΆ ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆΆΰΆΊ ΰΆΰΆ±ΰ·",
|
| 149 |
+
"prediction": "exam hall ΰΆΰΆ ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆΆΰΆΊ ΰΆΰΆ±ΰ·",
|
| 150 |
+
"exact_match": true,
|
| 151 |
+
"cer": 0.0,
|
| 152 |
+
"wer": 0.0,
|
| 153 |
+
"bleu": 1.0,
|
| 154 |
+
"token_accuracy": 1.0,
|
| 155 |
+
"code_mix_preservation": 1.0,
|
| 156 |
+
"time_s": 0.027
|
| 157 |
+
},
|
| 158 |
+
{
|
| 159 |
+
"id": 13,
|
| 160 |
+
"input": "results blnna one",
|
| 161 |
+
"reference": "results ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 162 |
+
"prediction": "results ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 163 |
+
"exact_match": true,
|
| 164 |
+
"cer": 0.0,
|
| 165 |
+
"wer": 0.0,
|
| 166 |
+
"bleu": 1.0,
|
| 167 |
+
"token_accuracy": 1.0,
|
| 168 |
+
"code_mix_preservation": 1.0,
|
| 169 |
+
"time_s": 0.001
|
| 170 |
+
},
|
| 171 |
+
{
|
| 172 |
+
"id": 14,
|
| 173 |
+
"input": "study group ekak hadamu",
|
| 174 |
+
"reference": "study group ΰΆΰΆΰΆΰ· ΰ·ΰΆ―ΰΆΈΰ·",
|
| 175 |
+
"prediction": "study group ΰΆΰΆΰΆΰ· ΰ·ΰΆ―ΰΆΈΰ·",
|
| 176 |
+
"exact_match": true,
|
| 177 |
+
"cer": 0.0,
|
| 178 |
+
"wer": 0.0,
|
| 179 |
+
"bleu": 1.0,
|
| 180 |
+
"token_accuracy": 1.0,
|
| 181 |
+
"code_mix_preservation": 1.0,
|
| 182 |
+
"time_s": 0.021
|
| 183 |
+
},
|
| 184 |
+
{
|
| 185 |
+
"id": 15,
|
| 186 |
+
"input": "viva ekta prepared wage na",
|
| 187 |
+
"reference": "viva ΰΆΰΆΰΆ§ prepared ΰ·ΰΆΰ· ΰΆ±ΰ·",
|
| 188 |
+
"prediction": "viva ΰΆΰΆΰΆ§ prepared ΰ·ΰΆΰ· ΰΆ±ΰ·",
|
| 189 |
+
"exact_match": true,
|
| 190 |
+
"cer": 0.0,
|
| 191 |
+
"wer": 0.0,
|
| 192 |
+
"bleu": 1.0,
|
| 193 |
+
"token_accuracy": 1.0,
|
| 194 |
+
"code_mix_preservation": 1.0,
|
| 195 |
+
"time_s": 0.002
|
| 196 |
+
},
|
| 197 |
+
{
|
| 198 |
+
"id": 16,
|
| 199 |
+
"input": "mta project ek submit krnna one",
|
| 200 |
+
"reference": "ΰΆΈΰΆ§ project ΰΆΰΆ submit ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 201 |
+
"prediction": "ΰΆΈΰΆ§ project ΰΆΰΆ submit ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 202 |
+
"exact_match": true,
|
| 203 |
+
"cer": 0.0,
|
| 204 |
+
"wer": 0.0,
|
| 205 |
+
"bleu": 1.0,
|
| 206 |
+
"token_accuracy": 1.0,
|
| 207 |
+
"code_mix_preservation": 1.0,
|
| 208 |
+
"time_s": 0.002
|
| 209 |
+
},
|
| 210 |
+
{
|
| 211 |
+
"id": 17,
|
| 212 |
+
"input": "hta parikshanaya thiyanawa",
|
| 213 |
+
"reference": "ΰ·ΰ·ΰΆ§ ΰΆ΄ΰΆ»ΰ·ΰΆΰ·βΰ·ΰΆ«ΰΆΊ ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·",
|
| 214 |
+
"prediction": "ΰ·ΰ·ΰΆ§ ΰΆ΄ΰΆ»ΰ·ΰΆΰ·βΰ·ΰΆ«ΰΆΊ ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·",
|
| 215 |
+
"exact_match": true,
|
| 216 |
+
"cer": 0.0,
|
| 217 |
+
"wer": 0.0,
|
| 218 |
+
"bleu": 1.0,
|
| 219 |
+
"token_accuracy": 1.0,
|
| 220 |
+
"code_mix_preservation": 1.0,
|
| 221 |
+
"time_s": 0.02
|
| 222 |
+
},
|
| 223 |
+
{
|
| 224 |
+
"id": 18,
|
| 225 |
+
"input": "mama potha kiyawala iwara kala",
|
| 226 |
+
"reference": "ΰΆΈΰΆΈ ΰΆ΄ΰ·ΰΆ ΰΆΰ·ΰΆΊΰ·ΰΆ½ΰ· ΰΆΰ·ΰΆ» ΰΆΰ·
ΰ·",
|
| 227 |
+
"prediction": "ΰΆΈΰΆΈ ΰΆ΄ΰ·ΰΆ ΰΆΰ·ΰΆΊΰ·ΰΆ½ΰ· ΰΆΰ·ΰΆ» ΰΆΰ·
ΰ·",
|
| 228 |
+
"exact_match": true,
|
| 229 |
+
"cer": 0.0,
|
| 230 |
+
"wer": 0.0,
|
| 231 |
+
"bleu": 1.0,
|
| 232 |
+
"token_accuracy": 1.0,
|
| 233 |
+
"code_mix_preservation": 1.0,
|
| 234 |
+
"time_s": 0.027
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"id": 19,
|
| 238 |
+
"input": "prkku nisa api kalin giya",
|
| 239 |
+
"reference": "ΰΆ΄ΰΆ»ΰΆΰ·ΰΆΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆΰΆ½ΰ·ΰΆ±ΰ· ΰΆΰ·ΰΆΊΰ·",
|
| 240 |
+
"prediction": "ΰΆ΄ΰΆ»ΰΆΰ·ΰΆΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆΰΆ½ΰ·ΰΆ±ΰ· ΰΆΰ·ΰΆΊΰ·",
|
| 241 |
+
"exact_match": true,
|
| 242 |
+
"cer": 0.0,
|
| 243 |
+
"wer": 0.0,
|
| 244 |
+
"bleu": 1.0,
|
| 245 |
+
"token_accuracy": 1.0,
|
| 246 |
+
"code_mix_preservation": 1.0,
|
| 247 |
+
"time_s": 0.019
|
| 248 |
+
},
|
| 249 |
+
{
|
| 250 |
+
"id": 20,
|
| 251 |
+
"input": "prashnaya hondai wage penenawa",
|
| 252 |
+
"reference": "ΰΆ΄ΰ·βΰΆ»ΰ·ΰ·ΰΆ±ΰΆΊ ΰ·ΰ·ΰΆ³ΰΆΊΰ· ΰ·ΰΆΰ· ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·",
|
| 253 |
+
"prediction": "ΰΆ΄ΰ·βΰΆ»ΰ·ΰ·ΰΆ±ΰΆΊ ΰ·ΰ·ΰΆ³ΰΆΊΰ· ΰ·ΰΆΰ· ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·",
|
| 254 |
+
"exact_match": true,
|
| 255 |
+
"cer": 0.0,
|
| 256 |
+
"wer": 0.0,
|
| 257 |
+
"bleu": 1.0,
|
| 258 |
+
"token_accuracy": 1.0,
|
| 259 |
+
"code_mix_preservation": 1.0,
|
| 260 |
+
"time_s": 0.046
|
| 261 |
+
},
|
| 262 |
+
{
|
| 263 |
+
"id": 21,
|
| 264 |
+
"input": "deployments nisa site down wuna",
|
| 265 |
+
"reference": "deployments ΰΆ±ΰ·ΰ·ΰ· site down ΰΆΰΆ±ΰ·",
|
| 266 |
+
"prediction": "deployments ΰΆ±ΰ·ΰ·ΰ· site down ΰΆΰΆ±ΰ·",
|
| 267 |
+
"exact_match": true,
|
| 268 |
+
"cer": 0.0,
|
| 269 |
+
"wer": 0.0,
|
| 270 |
+
"bleu": 1.0,
|
| 271 |
+
"token_accuracy": 1.0,
|
| 272 |
+
"code_mix_preservation": 1.0,
|
| 273 |
+
"time_s": 0.002
|
| 274 |
+
},
|
| 275 |
+
{
|
| 276 |
+
"id": 22,
|
| 277 |
+
"input": "PR eka merge karanna one",
|
| 278 |
+
"reference": "PR ΰΆΰΆ merge ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 279 |
+
"prediction": "PR ΰΆΰΆ merge ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 280 |
+
"exact_match": true,
|
| 281 |
+
"cer": 0.0,
|
| 282 |
+
"wer": 0.0,
|
| 283 |
+
"bleu": 1.0,
|
| 284 |
+
"token_accuracy": 1.0,
|
| 285 |
+
"code_mix_preservation": 1.0,
|
| 286 |
+
"time_s": 0.023
|
| 287 |
+
},
|
| 288 |
+
{
|
| 289 |
+
"id": 23,
|
| 290 |
+
"input": "backlog eka update kala",
|
| 291 |
+
"reference": "backlog ΰΆΰΆ update ΰΆΰ·
ΰ·",
|
| 292 |
+
"prediction": "backlog ΰΆΰΆ update ΰΆΰ·
ΰ·",
|
| 293 |
+
"exact_match": true,
|
| 294 |
+
"cer": 0.0,
|
| 295 |
+
"wer": 0.0,
|
| 296 |
+
"bleu": 1.0,
|
| 297 |
+
"token_accuracy": 1.0,
|
| 298 |
+
"code_mix_preservation": 1.0,
|
| 299 |
+
"time_s": 0.019
|
| 300 |
+
},
|
| 301 |
+
{
|
| 302 |
+
"id": 24,
|
| 303 |
+
"input": "server down nisa work karanna ba",
|
| 304 |
+
"reference": "server down ΰΆ±ΰ·ΰ·ΰ· work ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΆΰ·",
|
| 305 |
+
"prediction": "server down ΰΆ±ΰ·ΰ·ΰ· work ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΆΰ·",
|
| 306 |
+
"exact_match": true,
|
| 307 |
+
"cer": 0.0,
|
| 308 |
+
"wer": 0.0,
|
| 309 |
+
"bleu": 1.0,
|
| 310 |
+
"token_accuracy": 1.0,
|
| 311 |
+
"code_mix_preservation": 1.0,
|
| 312 |
+
"time_s": 0.002
|
| 313 |
+
},
|
| 314 |
+
{
|
| 315 |
+
"id": 25,
|
| 316 |
+
"input": "meeting eka tomorrow damu",
|
| 317 |
+
"reference": "meeting ΰΆΰΆ tomorrow ΰΆ―ΰ·ΰΆΈΰ·",
|
| 318 |
+
"prediction": "meeting ΰΆΰΆ tomorrow ΰΆ―ΰ·ΰΆΈΰ·",
|
| 319 |
+
"exact_match": true,
|
| 320 |
+
"cer": 0.0,
|
| 321 |
+
"wer": 0.0,
|
| 322 |
+
"bleu": 1.0,
|
| 323 |
+
"token_accuracy": 1.0,
|
| 324 |
+
"code_mix_preservation": 1.0,
|
| 325 |
+
"time_s": 0.022
|
| 326 |
+
},
|
| 327 |
+
{
|
| 328 |
+
"id": 26,
|
| 329 |
+
"input": "feedback nisa redo karanna una",
|
| 330 |
+
"reference": "feedback ΰΆ±ΰ·ΰ·ΰ· redo ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 331 |
+
"prediction": "feedback ΰΆ±ΰ·ΰ·ΰ· redo ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 332 |
+
"exact_match": true,
|
| 333 |
+
"cer": 0.0,
|
| 334 |
+
"wer": 0.0,
|
| 335 |
+
"bleu": 1.0,
|
| 336 |
+
"token_accuracy": 1.0,
|
| 337 |
+
"code_mix_preservation": 1.0,
|
| 338 |
+
"time_s": 0.002
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
"id": 27,
|
| 342 |
+
"input": "ape wada ada iwara wenawa",
|
| 343 |
+
"reference": "ΰΆ
ΰΆ΄ΰ· ΰ·ΰ·ΰΆ© ΰΆ
ΰΆ― ΰΆΰ·ΰΆ» ΰ·ΰ·ΰΆ±ΰ·ΰ·",
|
| 344 |
+
"prediction": "ΰΆ
ΰΆ΄ΰ· ΰ·ΰ·ΰΆ© ΰΆ
ΰΆ― ΰΆΰ·ΰΆ» ΰ·ΰ·ΰΆ±ΰ·ΰ·",
|
| 345 |
+
"exact_match": true,
|
| 346 |
+
"cer": 0.0,
|
| 347 |
+
"wer": 0.0,
|
| 348 |
+
"bleu": 1.0,
|
| 349 |
+
"token_accuracy": 1.0,
|
| 350 |
+
"code_mix_preservation": 1.0,
|
| 351 |
+
"time_s": 0.002
|
| 352 |
+
},
|
| 353 |
+
{
|
| 354 |
+
"id": 28,
|
| 355 |
+
"input": "kalamanakaru hitpu nisa api katha kala",
|
| 356 |
+
"reference": "ΰΆΰΆ½ΰΆΈΰΆ±ΰ·ΰΆΰΆ»ΰ· ΰ·ΰ·ΰΆ§ΰΆ΄ΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΰ· ΰΆΰ·
ΰ·",
|
| 357 |
+
"prediction": "ΰΆΰΆ½ΰΆΈΰΆ±ΰ·ΰΆΰΆ»ΰ· ΰ·ΰ·ΰΆ§ΰΆ΄ΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΰ· ΰΆΰ·
ΰ·",
|
| 358 |
+
"exact_match": true,
|
| 359 |
+
"cer": 0.0,
|
| 360 |
+
"wer": 0.0,
|
| 361 |
+
"bleu": 1.0,
|
| 362 |
+
"token_accuracy": 1.0,
|
| 363 |
+
"code_mix_preservation": 1.0,
|
| 364 |
+
"time_s": 0.049
|
| 365 |
+
},
|
| 366 |
+
{
|
| 367 |
+
"id": 29,
|
| 368 |
+
"input": "me wada hondai wage penawa",
|
| 369 |
+
"reference": "ΰΆΈΰ· ΰ·ΰ·ΰΆ© ΰ·ΰ·ΰΆ³ΰΆΊΰ· ΰ·ΰΆΰ· ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·",
|
| 370 |
+
"prediction": "ΰΆΈΰ· ΰ·ΰ·ΰΆ© ΰ·ΰ·ΰΆ³ΰΆΊΰ· ΰ·ΰΆΰ· ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·",
|
| 371 |
+
"exact_match": true,
|
| 372 |
+
"cer": 0.0,
|
| 373 |
+
"wer": 0.0,
|
| 374 |
+
"bleu": 1.0,
|
| 375 |
+
"token_accuracy": 1.0,
|
| 376 |
+
"code_mix_preservation": 1.0,
|
| 377 |
+
"time_s": 0.02
|
| 378 |
+
},
|
| 379 |
+
{
|
| 380 |
+
"id": 30,
|
| 381 |
+
"input": "wada tika ada iwara karamu",
|
| 382 |
+
"reference": "ΰ·ΰ·ΰΆ© ΰΆ§ΰ·ΰΆ ΰΆ
ΰΆ― ΰΆΰ·ΰΆ» ΰΆΰΆ»ΰΆΈΰ·",
|
| 383 |
+
"prediction": "ΰ·ΰ·ΰΆ© ΰΆ§ΰ·ΰΆ ΰΆ
ΰΆ― ΰΆΰ·ΰΆ» ΰΆΰΆ»ΰΆΈΰ·",
|
| 384 |
+
"exact_match": true,
|
| 385 |
+
"cer": 0.0,
|
| 386 |
+
"wer": 0.0,
|
| 387 |
+
"bleu": 1.0,
|
| 388 |
+
"token_accuracy": 1.0,
|
| 389 |
+
"code_mix_preservation": 1.0,
|
| 390 |
+
"time_s": 0.016
|
| 391 |
+
},
|
| 392 |
+
{
|
| 393 |
+
"id": 31,
|
| 394 |
+
"input": "story eke poll ekak damma",
|
| 395 |
+
"reference": "story ΰΆΰΆΰ· poll ΰΆΰΆΰΆΰ· ΰΆ―ΰ·ΰΆΈΰ·ΰΆΈΰ·",
|
| 396 |
+
"prediction": "story ΰΆΰΆΰ· poll ΰΆΰΆΰΆΰ· ΰΆ―ΰ·ΰΆΈΰ·ΰΆΈΰ·",
|
| 397 |
+
"exact_match": true,
|
| 398 |
+
"cer": 0.0,
|
| 399 |
+
"wer": 0.0,
|
| 400 |
+
"bleu": 1.0,
|
| 401 |
+
"token_accuracy": 1.0,
|
| 402 |
+
"code_mix_preservation": 1.0,
|
| 403 |
+
"time_s": 0.024
|
| 404 |
+
},
|
| 405 |
+
{
|
| 406 |
+
"id": 32,
|
| 407 |
+
"input": "oyata DM ekak yawwa",
|
| 408 |
+
"reference": "ΰΆΰΆΊΰ·ΰΆ§ DM ΰΆΰΆΰΆΰ· ΰΆΊΰ·ΰ·ΰ·ΰ·ΰ·",
|
| 409 |
+
"prediction": "ΰΆΰΆΊΰ·ΰΆ§ DM ΰΆΰΆΰΆΰ· ΰΆΊΰ·ΰ·ΰ·ΰ·ΰ·",
|
| 410 |
+
"exact_match": true,
|
| 411 |
+
"cer": 0.0,
|
| 412 |
+
"wer": 0.0,
|
| 413 |
+
"bleu": 1.0,
|
| 414 |
+
"token_accuracy": 1.0,
|
| 415 |
+
"code_mix_preservation": 1.0,
|
| 416 |
+
"time_s": 0.024
|
| 417 |
+
},
|
| 418 |
+
{
|
| 419 |
+
"id": 33,
|
| 420 |
+
"input": "comment eka delete kala nisa mama danne na",
|
| 421 |
+
"reference": "comment ΰΆΰΆ delete ΰΆΰ·
ΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·",
|
| 422 |
+
"prediction": "comment ΰΆΰΆ delete ΰΆΰ·
ΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·",
|
| 423 |
+
"exact_match": true,
|
| 424 |
+
"cer": 0.0,
|
| 425 |
+
"wer": 0.0,
|
| 426 |
+
"bleu": 1.0,
|
| 427 |
+
"token_accuracy": 1.0,
|
| 428 |
+
"code_mix_preservation": 1.0,
|
| 429 |
+
"time_s": 0.028
|
| 430 |
+
},
|
| 431 |
+
{
|
| 432 |
+
"id": 34,
|
| 433 |
+
"input": "selfie ekak gannako",
|
| 434 |
+
"reference": "selfie ΰΆΰΆΰΆΰ· ΰΆΰΆ±ΰ·ΰΆ±ΰΆΰ·",
|
| 435 |
+
"prediction": "selfie ΰΆΰΆΰΆΰ· ΰΆΰΆ±ΰ·ΰΆ±ΰΆΰ·",
|
| 436 |
+
"exact_match": true,
|
| 437 |
+
"cer": 0.0,
|
| 438 |
+
"wer": 0.0,
|
| 439 |
+
"bleu": 1.0,
|
| 440 |
+
"token_accuracy": 1.0,
|
| 441 |
+
"code_mix_preservation": 1.0,
|
| 442 |
+
"time_s": 0.023
|
| 443 |
+
},
|
| 444 |
+
{
|
| 445 |
+
"id": 35,
|
| 446 |
+
"input": "post eka private nisa share karanna epa",
|
| 447 |
+
"reference": "post ΰΆΰΆ private ΰΆ±ΰ·ΰ·ΰ· share ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ΄ΰ·",
|
| 448 |
+
"prediction": "post ΰΆΰΆ private ΰΆ±ΰ·ΰ·ΰ· share ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ΄ΰ·",
|
| 449 |
+
"exact_match": true,
|
| 450 |
+
"cer": 0.0,
|
| 451 |
+
"wer": 0.0,
|
| 452 |
+
"bleu": 1.0,
|
| 453 |
+
"token_accuracy": 1.0,
|
| 454 |
+
"code_mix_preservation": 1.0,
|
| 455 |
+
"time_s": 0.027
|
| 456 |
+
},
|
| 457 |
+
{
|
| 458 |
+
"id": 36,
|
| 459 |
+
"input": "oyta message krnna one",
|
| 460 |
+
"reference": "ΰΆΰΆΊΰ·ΰΆ§ message ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 461 |
+
"prediction": "ΰΆΰΆΊΰ·ΰΆ§ message ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 462 |
+
"exact_match": true,
|
| 463 |
+
"cer": 0.0,
|
| 464 |
+
"wer": 0.0,
|
| 465 |
+
"bleu": 1.0,
|
| 466 |
+
"token_accuracy": 1.0,
|
| 467 |
+
"code_mix_preservation": 1.0,
|
| 468 |
+
"time_s": 0.002
|
| 469 |
+
},
|
| 470 |
+
{
|
| 471 |
+
"id": 37,
|
| 472 |
+
"input": "api passe katha karamu",
|
| 473 |
+
"reference": "ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆΰΆΰ· ΰΆΰΆ»ΰΆΈΰ·",
|
| 474 |
+
"prediction": "ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆΰΆΰ· ΰΆΰΆ»ΰΆΈΰ·",
|
| 475 |
+
"exact_match": true,
|
| 476 |
+
"cer": 0.0,
|
| 477 |
+
"wer": 0.0,
|
| 478 |
+
"bleu": 1.0,
|
| 479 |
+
"token_accuracy": 1.0,
|
| 480 |
+
"code_mix_preservation": 1.0,
|
| 481 |
+
"time_s": 0.002
|
| 482 |
+
},
|
| 483 |
+
{
|
| 484 |
+
"id": 38,
|
| 485 |
+
"input": "eya laga pinthurayk thiyanawa",
|
| 486 |
+
"reference": "ΰΆΰΆΊΰ· ΰ·
ΰΆ ΰΆ΄ΰ·ΰΆ±ΰ·ΰΆΰ·ΰΆ»ΰΆΊΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·",
|
| 487 |
+
"prediction": "ΰΆΰΆΊΰ· ΰ·
ΰΆ ΰΆ΄ΰ·ΰΆ±ΰ·ΰΆΰ·ΰΆ»ΰΆΊΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·",
|
| 488 |
+
"exact_match": true,
|
| 489 |
+
"cer": 0.0,
|
| 490 |
+
"wer": 0.0,
|
| 491 |
+
"bleu": 1.0,
|
| 492 |
+
"token_accuracy": 1.0,
|
| 493 |
+
"code_mix_preservation": 1.0,
|
| 494 |
+
"time_s": 0.023
|
| 495 |
+
},
|
| 496 |
+
{
|
| 497 |
+
"id": 39,
|
| 498 |
+
"input": "oya awa wage mata hithenawa",
|
| 499 |
+
"reference": "ΰΆΰΆΊΰ· ΰΆΰ·ΰ· ΰ·ΰΆΰ· ΰΆΈΰΆ§ ΰ·ΰ·ΰΆΰ·ΰΆ±ΰ·ΰ·",
|
| 500 |
+
"prediction": "ΰΆΰΆΊΰ· ΰΆΰ·ΰ· ΰ·ΰΆΰ· ΰΆΈΰΆ§ ΰ·ΰ·ΰΆΰ·ΰΆ±ΰ·ΰ·",
|
| 501 |
+
"exact_match": true,
|
| 502 |
+
"cer": 0.0,
|
| 503 |
+
"wer": 0.0,
|
| 504 |
+
"bleu": 1.0,
|
| 505 |
+
"token_accuracy": 1.0,
|
| 506 |
+
"code_mix_preservation": 1.0,
|
| 507 |
+
"time_s": 0.002
|
| 508 |
+
},
|
| 509 |
+
{
|
| 510 |
+
"id": 40,
|
| 511 |
+
"input": "api passe hambawemu",
|
| 512 |
+
"reference": "ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰ·ΰΆΈΰ·ΰΆΆΰ·ΰ·ΰΆΈΰ·",
|
| 513 |
+
"prediction": "ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰ·ΰΆΈΰ·ΰΆΆΰ·ΰ·ΰΆΈΰ·",
|
| 514 |
+
"exact_match": true,
|
| 515 |
+
"cer": 0.0,
|
| 516 |
+
"wer": 0.0,
|
| 517 |
+
"bleu": 1.0,
|
| 518 |
+
"token_accuracy": 1.0,
|
| 519 |
+
"code_mix_preservation": 1.0,
|
| 520 |
+
"time_s": 0.015
|
| 521 |
+
},
|
| 522 |
+
{
|
| 523 |
+
"id": 41,
|
| 524 |
+
"input": "phone eka charge karanna one",
|
| 525 |
+
"reference": "phone ΰΆΰΆ charge ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 526 |
+
"prediction": "phone ΰΆΰΆ charge ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 527 |
+
"exact_match": true,
|
| 528 |
+
"cer": 0.0,
|
| 529 |
+
"wer": 0.0,
|
| 530 |
+
"bleu": 1.0,
|
| 531 |
+
"token_accuracy": 1.0,
|
| 532 |
+
"code_mix_preservation": 1.0,
|
| 533 |
+
"time_s": 0.022
|
| 534 |
+
},
|
| 535 |
+
{
|
| 536 |
+
"id": 42,
|
| 537 |
+
"input": "bus eka late una",
|
| 538 |
+
"reference": "bus ΰΆΰΆ late ΰΆΰΆ±ΰ·",
|
| 539 |
+
"prediction": "bus ΰΆΰΆ late ΰΆΰΆ±ΰ·",
|
| 540 |
+
"exact_match": true,
|
| 541 |
+
"cer": 0.0,
|
| 542 |
+
"wer": 0.0,
|
| 543 |
+
"bleu": 1.0,
|
| 544 |
+
"token_accuracy": 1.0,
|
| 545 |
+
"code_mix_preservation": 1.0,
|
| 546 |
+
"time_s": 0.018
|
| 547 |
+
},
|
| 548 |
+
{
|
| 549 |
+
"id": 43,
|
| 550 |
+
"input": "mama online inne",
|
| 551 |
+
"reference": "ΰΆΈΰΆΈ online ΰΆΰΆ±ΰ·ΰΆ±ΰ·",
|
| 552 |
+
"prediction": "ΰΆΈΰΆΈ online ΰΆΰΆ±ΰ·ΰΆ±ΰ·",
|
| 553 |
+
"exact_match": true,
|
| 554 |
+
"cer": 0.0,
|
| 555 |
+
"wer": 0.0,
|
| 556 |
+
"bleu": 1.0,
|
| 557 |
+
"token_accuracy": 1.0,
|
| 558 |
+
"code_mix_preservation": 1.0,
|
| 559 |
+
"time_s": 0.001
|
| 560 |
+
},
|
| 561 |
+
{
|
| 562 |
+
"id": 44,
|
| 563 |
+
"input": "time nathi nisa heta yamu",
|
| 564 |
+
"reference": "time ΰΆ±ΰ·ΰΆΰ· ΰΆ±ΰ·ΰ·ΰ· ΰ·ΰ·ΰΆ§ ΰΆΊΰΆΈΰ·",
|
| 565 |
+
"prediction": "time ΰΆ±ΰ·ΰΆΰ· ΰΆ±ΰ·ΰ·ΰ· ΰ·ΰ·ΰΆ§ ΰΆΊΰΆΈΰ·",
|
| 566 |
+
"exact_match": true,
|
| 567 |
+
"cer": 0.0,
|
| 568 |
+
"wer": 0.0,
|
| 569 |
+
"bleu": 1.0,
|
| 570 |
+
"token_accuracy": 1.0,
|
| 571 |
+
"code_mix_preservation": 1.0,
|
| 572 |
+
"time_s": 0.002
|
| 573 |
+
},
|
| 574 |
+
{
|
| 575 |
+
"id": 45,
|
| 576 |
+
"input": "oya call eka ganna",
|
| 577 |
+
"reference": "ΰΆΰΆΊΰ· call ΰΆΰΆ ΰΆΰΆ±ΰ·ΰΆ±",
|
| 578 |
+
"prediction": "ΰΆΰΆΊΰ· call ΰΆΰΆ ΰΆΰΆ±ΰ·ΰΆ±",
|
| 579 |
+
"exact_match": true,
|
| 580 |
+
"cer": 0.0,
|
| 581 |
+
"wer": 0.0,
|
| 582 |
+
"bleu": 1.0,
|
| 583 |
+
"token_accuracy": 1.0,
|
| 584 |
+
"code_mix_preservation": 1.0,
|
| 585 |
+
"time_s": 0.042
|
| 586 |
+
},
|
| 587 |
+
{
|
| 588 |
+
"id": 46,
|
| 589 |
+
"input": "api game yanawa heta",
|
| 590 |
+
"reference": "ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΈΰ· ΰΆΊΰΆ±ΰ·ΰ· ΰ·ΰ·ΰΆ§",
|
| 591 |
+
"prediction": "ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΈΰ· ΰΆΊΰΆ±ΰ·ΰ· ΰ·ΰ·ΰΆ§",
|
| 592 |
+
"exact_match": true,
|
| 593 |
+
"cer": 0.0,
|
| 594 |
+
"wer": 0.0,
|
| 595 |
+
"bleu": 1.0,
|
| 596 |
+
"token_accuracy": 1.0,
|
| 597 |
+
"code_mix_preservation": 1.0,
|
| 598 |
+
"time_s": 0.023
|
| 599 |
+
},
|
| 600 |
+
{
|
| 601 |
+
"id": 47,
|
| 602 |
+
"input": "man heta enne na",
|
| 603 |
+
"reference": "ΰΆΈΰΆ±ΰ· ΰ·ΰ·ΰΆ§ ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·",
|
| 604 |
+
"prediction": "ΰΆΈΰ·ΰΆ±ΰ· ΰ·ΰ·ΰΆ§ ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·",
|
| 605 |
+
"exact_match": false,
|
| 606 |
+
"cer": 0.0625,
|
| 607 |
+
"wer": 0.25,
|
| 608 |
+
"bleu": 0.0,
|
| 609 |
+
"token_accuracy": 0.75,
|
| 610 |
+
"code_mix_preservation": 1.0,
|
| 611 |
+
"time_s": 0.045
|
| 612 |
+
},
|
| 613 |
+
{
|
| 614 |
+
"id": 48,
|
| 615 |
+
"input": "eka hari lassanai",
|
| 616 |
+
"reference": "ΰΆΰΆ ΰ·ΰΆ»ΰ· ΰΆ½ΰ·ΰ·ΰ·ΰΆ±ΰΆΊΰ·",
|
| 617 |
+
"prediction": "ΰΆΰΆ ΰ·ΰΆ»ΰ· ΰΆ½ΰ·ΰ·ΰ·ΰΆ±ΰΆΊΰ·",
|
| 618 |
+
"exact_match": true,
|
| 619 |
+
"cer": 0.0,
|
| 620 |
+
"wer": 0.0,
|
| 621 |
+
"bleu": 1.0,
|
| 622 |
+
"token_accuracy": 1.0,
|
| 623 |
+
"code_mix_preservation": 1.0,
|
| 624 |
+
"time_s": 0.015
|
| 625 |
+
},
|
| 626 |
+
{
|
| 627 |
+
"id": 49,
|
| 628 |
+
"input": "oya kiwwa hari",
|
| 629 |
+
"reference": "ΰΆΰΆΊΰ· ΰΆΰ·ΰ·ΰ·ΰ·ΰ· ΰ·ΰΆ»ΰ·",
|
| 630 |
+
"prediction": "ΰΆΰΆΊΰ· ΰΆΰ·ΰ·ΰ·ΰ·ΰ· ΰ·ΰΆ»ΰ·",
|
| 631 |
+
"exact_match": true,
|
| 632 |
+
"cer": 0.0,
|
| 633 |
+
"wer": 0.0,
|
| 634 |
+
"bleu": 1.0,
|
| 635 |
+
"token_accuracy": 1.0,
|
| 636 |
+
"code_mix_preservation": 1.0,
|
| 637 |
+
"time_s": 0.001
|
| 638 |
+
},
|
| 639 |
+
{
|
| 640 |
+
"id": 50,
|
| 641 |
+
"input": "kalaya ithuru krganna one",
|
| 642 |
+
"reference": "ΰΆΰΆ½ΰΆΊ ΰΆΰΆΰ·ΰΆ»ΰ· ΰΆΰΆ»ΰΆΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 643 |
+
"prediction": "ΰΆΰΆ½ΰΆΊ ΰΆΰΆΰ·ΰΆ»ΰ· ΰΆΰΆ»ΰΆΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 644 |
+
"exact_match": true,
|
| 645 |
+
"cer": 0.0,
|
| 646 |
+
"wer": 0.0,
|
| 647 |
+
"bleu": 1.0,
|
| 648 |
+
"token_accuracy": 1.0,
|
| 649 |
+
"code_mix_preservation": 1.0,
|
| 650 |
+
"time_s": 0.046
|
| 651 |
+
},
|
| 652 |
+
{
|
| 653 |
+
"id": 51,
|
| 654 |
+
"input": "date eka fix karanna one",
|
| 655 |
+
"reference": "date ΰΆΰΆ fix ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 656 |
+
"prediction": "date ΰΆΰΆ fix ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 657 |
+
"exact_match": true,
|
| 658 |
+
"cer": 0.0,
|
| 659 |
+
"wer": 0.0,
|
| 660 |
+
"bleu": 1.0,
|
| 661 |
+
"token_accuracy": 1.0,
|
| 662 |
+
"code_mix_preservation": 1.0,
|
| 663 |
+
"time_s": 0.023
|
| 664 |
+
},
|
| 665 |
+
{
|
| 666 |
+
"id": 52,
|
| 667 |
+
"input": "rata yanna one",
|
| 668 |
+
"reference": "ΰΆ»ΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 669 |
+
"prediction": "ΰΆ»ΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 670 |
+
"exact_match": true,
|
| 671 |
+
"cer": 0.0,
|
| 672 |
+
"wer": 0.0,
|
| 673 |
+
"bleu": 1.0,
|
| 674 |
+
"token_accuracy": 1.0,
|
| 675 |
+
"code_mix_preservation": 1.0,
|
| 676 |
+
"time_s": 0.046
|
| 677 |
+
},
|
| 678 |
+
{
|
| 679 |
+
"id": 53,
|
| 680 |
+
"input": "game eke leaderboard eka balanna",
|
| 681 |
+
"reference": "game ΰΆΰΆΰ· leaderboard ΰΆΰΆ ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ±",
|
| 682 |
+
"prediction": "ΰΆΰΆΈΰ· ΰΆΰΆΰ· leaderboard ΰΆΰΆ ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ±",
|
| 683 |
+
"exact_match": false,
|
| 684 |
+
"cer": 0.1379,
|
| 685 |
+
"wer": 0.2,
|
| 686 |
+
"bleu": 0.6687,
|
| 687 |
+
"token_accuracy": 0.8,
|
| 688 |
+
"code_mix_preservation": 0.5,
|
| 689 |
+
"time_s": 0.072
|
| 690 |
+
},
|
| 691 |
+
{
|
| 692 |
+
"id": 54,
|
| 693 |
+
"input": "api thamai hodama",
|
| 694 |
+
"reference": "ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΈΰΆΊΰ· ΰ·ΰ·ΰΆ³ΰΆΈ",
|
| 695 |
+
"prediction": "ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΈΰΆΊΰ· ΰ·ΰ·ΰΆ³ΰΆΈ",
|
| 696 |
+
"exact_match": true,
|
| 697 |
+
"cer": 0.0,
|
| 698 |
+
"wer": 0.0,
|
| 699 |
+
"bleu": 1.0,
|
| 700 |
+
"token_accuracy": 1.0,
|
| 701 |
+
"code_mix_preservation": 1.0,
|
| 702 |
+
"time_s": 0.018
|
| 703 |
+
},
|
| 704 |
+
{
|
| 705 |
+
"id": 55,
|
| 706 |
+
"input": "mama heta udee enawa oya enakota message ekk dnna",
|
| 707 |
+
"reference": "ΰΆΈΰΆΈ ΰ·ΰ·ΰΆ§ ΰΆΰΆ―ΰ· ΰΆΰΆ±ΰ·ΰ· ΰΆΰΆΊΰ· ΰΆΰΆ±ΰΆΰ·ΰΆ§ message ΰΆΰΆΰΆΰ· ΰΆ―ΰ·ΰΆ±ΰ·ΰΆ±",
|
| 708 |
+
"prediction": "ΰΆΈΰΆΈ ΰ·ΰ·ΰΆ§ ΰΆΰΆ―ΰ· ΰΆΰΆ±ΰ·ΰ· ΰΆΰΆΊΰ· ΰΆΰΆ±ΰΆΰ·ΰΆ§ message ΰΆΰΆΰΆΰ· ΰΆ―ΰ·ΰΆ±ΰ·ΰΆ±",
|
| 709 |
+
"exact_match": true,
|
| 710 |
+
"cer": 0.0,
|
| 711 |
+
"wer": 0.0,
|
| 712 |
+
"bleu": 1.0,
|
| 713 |
+
"token_accuracy": 1.0,
|
| 714 |
+
"code_mix_preservation": 1.0,
|
| 715 |
+
"time_s": 0.061
|
| 716 |
+
},
|
| 717 |
+
{
|
| 718 |
+
"id": 56,
|
| 719 |
+
"input": "ape gedara langa thiyana kadeta yanna one",
|
| 720 |
+
"reference": "ΰΆ
ΰΆ΄ΰ· ΰΆΰ·ΰΆ―ΰΆ» ΰ·
ΰΆ ΰΆΰ·ΰΆΊΰ·ΰΆ± ΰΆΰΆ©ΰ·ΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 721 |
+
"prediction": "ΰΆ
ΰΆ΄ΰ· ΰΆΰ·ΰΆ―ΰΆ» ΰ·
ΰΆ ΰΆΰ·ΰΆΊΰ·ΰΆ± ΰΆΰΆ©ΰ·ΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 722 |
+
"exact_match": true,
|
| 723 |
+
"cer": 0.0,
|
| 724 |
+
"wer": 0.0,
|
| 725 |
+
"bleu": 1.0,
|
| 726 |
+
"token_accuracy": 1.0,
|
| 727 |
+
"code_mix_preservation": 1.0,
|
| 728 |
+
"time_s": 0.067
|
| 729 |
+
},
|
| 730 |
+
{
|
| 731 |
+
"id": 57,
|
| 732 |
+
"input": "mama assignment eka karala submit karanawa ada raa",
|
| 733 |
+
"reference": "ΰΆΈΰΆΈ assignment ΰΆΰΆ ΰΆΰΆ»ΰΆ½ΰ· submit ΰΆΰΆ»ΰΆ±ΰ·ΰ· ΰΆ
ΰΆ― ΰΆ»ΰ·",
|
| 734 |
+
"prediction": "ΰΆΈΰΆΈ assignment ΰΆΰΆ ΰΆΰΆ»ΰ·ΰΆ½ submit ΰΆΰΆ»ΰΆ±ΰ·ΰ· ΰΆ
ΰΆ― ΰΆ»ΰ·",
|
| 735 |
+
"exact_match": false,
|
| 736 |
+
"cer": 0.05,
|
| 737 |
+
"wer": 0.125,
|
| 738 |
+
"bleu": 0.5,
|
| 739 |
+
"token_accuracy": 0.875,
|
| 740 |
+
"code_mix_preservation": 1.0,
|
| 741 |
+
"time_s": 0.097
|
| 742 |
+
},
|
| 743 |
+
{
|
| 744 |
+
"id": 58,
|
| 745 |
+
"input": "oya enne naththe mokada kiyla mama danne na",
|
| 746 |
+
"reference": "ΰΆΰΆΊΰ· ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·ΰΆΰ·ΰΆΰ· ΰΆΈΰ·ΰΆΰΆ― ΰΆΰ·ΰΆΊΰΆ½ΰ· ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·",
|
| 747 |
+
"prediction": "ΰΆΰΆΊΰ· ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·ΰΆΰ·ΰΆΰ· ΰΆΈΰ·ΰΆΰΆ― ΰΆΰ·ΰΆΊΰΆ½ΰ· ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·",
|
| 748 |
+
"exact_match": true,
|
| 749 |
+
"cer": 0.0,
|
| 750 |
+
"wer": 0.0,
|
| 751 |
+
"bleu": 1.0,
|
| 752 |
+
"token_accuracy": 1.0,
|
| 753 |
+
"code_mix_preservation": 1.0,
|
| 754 |
+
"time_s": 0.045
|
| 755 |
+
},
|
| 756 |
+
{
|
| 757 |
+
"id": 59,
|
| 758 |
+
"input": "client ekka call karala feedback eka ahanna one",
|
| 759 |
+
"reference": "client ΰΆΰΆΰ·ΰΆ call ΰΆΰΆ»ΰΆ½ΰ· feedback ΰΆΰΆ ΰΆ
ΰ·ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 760 |
+
"prediction": "client ΰΆΰΆΰ·ΰΆ call ΰΆΰΆ»ΰΆ½ΰ· feedback ΰΆΰΆ ΰΆ
ΰ·ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 761 |
+
"exact_match": true,
|
| 762 |
+
"cer": 0.0,
|
| 763 |
+
"wer": 0.0,
|
| 764 |
+
"bleu": 1.0,
|
| 765 |
+
"token_accuracy": 1.0,
|
| 766 |
+
"code_mix_preservation": 1.0,
|
| 767 |
+
"time_s": 0.097
|
| 768 |
+
},
|
| 769 |
+
{
|
| 770 |
+
"id": 60,
|
| 771 |
+
"input": "mama gedara gihilla kewata passe call karannm",
|
| 772 |
+
"reference": "ΰΆΈΰΆΈ ΰΆΰ·ΰΆ―ΰΆ» ΰΆΰ·ΰ·ΰ·ΰΆ½ΰ·ΰΆ½ΰ· ΰΆΰ·ΰ·ΰΆ§ ΰΆ΄ΰ·ΰ·ΰ·ΰ· call ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±ΰΆΈΰ·",
|
| 773 |
+
"prediction": "ΰΆΈΰΆΈ ΰΆΰ·ΰΆ―ΰΆ» ΰΆΰ·ΰ·ΰ·ΰΆ½ΰ·ΰΆ½ΰ· ΰΆΰ·ΰ·ΰΆ§ ΰΆ΄ΰ·ΰ·ΰ·ΰ· call ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±ΰΆΈΰ·",
|
| 774 |
+
"exact_match": true,
|
| 775 |
+
"cer": 0.0,
|
| 776 |
+
"wer": 0.0,
|
| 777 |
+
"bleu": 1.0,
|
| 778 |
+
"token_accuracy": 1.0,
|
| 779 |
+
"code_mix_preservation": 1.0,
|
| 780 |
+
"time_s": 0.03
|
| 781 |
+
},
|
| 782 |
+
{
|
| 783 |
+
"id": 61,
|
| 784 |
+
"input": "laptop eke software update karanna one",
|
| 785 |
+
"reference": "laptop ΰΆΰΆΰ· software update ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 786 |
+
"prediction": "laptop ΰΆΰΆΰ· software update ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 787 |
+
"exact_match": true,
|
| 788 |
+
"cer": 0.0,
|
| 789 |
+
"wer": 0.0,
|
| 790 |
+
"bleu": 1.0,
|
| 791 |
+
"token_accuracy": 1.0,
|
| 792 |
+
"code_mix_preservation": 1.0,
|
| 793 |
+
"time_s": 0.002
|
| 794 |
+
},
|
| 795 |
+
{
|
| 796 |
+
"id": 62,
|
| 797 |
+
"input": "office eke wifi password eka mokakda",
|
| 798 |
+
"reference": "office ΰΆΰΆΰ· wifi password ΰΆΰΆ ΰΆΈΰ·ΰΆΰΆΰ·ΰΆ―",
|
| 799 |
+
"prediction": "office ΰΆΰΆΰ· wifi password ΰΆΰΆ ΰΆΈΰ·ΰΆΰΆΰ·ΰΆ―",
|
| 800 |
+
"exact_match": true,
|
| 801 |
+
"cer": 0.0,
|
| 802 |
+
"wer": 0.0,
|
| 803 |
+
"bleu": 1.0,
|
| 804 |
+
"token_accuracy": 1.0,
|
| 805 |
+
"code_mix_preservation": 1.0,
|
| 806 |
+
"time_s": 0.037
|
| 807 |
+
},
|
| 808 |
+
{
|
| 809 |
+
"id": 63,
|
| 810 |
+
"input": "online order eka track karanna ba",
|
| 811 |
+
"reference": "online order ΰΆΰΆ track ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΆΰ·",
|
| 812 |
+
"prediction": "online order ΰΆΰΆ track ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΆΰ·",
|
| 813 |
+
"exact_match": true,
|
| 814 |
+
"cer": 0.0,
|
| 815 |
+
"wer": 0.0,
|
| 816 |
+
"bleu": 1.0,
|
| 817 |
+
"token_accuracy": 1.0,
|
| 818 |
+
"code_mix_preservation": 1.0,
|
| 819 |
+
"time_s": 0.023
|
| 820 |
+
},
|
| 821 |
+
{
|
| 822 |
+
"id": 64,
|
| 823 |
+
"input": "email eke attachment eka download karanna",
|
| 824 |
+
"reference": "email ΰΆΰΆΰ· attachment ΰΆΰΆ download ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±",
|
| 825 |
+
"prediction": "email ΰΆΰΆΰ· attachment ΰΆΰΆ download ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±",
|
| 826 |
+
"exact_match": true,
|
| 827 |
+
"cer": 0.0,
|
| 828 |
+
"wer": 0.0,
|
| 829 |
+
"bleu": 1.0,
|
| 830 |
+
"token_accuracy": 1.0,
|
| 831 |
+
"code_mix_preservation": 1.0,
|
| 832 |
+
"time_s": 0.023
|
| 833 |
+
},
|
| 834 |
+
{
|
| 835 |
+
"id": 65,
|
| 836 |
+
"input": "Instagram story eke filter eka hadanna",
|
| 837 |
+
"reference": "Instagram story ΰΆΰΆΰ· filter ΰΆΰΆ ΰ·ΰΆ―ΰΆ±ΰ·ΰΆ±",
|
| 838 |
+
"prediction": "Instagram story ΰΆΰΆΰ· filter ΰΆΰΆ ΰ·ΰΆ―ΰΆ±ΰ·ΰΆ±",
|
| 839 |
+
"exact_match": true,
|
| 840 |
+
"cer": 0.0,
|
| 841 |
+
"wer": 0.0,
|
| 842 |
+
"bleu": 1.0,
|
| 843 |
+
"token_accuracy": 1.0,
|
| 844 |
+
"code_mix_preservation": 1.0,
|
| 845 |
+
"time_s": 0.023
|
| 846 |
+
},
|
| 847 |
+
{
|
| 848 |
+
"id": 66,
|
| 849 |
+
"input": "oyge wada iwra krd",
|
| 850 |
+
"reference": "ΰΆΰΆΊΰ·ΰΆΰ· ΰ·ΰ·ΰΆ© ΰΆΰ·ΰΆ» ΰΆΰΆ»ΰ·ΰΆ―",
|
| 851 |
+
"prediction": "ΰΆΰΆΊΰ·ΰΆΰ· ΰ·ΰ·ΰΆ© ΰΆΰ·ΰΆ» ΰΆΰΆ»ΰ·ΰΆ―",
|
| 852 |
+
"exact_match": true,
|
| 853 |
+
"cer": 0.0,
|
| 854 |
+
"wer": 0.0,
|
| 855 |
+
"bleu": 1.0,
|
| 856 |
+
"token_accuracy": 1.0,
|
| 857 |
+
"code_mix_preservation": 1.0,
|
| 858 |
+
"time_s": 0.002
|
| 859 |
+
},
|
| 860 |
+
{
|
| 861 |
+
"id": 67,
|
| 862 |
+
"input": "mge phone ek hack una",
|
| 863 |
+
"reference": "ΰΆΈΰΆΰ· phone ΰΆΰΆ hack ΰΆΰΆ±ΰ·",
|
| 864 |
+
"prediction": "ΰΆΈΰΆΰ· phone ΰΆΰΆ hack ΰΆΰΆ±ΰ·",
|
| 865 |
+
"exact_match": true,
|
| 866 |
+
"cer": 0.0,
|
| 867 |
+
"wer": 0.0,
|
| 868 |
+
"bleu": 1.0,
|
| 869 |
+
"token_accuracy": 1.0,
|
| 870 |
+
"code_mix_preservation": 1.0,
|
| 871 |
+
"time_s": 0.002
|
| 872 |
+
},
|
| 873 |
+
{
|
| 874 |
+
"id": 68,
|
| 875 |
+
"input": "handawata ynna wenwa",
|
| 876 |
+
"reference": "ΰ·ΰ·ΰΆ±ΰ·ΰΆ―ΰ·ΰ·ΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰ·ΰ·ΰΆ±ΰ·ΰ·",
|
| 877 |
+
"prediction": "ΰ·ΰ·ΰΆ±ΰ·ΰΆ―ΰ·ΰ·ΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰ·ΰ·ΰΆ±ΰ·ΰ·",
|
| 878 |
+
"exact_match": true,
|
| 879 |
+
"cer": 0.0,
|
| 880 |
+
"wer": 0.0,
|
| 881 |
+
"bleu": 1.0,
|
| 882 |
+
"token_accuracy": 1.0,
|
| 883 |
+
"code_mix_preservation": 1.0,
|
| 884 |
+
"time_s": 0.026
|
| 885 |
+
},
|
| 886 |
+
{
|
| 887 |
+
"id": 69,
|
| 888 |
+
"input": "prashnya krnna oni",
|
| 889 |
+
"reference": "ΰΆ΄ΰ·βΰΆ»ΰ·ΰ·βΰΆ±ΰΆΊ ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 890 |
+
"prediction": "ΰΆ΄ΰ·βΰΆ»ΰ·ΰ·βΰΆ±ΰΆΊ ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 891 |
+
"exact_match": true,
|
| 892 |
+
"cer": 0.0,
|
| 893 |
+
"wer": 0.0,
|
| 894 |
+
"bleu": 1.0,
|
| 895 |
+
"token_accuracy": 1.0,
|
| 896 |
+
"code_mix_preservation": 1.0,
|
| 897 |
+
"time_s": 0.001
|
| 898 |
+
},
|
| 899 |
+
{
|
| 900 |
+
"id": 70,
|
| 901 |
+
"input": "apita gdra ynna oni",
|
| 902 |
+
"reference": "ΰΆ
ΰΆ΄ΰ·ΰΆ§ ΰΆΰ·ΰΆ―ΰΆ» ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 903 |
+
"prediction": "ΰΆ
ΰΆ΄ΰ·ΰΆ§ ΰΆΰ·ΰΆ―ΰΆ» ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 904 |
+
"exact_match": true,
|
| 905 |
+
"cer": 0.0,
|
| 906 |
+
"wer": 0.0,
|
| 907 |
+
"bleu": 1.0,
|
| 908 |
+
"token_accuracy": 1.0,
|
| 909 |
+
"code_mix_preservation": 1.0,
|
| 910 |
+
"time_s": 0.072
|
| 911 |
+
},
|
| 912 |
+
{
|
| 913 |
+
"id": 71,
|
| 914 |
+
"input": "mama oyata kiwwa",
|
| 915 |
+
"reference": "ΰΆΈΰΆΈ ΰΆΰΆΊΰ·ΰΆ§ ΰΆΰ·ΰ·ΰ·ΰ·ΰ·",
|
| 916 |
+
"prediction": "ΰΆΈΰΆΈ ΰΆΰΆΊΰ·ΰΆ§ ΰΆΰ·ΰ·ΰ·ΰ·ΰ·",
|
| 917 |
+
"exact_match": true,
|
| 918 |
+
"cer": 0.0,
|
| 919 |
+
"wer": 0.0,
|
| 920 |
+
"bleu": 1.0,
|
| 921 |
+
"token_accuracy": 1.0,
|
| 922 |
+
"code_mix_preservation": 1.0,
|
| 923 |
+
"time_s": 0.001
|
| 924 |
+
},
|
| 925 |
+
{
|
| 926 |
+
"id": 72,
|
| 927 |
+
"input": "oya hari hondai",
|
| 928 |
+
"reference": "ΰΆΰΆΊΰ· ΰ·ΰΆ»ΰ· ΰ·ΰ·ΰΆ³ΰΆΊΰ·",
|
| 929 |
+
"prediction": "ΰΆΰΆΊΰ· ΰ·ΰΆ»ΰ· ΰ·ΰ·ΰΆ³ΰΆΊΰ·",
|
| 930 |
+
"exact_match": true,
|
| 931 |
+
"cer": 0.0,
|
| 932 |
+
"wer": 0.0,
|
| 933 |
+
"bleu": 1.0,
|
| 934 |
+
"token_accuracy": 1.0,
|
| 935 |
+
"code_mix_preservation": 1.0,
|
| 936 |
+
"time_s": 0.015
|
| 937 |
+
},
|
| 938 |
+
{
|
| 939 |
+
"id": 73,
|
| 940 |
+
"input": "api heta yamu",
|
| 941 |
+
"reference": "ΰΆ
ΰΆ΄ΰ· ΰ·ΰ·ΰΆ§ ΰΆΊΰΆΈΰ·",
|
| 942 |
+
"prediction": "ΰΆ
ΰΆ΄ΰ· ΰ·ΰ·ΰΆ§ ΰΆΊΰΆΈΰ·",
|
| 943 |
+
"exact_match": true,
|
| 944 |
+
"cer": 0.0,
|
| 945 |
+
"wer": 0.0,
|
| 946 |
+
"bleu": 1.0,
|
| 947 |
+
"token_accuracy": 1.0,
|
| 948 |
+
"code_mix_preservation": 1.0,
|
| 949 |
+
"time_s": 0.001
|
| 950 |
+
},
|
| 951 |
+
{
|
| 952 |
+
"id": 74,
|
| 953 |
+
"input": "app eka crash wenawa phone eke",
|
| 954 |
+
"reference": "app ΰΆΰΆ crash ΰ·ΰ·ΰΆ±ΰ·ΰ· phone ΰΆΰΆΰ·",
|
| 955 |
+
"prediction": "app ΰΆΰΆ crash ΰ·ΰ·ΰΆ±ΰ·ΰ· phone ΰΆΰΆΰ·",
|
| 956 |
+
"exact_match": true,
|
| 957 |
+
"cer": 0.0,
|
| 958 |
+
"wer": 0.0,
|
| 959 |
+
"bleu": 1.0,
|
| 960 |
+
"token_accuracy": 1.0,
|
| 961 |
+
"code_mix_preservation": 1.0,
|
| 962 |
+
"time_s": 0.028
|
| 963 |
+
},
|
| 964 |
+
{
|
| 965 |
+
"id": 75,
|
| 966 |
+
"input": "code eka push karanna github ekata",
|
| 967 |
+
"reference": "code ΰΆΰΆ push ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± github ΰΆΰΆΰΆ§",
|
| 968 |
+
"prediction": "code ΰΆΰΆ push ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± github ΰΆΰΆΰΆ§",
|
| 969 |
+
"exact_match": true,
|
| 970 |
+
"cer": 0.0,
|
| 971 |
+
"wer": 0.0,
|
| 972 |
+
"bleu": 1.0,
|
| 973 |
+
"token_accuracy": 1.0,
|
| 974 |
+
"code_mix_preservation": 1.0,
|
| 975 |
+
"time_s": 0.048
|
| 976 |
+
},
|
| 977 |
+
{
|
| 978 |
+
"id": 76,
|
| 979 |
+
"input": "database eka slow nisa query eka optimize karanna one",
|
| 980 |
+
"reference": "database ΰΆΰΆ slow ΰΆ±ΰ·ΰ·ΰ· query ΰΆΰΆ optimize ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 981 |
+
"prediction": "database ΰΆΰΆ slow ΰΆ±ΰ·ΰ·ΰ· query ΰΆΰΆ optimize ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 982 |
+
"exact_match": true,
|
| 983 |
+
"cer": 0.0,
|
| 984 |
+
"wer": 0.0,
|
| 985 |
+
"bleu": 1.0,
|
| 986 |
+
"token_accuracy": 1.0,
|
| 987 |
+
"code_mix_preservation": 1.0,
|
| 988 |
+
"time_s": 0.053
|
| 989 |
+
},
|
| 990 |
+
{
|
| 991 |
+
"id": 77,
|
| 992 |
+
"input": "bug eka fix kala merge karanna",
|
| 993 |
+
"reference": "bug ΰΆΰΆ fix ΰΆΰ·
ΰ· merge ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±",
|
| 994 |
+
"prediction": "bug ΰΆΰΆ fix ΰΆΰ·
ΰ· merge ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±",
|
| 995 |
+
"exact_match": true,
|
| 996 |
+
"cer": 0.0,
|
| 997 |
+
"wer": 0.0,
|
| 998 |
+
"bleu": 1.0,
|
| 999 |
+
"token_accuracy": 1.0,
|
| 1000 |
+
"code_mix_preservation": 1.0,
|
| 1001 |
+
"time_s": 0.046
|
| 1002 |
+
},
|
| 1003 |
+
{
|
| 1004 |
+
"id": 78,
|
| 1005 |
+
"input": "internet eka slow wage thiyanawa",
|
| 1006 |
+
"reference": "internet ΰΆΰΆ slow ΰ·ΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·",
|
| 1007 |
+
"prediction": "internet ΰΆΰΆ slow ΰ·ΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·",
|
| 1008 |
+
"exact_match": true,
|
| 1009 |
+
"cer": 0.0,
|
| 1010 |
+
"wer": 0.0,
|
| 1011 |
+
"bleu": 1.0,
|
| 1012 |
+
"token_accuracy": 1.0,
|
| 1013 |
+
"code_mix_preservation": 1.0,
|
| 1014 |
+
"time_s": 0.023
|
| 1015 |
+
},
|
| 1016 |
+
{
|
| 1017 |
+
"id": 79,
|
| 1018 |
+
"input": "kema hodai ada",
|
| 1019 |
+
"reference": "ΰΆΰ·ΰΆΈ ΰ·ΰ·ΰΆ³ΰΆΊΰ· ΰΆ
ΰΆ―",
|
| 1020 |
+
"prediction": "ΰΆΰ·ΰΆΈ ΰ·ΰ·ΰΆ³ΰΆΊΰ· ΰΆ
ΰΆ―",
|
| 1021 |
+
"exact_match": true,
|
| 1022 |
+
"cer": 0.0,
|
| 1023 |
+
"wer": 0.0,
|
| 1024 |
+
"bleu": 1.0,
|
| 1025 |
+
"token_accuracy": 1.0,
|
| 1026 |
+
"code_mix_preservation": 1.0,
|
| 1027 |
+
"time_s": 0.001
|
| 1028 |
+
},
|
| 1029 |
+
{
|
| 1030 |
+
"id": 80,
|
| 1031 |
+
"input": "mama bus eke enawa",
|
| 1032 |
+
"reference": "ΰΆΈΰΆΈ bus ΰΆΰΆΰ· ΰΆΰΆ±ΰ·ΰ·",
|
| 1033 |
+
"prediction": "ΰΆΈΰΆΈ bus ΰΆΰΆΰ· ΰΆΰΆ±ΰ·ΰ·",
|
| 1034 |
+
"exact_match": true,
|
| 1035 |
+
"cer": 0.0,
|
| 1036 |
+
"wer": 0.0,
|
| 1037 |
+
"bleu": 1.0,
|
| 1038 |
+
"token_accuracy": 1.0,
|
| 1039 |
+
"code_mix_preservation": 1.0,
|
| 1040 |
+
"time_s": 0.002
|
| 1041 |
+
},
|
| 1042 |
+
{
|
| 1043 |
+
"id": 81,
|
| 1044 |
+
"input": "ganu depala ekka market giya",
|
| 1045 |
+
"reference": "ΰΆΰ·ΰΆ±ΰ· ΰΆ―ΰ·ΰΆ΄ΰΆ½ ΰΆΰΆΰ·ΰΆ market ΰΆΰ·ΰΆΊΰ·",
|
| 1046 |
+
"prediction": "ΰΆΰΆ«ΰ· ΰΆ―ΰ·ΰΆ΄ΰ·
ΰΆΰΆΰ·ΰΆΰ· market ΰΆΰ·ΰΆΊΰ·",
|
| 1047 |
+
"exact_match": false,
|
| 1048 |
+
"cer": 0.1538,
|
| 1049 |
+
"wer": 0.6,
|
| 1050 |
+
"bleu": 0.0,
|
| 1051 |
+
"token_accuracy": 0.4,
|
| 1052 |
+
"code_mix_preservation": 1.0,
|
| 1053 |
+
"time_s": 0.07
|
| 1054 |
+
},
|
| 1055 |
+
{
|
| 1056 |
+
"id": 82,
|
| 1057 |
+
"input": "watura bonna one",
|
| 1058 |
+
"reference": "ΰ·ΰΆΰ·ΰΆ» ΰΆΆΰ·ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 1059 |
+
"prediction": "ΰ·ΰΆΰ·ΰΆ» ΰΆΆΰ·ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 1060 |
+
"exact_match": true,
|
| 1061 |
+
"cer": 0.0,
|
| 1062 |
+
"wer": 0.0,
|
| 1063 |
+
"bleu": 1.0,
|
| 1064 |
+
"token_accuracy": 1.0,
|
| 1065 |
+
"code_mix_preservation": 1.0,
|
| 1066 |
+
"time_s": 0.03
|
| 1067 |
+
},
|
| 1068 |
+
{
|
| 1069 |
+
"id": 83,
|
| 1070 |
+
"input": "shop eke sugar nati nisa mama giye na",
|
| 1071 |
+
"reference": "shop ΰΆΰΆΰ· sugar ΰΆ±ΰ·ΰΆΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆΰ·ΰΆΊΰ· ΰΆ±ΰ·",
|
| 1072 |
+
"prediction": "shop ΰΆΰΆΰ· sugar ΰΆ±ΰ·ΰΆΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆΰ·ΰΆΊΰ· ΰΆ±ΰ·",
|
| 1073 |
+
"exact_match": true,
|
| 1074 |
+
"cer": 0.0,
|
| 1075 |
+
"wer": 0.0,
|
| 1076 |
+
"bleu": 1.0,
|
| 1077 |
+
"token_accuracy": 1.0,
|
| 1078 |
+
"code_mix_preservation": 1.0,
|
| 1079 |
+
"time_s": 0.003
|
| 1080 |
+
},
|
| 1081 |
+
{
|
| 1082 |
+
"id": 84,
|
| 1083 |
+
"input": "hri hari",
|
| 1084 |
+
"reference": "ΰ·ΰΆ»ΰ· ΰ·ΰΆ»ΰ·",
|
| 1085 |
+
"prediction": "ΰ·ΰΆ»ΰ· ΰ·ΰΆ»ΰ·",
|
| 1086 |
+
"exact_match": true,
|
| 1087 |
+
"cer": 0.0,
|
| 1088 |
+
"wer": 0.0,
|
| 1089 |
+
"bleu": 1.0,
|
| 1090 |
+
"token_accuracy": 1.0,
|
| 1091 |
+
"code_mix_preservation": 1.0,
|
| 1092 |
+
"time_s": 0.001
|
| 1093 |
+
},
|
| 1094 |
+
{
|
| 1095 |
+
"id": 85,
|
| 1096 |
+
"input": "mta ep",
|
| 1097 |
+
"reference": "ΰΆΈΰΆ§ ΰΆΰΆ΄ΰ·",
|
| 1098 |
+
"prediction": "ΰΆΈΰΆ§ ΰΆΰΆ΄ΰ·",
|
| 1099 |
+
"exact_match": true,
|
| 1100 |
+
"cer": 0.0,
|
| 1101 |
+
"wer": 0.0,
|
| 1102 |
+
"bleu": 1.0,
|
| 1103 |
+
"token_accuracy": 1.0,
|
| 1104 |
+
"code_mix_preservation": 1.0,
|
| 1105 |
+
"time_s": 0.001
|
| 1106 |
+
},
|
| 1107 |
+
{
|
| 1108 |
+
"id": 86,
|
| 1109 |
+
"input": "ok hari",
|
| 1110 |
+
"reference": "ok ΰ·ΰΆ»ΰ·",
|
| 1111 |
+
"prediction": "ok ΰ·ΰΆ»ΰ·",
|
| 1112 |
+
"exact_match": true,
|
| 1113 |
+
"cer": 0.0,
|
| 1114 |
+
"wer": 0.0,
|
| 1115 |
+
"bleu": 1.0,
|
| 1116 |
+
"token_accuracy": 1.0,
|
| 1117 |
+
"code_mix_preservation": 1.0,
|
| 1118 |
+
"time_s": 0.001
|
| 1119 |
+
},
|
| 1120 |
+
{
|
| 1121 |
+
"id": 87,
|
| 1122 |
+
"input": "ape game hari dewal wenne",
|
| 1123 |
+
"reference": "ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΈΰ· ΰ·ΰΆ»ΰ· ΰΆ―ΰ·ΰ·ΰΆ½ΰ· ΰ·ΰ·ΰΆ±ΰ·ΰΆ±ΰ·",
|
| 1124 |
+
"prediction": "ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΈΰ· ΰ·ΰΆ»ΰ· ΰΆ―ΰ·ΰ·ΰΆ½ΰ· ΰ·ΰ·ΰΆ±ΰ·ΰΆ±ΰ·",
|
| 1125 |
+
"exact_match": false,
|
| 1126 |
+
"cer": 0.0417,
|
| 1127 |
+
"wer": 0.2,
|
| 1128 |
+
"bleu": 0.6687,
|
| 1129 |
+
"token_accuracy": 0.8,
|
| 1130 |
+
"code_mix_preservation": 1.0,
|
| 1131 |
+
"time_s": 0.082
|
| 1132 |
+
},
|
| 1133 |
+
{
|
| 1134 |
+
"id": 88,
|
| 1135 |
+
"input": "mta dan one na",
|
| 1136 |
+
"reference": "ΰΆΈΰΆ§ ΰΆ―ΰ·ΰΆ±ΰ· ΰΆΰΆ±ΰ· ΰΆ±ΰ·",
|
| 1137 |
+
"prediction": "ΰΆΈΰΆ§ ΰΆ―ΰ·ΰΆ±ΰ· ΰΆΰΆ±ΰ· ΰΆ±ΰ·",
|
| 1138 |
+
"exact_match": true,
|
| 1139 |
+
"cer": 0.0,
|
| 1140 |
+
"wer": 0.0,
|
| 1141 |
+
"bleu": 1.0,
|
| 1142 |
+
"token_accuracy": 1.0,
|
| 1143 |
+
"code_mix_preservation": 1.0,
|
| 1144 |
+
"time_s": 0.002
|
| 1145 |
+
},
|
| 1146 |
+
{
|
| 1147 |
+
"id": 89,
|
| 1148 |
+
"input": "eka hari hondai wage dnuna nisa mama giya",
|
| 1149 |
+
"reference": "ΰΆΰΆ ΰ·ΰΆ»ΰ· ΰ·ΰ·ΰΆ³ΰΆΊΰ· ΰ·ΰΆΰ· ΰΆ―ΰ·ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆΰ·ΰΆΊΰ·",
|
| 1150 |
+
"prediction": "ΰΆΰΆ ΰ·ΰΆ»ΰ· ΰ·ΰ·ΰΆ³ΰΆΊΰ· ΰ·ΰΆΰ· ΰΆ―ΰ·ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆΰ·ΰΆΊΰ·",
|
| 1151 |
+
"exact_match": true,
|
| 1152 |
+
"cer": 0.0,
|
| 1153 |
+
"wer": 0.0,
|
| 1154 |
+
"bleu": 1.0,
|
| 1155 |
+
"token_accuracy": 1.0,
|
| 1156 |
+
"code_mix_preservation": 1.0,
|
| 1157 |
+
"time_s": 0.044
|
| 1158 |
+
},
|
| 1159 |
+
{
|
| 1160 |
+
"id": 90,
|
| 1161 |
+
"input": "game eke mission hari amarui",
|
| 1162 |
+
"reference": "game ΰΆΰΆΰ· mission ΰ·ΰΆ»ΰ· ΰΆ
ΰΆΈΰ·ΰΆ»ΰ·ΰΆΊΰ·",
|
| 1163 |
+
"prediction": "ΰΆΰΆΈΰ· ΰΆΰΆΰ· mission ΰ·ΰΆ»ΰ· ΰΆ
ΰΆΈΰ·ΰΆ»ΰ·ΰΆΊΰ·",
|
| 1164 |
+
"exact_match": false,
|
| 1165 |
+
"cer": 0.1429,
|
| 1166 |
+
"wer": 0.2,
|
| 1167 |
+
"bleu": 0.6687,
|
| 1168 |
+
"token_accuracy": 0.8,
|
| 1169 |
+
"code_mix_preservation": 0.5,
|
| 1170 |
+
"time_s": 0.029
|
| 1171 |
+
},
|
| 1172 |
+
{
|
| 1173 |
+
"id": 91,
|
| 1174 |
+
"input": "mama heta yanawa",
|
| 1175 |
+
"reference": "ΰΆΈΰΆΈ ΰ·ΰ·ΰΆ§ ΰΆΊΰΆ±ΰ·ΰ·",
|
| 1176 |
+
"prediction": "ΰΆΈΰΆΈ ΰ·ΰ·ΰΆ§ ΰΆΊΰΆ±ΰ·ΰ·",
|
| 1177 |
+
"exact_match": true,
|
| 1178 |
+
"cer": 0.0,
|
| 1179 |
+
"wer": 0.0,
|
| 1180 |
+
"bleu": 1.0,
|
| 1181 |
+
"token_accuracy": 1.0,
|
| 1182 |
+
"code_mix_preservation": 1.0,
|
| 1183 |
+
"time_s": 0.001
|
| 1184 |
+
},
|
| 1185 |
+
{
|
| 1186 |
+
"id": 92,
|
| 1187 |
+
"input": "ey iye aawa",
|
| 1188 |
+
"reference": "ΰΆΰΆΊΰ· ΰΆΰΆΊΰ· ΰΆΰ·ΰ·",
|
| 1189 |
+
"prediction": "ΰΆΰΆΊΰ· ΰΆΰΆΊΰ· ΰΆΰ·ΰ·",
|
| 1190 |
+
"exact_match": true,
|
| 1191 |
+
"cer": 0.0,
|
| 1192 |
+
"wer": 0.0,
|
| 1193 |
+
"bleu": 1.0,
|
| 1194 |
+
"token_accuracy": 1.0,
|
| 1195 |
+
"code_mix_preservation": 1.0,
|
| 1196 |
+
"time_s": 0.024
|
| 1197 |
+
},
|
| 1198 |
+
{
|
| 1199 |
+
"id": 93,
|
| 1200 |
+
"input": "api dan yanawa",
|
| 1201 |
+
"reference": "ΰΆ
ΰΆ΄ΰ· ΰΆ―ΰ·ΰΆ±ΰ· ΰΆΊΰΆ±ΰ·ΰ·",
|
| 1202 |
+
"prediction": "ΰΆ
ΰΆ΄ΰ· ΰΆ―ΰ·ΰΆ±ΰ· ΰΆΊΰΆ±ΰ·ΰ·",
|
| 1203 |
+
"exact_match": true,
|
| 1204 |
+
"cer": 0.0,
|
| 1205 |
+
"wer": 0.0,
|
| 1206 |
+
"bleu": 1.0,
|
| 1207 |
+
"token_accuracy": 1.0,
|
| 1208 |
+
"code_mix_preservation": 1.0,
|
| 1209 |
+
"time_s": 0.001
|
| 1210 |
+
},
|
| 1211 |
+
{
|
| 1212 |
+
"id": 94,
|
| 1213 |
+
"input": "video eka balanna one",
|
| 1214 |
+
"reference": "video ΰΆΰΆ ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 1215 |
+
"prediction": "video ΰΆΰΆ ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·",
|
| 1216 |
+
"exact_match": true,
|
| 1217 |
+
"cer": 0.0,
|
| 1218 |
+
"wer": 0.0,
|
| 1219 |
+
"bleu": 1.0,
|
| 1220 |
+
"token_accuracy": 1.0,
|
| 1221 |
+
"code_mix_preservation": 1.0,
|
| 1222 |
+
"time_s": 0.042
|
| 1223 |
+
},
|
| 1224 |
+
{
|
| 1225 |
+
"id": 95,
|
| 1226 |
+
"input": "video ekak hadamu",
|
| 1227 |
+
"reference": "video ΰΆΰΆΰΆΰ· ΰ·ΰΆ―ΰΆΈΰ·",
|
| 1228 |
+
"prediction": "video ΰΆΰΆΰΆΰ· ΰ·ΰΆ―ΰΆΈΰ·",
|
| 1229 |
+
"exact_match": true,
|
| 1230 |
+
"cer": 0.0,
|
| 1231 |
+
"wer": 0.0,
|
| 1232 |
+
"bleu": 1.0,
|
| 1233 |
+
"token_accuracy": 1.0,
|
| 1234 |
+
"code_mix_preservation": 1.0,
|
| 1235 |
+
"time_s": 0.023
|
| 1236 |
+
},
|
| 1237 |
+
{
|
| 1238 |
+
"id": 96,
|
| 1239 |
+
"input": "video eke comment eka balanna",
|
| 1240 |
+
"reference": "video ΰΆΰΆΰ· comment ΰΆΰΆ ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ±",
|
| 1241 |
+
"prediction": "video ΰΆΰΆΰ· comment ΰΆΰΆ ΰΆΆΰΆ½ΰΆ±ΰ·οΏ½οΏ½",
|
| 1242 |
+
"exact_match": true,
|
| 1243 |
+
"cer": 0.0,
|
| 1244 |
+
"wer": 0.0,
|
| 1245 |
+
"bleu": 1.0,
|
| 1246 |
+
"token_accuracy": 1.0,
|
| 1247 |
+
"code_mix_preservation": 1.0,
|
| 1248 |
+
"time_s": 0.041
|
| 1249 |
+
},
|
| 1250 |
+
{
|
| 1251 |
+
"id": 97,
|
| 1252 |
+
"input": "video ekata like ekak danna",
|
| 1253 |
+
"reference": "video ΰΆΰΆΰΆ§ like ΰΆΰΆΰΆΰ· ΰΆ―ΰ·ΰΆ±ΰ·ΰΆ±",
|
| 1254 |
+
"prediction": "video ΰΆΰΆΰΆ§ like ΰΆΰΆΰΆΰ· ΰΆ―ΰ·ΰΆ±ΰ·ΰΆ±",
|
| 1255 |
+
"exact_match": true,
|
| 1256 |
+
"cer": 0.0,
|
| 1257 |
+
"wer": 0.0,
|
| 1258 |
+
"bleu": 1.0,
|
| 1259 |
+
"token_accuracy": 1.0,
|
| 1260 |
+
"code_mix_preservation": 1.0,
|
| 1261 |
+
"time_s": 0.059
|
| 1262 |
+
},
|
| 1263 |
+
{
|
| 1264 |
+
"id": 98,
|
| 1265 |
+
"input": "lecture eka record karala share karanna",
|
| 1266 |
+
"reference": "lecture ΰΆΰΆ record ΰΆΰΆ»ΰΆ½ΰ· share ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±",
|
| 1267 |
+
"prediction": "lecture ΰΆΰΆ record ΰΆΰΆ»ΰΆ½ΰ· share ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±",
|
| 1268 |
+
"exact_match": true,
|
| 1269 |
+
"cer": 0.0,
|
| 1270 |
+
"wer": 0.0,
|
| 1271 |
+
"bleu": 1.0,
|
| 1272 |
+
"token_accuracy": 1.0,
|
| 1273 |
+
"code_mix_preservation": 1.0,
|
| 1274 |
+
"time_s": 0.046
|
| 1275 |
+
},
|
| 1276 |
+
{
|
| 1277 |
+
"id": 99,
|
| 1278 |
+
"input": "research paper eka liyanna one heta wge",
|
| 1279 |
+
"reference": "research paper ΰΆΰΆ ΰΆ½ΰ·ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ· ΰ·ΰ·ΰΆ§ ΰ·ΰΆΰ·",
|
| 1280 |
+
"prediction": "research paper ΰΆΰΆ ΰΆ½ΰ·ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ· ΰ·ΰ·ΰΆ§ ΰ·ΰΆΰ·",
|
| 1281 |
+
"exact_match": true,
|
| 1282 |
+
"cer": 0.0,
|
| 1283 |
+
"wer": 0.0,
|
| 1284 |
+
"bleu": 1.0,
|
| 1285 |
+
"token_accuracy": 1.0,
|
| 1286 |
+
"code_mix_preservation": 1.0,
|
| 1287 |
+
"time_s": 0.074
|
| 1288 |
+
},
|
| 1289 |
+
{
|
| 1290 |
+
"id": 100,
|
| 1291 |
+
"input": "exam eka hari amarui",
|
| 1292 |
+
"reference": "exam ΰΆΰΆ ΰ·ΰΆ»ΰ· ΰΆ
ΰΆΈΰ·ΰΆ»ΰ·ΰΆΊΰ·",
|
| 1293 |
+
"prediction": "exam ΰΆΰΆ ΰ·ΰΆ»ΰ· ΰΆ
ΰΆΈΰ·ΰΆ»ΰ·ΰΆΊΰ·",
|
| 1294 |
+
"exact_match": true,
|
| 1295 |
+
"cer": 0.0,
|
| 1296 |
+
"wer": 0.0,
|
| 1297 |
+
"bleu": 1.0,
|
| 1298 |
+
"token_accuracy": 1.0,
|
| 1299 |
+
"code_mix_preservation": 1.0,
|
| 1300 |
+
"time_s": 0.02
|
| 1301 |
+
},
|
| 1302 |
+
{
|
| 1303 |
+
"id": 101,
|
| 1304 |
+
"input": "sprint eka plan karamu Monday",
|
| 1305 |
+
"reference": "sprint ΰΆΰΆ plan ΰΆΰΆ»ΰΆΈΰ· Monday",
|
| 1306 |
+
"prediction": "sprint ΰΆΰΆ plan ΰΆΰΆ»ΰΆΈΰ· Monday",
|
| 1307 |
+
"exact_match": true,
|
| 1308 |
+
"cer": 0.0,
|
| 1309 |
+
"wer": 0.0,
|
| 1310 |
+
"bleu": 1.0,
|
| 1311 |
+
"token_accuracy": 1.0,
|
| 1312 |
+
"code_mix_preservation": 1.0,
|
| 1313 |
+
"time_s": 0.02
|
| 1314 |
+
},
|
| 1315 |
+
{
|
| 1316 |
+
"id": 102,
|
| 1317 |
+
"input": "ape team eka deadline ekata kala",
|
| 1318 |
+
"reference": "ΰΆ
ΰΆ΄ΰ· team ΰΆΰΆ deadline ΰΆΰΆΰΆ§ ΰΆΰ·
ΰ·",
|
| 1319 |
+
"prediction": "ΰΆ
ΰΆ΄ΰ· team ΰΆΰΆ deadline ΰΆΰΆΰΆ§ ΰΆΰ·
ΰ·",
|
| 1320 |
+
"exact_match": true,
|
| 1321 |
+
"cer": 0.0,
|
| 1322 |
+
"wer": 0.0,
|
| 1323 |
+
"bleu": 1.0,
|
| 1324 |
+
"token_accuracy": 1.0,
|
| 1325 |
+
"code_mix_preservation": 1.0,
|
| 1326 |
+
"time_s": 0.044
|
| 1327 |
+
},
|
| 1328 |
+
{
|
| 1329 |
+
"id": 103,
|
| 1330 |
+
"input": "standup eke mokada kiwwe",
|
| 1331 |
+
"reference": "standup ΰΆΰΆΰ· ΰΆΈΰ·ΰΆΰΆ― ΰΆΰ·ΰ·ΰ·ΰ·ΰ·",
|
| 1332 |
+
"prediction": "standup ΰΆΰΆΰ· ΰΆΈΰ·ΰΆΰΆ― ΰΆΰ·ΰ·ΰ·ΰ·ΰ·",
|
| 1333 |
+
"exact_match": false,
|
| 1334 |
+
"cer": 0.0435,
|
| 1335 |
+
"wer": 0.25,
|
| 1336 |
+
"bleu": 0.0,
|
| 1337 |
+
"token_accuracy": 0.75,
|
| 1338 |
+
"code_mix_preservation": 1.0,
|
| 1339 |
+
"time_s": 0.048
|
| 1340 |
+
},
|
| 1341 |
+
{
|
| 1342 |
+
"id": 104,
|
| 1343 |
+
"input": "reel eka viral una",
|
| 1344 |
+
"reference": "reel ΰΆΰΆ viral ΰΆΰΆ±ΰ·",
|
| 1345 |
+
"prediction": "reel ΰΆΰΆ viral ΰΆΰΆ±ΰ·",
|
| 1346 |
+
"exact_match": true,
|
| 1347 |
+
"cer": 0.0,
|
| 1348 |
+
"wer": 0.0,
|
| 1349 |
+
"bleu": 1.0,
|
| 1350 |
+
"token_accuracy": 1.0,
|
| 1351 |
+
"code_mix_preservation": 1.0,
|
| 1352 |
+
"time_s": 0.022
|
| 1353 |
+
},
|
| 1354 |
+
{
|
| 1355 |
+
"id": 105,
|
| 1356 |
+
"input": "group chat eke mokada wenne",
|
| 1357 |
+
"reference": "group chat ΰΆΰΆΰ· ΰΆΈΰ·ΰΆΰΆ― ΰ·ΰ·ΰΆ±ΰ·ΰΆ±ΰ·",
|
| 1358 |
+
"prediction": "group chat ΰΆΰΆΰ· ΰΆΈΰ·ΰΆΰΆ― ΰ·ΰ·ΰΆ±ΰ·ΰΆ±ΰ·",
|
| 1359 |
+
"exact_match": false,
|
| 1360 |
+
"cer": 0.0385,
|
| 1361 |
+
"wer": 0.2,
|
| 1362 |
+
"bleu": 0.6687,
|
| 1363 |
+
"token_accuracy": 0.8,
|
| 1364 |
+
"code_mix_preservation": 1.0,
|
| 1365 |
+
"time_s": 0.047
|
| 1366 |
+
},
|
| 1367 |
+
{
|
| 1368 |
+
"id": 106,
|
| 1369 |
+
"input": "oyge profile picture eka lassanai",
|
| 1370 |
+
"reference": "ΰΆΰΆΊΰ·ΰΆΰ· profile picture ΰΆΰΆ ΰΆ½ΰ·ΰ·ΰ·ΰΆ±ΰΆΊΰ·",
|
| 1371 |
+
"prediction": "ΰΆΰΆΊΰ·ΰΆΰ· profile picture ΰΆΰΆ ΰΆ½ΰ·ΰ·ΰ·ΰΆ±ΰΆΊΰ·",
|
| 1372 |
+
"exact_match": true,
|
| 1373 |
+
"cer": 0.0,
|
| 1374 |
+
"wer": 0.0,
|
| 1375 |
+
"bleu": 1.0,
|
| 1376 |
+
"token_accuracy": 1.0,
|
| 1377 |
+
"code_mix_preservation": 1.0,
|
| 1378 |
+
"time_s": 0.048
|
| 1379 |
+
},
|
| 1380 |
+
{
|
| 1381 |
+
"id": 107,
|
| 1382 |
+
"input": "mama enne na heta",
|
| 1383 |
+
"reference": "ΰΆΈΰΆΈ ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ· ΰ·ΰ·ΰΆ§",
|
| 1384 |
+
"prediction": "ΰΆΈΰΆΈ ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ· ΰ·ΰ·ΰΆ§",
|
| 1385 |
+
"exact_match": true,
|
| 1386 |
+
"cer": 0.0,
|
| 1387 |
+
"wer": 0.0,
|
| 1388 |
+
"bleu": 1.0,
|
| 1389 |
+
"token_accuracy": 1.0,
|
| 1390 |
+
"code_mix_preservation": 1.0,
|
| 1391 |
+
"time_s": 0.024
|
| 1392 |
+
},
|
| 1393 |
+
{
|
| 1394 |
+
"id": 108,
|
| 1395 |
+
"input": "eka karanna epa",
|
| 1396 |
+
"reference": "ΰΆΰΆ ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ΄ΰ·",
|
| 1397 |
+
"prediction": "ΰΆΰΆ ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ΄ΰ·",
|
| 1398 |
+
"exact_match": true,
|
| 1399 |
+
"cer": 0.0,
|
| 1400 |
+
"wer": 0.0,
|
| 1401 |
+
"bleu": 1.0,
|
| 1402 |
+
"token_accuracy": 1.0,
|
| 1403 |
+
"code_mix_preservation": 1.0,
|
| 1404 |
+
"time_s": 0.001
|
| 1405 |
+
},
|
| 1406 |
+
{
|
| 1407 |
+
"id": 109,
|
| 1408 |
+
"input": "kawruwath enne na",
|
| 1409 |
+
"reference": "ΰΆΰ·ΰ·ΰΆ»ΰ·ΰ·ΰΆΰ· ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·",
|
| 1410 |
+
"prediction": "ΰΆΰ·ΰ·ΰΆ»ΰ·ΰ·ΰΆΰ· ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·",
|
| 1411 |
+
"exact_match": true,
|
| 1412 |
+
"cer": 0.0,
|
| 1413 |
+
"wer": 0.0,
|
| 1414 |
+
"bleu": 1.0,
|
| 1415 |
+
"token_accuracy": 1.0,
|
| 1416 |
+
"code_mix_preservation": 1.0,
|
| 1417 |
+
"time_s": 0.045
|
| 1418 |
+
},
|
| 1419 |
+
{
|
| 1420 |
+
"id": 110,
|
| 1421 |
+
"input": "oya koheda ynne",
|
| 1422 |
+
"reference": "ΰΆΰΆΊΰ· ΰΆΰ·ΰ·ΰ·ΰΆ― ΰΆΊοΏ½οΏ½οΏ½ΰ·ΰΆ±ΰ·",
|
| 1423 |
+
"prediction": "ΰΆΰΆΊΰ· ΰΆΰ·ΰ·ΰ·ΰΆ― ΰΆΊΰΆ±ΰ·ΰΆ±ΰ·",
|
| 1424 |
+
"exact_match": false,
|
| 1425 |
+
"cer": 0.1333,
|
| 1426 |
+
"wer": 0.6667,
|
| 1427 |
+
"bleu": 0.0,
|
| 1428 |
+
"token_accuracy": 0.3333,
|
| 1429 |
+
"code_mix_preservation": 1.0,
|
| 1430 |
+
"time_s": 0.047
|
| 1431 |
+
}
|
| 1432 |
+
]
|
fine_tuning/attempt_2_informal_sinhala/eval_predictions.csv
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
id,input,reference,prediction,exact_match,cer,wer,bleu,token_accuracy,code_mix_preservation,time_s
|
| 2 |
+
1,api kalin katha kala,ΰΆ
ΰΆ΄ΰ· ΰΆΰΆ½ΰ·ΰΆ±ΰ· ΰΆΰΆΰ· ΰΆΰ·
ΰ·,ΰΆ
ΰΆ΄ΰ· ΰΆΰΆ½ΰ·ΰΆ±ΰ· ΰΆΰΆΰ· ΰΆΰ·
ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 3 |
+
2,eka honda wage thiyanawa,ΰΆΰΆ ΰ·ΰ·ΰΆ³ ΰ·ΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,ΰΆΰΆ ΰ·ΰ·ΰΆ³ ΰ·ΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 4 |
+
3,meheta thadata wessa,ΰΆΈΰ·ΰ·ΰ·ΰΆ§ ΰΆΰΆ―ΰΆ§ ΰ·ΰ·ΰ·ΰ·ΰ·ΰ·,ΰΆΈΰ·ΰ·ΰ·ΰΆ§ ΰΆΰΆ―ΰΆ§ ΰ·ΰ·ΰ·ΰ·ΰ·ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.217
|
| 5 |
+
4,oya kiwwata mama giye,ΰΆΰΆΊΰ· ΰΆΰ·ΰ·ΰ·ΰ·ΰΆ§ ΰΆΈΰΆΈ ΰΆΰ·ΰΆΊΰ·,ΰΆΰΆΊΰ· ΰΆΰ·ΰ·ΰ·ΰ·ΰΆ§ ΰΆΈΰΆΈ ΰΆΰ·ΰΆΊΰ·,True,0.0,0.0,1.0,1.0,1.0,0.043
|
| 6 |
+
5,mama danne na eka gena,ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ· ΰΆΰΆ ΰΆΰ·ΰΆ±,ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ· ΰΆΰΆ ΰΆΰ·ΰΆ±,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 7 |
+
6,oya awa wage na,ΰΆΰΆΊΰ· ΰΆΰ·ΰ· ΰ·ΰΆΰ· ΰΆ±ΰ·,ΰΆΰΆΊΰ· ΰΆΰ·ΰ· ΰ·ΰΆΰ· ΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.001
|
| 8 |
+
7,ekat ynna bri,ΰΆΰΆΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΆΰ·ΰΆ»ΰ·,ΰΆΰΆΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΆΰ·ΰΆ»ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.024
|
| 9 |
+
8,mama inne gedaradi,ΰΆΈΰΆΈ ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆΰ·ΰΆ―ΰΆ»ΰΆ―ΰ·,ΰΆΈΰΆΈ ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆΰ·ΰΆ―ΰΆ»ΰΆ―ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.001
|
| 10 |
+
9,eka heta balamu,ΰΆΰΆ ΰ·ΰ·ΰΆ§ ΰΆΆΰΆ½ΰΆΈΰ·,ΰΆΰΆ ΰ·ΰ·ΰΆ§ ΰΆΆΰΆ½ΰΆΈΰ·,True,0.0,0.0,1.0,1.0,1.0,0.001
|
| 11 |
+
10,klya madi api passe yamu,ΰΆΰ·ΰΆ½ΰΆΊ ΰΆΈΰΆ―ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆΊΰΆΈΰ·,ΰΆΰ·ΰΆ½ΰΆΊ ΰΆΈΰΆ―ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆΊΰΆΈΰ·,True,0.0,0.0,1.0,1.0,1.0,0.028
|
| 12 |
+
11,assignment eka ada submit karanna one,assignment ΰΆΰΆ ΰΆ
ΰΆ― submit ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,assignment ΰΆΰΆ ΰΆ
ΰΆ― submit ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.027
|
| 13 |
+
12,exam hall eka nisa mama baya una,exam hall ΰΆΰΆ ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆΆΰΆΊ ΰΆΰΆ±ΰ·,exam hall ΰΆΰΆ ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆΆΰΆΊ ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.027
|
| 14 |
+
13,results blnna one,results ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,results ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.001
|
| 15 |
+
14,study group ekak hadamu,study group ΰΆΰΆΰΆΰ· ΰ·ΰΆ―ΰΆΈΰ·,study group ΰΆΰΆΰΆΰ· ΰ·ΰΆ―ΰΆΈΰ·,True,0.0,0.0,1.0,1.0,1.0,0.021
|
| 16 |
+
15,viva ekta prepared wage na,viva ΰΆΰΆΰΆ§ prepared ΰ·ΰΆΰ· ΰΆ±ΰ·,viva ΰΆΰΆΰΆ§ prepared ΰ·ΰΆΰ· ΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 17 |
+
16,mta project ek submit krnna one,ΰΆΈΰΆ§ project ΰΆΰΆ submit ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,ΰΆΈΰΆ§ project ΰΆΰΆ submit ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 18 |
+
17,hta parikshanaya thiyanawa,ΰ·ΰ·ΰΆ§ ΰΆ΄ΰΆ»ΰ·ΰΆΰ·βΰ·ΰΆ«ΰΆΊ ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,ΰ·ΰ·ΰΆ§ ΰΆ΄ΰΆ»ΰ·ΰΆΰ·βΰ·ΰΆ«ΰΆΊ ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.02
|
| 19 |
+
18,mama potha kiyawala iwara kala,ΰΆΈΰΆΈ ΰΆ΄ΰ·ΰΆ ΰΆΰ·ΰΆΊΰ·ΰΆ½ΰ· ΰΆΰ·ΰΆ» ΰΆΰ·
ΰ·,ΰΆΈΰΆΈ ΰΆ΄ΰ·ΰΆ ΰΆΰ·ΰΆΊΰ·ΰΆ½ΰ· ΰΆΰ·ΰΆ» ΰΆΰ·
ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.027
|
| 20 |
+
19,prkku nisa api kalin giya,ΰΆ΄ΰΆ»ΰΆΰ·ΰΆΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆΰΆ½ΰ·ΰΆ±ΰ· ΰΆΰ·ΰΆΊΰ·,ΰΆ΄ΰΆ»ΰΆΰ·ΰΆΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆΰΆ½ΰ·ΰΆ±ΰ· ΰΆΰ·ΰΆΊΰ·,True,0.0,0.0,1.0,1.0,1.0,0.019
|
| 21 |
+
20,prashnaya hondai wage penenawa,ΰΆ΄ΰ·βΰΆ»ΰ·ΰ·ΰΆ±ΰΆΊ ΰ·ΰ·ΰΆ³ΰΆΊΰ· ΰ·ΰΆΰ· ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·,ΰΆ΄ΰ·βΰΆ»ΰ·ΰ·ΰΆ±ΰΆΊ ΰ·ΰ·ΰΆ³ΰΆΊΰ· ΰ·ΰΆΰ· ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.046
|
| 22 |
+
21,deployments nisa site down wuna,deployments ΰΆ±ΰ·ΰ·ΰ· site down ΰΆΰΆ±ΰ·,deployments ΰΆ±ΰ·ΰ·ΰ· site down ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 23 |
+
22,PR eka merge karanna one,PR ΰΆΰΆ merge ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,PR ΰΆΰΆ merge ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.023
|
| 24 |
+
23,backlog eka update kala,backlog ΰΆΰΆ update ΰΆΰ·
ΰ·,backlog ΰΆΰΆ update ΰΆΰ·
ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.019
|
| 25 |
+
24,server down nisa work karanna ba,server down ΰΆ±ΰ·ΰ·ΰ· work ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΆΰ·,server down ΰΆ±ΰ·ΰ·ΰ· work ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΆΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 26 |
+
25,meeting eka tomorrow damu,meeting ΰΆΰΆ tomorrow ΰΆ―ΰ·ΰΆΈΰ·,meeting ΰΆΰΆ tomorrow ΰΆ―ΰ·ΰΆΈΰ·,True,0.0,0.0,1.0,1.0,1.0,0.022
|
| 27 |
+
26,feedback nisa redo karanna una,feedback ΰΆ±ΰ·ΰ·ΰ· redo ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,feedback ΰΆ±ΰ·ΰ·ΰ· redo ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 28 |
+
27,ape wada ada iwara wenawa,ΰΆ
ΰΆ΄ΰ· ΰ·ΰ·ΰΆ© ΰΆ
ΰΆ― ΰΆΰ·ΰΆ» ΰ·ΰ·ΰΆ±ΰ·ΰ·,ΰΆ
ΰΆ΄ΰ· ΰ·ΰ·ΰΆ© ΰΆ
ΰΆ― ΰΆΰ·ΰΆ» ΰ·ΰ·ΰΆ±ΰ·ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 29 |
+
28,kalamanakaru hitpu nisa api katha kala,ΰΆΰΆ½ΰΆΈΰΆ±ΰ·ΰΆΰΆ»ΰ· ΰ·ΰ·ΰΆ§ΰΆ΄ΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΰ· ΰΆΰ·
ΰ·,ΰΆΰΆ½ΰΆΈΰΆ±ΰ·ΰΆΰΆ»ΰ· ΰ·ΰ·ΰΆ§ΰΆ΄ΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΰ· ΰΆΰ·
ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.049
|
| 30 |
+
29,me wada hondai wage penawa,ΰΆΈΰ· ΰ·ΰ·ΰΆ© ΰ·ΰ·ΰΆ³ΰΆΊΰ· ΰ·ΰΆΰ· ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·,ΰΆΈΰ· ΰ·ΰ·ΰΆ© ΰ·ΰ·ΰΆ³ΰΆΊΰ· ΰ·ΰΆΰ· ΰΆ΄ΰ·ΰΆ±ΰ·ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.02
|
| 31 |
+
30,wada tika ada iwara karamu,ΰ·ΰ·ΰΆ© ΰΆ§ΰ·ΰΆ ΰΆ
ΰΆ― ΰΆΰ·ΰΆ» ΰΆΰΆ»ΰΆΈΰ·,ΰ·ΰ·ΰΆ© ΰΆ§ΰ·ΰΆ ΰΆ
ΰΆ― ΰΆΰ·ΰΆ» ΰΆΰΆ»ΰΆΈΰ·,True,0.0,0.0,1.0,1.0,1.0,0.016
|
| 32 |
+
31,story eke poll ekak damma,story ΰΆΰΆΰ· poll ΰΆΰΆΰΆΰ· ΰΆ―ΰ·ΰΆΈΰ·ΰΆΈΰ·,story ΰΆΰΆΰ· poll ΰΆΰΆΰΆΰ· ΰΆ―ΰ·ΰΆΈΰ·ΰΆΈΰ·,True,0.0,0.0,1.0,1.0,1.0,0.024
|
| 33 |
+
32,oyata DM ekak yawwa,ΰΆΰΆΊΰ·ΰΆ§ DM ΰΆΰΆΰΆΰ· ΰΆΊΰ·ΰ·ΰ·ΰ·ΰ·,ΰΆΰΆΊΰ·ΰΆ§ DM ΰΆΰΆΰΆΰ· ΰΆΊΰ·ΰ·ΰ·ΰ·ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.024
|
| 34 |
+
33,comment eka delete kala nisa mama danne na,comment ΰΆΰΆ delete ΰΆΰ·
ΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·,comment ΰΆΰΆ delete ΰΆΰ·
ΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.028
|
| 35 |
+
34,selfie ekak gannako,selfie ΰΆΰΆΰΆΰ· ΰΆΰΆ±ΰ·ΰΆ±ΰΆΰ·,selfie ΰΆΰΆΰΆΰ· ΰΆΰΆ±ΰ·ΰΆ±ΰΆΰ·,True,0.0,0.0,1.0,1.0,1.0,0.023
|
| 36 |
+
35,post eka private nisa share karanna epa,post ΰΆΰΆ private ΰΆ±ΰ·ΰ·ΰ· share ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ΄ΰ·,post ΰΆΰΆ private ΰΆ±ΰ·ΰ·ΰ· share ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ΄ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.027
|
| 37 |
+
36,oyta message krnna one,ΰΆΰΆΊΰ·ΰΆ§ message ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,ΰΆΰΆΊΰ·ΰΆ§ message ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 38 |
+
37,api passe katha karamu,ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆΰΆΰ· ΰΆΰΆ»ΰΆΈΰ·,ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰΆΰΆΰ· ΰΆΰΆ»ΰΆΈΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 39 |
+
38,eya laga pinthurayk thiyanawa,ΰΆΰΆΊΰ· ΰ·
ΰΆ ΰΆ΄ΰ·ΰΆ±ΰ·ΰΆΰ·ΰΆ»ΰΆΊΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,ΰΆΰΆΊΰ· ΰ·
ΰΆ ΰΆ΄ΰ·ΰΆ±ΰ·ΰΆΰ·ΰΆ»ΰΆΊΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.023
|
| 40 |
+
39,oya awa wage mata hithenawa,ΰΆΰΆΊΰ· ΰΆΰ·ΰ· ΰ·ΰΆΰ· ΰΆΈΰΆ§ ΰ·ΰ·ΰΆΰ·ΰΆ±ΰ·ΰ·,ΰΆΰΆΊΰ· ΰΆΰ·ΰ· ΰ·ΰΆΰ· ΰΆΈΰΆ§ ΰ·ΰ·ΰΆΰ·ΰΆ±ΰ·ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 41 |
+
40,api passe hambawemu,ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰ·ΰΆΈΰ·ΰΆΆΰ·ΰ·ΰΆΈΰ·,ΰΆ
ΰΆ΄ΰ· ΰΆ΄ΰ·ΰ·ΰ·ΰ· ΰ·ΰΆΈΰ·ΰΆΆΰ·ΰ·ΰΆΈΰ·,True,0.0,0.0,1.0,1.0,1.0,0.015
|
| 42 |
+
41,phone eka charge karanna one,phone ΰΆΰΆ charge ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,phone ΰΆΰΆ charge ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.022
|
| 43 |
+
42,bus eka late una,bus ΰΆΰΆ late ΰΆΰΆ±ΰ·,bus ΰΆΰΆ late ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.018
|
| 44 |
+
43,mama online inne,ΰΆΈΰΆΈ online ΰΆΰΆ±ΰ·ΰΆ±ΰ·,ΰΆΈΰΆΈ online ΰΆΰΆ±ΰ·ΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.001
|
| 45 |
+
44,time nathi nisa heta yamu,time ΰΆ±ΰ·ΰΆΰ· ΰΆ±ΰ·ΰ·ΰ· ΰ·ΰ·ΰΆ§ ΰΆΊΰΆΈΰ·,time ΰΆ±ΰ·ΰΆΰ· ΰΆ±ΰ·ΰ·ΰ· ΰ·ΰ·ΰΆ§ ΰΆΊΰΆΈΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 46 |
+
45,oya call eka ganna,ΰΆΰΆΊΰ· call ΰΆΰΆ ΰΆΰΆ±ΰ·ΰΆ±,ΰΆΰΆΊΰ· call ΰΆΰΆ ΰΆΰΆ±ΰ·ΰΆ±,True,0.0,0.0,1.0,1.0,1.0,0.042
|
| 47 |
+
46,api game yanawa heta,ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΈΰ· ΰΆΊΰΆ±ΰ·ΰ· ΰ·ΰ·ΰΆ§,ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΈΰ· ΰΆΊΰΆ±ΰ·ΰ· ΰ·ΰ·ΰΆ§,True,0.0,0.0,1.0,1.0,1.0,0.023
|
| 48 |
+
47,man heta enne na,ΰΆΈΰΆ±ΰ· ΰ·ΰ·ΰΆ§ ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·,ΰΆΈΰ·ΰΆ±ΰ· ΰ·ΰ·ΰΆ§ ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·,False,0.0625,0.25,0.0,0.75,1.0,0.045
|
| 49 |
+
48,eka hari lassanai,ΰΆΰΆ ΰ·ΰΆ»ΰ· ΰΆ½ΰ·ΰ·ΰ·ΰΆ±ΰΆΊΰ·,ΰΆΰΆ ΰ·ΰΆ»ΰ· ΰΆ½ΰ·ΰ·ΰ·ΰΆ±ΰΆΊΰ·,True,0.0,0.0,1.0,1.0,1.0,0.015
|
| 50 |
+
49,oya kiwwa hari,ΰΆΰΆΊΰ· ΰΆΰ·ΰ·ΰ·ΰ·ΰ· ΰ·ΰΆ»ΰ·,ΰΆΰΆΊΰ· ΰΆΰ·ΰ·ΰ·ΰ·ΰ· ΰ·ΰΆ»ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.001
|
| 51 |
+
50,kalaya ithuru krganna one,ΰΆΰΆ½ΰΆΊ ΰΆΰΆΰ·ΰΆ»ΰ· ΰΆΰΆ»ΰΆΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,ΰΆΰΆ½ΰΆΊ ΰΆΰΆΰ·ΰΆ»ΰ· ΰΆΰΆ»ΰΆΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.046
|
| 52 |
+
51,date eka fix karanna one,date ΰΆΰΆ fix ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,date ΰΆΰΆ fix ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.023
|
| 53 |
+
52,rata yanna one,ΰΆ»ΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,ΰΆ»ΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.046
|
| 54 |
+
53,game eke leaderboard eka balanna,game ΰΆΰΆΰ· leaderboard ΰΆΰΆ ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ±,ΰΆΰΆΈΰ· ΰΆΰΆΰ· leaderboard ΰΆΰΆ ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ±,False,0.1379,0.2,0.6687,0.8,0.5,0.072
|
| 55 |
+
54,api thamai hodama,ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΈΰΆΊΰ· ΰ·ΰ·ΰΆ³ΰΆΈ,ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΈΰΆΊΰ· ΰ·ΰ·ΰΆ³ΰΆΈ,True,0.0,0.0,1.0,1.0,1.0,0.018
|
| 56 |
+
55,mama heta udee enawa oya enakota message ekk dnna,ΰΆΈΰΆΈ ΰ·ΰ·ΰΆ§ ΰΆΰΆ―ΰ· ΰΆΰΆ±ΰ·ΰ· ΰΆΰΆΊΰ· ΰΆΰΆ±ΰΆΰ·ΰΆ§ message ΰΆΰΆΰΆΰ· ΰΆ―ΰ·ΰΆ±ΰ·ΰΆ±,ΰΆΈΰΆΈ ΰ·ΰ·ΰΆ§ ΰΆΰΆ―ΰ· ΰΆΰΆ±ΰ·ΰ· ΰΆΰΆΊΰ· ΰΆΰΆ±ΰΆΰ·ΰΆ§ message ΰΆΰΆΰΆΰ· ΰΆ―ΰ·ΰΆ±ΰ·ΰΆ±,True,0.0,0.0,1.0,1.0,1.0,0.061
|
| 57 |
+
56,ape gedara langa thiyana kadeta yanna one,ΰΆ
ΰΆ΄ΰ· ΰΆΰ·ΰΆ―ΰΆ» ΰ·
ΰΆ ΰΆΰ·ΰΆΊΰ·ΰΆ± ΰΆΰΆ©ΰ·ΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,ΰΆ
ΰΆ΄ΰ· ΰΆΰ·ΰΆ―ΰΆ» ΰ·
ΰΆ ΰΆΰ·ΰΆΊΰ·ΰΆ± ΰΆΰΆ©ΰ·ΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.067
|
| 58 |
+
57,mama assignment eka karala submit karanawa ada raa,ΰΆΈΰΆΈ assignment ΰΆΰΆ ΰΆΰΆ»ΰΆ½ΰ· submit ΰΆΰΆ»ΰΆ±ΰ·ΰ· ΰΆ
ΰΆ― ΰΆ»ΰ·,ΰΆΈΰΆΈ assignment ΰΆΰΆ ΰΆΰΆ»ΰ·ΰΆ½ submit ΰΆΰΆ»ΰΆ±ΰ·ΰ· ΰΆ
ΰΆ― ΰΆ»ΰ·,False,0.05,0.125,0.5,0.875,1.0,0.097
|
| 59 |
+
58,oya enne naththe mokada kiyla mama danne na,ΰΆΰΆΊΰ· ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·ΰΆΰ·ΰΆΰ· ΰΆΈΰ·ΰΆΰΆ― ΰΆΰ·ΰΆΊΰΆ½ΰ· ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·,ΰΆΰΆΊΰ· ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·ΰΆΰ·ΰΆΰ· ΰΆΈΰ·ΰΆΰΆ― ΰΆΰ·ΰΆΊΰΆ½ΰ· ΰΆΈΰΆΈ ΰΆ―ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.045
|
| 60 |
+
59,client ekka call karala feedback eka ahanna one,client ΰΆΰΆΰ·ΰΆ call ΰΆΰΆ»ΰΆ½ΰ· feedback ΰΆΰΆ ΰΆ
ΰ·ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,client ΰΆΰΆΰ·ΰΆ call ΰΆΰΆ»ΰΆ½ΰ· feedback ΰΆΰΆ ΰΆ
ΰ·ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.097
|
| 61 |
+
60,mama gedara gihilla kewata passe call karannm,ΰΆΈΰΆΈ ΰΆΰ·ΰΆ―ΰΆ» ΰΆΰ·ΰ·ΰ·ΰΆ½ΰ·ΰΆ½ΰ· ΰΆΰ·ΰ·ΰΆ§ ΰΆ΄ΰ·ΰ·ΰ·ΰ· call ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±ΰΆΈΰ·,ΰΆΈΰΆΈ ΰΆΰ·ΰΆ―ΰΆ» ΰΆΰ·ΰ·ΰ·ΰΆ½ΰ·ΰΆ½ΰ· ΰΆΰ·ΰ·ΰΆ§ ΰΆ΄ΰ·ΰ·ΰ·ΰ· call ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±ΰΆΈΰ·,True,0.0,0.0,1.0,1.0,1.0,0.03
|
| 62 |
+
61,laptop eke software update karanna one,laptop ΰΆΰΆΰ· software update ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,laptop ΰΆΰΆΰ· software update ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 63 |
+
62,office eke wifi password eka mokakda,office ΰΆΰΆΰ· wifi password ΰΆΰΆ ΰΆΈΰ·ΰΆΰΆΰ·ΰΆ―,office ΰΆΰΆΰ· wifi password ΰΆΰΆ ΰΆΈΰ·ΰΆΰΆΰ·ΰΆ―,True,0.0,0.0,1.0,1.0,1.0,0.037
|
| 64 |
+
63,online order eka track karanna ba,online order ΰΆΰΆ track ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΆΰ·,online order ΰΆΰΆ track ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΆΰ·,True,0.0,0.0,1.0,1.0,1.0,0.023
|
| 65 |
+
64,email eke attachment eka download karanna,email ΰΆΰΆΰ· attachment ΰΆΰΆ download ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±,email ΰΆΰΆΰ· attachment ΰΆΰΆ download ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±,True,0.0,0.0,1.0,1.0,1.0,0.023
|
| 66 |
+
65,Instagram story eke filter eka hadanna,Instagram story ΰΆΰΆΰ· filter ΰΆΰΆ ΰ·ΰΆ―ΰΆ±ΰ·ΰΆ±,Instagram story ΰΆΰΆΰ· filter ΰΆΰΆ ΰ·ΰΆ―ΰΆ±ΰ·ΰΆ±,True,0.0,0.0,1.0,1.0,1.0,0.023
|
| 67 |
+
66,oyge wada iwra krd,ΰΆΰΆΊΰ·ΰΆΰ· ΰ·ΰ·ΰΆ© ΰΆΰ·ΰΆ» ΰΆΰΆ»ΰ·ΰΆ―,ΰΆΰΆΊΰ·ΰΆΰ· ΰ·ΰ·ΰΆ© ΰΆΰ·ΰΆ» ΰΆΰΆ»ΰ·ΰΆ―,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 68 |
+
67,mge phone ek hack una,ΰΆΈΰΆΰ· phone ΰΆΰΆ hack ΰΆΰΆ±ΰ·,ΰΆΈΰΆΰ· phone ΰΆΰΆ hack ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 69 |
+
68,handawata ynna wenwa,ΰ·ΰ·ΰΆ±ΰ·ΰΆ―ΰ·ΰ·ΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰ·ΰ·ΰΆ±ΰ·ΰ·,ΰ·ΰ·ΰΆ±ΰ·ΰΆ―ΰ·ΰ·ΰΆ§ ΰΆΊΰΆ±ΰ·ΰΆ± ΰ·ΰ·ΰΆ±ΰ·ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.026
|
| 70 |
+
69,prashnya krnna oni,ΰΆ΄ΰ·βΰΆ»ΰ·ΰ·βΰΆ±ΰΆΊ ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,ΰΆ΄ΰ·βΰΆ»ΰ·ΰ·βΰΆ±ΰΆΊ ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.001
|
| 71 |
+
70,apita gdra ynna oni,ΰΆ
ΰΆ΄ΰ·ΰΆ§ ΰΆΰ·ΰΆ―ΰΆ» ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,ΰΆ
ΰΆ΄ΰ·ΰΆ§ ΰΆΰ·ΰΆ―ΰΆ» ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.072
|
| 72 |
+
71,mama oyata kiwwa,ΰΆΈΰΆΈ ΰΆΰΆΊΰ·ΰΆ§ ΰΆΰ·ΰ·ΰ·ΰ·ΰ·,ΰΆΈΰΆΈ ΰΆΰΆΊΰ·ΰΆ§ ΰΆΰ·ΰ·ΰ·ΰ·ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.001
|
| 73 |
+
72,oya hari hondai,ΰΆΰΆΊΰ· ΰ·ΰΆ»ΰ· ΰ·ΰ·ΰΆ³ΰΆΊΰ·,ΰΆΰΆΊΰ· ΰ·ΰΆ»ΰ· ΰ·ΰ·ΰΆ³ΰΆΊΰ·,True,0.0,0.0,1.0,1.0,1.0,0.015
|
| 74 |
+
73,api heta yamu,ΰΆ
ΰΆ΄ΰ· ΰ·ΰ·ΰΆ§ ΰΆΊΰΆΈΰ·,ΰΆ
ΰΆ΄ΰ· ΰ·ΰ·ΰΆ§ ΰΆΊΰΆΈΰ·,True,0.0,0.0,1.0,1.0,1.0,0.001
|
| 75 |
+
74,app eka crash wenawa phone eke,app ΰΆΰΆ crash ΰ·ΰ·ΰΆ±ΰ·ΰ· phone ΰΆΰΆΰ·,app ΰΆΰΆ crash ΰ·ΰ·ΰΆ±ΰ·ΰ· phone ΰΆΰΆΰ·,True,0.0,0.0,1.0,1.0,1.0,0.028
|
| 76 |
+
75,code eka push karanna github ekata,code ΰΆΰΆ push ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± github ΰΆΰΆΰΆ§,code ΰΆΰΆ push ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± github ΰΆΰΆΰΆ§,True,0.0,0.0,1.0,1.0,1.0,0.048
|
| 77 |
+
76,database eka slow nisa query eka optimize karanna one,database ΰΆΰΆ slow ΰΆ±ΰ·ΰ·ΰ· query ΰΆΰΆ optimize ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,database ΰΆΰΆ slow ΰΆ±ΰ·ΰ·ΰ· query ΰΆΰΆ optimize ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.053
|
| 78 |
+
77,bug eka fix kala merge karanna,bug ΰΆΰΆ fix ΰΆΰ·
ΰ· merge ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±,bug ΰΆΰΆ fix ΰΆΰ·
ΰ· merge ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±,True,0.0,0.0,1.0,1.0,1.0,0.046
|
| 79 |
+
78,internet eka slow wage thiyanawa,internet ΰΆΰΆ slow ΰ·ΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,internet ΰΆΰΆ slow ΰ·ΰΆΰ· ΰΆΰ·ΰΆΊΰ·ΰΆ±ΰ·ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.023
|
| 80 |
+
79,kema hodai ada,ΰΆΰ·ΰΆΈ ΰ·ΰ·ΰΆ³ΰΆΊΰ· ΰΆ
ΰΆ―,ΰΆΰ·ΰΆΈ ΰ·ΰ·ΰΆ³ΰΆΊΰ· ΰΆ
ΰΆ―,True,0.0,0.0,1.0,1.0,1.0,0.001
|
| 81 |
+
80,mama bus eke enawa,ΰΆΈΰΆΈ bus ΰΆΰΆΰ· ΰΆΰΆ±ΰ·ΰ·,ΰΆΈΰΆΈ bus ΰΆΰΆΰ· ΰΆΰΆ±ΰ·ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 82 |
+
81,ganu depala ekka market giya,ΰΆΰ·ΰΆ±ΰ· ΰΆ―ΰ·ΰΆ΄ΰΆ½ ΰΆΰΆΰ·ΰΆ market ΰΆΰ·ΰΆΊΰ·,ΰΆΰΆ«ΰ· ΰΆ―ΰ·ΰΆ΄ΰ·
ΰΆΰΆΰ·ΰΆΰ· market ΰΆΰ·ΰΆΊΰ·,False,0.1538,0.6,0.0,0.4,1.0,0.07
|
| 83 |
+
82,watura bonna one,ΰ·ΰΆΰ·ΰΆ» ΰΆΆΰ·ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,ΰ·ΰΆΰ·ΰΆ» ΰΆΆΰ·ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.03
|
| 84 |
+
83,shop eke sugar nati nisa mama giye na,shop ΰΆΰΆΰ· sugar ΰΆ±ΰ·ΰΆΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆΰ·ΰΆΊΰ· ΰΆ±ΰ·,shop ΰΆΰΆΰ· sugar ΰΆ±ΰ·ΰΆΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆΰ·ΰΆΊΰ· ΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.003
|
| 85 |
+
84,hri hari,ΰ·ΰΆ»ΰ· ΰ·ΰΆ»ΰ·,ΰ·ΰΆ»ΰ· ΰ·ΰΆ»ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.001
|
| 86 |
+
85,mta ep,ΰΆΈΰΆ§ ΰΆΰΆ΄ΰ·,ΰΆΈΰΆ§ ΰΆΰΆ΄ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.001
|
| 87 |
+
86,ok hari,ok ΰ·ΰΆ»ΰ·,ok ΰ·ΰΆ»ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.001
|
| 88 |
+
87,ape game hari dewal wenne,ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΈΰ· ΰ·ΰΆ»ΰ· ΰΆ―ΰ·ΰ·ΰΆ½ΰ· ΰ·ΰ·ΰΆ±ΰ·ΰΆ±ΰ·,ΰΆ
ΰΆ΄ΰ· ΰΆΰΆΈΰ· ΰ·ΰΆ»ΰ· ΰΆ―ΰ·ΰ·ΰΆ½ΰ· ΰ·ΰ·ΰΆ±ΰ·ΰΆ±ΰ·,False,0.0417,0.2,0.6687,0.8,1.0,0.082
|
| 89 |
+
88,mta dan one na,ΰΆΈΰΆ§ ΰΆ―ΰ·ΰΆ±ΰ· ΰΆΰΆ±ΰ· ΰΆ±ΰ·,ΰΆΈΰΆ§ ΰΆ―ΰ·ΰΆ±ΰ· ΰΆΰΆ±ΰ· ΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.002
|
| 90 |
+
89,eka hari hondai wage dnuna nisa mama giya,ΰΆΰΆ ΰ·ΰΆ»ΰ· ΰ·ΰ·ΰΆ³ΰΆΊΰ· ΰ·ΰΆΰ· ΰΆ―ΰ·ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆΰ·ΰΆΊΰ·,ΰΆΰΆ ΰ·ΰΆ»ΰ· ΰ·ΰ·ΰΆ³ΰΆΊΰ· ΰ·ΰΆΰ· ΰΆ―ΰ·ΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·ΰ·ΰ· ΰΆΈΰΆΈ ΰΆΰ·ΰΆΊΰ·,True,0.0,0.0,1.0,1.0,1.0,0.044
|
| 91 |
+
90,game eke mission hari amarui,game ΰΆΰΆΰ· mission ΰ·ΰΆ»ΰ· ΰΆ
ΰΆΈΰ·ΰΆ»ΰ·ΰΆΊΰ·,ΰΆΰΆΈΰ· ΰΆΰΆΰ· mission ΰ·ΰΆ»ΰ· ΰΆ
ΰΆΈΰ·ΰΆ»ΰ·ΰΆΊΰ·,False,0.1429,0.2,0.6687,0.8,0.5,0.029
|
| 92 |
+
91,mama heta yanawa,ΰΆΈΰΆΈ ΰ·ΰ·ΰΆ§ ΰΆΊΰΆ±ΰ·ΰ·,ΰΆΈΰΆΈ ΰ·ΰ·ΰΆ§ ΰΆΊΰΆ±ΰ·ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.001
|
| 93 |
+
92,ey iye aawa,ΰΆΰΆΊΰ· ΰΆΰΆΊΰ· ΰΆΰ·ΰ·,ΰΆΰΆΊΰ· ΰΆΰΆΊΰ· ΰΆΰ·ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.024
|
| 94 |
+
93,api dan yanawa,ΰΆ
ΰΆ΄ΰ· ΰΆ―ΰ·ΰΆ±ΰ· ΰΆΊΰΆ±ΰ·ΰ·,ΰΆ
ΰΆ΄ΰ· ΰΆ―ΰ·ΰΆ±ΰ· ΰΆΊΰΆ±ΰ·ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.001
|
| 95 |
+
94,video eka balanna one,video ΰΆΰΆ ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,video ΰΆΰΆ ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.042
|
| 96 |
+
95,video ekak hadamu,video ΰΆΰΆΰΆΰ· ΰ·ΰΆ―ΰΆΈΰ·,video ΰΆΰΆΰΆΰ· ΰ·ΰΆ―ΰΆΈΰ·,True,0.0,0.0,1.0,1.0,1.0,0.023
|
| 97 |
+
96,video eke comment eka balanna,video ΰΆΰΆΰ· comment ΰΆΰΆ ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ±,video ΰΆΰΆΰ· comment ΰΆΰΆ ΰΆΆΰΆ½ΰΆ±ΰ·ΰΆ±,True,0.0,0.0,1.0,1.0,1.0,0.041
|
| 98 |
+
97,video ekata like ekak danna,video ΰΆΰΆΰΆ§ like ΰΆΰΆΰΆΰ· ΰΆ―ΰ·ΰΆ±ΰ·ΰΆ±,video ΰΆΰΆΰΆ§ like ΰΆΰΆΰΆΰ· ΰΆ―ΰ·ΰΆ±ΰ·ΰΆ±,True,0.0,0.0,1.0,1.0,1.0,0.059
|
| 99 |
+
98,lecture eka record karala share karanna,lecture ΰΆΰΆ record ΰΆΰΆ»ΰΆ½ΰ· share ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±,lecture ΰΆΰΆ record ΰΆΰΆ»ΰΆ½ΰ· share ΰΆΰΆ»ΰΆ±ΰ·ΰΆ±,True,0.0,0.0,1.0,1.0,1.0,0.046
|
| 100 |
+
99,research paper eka liyanna one heta wge,research paper ΰΆΰΆ ΰΆ½ΰ·ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ· ΰ·ΰ·ΰΆ§ ΰ·ΰΆΰ·,research paper ΰΆΰΆ ΰΆ½ΰ·ΰΆΊΰΆ±ΰ·ΰΆ± ΰΆΰΆ±ΰ· ΰ·ΰ·ΰΆ§ ΰ·ΰΆΰ·,True,0.0,0.0,1.0,1.0,1.0,0.074
|
| 101 |
+
100,exam eka hari amarui,exam ΰΆΰΆ ΰ·ΰΆ»ΰ· ΰΆ
ΰΆΈΰ·ΰΆ»ΰ·ΰΆΊΰ·,exam ΰΆΰΆ ΰ·ΰΆ»ΰ· ΰΆ
ΰΆΈΰ·ΰΆ»ΰ·ΰΆΊΰ·,True,0.0,0.0,1.0,1.0,1.0,0.02
|
| 102 |
+
101,sprint eka plan karamu Monday,sprint ΰΆΰΆ plan ΰΆΰΆ»ΰΆΈΰ· Monday,sprint ΰΆΰΆ plan ΰΆΰΆ»ΰΆΈΰ· Monday,True,0.0,0.0,1.0,1.0,1.0,0.02
|
| 103 |
+
102,ape team eka deadline ekata kala,ΰΆ
ΰΆ΄ΰ· team ΰΆΰΆ deadline ΰΆΰΆΰΆ§ ΰΆΰ·
ΰ·,ΰΆ
ΰΆ΄ΰ· team ΰΆΰΆ deadline ΰΆΰΆΰΆ§ ΰΆΰ·
ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.044
|
| 104 |
+
103,standup eke mokada kiwwe,standup ΰΆΰΆΰ· ΰΆΈΰ·ΰΆΰΆ― ΰΆΰ·ΰ·ΰ·ΰ·ΰ·,standup ΰΆΰΆΰ· ΰΆΈΰ·ΰΆΰΆ― ΰΆΰ·ΰ·ΰ·ΰ·ΰ·,False,0.0435,0.25,0.0,0.75,1.0,0.048
|
| 105 |
+
104,reel eka viral una,reel ΰΆΰΆ viral ΰΆΰΆ±ΰ·,reel ΰΆΰΆ viral ΰΆΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.022
|
| 106 |
+
105,group chat eke mokada wenne,group chat ΰΆΰΆΰ· ΰΆΈΰ·ΰΆΰΆ― ΰ·ΰ·ΰΆ±ΰ·ΰΆ±ΰ·,group chat ΰΆΰΆΰ· ΰΆΈΰ·ΰΆΰΆ― ΰ·ΰ·ΰΆ±ΰ·ΰΆ±ΰ·,False,0.0385,0.2,0.6687,0.8,1.0,0.047
|
| 107 |
+
106,oyge profile picture eka lassanai,ΰΆΰΆΊΰ·ΰΆΰ· profile picture ΰΆΰΆ ΰΆ½ΰ·ΰ·ΰ·ΰΆ±ΰΆΊΰ·,ΰΆΰΆΊΰ·ΰΆΰ· profile picture ΰΆΰΆ ΰΆ½ΰ·ΰ·ΰ·ΰΆ±ΰΆΊΰ·,True,0.0,0.0,1.0,1.0,1.0,0.048
|
| 108 |
+
107,mama enne na heta,ΰΆΈΰΆΈ ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ· ΰ·ΰ·ΰΆ§,ΰΆΈΰΆΈ ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ· ΰ·ΰ·ΰΆ§,True,0.0,0.0,1.0,1.0,1.0,0.024
|
| 109 |
+
108,eka karanna epa,ΰΆΰΆ ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ΄ΰ·,ΰΆΰΆ ΰΆΰΆ»ΰΆ±ΰ·ΰΆ± ΰΆΰΆ΄ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.001
|
| 110 |
+
109,kawruwath enne na,ΰΆΰ·ΰ·ΰΆ»ΰ·ΰ·ΰΆΰ· ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·,ΰΆΰ·ΰ·ΰΆ»ΰ·ΰ·ΰΆΰ· ΰΆΰΆ±ΰ·ΰΆ±ΰ· ΰΆ±ΰ·,True,0.0,0.0,1.0,1.0,1.0,0.045
|
| 111 |
+
110,oya koheda ynne,ΰΆΰΆΊΰ· ΰΆΰ·ΰ·ΰ·ΰΆ― ΰΆΊΰΆ±ΰ·ΰΆ±ΰ·,ΰΆΰΆΊΰ· ΰΆΰ·ΰ·ΰ·ΰΆ― ΰΆΊΰΆ±ΰ·ΰΆ±ΰ·,False,0.1333,0.6667,0.0,0.3333,1.0,0.047
|
fine_tuning/attempt_2_informal_sinhala/experiment_notes.txt
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
================================================================================
|
| 2 |
+
SinCode β MLM Fine-Tuning Experiment 2 (Informal Sinhala Corpus)
|
| 3 |
+
Date: 29β30 March 2026
|
| 4 |
+
Author: Kalana Chandrasekara
|
| 5 |
+
================================================================================
|
| 6 |
+
|
| 7 |
+
MOTIVATION
|
| 8 |
+
--------------------------------------------------------------------------------
|
| 9 |
+
Experiment 1 (Wikipedia corpus) produced no measurable downstream improvement
|
| 10 |
+
in transliteration accuracy. The Wikipedia corpus is formal-register text,
|
| 11 |
+
which differs significantly from informal Singlish conversation patterns.
|
| 12 |
+
|
| 13 |
+
This experiment uses a large informal Sinhala dataset to better align the
|
| 14 |
+
language model with the colloquial register used in Singlish input.
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
DATASET
|
| 18 |
+
--------------------------------------------------------------------------------
|
| 19 |
+
Source: 9wimu9/sinhala_dataset_59m (Hugging Face Hub)
|
| 20 |
+
Description: 59M mixed-register Sinhala text samples, primarily informal
|
| 21 |
+
Subset used: 500,000 samples (full 59M would require ~15 days)
|
| 22 |
+
After filter: 499,801 samples (removed rows with < 10 characters)
|
| 23 |
+
Tokenized: 271,000 sequences (filtered sequences with < 20 tokens)
|
| 24 |
+
Train split: 257,450 samples (95%)
|
| 25 |
+
Eval split: 13,550 samples (5%)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
TRAINING CONFIGURATION
|
| 29 |
+
--------------------------------------------------------------------------------
|
| 30 |
+
Base model: FacebookAI/xlm-roberta-base (~270M parameters)
|
| 31 |
+
Output directory: xlm-roberta-sinhala-v2/final/
|
| 32 |
+
Published on HF: Kalana001/xlm-roberta-sinhala-sincode
|
| 33 |
+
|
| 34 |
+
βββββββββββββββββββββββββββββ¬βββββββββββββββββββ
|
| 35 |
+
β Parameter β Value β
|
| 36 |
+
βββββββββββββββββββββββββββββΌβββββββββββββββββββ€
|
| 37 |
+
β Epochs β 1 β
|
| 38 |
+
β Per-device batch size β 8 β
|
| 39 |
+
β Gradient accumulation β 4 β
|
| 40 |
+
β Effective batch size β 32 β
|
| 41 |
+
β Learning rate β 2e-5 β
|
| 42 |
+
β LR scheduler β Cosine β
|
| 43 |
+
β Warmup steps β ~503 β
|
| 44 |
+
β Weight decay β 0.01 β
|
| 45 |
+
β MLM probability β 0.15 β
|
| 46 |
+
β Max sequence length β 256 tokens β
|
| 47 |
+
β FP16 (mixed precision) β Yes β
|
| 48 |
+
β Total training steps β ~8,046 β
|
| 49 |
+
β Seed β 42 β
|
| 50 |
+
βββββββββββββββββββββββββββββ΄βββββββββββββββββββ
|
| 51 |
+
|
| 52 |
+
Hardware:
|
| 53 |
+
GPU: NVIDIA GeForce RTX 5060 Ti (16 GB VRAM)
|
| 54 |
+
CPU: AMD Ryzen 7 5800X (8-core / 16-thread)
|
| 55 |
+
CUDA: 13.2 (compute capability 12.0 β Blackwell)
|
| 56 |
+
OS: Windows, Python 3.14
|
| 57 |
+
|
| 58 |
+
Estimated training time: ~1.5 hours
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
RESULTS
|
| 62 |
+
--------------------------------------------------------------------------------
|
| 63 |
+
Training loss: 9.556 β 8.776 (-8.2%)
|
| 64 |
+
Eval loss: 2.1877 β 2.0621
|
| 65 |
+
|
| 66 |
+
Perplexity comparison (15 Sinhala test sentences):
|
| 67 |
+
Base model (no fine-tuning): 35.35
|
| 68 |
+
Experiment 2 (this model): 15.95
|
| 69 |
+
Improvement: -54.9%
|
| 70 |
+
|
| 71 |
+
See training_loss.png for the full loss curve across 8,046 steps.
|
| 72 |
+
Run compare_perplexity.py to reproduce the perplexity figures.
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
DOWNSTREAM EVALUATION (110 sentences)
|
| 76 |
+
--------------------------------------------------------------------------------
|
| 77 |
+
Dataset: evaluation/dataset_110.csv
|
| 78 |
+
Predictions: eval_predictions.csv (this folder)
|
| 79 |
+
Diagnostics: eval_diagnostics.json (this folder)
|
| 80 |
+
|
| 81 |
+
βββββββββββββββββββββββββββ¬ββββββββββββββ
|
| 82 |
+
β Metric β Value β
|
| 83 |
+
βββββββββββββββββββββββββββΌββββββββββββββ€
|
| 84 |
+
β Exact Match β 101/110 β
|
| 85 |
+
β β (91.8%) β
|
| 86 |
+
β Character Error Rate β 0.0073 β
|
| 87 |
+
β Word Error Rate β 0.0245 β
|
| 88 |
+
β BLEU Score β 0.947 β
|
| 89 |
+
βββββββββββββββββββββββββββ΄ββββββββββββββ
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
FILES IN THIS FOLDER
|
| 93 |
+
--------------------------------------------------------------------------------
|
| 94 |
+
train_mlm.py β Training script (in parent fine_tuning/ folder)
|
| 95 |
+
experiment_notes.txt β This file
|
| 96 |
+
training_loss.png β Loss curve graph across all training steps
|
| 97 |
+
plot_training.py β Script used to generate training_loss.png
|
| 98 |
+
compare_perplexity.py β Script to measure perplexity before/after fine-tuning
|
| 99 |
+
eval_predictions.csv β 110-sentence evaluation predictions
|
| 100 |
+
eval_diagnostics.json β Per-sentence diagnostic breakdown
|
| 101 |
+
|
| 102 |
+
================================================================================
|
fine_tuning/attempt_2_informal_sinhala/plot_training.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Plot training loss curve from Hugging Face trainer state files."""
|
| 2 |
+
import json
|
| 3 |
+
import sys
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import matplotlib.pyplot as plt
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def load_history(output_dir):
|
| 10 |
+
"""Load de-duplicated train/eval history from checkpoint trainer_state.json files."""
|
| 11 |
+
checkpoint_dirs = sorted(
|
| 12 |
+
[path for path in Path(output_dir).iterdir() if path.is_dir() and path.name.startswith("checkpoint-")],
|
| 13 |
+
key=lambda path: int(path.name.split("-")[1]),
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
if not checkpoint_dirs:
|
| 17 |
+
raise FileNotFoundError(f"No checkpoint directories found in {output_dir}")
|
| 18 |
+
|
| 19 |
+
merged_history = {}
|
| 20 |
+
for checkpoint_dir in checkpoint_dirs:
|
| 21 |
+
trainer_state_path = checkpoint_dir / "trainer_state.json"
|
| 22 |
+
if not trainer_state_path.exists():
|
| 23 |
+
continue
|
| 24 |
+
with trainer_state_path.open("r", encoding="utf-8") as handle:
|
| 25 |
+
trainer_state = json.load(handle)
|
| 26 |
+
for entry in trainer_state.get("log_history", []):
|
| 27 |
+
step = entry.get("step")
|
| 28 |
+
if step is None:
|
| 29 |
+
continue
|
| 30 |
+
merged_history[(step, "eval_loss" in entry)] = entry
|
| 31 |
+
|
| 32 |
+
history = [merged_history[key] for key in sorted(merged_history)]
|
| 33 |
+
train_entries = [entry for entry in history if "loss" in entry]
|
| 34 |
+
eval_entries = [entry for entry in history if "eval_loss" in entry]
|
| 35 |
+
return train_entries, eval_entries
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def plot_loss(train_entries, eval_entries):
|
| 39 |
+
steps = [int(entry["step"]) for entry in train_entries]
|
| 40 |
+
losses = [float(entry["loss"]) for entry in train_entries]
|
| 41 |
+
epochs = [float(entry.get("epoch", 0.0)) for entry in train_entries]
|
| 42 |
+
eval_steps = [int(entry["step"]) for entry in eval_entries]
|
| 43 |
+
eval_losses = [float(entry["eval_loss"]) for entry in eval_entries]
|
| 44 |
+
|
| 45 |
+
fig, ax1 = plt.subplots(figsize=(12, 6))
|
| 46 |
+
|
| 47 |
+
# Training loss
|
| 48 |
+
ax1.plot(steps, losses, color='#2196F3', alpha=0.4, linewidth=0.8, label='Train Loss (raw)')
|
| 49 |
+
|
| 50 |
+
# Smoothed training loss (moving average)
|
| 51 |
+
window = min(20, len(losses) // 5) if len(losses) > 10 else 1
|
| 52 |
+
if window > 1:
|
| 53 |
+
smoothed = []
|
| 54 |
+
for i in range(len(losses)):
|
| 55 |
+
start = max(0, i - window + 1)
|
| 56 |
+
smoothed.append(sum(losses[start:i+1]) / (i - start + 1))
|
| 57 |
+
ax1.plot(steps, smoothed, color='#1565C0', linewidth=2, label=f'Train Loss (smoothed, w={window})')
|
| 58 |
+
|
| 59 |
+
# Eval loss points
|
| 60 |
+
if eval_losses:
|
| 61 |
+
ax1.scatter(eval_steps, eval_losses, color='#F44336', s=80, zorder=5,
|
| 62 |
+
marker='*', label='Eval Loss')
|
| 63 |
+
for s, l in zip(eval_steps, eval_losses):
|
| 64 |
+
ax1.annotate(f'{l:.4f}', (s, l), textcoords="offset points",
|
| 65 |
+
xytext=(10, 10), fontsize=8, color='#F44336')
|
| 66 |
+
|
| 67 |
+
ax1.set_xlabel('Training Steps', fontsize=12)
|
| 68 |
+
ax1.set_ylabel('Loss', fontsize=12)
|
| 69 |
+
ax1.set_title('SinCode MLM Fine-Tuning β Experiment 2\n(9wimu9/sinhala_dataset_59m, 500K samples, 1 epoch)',
|
| 70 |
+
fontsize=13, fontweight='bold')
|
| 71 |
+
ax1.legend(loc='upper right', fontsize=10)
|
| 72 |
+
ax1.grid(True, alpha=0.3)
|
| 73 |
+
|
| 74 |
+
# Add annotations
|
| 75 |
+
ax1.annotate(f'Start: {losses[0]:.3f}', (steps[0], losses[0]),
|
| 76 |
+
textcoords="offset points", xytext=(15, -10), fontsize=9,
|
| 77 |
+
arrowprops=dict(arrowstyle='->', color='gray'))
|
| 78 |
+
ax1.annotate(f'End: {losses[-1]:.3f}', (steps[-1], losses[-1]),
|
| 79 |
+
textcoords="offset points", xytext=(-60, 15), fontsize=9,
|
| 80 |
+
arrowprops=dict(arrowstyle='->', color='gray'))
|
| 81 |
+
|
| 82 |
+
# Loss reduction annotation
|
| 83 |
+
reduction = ((losses[0] - losses[-1]) / losses[0]) * 100
|
| 84 |
+
ax1.text(0.02, 0.02, f'Loss reduction: {losses[0]:.3f} β {losses[-1]:.3f} ({reduction:+.1f}%)',
|
| 85 |
+
transform=ax1.transAxes, fontsize=10,
|
| 86 |
+
bbox=dict(boxstyle='round,pad=0.3', facecolor='lightyellow', alpha=0.8))
|
| 87 |
+
|
| 88 |
+
plt.tight_layout()
|
| 89 |
+
|
| 90 |
+
# Save
|
| 91 |
+
out_path = 'misc/training_loss_v2.png'
|
| 92 |
+
plt.savefig(out_path, dpi=150, bbox_inches='tight')
|
| 93 |
+
print(f"Chart saved to: {out_path}")
|
| 94 |
+
plt.close(fig)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
if __name__ == "__main__":
|
| 98 |
+
output_dir = sys.argv[1] if len(sys.argv) > 1 else "xlm-roberta-sinhala-v2"
|
| 99 |
+
|
| 100 |
+
try:
|
| 101 |
+
train_entries, eval_entries = load_history(output_dir)
|
| 102 |
+
except FileNotFoundError as exc:
|
| 103 |
+
print(f"Usage: python plot_training.py <output_dir>")
|
| 104 |
+
print(f" {exc}")
|
| 105 |
+
sys.exit(1)
|
| 106 |
+
|
| 107 |
+
if not train_entries:
|
| 108 |
+
print("No training loss entries found in checkpoint trainer_state.json files.")
|
| 109 |
+
sys.exit(1)
|
| 110 |
+
|
| 111 |
+
steps = [int(entry["step"]) for entry in train_entries]
|
| 112 |
+
losses = [float(entry["loss"]) for entry in train_entries]
|
| 113 |
+
print(f"Found {len(train_entries)} training loss entries, {len(eval_entries)} eval loss entries")
|
| 114 |
+
print(f"Steps: {steps[0]} β {steps[-1]}")
|
| 115 |
+
print(f"Loss: {losses[0]:.3f} β {losses[-1]:.3f} ({((losses[0]-losses[-1])/losses[0])*100:+.1f}%)")
|
| 116 |
+
|
| 117 |
+
plot_loss(train_entries, eval_entries)
|
fine_tuning/attempt_2_informal_sinhala/training_loss.png
ADDED
|
Git LFS Details
|
fine_tuning/train_mlm.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Continued MLM pre-training of XLM-RoBERTa on Sinhala text.
|
| 3 |
+
|
| 4 |
+
Experiment 1 (completed): Sinhala Wikipedia (23K articles) β no improvement.
|
| 5 |
+
Experiment 2 (current): 9wimu9/sinhala_dataset_59m β 500K informal samples.
|
| 6 |
+
|
| 7 |
+
Usage:
|
| 8 |
+
python train_mlm.py # full training (500K, 1 epoch)
|
| 9 |
+
python train_mlm.py --samples 100 --test # quick smoke test
|
| 10 |
+
python train_mlm.py --samples 1000000 # 1M samples
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import argparse
|
| 14 |
+
import os
|
| 15 |
+
import math
|
| 16 |
+
import torch
|
| 17 |
+
from transformers import (
|
| 18 |
+
AutoTokenizer,
|
| 19 |
+
AutoModelForMaskedLM,
|
| 20 |
+
DataCollatorForLanguageModeling,
|
| 21 |
+
TrainingArguments,
|
| 22 |
+
Trainer,
|
| 23 |
+
)
|
| 24 |
+
from datasets import load_dataset
|
| 25 |
+
|
| 26 |
+
# βββ Defaults ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 27 |
+
|
| 28 |
+
BASE_MODEL = "FacebookAI/xlm-roberta-base"
|
| 29 |
+
OUTPUT_DIR = "xlm-roberta-sinhala-v2" # saved model directory (v2 = informal data)
|
| 30 |
+
DATASET = "9wimu9/sinhala_dataset_59m" # 59M mixed-register Sinhala samples
|
| 31 |
+
DEFAULT_SAMPLES = 500_000 # subset size (full 59M is ~15 days)
|
| 32 |
+
MAX_SEQ_LEN = 256 # token block size
|
| 33 |
+
MLM_PROB = 0.15 # mask probability (same as original)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def parse_args():
|
| 37 |
+
p = argparse.ArgumentParser(description="Continue MLM pre-training on Sinhala text")
|
| 38 |
+
p.add_argument("--base_model", default=BASE_MODEL, help="Base HuggingFace model")
|
| 39 |
+
p.add_argument("--output_dir", default=OUTPUT_DIR, help="Output directory for fine-tuned model")
|
| 40 |
+
p.add_argument("--epochs", type=int, default=1, help="Number of training epochs (1 is enough for 500K)")
|
| 41 |
+
p.add_argument("--batch_size", type=int, default=8, help="Per-device train batch size")
|
| 42 |
+
p.add_argument("--grad_accum", type=int, default=4, help="Gradient accumulation steps")
|
| 43 |
+
p.add_argument("--lr", type=float, default=2e-5, help="Learning rate")
|
| 44 |
+
p.add_argument("--max_seq_len", type=int, default=MAX_SEQ_LEN, help="Max sequence length")
|
| 45 |
+
p.add_argument("--samples", type=int, default=DEFAULT_SAMPLES, help="Number of samples to use from dataset")
|
| 46 |
+
p.add_argument("--test", action="store_true", help="Quick smoke test with 100 samples")
|
| 47 |
+
p.add_argument("--resume", action="store_true", help="Resume from latest checkpoint")
|
| 48 |
+
return p.parse_args()
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def load_and_prepare_dataset(tokenizer, max_seq_len, num_samples, test_mode=False):
|
| 52 |
+
"""Download Sinhala dataset (streaming) and tokenize a subset."""
|
| 53 |
+
if test_mode:
|
| 54 |
+
num_samples = 100
|
| 55 |
+
|
| 56 |
+
print(f"π₯ Loading {DATASET} (streaming {num_samples:,} samples)...")
|
| 57 |
+
ds = load_dataset(DATASET, split="train", streaming=True)
|
| 58 |
+
|
| 59 |
+
# Collect samples from the stream
|
| 60 |
+
texts = []
|
| 61 |
+
for i, row in enumerate(ds):
|
| 62 |
+
if i >= num_samples:
|
| 63 |
+
break
|
| 64 |
+
text = row.get("text", "")
|
| 65 |
+
if len(text.strip()) >= 10: # skip near-empty rows
|
| 66 |
+
texts.append(text)
|
| 67 |
+
if (i + 1) % 50_000 == 0:
|
| 68 |
+
print(f" ... loaded {i + 1:,} / {num_samples:,}")
|
| 69 |
+
|
| 70 |
+
print(f"π Collected {len(texts):,} samples (after filtering empty rows)")
|
| 71 |
+
|
| 72 |
+
# Convert to HF Dataset for .map() compatibility
|
| 73 |
+
from datasets import Dataset
|
| 74 |
+
raw = Dataset.from_dict({"text": texts})
|
| 75 |
+
del texts # free memory
|
| 76 |
+
|
| 77 |
+
# Tokenize
|
| 78 |
+
def tokenize_fn(examples):
|
| 79 |
+
return tokenizer(
|
| 80 |
+
examples["text"],
|
| 81 |
+
truncation=True,
|
| 82 |
+
max_length=max_seq_len,
|
| 83 |
+
padding=False,
|
| 84 |
+
return_special_tokens_mask=True,
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
print("π€ Tokenizing...")
|
| 88 |
+
tokenized = raw.map(
|
| 89 |
+
tokenize_fn,
|
| 90 |
+
batched=True,
|
| 91 |
+
num_proc=4 if not test_mode else 1,
|
| 92 |
+
remove_columns=raw.column_names,
|
| 93 |
+
desc="Tokenizing",
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
# Filter out very short sequences (< 20 tokens)
|
| 97 |
+
tokenized = tokenized.filter(lambda x: len(x["input_ids"]) >= 20)
|
| 98 |
+
|
| 99 |
+
print(f"β
{len(tokenized):,} tokenized samples ready")
|
| 100 |
+
return tokenized
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def main():
|
| 104 |
+
args = parse_args()
|
| 105 |
+
|
| 106 |
+
# βββ Device check ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 107 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 108 |
+
if device == "cuda":
|
| 109 |
+
gpu_name = torch.cuda.get_device_name(0)
|
| 110 |
+
gpu_mem = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
|
| 111 |
+
print(f"π₯οΈ GPU: {gpu_name} ({gpu_mem:.1f} GB)")
|
| 112 |
+
else:
|
| 113 |
+
print("β οΈ No GPU detected β training will be slow!")
|
| 114 |
+
|
| 115 |
+
# βββ Load tokenizer & model ββββββββββββββββββββββββββββββββββββββββββ
|
| 116 |
+
print(f"π¦ Loading {args.base_model}...")
|
| 117 |
+
tokenizer = AutoTokenizer.from_pretrained(args.base_model)
|
| 118 |
+
model = AutoModelForMaskedLM.from_pretrained(args.base_model)
|
| 119 |
+
|
| 120 |
+
# βββ Dataset ββοΏ½οΏ½οΏ½ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 121 |
+
dataset = load_and_prepare_dataset(tokenizer, args.max_seq_len, args.samples, args.test)
|
| 122 |
+
|
| 123 |
+
# Split 95/5 for train/validation
|
| 124 |
+
split = dataset.train_test_split(test_size=0.05, seed=42)
|
| 125 |
+
train_dataset = split["train"]
|
| 126 |
+
eval_dataset = split["test"]
|
| 127 |
+
print(f"π Train: {len(train_dataset):,} | Eval: {len(eval_dataset):,}")
|
| 128 |
+
|
| 129 |
+
# βββ Data collator (dynamic masking each epoch) ββββββββββββββββββββββ
|
| 130 |
+
data_collator = DataCollatorForLanguageModeling(
|
| 131 |
+
tokenizer=tokenizer,
|
| 132 |
+
mlm=True,
|
| 133 |
+
mlm_probability=MLM_PROB,
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
# βββ Training arguments ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 137 |
+
# Effective batch = batch_size * grad_accum = 8 * 4 = 32
|
| 138 |
+
total_steps = math.ceil(len(train_dataset) / (args.batch_size * args.grad_accum)) * args.epochs
|
| 139 |
+
|
| 140 |
+
training_args = TrainingArguments(
|
| 141 |
+
output_dir=args.output_dir,
|
| 142 |
+
num_train_epochs=args.epochs,
|
| 143 |
+
per_device_train_batch_size=args.batch_size,
|
| 144 |
+
per_device_eval_batch_size=args.batch_size * 2,
|
| 145 |
+
gradient_accumulation_steps=args.grad_accum,
|
| 146 |
+
learning_rate=args.lr,
|
| 147 |
+
weight_decay=0.01,
|
| 148 |
+
warmup_steps=max(100, total_steps // 16),
|
| 149 |
+
lr_scheduler_type="cosine",
|
| 150 |
+
eval_strategy="steps",
|
| 151 |
+
eval_steps=max(500, total_steps // 10),
|
| 152 |
+
save_strategy="steps",
|
| 153 |
+
save_steps=max(500, total_steps // 10),
|
| 154 |
+
save_total_limit=2,
|
| 155 |
+
logging_steps=50,
|
| 156 |
+
fp16=device == "cuda",
|
| 157 |
+
dataloader_num_workers=2,
|
| 158 |
+
load_best_model_at_end=True,
|
| 159 |
+
metric_for_best_model="eval_loss",
|
| 160 |
+
greater_is_better=False,
|
| 161 |
+
report_to="none", # no wandb/tensorboard
|
| 162 |
+
seed=42,
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
# βββ Trainer βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 166 |
+
trainer = Trainer(
|
| 167 |
+
model=model,
|
| 168 |
+
args=training_args,
|
| 169 |
+
train_dataset=train_dataset,
|
| 170 |
+
eval_dataset=eval_dataset,
|
| 171 |
+
data_collator=data_collator,
|
| 172 |
+
processing_class=tokenizer,
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
# βββ Train βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 176 |
+
print("π Starting training...")
|
| 177 |
+
resume_checkpoint = args.resume and os.path.isdir(args.output_dir)
|
| 178 |
+
trainer.train(resume_from_checkpoint=resume_checkpoint if resume_checkpoint else None)
|
| 179 |
+
|
| 180 |
+
# βββ Save final model ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 181 |
+
final_path = os.path.join(args.output_dir, "final")
|
| 182 |
+
print(f"πΎ Saving fine-tuned model to {final_path}/")
|
| 183 |
+
trainer.save_model(final_path)
|
| 184 |
+
tokenizer.save_pretrained(final_path)
|
| 185 |
+
|
| 186 |
+
# βββ Final eval ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 187 |
+
metrics = trainer.evaluate()
|
| 188 |
+
print(f"\nπ Final eval loss: {metrics['eval_loss']:.4f}")
|
| 189 |
+
print(f" Perplexity: {math.exp(metrics['eval_loss']):.2f}")
|
| 190 |
+
print(f"\nβ
Model saved to: {os.path.abspath(final_path)}")
|
| 191 |
+
print(f" To use in SinCode, update DEFAULT_MODEL_NAME in core/constants.py to:")
|
| 192 |
+
print(f' DEFAULT_MODEL_NAME = r"{os.path.abspath(final_path)}"')
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
if __name__ == "__main__":
|
| 196 |
+
main()
|
images/SinCodeLogo.jpg
ADDED
|
images/background.png
ADDED
|
Git LFS Details
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
transformers
|
| 3 |
+
torch
|
| 4 |
+
requests
|
| 5 |
+
pillow
|
sincode_model.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SinCode: Context-Aware Singlish-to-Sinhala Transliteration Engine
|
| 3 |
+
|
| 4 |
+
Backward-compatible entry point β all logic lives in the ``core/`` package.
|
| 5 |
+
This module re-exports the public API so that existing imports
|
| 6 |
+
(``from sincode_model import BeamSearchDecoder``) continue to work.
|
| 7 |
+
|
| 8 |
+
Author: Kalana Chandrasekara (2026)
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
# ββ Re-exports (public API) βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 12 |
+
|
| 13 |
+
from core.decoder import BeamSearchDecoder # noqa: F401
|
| 14 |
+
from core.scorer import CandidateScorer, ScoredCandidate, WordDiagnostic # noqa: F401
|
| 15 |
+
from core.dictionary import DictionaryAdapter # noqa: F401
|
| 16 |
+
from core.transliterate import rule_based_transliterate # noqa: F401
|
| 17 |
+
from core.english import ENGLISH_VOCAB, CORE_ENGLISH_WORDS, load_english_vocab # noqa: F401
|
| 18 |
+
from core.mappings import COMMON_WORDS, CONTEXT_WORDS_STANDALONE # noqa: F401
|
| 19 |
+
from core.constants import ( # noqa: F401
|
| 20 |
+
DEFAULT_MODEL_NAME, DEFAULT_DICTIONARY_PATH,
|
| 21 |
+
W_MLM, W_FIDELITY, W_RANK,
|
| 22 |
+
MAX_CANDIDATES, DEFAULT_BEAM_WIDTH,
|
| 23 |
+
FIDELITY_SCALE, DICT_FIDELITY_DAMP, MIN_ENGLISH_LEN,
|
| 24 |
+
)
|