|
|
|
|
|
import os |
|
|
from datasets import load_dataset |
|
|
|
|
|
|
|
|
DATA_DIR = "data/processed" |
|
|
TEST_DIR = "data/test_sets" |
|
|
DATASET_NAME = "Programmer-RD-AI/sinhala-english-singlish-translation" |
|
|
NUM_TEST_LINES = 500 |
|
|
|
|
|
|
|
|
print("--- Creating a held-back test set for Sinhalese ---") |
|
|
os.makedirs(TEST_DIR, exist_ok=True) |
|
|
|
|
|
|
|
|
dataset = load_dataset(DATASET_NAME, split='train') |
|
|
|
|
|
|
|
|
train_dataset = dataset.select(range(len(dataset) - NUM_TEST_LINES)) |
|
|
test_dataset = dataset.select(range(len(dataset) - NUM_TEST_LINES, len(dataset))) |
|
|
|
|
|
|
|
|
with open(os.path.join(DATA_DIR, "sinhala.si"), "w", encoding="utf-8") as f_source, \ |
|
|
open(os.path.join(DATA_DIR, "sinhala.en"), "w", encoding="utf-8") as f_target: |
|
|
for example in train_dataset: |
|
|
f_source.write(example['Sinhala'] + "\n") |
|
|
f_target.write(example['English'] + "\n") |
|
|
|
|
|
|
|
|
with open(os.path.join(TEST_DIR, "test.si"), "w", encoding="utf-8") as f_source, \ |
|
|
open(os.path.join(TEST_DIR, "test.en"), "w", encoding="utf-8") as f_target: |
|
|
for example in test_dataset: |
|
|
f_source.write(example['Sinhala'] + "\n") |
|
|
f_target.write(example['English'] + "\n") |
|
|
|
|
|
print(f"Successfully created a test set with {NUM_TEST_LINES} lines for Sinhalese.") |
|
|
print(f"The original training files in '{DATA_DIR}' have been updated.") |
|
|
|