File size: 1,441 Bytes
fd2f49a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# scripts/create_sinhala_test_set.py
import os
from datasets import load_dataset

# --- Configuration ---
DATA_DIR = "data/processed"
TEST_DIR = "data/test_sets"
DATASET_NAME = "Programmer-RD-AI/sinhala-english-singlish-translation"
NUM_TEST_LINES = 500
# ---

print("--- Creating a held-back test set for Sinhalese ---")
os.makedirs(TEST_DIR, exist_ok=True)

# Load the dataset from Hugging Face
dataset = load_dataset(DATASET_NAME, split='train')

# Split the dataset
train_dataset = dataset.select(range(len(dataset) - NUM_TEST_LINES))
test_dataset = dataset.select(range(len(dataset) - NUM_TEST_LINES, len(dataset)))

# Write the new training files
with open(os.path.join(DATA_DIR, "sinhala.si"), "w", encoding="utf-8") as f_source, \
     open(os.path.join(DATA_DIR, "sinhala.en"), "w", encoding="utf-8") as f_target:
    for example in train_dataset:
        f_source.write(example['Sinhala'] + "\n")
        f_target.write(example['English'] + "\n")

# Write the new test files
with open(os.path.join(TEST_DIR, "test.si"), "w", encoding="utf-8") as f_source, \
     open(os.path.join(TEST_DIR, "test.en"), "w", encoding="utf-8") as f_target:
    for example in test_dataset:
        f_source.write(example['Sinhala'] + "\n")
        f_target.write(example['English'] + "\n")

print(f"Successfully created a test set with {NUM_TEST_LINES} lines for Sinhalese.")
print(f"The original training files in '{DATA_DIR}' have been updated.")