|
|
|
|
|
import os |
|
|
|
|
|
|
|
|
DATA_DIR = "data/processed" |
|
|
TEST_DIR = "data/test_sets" |
|
|
SOURCE_FILE = os.path.join(DATA_DIR, "nepali.ne") |
|
|
TARGET_FILE = os.path.join(DATA_DIR, "nepali.en") |
|
|
NUM_TEST_LINES = 500 |
|
|
|
|
|
|
|
|
print("--- Creating a held-back test set for Nepali ---") |
|
|
os.makedirs(TEST_DIR, exist_ok=True) |
|
|
|
|
|
|
|
|
with open(SOURCE_FILE, "r", encoding="utf-8") as f: |
|
|
source_lines = f.readlines() |
|
|
with open(TARGET_FILE, "r", encoding="utf-8") as f: |
|
|
target_lines = f.readlines() |
|
|
|
|
|
|
|
|
assert len(source_lines) == len(target_lines), "Source and target files have different lengths!" |
|
|
|
|
|
|
|
|
train_source_lines = source_lines[:-NUM_TEST_LINES] |
|
|
test_source_lines = source_lines[-NUM_TEST_LINES:] |
|
|
|
|
|
train_target_lines = target_lines[:-NUM_TEST_LINES] |
|
|
test_target_lines = target_lines[-NUM_TEST_LINES:] |
|
|
|
|
|
|
|
|
with open(SOURCE_FILE, "w", encoding="utf-8") as f: |
|
|
f.writelines(train_source_lines) |
|
|
with open(TARGET_FILE, "w", encoding="utf-8") as f: |
|
|
f.writelines(train_target_lines) |
|
|
|
|
|
|
|
|
with open(os.path.join(TEST_DIR, "test.ne"), "w", encoding="utf-8") as f: |
|
|
f.writelines(test_source_lines) |
|
|
with open(os.path.join(TEST_DIR, "test.en"), "w", encoding="utf-8") as f: |
|
|
f.writelines(test_target_lines) |
|
|
|
|
|
print(f"Successfully created a test set with {NUM_TEST_LINES} lines for Nepali.") |
|
|
print(f"The original training files in '{DATA_DIR}' have been updated.") |
|
|
|