Spaces:
Sleeping
Sleeping
File size: 1,596 Bytes
d14e502 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
from datasets import load_dataset, Features, Value, concatenate_datasets
data_files = {
"train": [
"/Users/apple/Documents/Projects/data/ben/ben_train.json",
"/Users/apple/Documents/Projects/data/hin/hin_train.json",
"/Users/apple/Documents/Projects/data/tam/tam_train.json",
],
"validation": [
"/Users/apple/Documents/Projects/data/ben/ben_valid.json",
"/Users/apple/Documents/Projects/data/hin/hin_valid.json",
"/Users/apple/Documents/Projects/data/tam/tam_valid.json",
],
"test": [
"/Users/apple/Documents/Projects/data/ben/ben_test.json",
"/Users/apple/Documents/Projects/data/hin/hin_test.json",
"/Users/apple/Documents/Projects/data/tam/tam_test.json",
],
}
features = Features({
"unique_identifier": Value("string"),
"native word": Value("string"),
"english word": Value("string"),
"source": Value("string"),
"score": Value("float32"),
})
dataset = load_dataset("json", data_files=data_files, features=features)
dataset = dataset.remove_columns(["source", "score"])
def detect_lang(example):
uid = example["unique_identifier"].lower()
if uid.startswith("hin"):
lang = "hi"
elif uid.startswith("ben"):
lang = "bn"
elif uid.startswith("tam"):
lang = "ta"
else:
lang = "hi"
example["lang"] = lang
example["input_text"] = f"transliterate to {lang}: {example['english word']}"
example["target_text"] = example["native word"]
return example
dataset = dataset.map(detect_lang)
print(dataset["train"][0])
|