File size: 1,596 Bytes
d14e502
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from datasets import load_dataset, Features, Value, concatenate_datasets

data_files = {
    "train": [
        "/Users/apple/Documents/Projects/data/ben/ben_train.json",
        "/Users/apple/Documents/Projects/data/hin/hin_train.json",
        "/Users/apple/Documents/Projects/data/tam/tam_train.json",
    ],
    "validation": [
        "/Users/apple/Documents/Projects/data/ben/ben_valid.json",
        "/Users/apple/Documents/Projects/data/hin/hin_valid.json",
        "/Users/apple/Documents/Projects/data/tam/tam_valid.json",
    ],
    "test": [
        "/Users/apple/Documents/Projects/data/ben/ben_test.json",
        "/Users/apple/Documents/Projects/data/hin/hin_test.json",
        "/Users/apple/Documents/Projects/data/tam/tam_test.json",
    ],
}

features = Features({
    "unique_identifier": Value("string"),
    "native word": Value("string"),
    "english word": Value("string"),
    "source": Value("string"),
    "score": Value("float32"),
})

dataset = load_dataset("json", data_files=data_files, features=features)
dataset = dataset.remove_columns(["source", "score"])

def detect_lang(example):
    uid = example["unique_identifier"].lower()
    if uid.startswith("hin"):
        lang = "hi"
    elif uid.startswith("ben"):
        lang = "bn"
    elif uid.startswith("tam"):
        lang = "ta"
    else:
        lang = "hi"

    example["lang"] = lang
    example["input_text"] = f"transliterate to {lang}: {example['english word']}"
    example["target_text"] = example["native word"]
    return example

dataset = dataset.map(detect_lang)

print(dataset["train"][0])