from datasets import load_dataset, Features, Value, concatenate_datasets data_files = { "train": [ "/Users/apple/Documents/Projects/data/ben/ben_train.json", "/Users/apple/Documents/Projects/data/hin/hin_train.json", "/Users/apple/Documents/Projects/data/tam/tam_train.json", ], "validation": [ "/Users/apple/Documents/Projects/data/ben/ben_valid.json", "/Users/apple/Documents/Projects/data/hin/hin_valid.json", "/Users/apple/Documents/Projects/data/tam/tam_valid.json", ], "test": [ "/Users/apple/Documents/Projects/data/ben/ben_test.json", "/Users/apple/Documents/Projects/data/hin/hin_test.json", "/Users/apple/Documents/Projects/data/tam/tam_test.json", ], } features = Features({ "unique_identifier": Value("string"), "native word": Value("string"), "english word": Value("string"), "source": Value("string"), "score": Value("float32"), }) dataset = load_dataset("json", data_files=data_files, features=features) dataset = dataset.remove_columns(["source", "score"]) def detect_lang(example): uid = example["unique_identifier"].lower() if uid.startswith("hin"): lang = "hi" elif uid.startswith("ben"): lang = "bn" elif uid.startswith("tam"): lang = "ta" else: lang = "hi" example["lang"] = lang example["input_text"] = f"transliterate to {lang}: {example['english word']}" example["target_text"] = example["native word"] return example dataset = dataset.map(detect_lang) print(dataset["train"][0])