Spaces:
Sleeping
Sleeping
| from datasets import load_dataset, Features, Value, concatenate_datasets | |
| data_files = { | |
| "train": [ | |
| "/Users/apple/Documents/Projects/data/ben/ben_train.json", | |
| "/Users/apple/Documents/Projects/data/hin/hin_train.json", | |
| "/Users/apple/Documents/Projects/data/tam/tam_train.json", | |
| ], | |
| "validation": [ | |
| "/Users/apple/Documents/Projects/data/ben/ben_valid.json", | |
| "/Users/apple/Documents/Projects/data/hin/hin_valid.json", | |
| "/Users/apple/Documents/Projects/data/tam/tam_valid.json", | |
| ], | |
| "test": [ | |
| "/Users/apple/Documents/Projects/data/ben/ben_test.json", | |
| "/Users/apple/Documents/Projects/data/hin/hin_test.json", | |
| "/Users/apple/Documents/Projects/data/tam/tam_test.json", | |
| ], | |
| } | |
| features = Features({ | |
| "unique_identifier": Value("string"), | |
| "native word": Value("string"), | |
| "english word": Value("string"), | |
| "source": Value("string"), | |
| "score": Value("float32"), | |
| }) | |
| dataset = load_dataset("json", data_files=data_files, features=features) | |
| dataset = dataset.remove_columns(["source", "score"]) | |
| def detect_lang(example): | |
| uid = example["unique_identifier"].lower() | |
| if uid.startswith("hin"): | |
| lang = "hi" | |
| elif uid.startswith("ben"): | |
| lang = "bn" | |
| elif uid.startswith("tam"): | |
| lang = "ta" | |
| else: | |
| lang = "hi" | |
| example["lang"] = lang | |
| example["input_text"] = f"transliterate to {lang}: {example['english word']}" | |
| example["target_text"] = example["native word"] | |
| return example | |
| dataset = dataset.map(detect_lang) | |
| print(dataset["train"][0]) | |