multilingual-transliteration / prepare_data.py
HF Deploy
Deploy multilingual transliteration app to Hugging Face Spaces
d14e502
from datasets import load_dataset, Features, Value, concatenate_datasets
data_files = {
"train": [
"/Users/apple/Documents/Projects/data/ben/ben_train.json",
"/Users/apple/Documents/Projects/data/hin/hin_train.json",
"/Users/apple/Documents/Projects/data/tam/tam_train.json",
],
"validation": [
"/Users/apple/Documents/Projects/data/ben/ben_valid.json",
"/Users/apple/Documents/Projects/data/hin/hin_valid.json",
"/Users/apple/Documents/Projects/data/tam/tam_valid.json",
],
"test": [
"/Users/apple/Documents/Projects/data/ben/ben_test.json",
"/Users/apple/Documents/Projects/data/hin/hin_test.json",
"/Users/apple/Documents/Projects/data/tam/tam_test.json",
],
}
features = Features({
"unique_identifier": Value("string"),
"native word": Value("string"),
"english word": Value("string"),
"source": Value("string"),
"score": Value("float32"),
})
dataset = load_dataset("json", data_files=data_files, features=features)
dataset = dataset.remove_columns(["source", "score"])
def detect_lang(example):
uid = example["unique_identifier"].lower()
if uid.startswith("hin"):
lang = "hi"
elif uid.startswith("ben"):
lang = "bn"
elif uid.startswith("tam"):
lang = "ta"
else:
lang = "hi"
example["lang"] = lang
example["input_text"] = f"transliterate to {lang}: {example['english word']}"
example["target_text"] = example["native word"]
return example
dataset = dataset.map(detect_lang)
print(dataset["train"][0])