Spaces:
Runtime error
Runtime error
File size: 2,539 Bytes
724838e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import pandas as pd
import os
import random
def create_dummy_data():
"""Generates dummy transliteration data for Hindi, Bengali, and Tamil."""
# Minimal dummy dataset
data = [
# Hindi
("namaste", "नमस्ते", "hi"),
("aap", "आप", "hi"),
("kya", "क्या", "hi"),
("kar", "कर", "hi"),
("rahe", "रहे", "hi"),
("ho", "हो", "hi"),
("mera", "मेरा", "hi"),
("naam", "नाम", "hi"),
("hai", "है", "hi"),
("bharat", "भारत", "hi"),
# Bengali
("namoshkar", "নমস্কার", "bn"),
("apni", "আপনি", "bn"),
("kemon", "কেমন", "bn"),
("achen", "আছেন", "bn"),
("amar", "আমার", "bn"),
("nam", "নাম", "bn"),
("bangla", "বাংলা", "bn"),
("desh", "দেশ", "bn"),
("khabar", "খাবার", "bn"),
("jal", "জল", "bn"),
# Tamil
("vanakkam", "வணக்கம்", "ta"),
("neengal", "நீங்கள்", "ta"),
("eppadi", "எப்படி", "ta"),
("irukkeenga", "இருக்கிறீர்கள்", "ta"),
("en", "என்", "ta"),
("peyar", "பெயர்", "ta"),
("tamil", "தமிழ்", "ta"),
("nadu", "நாடு", "ta"),
("sapadu", "சாப்பாடு", "ta"),
("thanni", "தண்ணீர்", "ta")
]
# Expand data slightly by duplicating to simulate a larger set for split
data = data * 5
random.shuffle(data)
df = pd.DataFrame(data, columns=["source", "target", "lang"])
# Split into train, val, test (80-10-10)
train_size = int(0.8 * len(df))
val_size = int(0.1 * len(df))
train_df = df[:train_size]
val_df = df[train_size:train_size+val_size]
test_df = df[train_size+val_size:]
output_dir = "data"
os.makedirs(output_dir, exist_ok=True)
train_df.to_csv(os.path.join(output_dir, "train.csv"), index=False)
val_df.to_csv(os.path.join(output_dir, "val.csv"), index=False)
test_df.to_csv(os.path.join(output_dir, "test.csv"), index=False)
print(f"Data generation complete.")
print(f"Train size: {len(train_df)}")
print(f"Val size: {len(val_df)}")
print(f"Test size: {len(test_df)}")
if __name__ == "__main__":
create_dummy_data()
|