import pandas as pd import os import random def create_dummy_data(): """Generates dummy transliteration data for Hindi, Bengali, and Tamil.""" # Minimal dummy dataset data = [ # Hindi ("namaste", "नमस्ते", "hi"), ("aap", "आप", "hi"), ("kya", "क्या", "hi"), ("kar", "कर", "hi"), ("rahe", "रहे", "hi"), ("ho", "हो", "hi"), ("mera", "मेरा", "hi"), ("naam", "नाम", "hi"), ("hai", "है", "hi"), ("bharat", "भारत", "hi"), # Bengali ("namoshkar", "নমস্কার", "bn"), ("apni", "আপনি", "bn"), ("kemon", "কেমন", "bn"), ("achen", "আছেন", "bn"), ("amar", "আমার", "bn"), ("nam", "নাম", "bn"), ("bangla", "বাংলা", "bn"), ("desh", "দেশ", "bn"), ("khabar", "খাবার", "bn"), ("jal", "জল", "bn"), # Tamil ("vanakkam", "வணக்கம்", "ta"), ("neengal", "நீங்கள்", "ta"), ("eppadi", "எப்படி", "ta"), ("irukkeenga", "இருக்கிறீர்கள்", "ta"), ("en", "என்", "ta"), ("peyar", "பெயர்", "ta"), ("tamil", "தமிழ்", "ta"), ("nadu", "நாடு", "ta"), ("sapadu", "சாப்பாடு", "ta"), ("thanni", "தண்ணீர்", "ta") ] # Expand data slightly by duplicating to simulate a larger set for split data = data * 5 random.shuffle(data) df = pd.DataFrame(data, columns=["source", "target", "lang"]) # Split into train, val, test (80-10-10) train_size = int(0.8 * len(df)) val_size = int(0.1 * len(df)) train_df = df[:train_size] val_df = df[train_size:train_size+val_size] test_df = df[train_size+val_size:] output_dir = "data" os.makedirs(output_dir, exist_ok=True) train_df.to_csv(os.path.join(output_dir, "train.csv"), index=False) val_df.to_csv(os.path.join(output_dir, "val.csv"), index=False) test_df.to_csv(os.path.join(output_dir, "test.csv"), index=False) print(f"Data generation complete.") print(f"Train size: {len(train_df)}") print(f"Val size: {len(val_df)}") print(f"Test size: {len(test_df)}") if __name__ == "__main__": create_dummy_data()