Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import os | |
| import random | |
| def create_dummy_data(): | |
| """Generates dummy transliteration data for Hindi, Bengali, and Tamil.""" | |
| # Minimal dummy dataset | |
| data = [ | |
| # Hindi | |
| ("namaste", "नमस्ते", "hi"), | |
| ("aap", "आप", "hi"), | |
| ("kya", "क्या", "hi"), | |
| ("kar", "कर", "hi"), | |
| ("rahe", "रहे", "hi"), | |
| ("ho", "हो", "hi"), | |
| ("mera", "मेरा", "hi"), | |
| ("naam", "नाम", "hi"), | |
| ("hai", "है", "hi"), | |
| ("bharat", "भारत", "hi"), | |
| # Bengali | |
| ("namoshkar", "নমস্কার", "bn"), | |
| ("apni", "আপনি", "bn"), | |
| ("kemon", "কেমন", "bn"), | |
| ("achen", "আছেন", "bn"), | |
| ("amar", "আমার", "bn"), | |
| ("nam", "নাম", "bn"), | |
| ("bangla", "বাংলা", "bn"), | |
| ("desh", "দেশ", "bn"), | |
| ("khabar", "খাবার", "bn"), | |
| ("jal", "জল", "bn"), | |
| # Tamil | |
| ("vanakkam", "வணக்கம்", "ta"), | |
| ("neengal", "நீங்கள்", "ta"), | |
| ("eppadi", "எப்படி", "ta"), | |
| ("irukkeenga", "இருக்கிறீர்கள்", "ta"), | |
| ("en", "என்", "ta"), | |
| ("peyar", "பெயர்", "ta"), | |
| ("tamil", "தமிழ்", "ta"), | |
| ("nadu", "நாடு", "ta"), | |
| ("sapadu", "சாப்பாடு", "ta"), | |
| ("thanni", "தண்ணீர்", "ta") | |
| ] | |
| # Expand data slightly by duplicating to simulate a larger set for split | |
| data = data * 5 | |
| random.shuffle(data) | |
| df = pd.DataFrame(data, columns=["source", "target", "lang"]) | |
| # Split into train, val, test (80-10-10) | |
| train_size = int(0.8 * len(df)) | |
| val_size = int(0.1 * len(df)) | |
| train_df = df[:train_size] | |
| val_df = df[train_size:train_size+val_size] | |
| test_df = df[train_size+val_size:] | |
| output_dir = "data" | |
| os.makedirs(output_dir, exist_ok=True) | |
| train_df.to_csv(os.path.join(output_dir, "train.csv"), index=False) | |
| val_df.to_csv(os.path.join(output_dir, "val.csv"), index=False) | |
| test_df.to_csv(os.path.join(output_dir, "test.csv"), index=False) | |
| print(f"Data generation complete.") | |
| print(f"Train size: {len(train_df)}") | |
| print(f"Val size: {len(val_df)}") | |
| print(f"Test size: {len(test_df)}") | |
| if __name__ == "__main__": | |
| create_dummy_data() | |