Project-1 / src /prepare_data.py
Abhishek11k's picture
Upload 31 files
e1d9ec2 verified
import pandas as pd
import os
import random
def create_dummy_data():
"""Generates dummy transliteration data for Hindi, Bengali, and Tamil."""
# Minimal dummy dataset
data = [
# Hindi
("namaste", "नमस्ते", "hi"),
("aap", "आप", "hi"),
("kya", "क्या", "hi"),
("kar", "कर", "hi"),
("rahe", "रहे", "hi"),
("ho", "हो", "hi"),
("mera", "मेरा", "hi"),
("naam", "नाम", "hi"),
("hai", "है", "hi"),
("bharat", "भारत", "hi"),
# Bengali
("namoshkar", "নমস্কার", "bn"),
("apni", "আপনি", "bn"),
("kemon", "কেমন", "bn"),
("achen", "আছেন", "bn"),
("amar", "আমার", "bn"),
("nam", "নাম", "bn"),
("bangla", "বাংলা", "bn"),
("desh", "দেশ", "bn"),
("khabar", "খাবার", "bn"),
("jal", "জল", "bn"),
# Tamil
("vanakkam", "வணக்கம்", "ta"),
("neengal", "நீங்கள்", "ta"),
("eppadi", "எப்படி", "ta"),
("irukkeenga", "இருக்கிறீர்கள்", "ta"),
("en", "என்", "ta"),
("peyar", "பெயர்", "ta"),
("tamil", "தமிழ்", "ta"),
("nadu", "நாடு", "ta"),
("sapadu", "சாப்பாடு", "ta"),
("thanni", "தண்ணீர்", "ta")
]
# Expand data slightly by duplicating to simulate a larger set for split
data = data * 5
random.shuffle(data)
df = pd.DataFrame(data, columns=["source", "target", "lang"])
# Split into train, val, test (80-10-10)
train_size = int(0.8 * len(df))
val_size = int(0.1 * len(df))
train_df = df[:train_size]
val_df = df[train_size:train_size+val_size]
test_df = df[train_size+val_size:]
output_dir = "data"
os.makedirs(output_dir, exist_ok=True)
train_df.to_csv(os.path.join(output_dir, "train.csv"), index=False)
val_df.to_csv(os.path.join(output_dir, "val.csv"), index=False)
test_df.to_csv(os.path.join(output_dir, "test.csv"), index=False)
print(f"Data generation complete.")
print(f"Train size: {len(train_df)}")
print(f"Val size: {len(val_df)}")
print(f"Test size: {len(test_df)}")
if __name__ == "__main__":
create_dummy_data()