import pandas as pd from datasets import load_dataset from sklearn.model_selection import train_test_split def prepare_data(languages=["hin", "ben", "tam"]): combined_data = [] for lang in languages: # Load the dataset from Hugging Face # Aksharantar usually uses 'english' as source and 'native' as target dataset = load_dataset("ai4bharat/Aksharantar", lang, split='train', streaming=False) df = pd.DataFrame(dataset) # Prepare: english_word -> native_word df['source'] = f"<{lang}> " + df['english'] df['target'] = df['native'] combined_data.append(df[['source', 'target']]) full_df = pd.concat(combined_data).sample(frac=1).reset_index(drop=True) # Split: 90% Train, 10% Val train, val = train_test_split(full_df, test_size=0.1, random_state=42) train.to_csv("train.csv", index=False) val.to_csv("val.csv", index=False) print(f"Dataset prepared with {len(full_df)} samples.") if __name__ == "__main__": prepare_data()