Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| from datasets import load_dataset | |
| from sklearn.model_selection import train_test_split | |
| def prepare_data(languages=["hin", "ben", "tam"]): | |
| combined_data = [] | |
| for lang in languages: | |
| # Load the dataset from Hugging Face | |
| # Aksharantar usually uses 'english' as source and 'native' as target | |
| dataset = load_dataset("ai4bharat/Aksharantar", lang, split='train', streaming=False) | |
| df = pd.DataFrame(dataset) | |
| # Prepare: <lang> english_word -> native_word | |
| df['source'] = f"<{lang}> " + df['english'] | |
| df['target'] = df['native'] | |
| combined_data.append(df[['source', 'target']]) | |
| full_df = pd.concat(combined_data).sample(frac=1).reset_index(drop=True) | |
| # Split: 90% Train, 10% Val | |
| train, val = train_test_split(full_df, test_size=0.1, random_state=42) | |
| train.to_csv("train.csv", index=False) | |
| val.to_csv("val.csv", index=False) | |
| print(f"Dataset prepared with {len(full_df)} samples.") | |
| if __name__ == "__main__": | |
| prepare_data() |