Dinesh310's picture
Create training/preprocess.py
aee05e1 verified
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
def prepare_data(languages=["hin", "ben", "tam"]):
combined_data = []
for lang in languages:
# Load the dataset from Hugging Face
# Aksharantar usually uses 'english' as source and 'native' as target
dataset = load_dataset("ai4bharat/Aksharantar", lang, split='train', streaming=False)
df = pd.DataFrame(dataset)
# Prepare: <lang> english_word -> native_word
df['source'] = f"<{lang}> " + df['english']
df['target'] = df['native']
combined_data.append(df[['source', 'target']])
full_df = pd.concat(combined_data).sample(frac=1).reset_index(drop=True)
# Split: 90% Train, 10% Val
train, val = train_test_split(full_df, test_size=0.1, random_state=42)
train.to_csv("train.csv", index=False)
val.to_csv("val.csv", index=False)
print(f"Dataset prepared with {len(full_df)} samples.")
if __name__ == "__main__":
prepare_data()