File size: 1,068 Bytes
aee05e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split

def prepare_data(languages=["hin", "ben", "tam"]):
    combined_data = []
    
    for lang in languages:
        # Load the dataset from Hugging Face
        # Aksharantar usually uses 'english' as source and 'native' as target
        dataset = load_dataset("ai4bharat/Aksharantar", lang, split='train', streaming=False)
        df = pd.DataFrame(dataset)
        
        # Prepare: <lang> english_word -> native_word
        df['source'] = f"<{lang}> " + df['english']
        df['target'] = df['native']
        
        combined_data.append(df[['source', 'target']])
    
    full_df = pd.concat(combined_data).sample(frac=1).reset_index(drop=True)
    
    # Split: 90% Train, 10% Val
    train, val = train_test_split(full_df, test_size=0.1, random_state=42)
    
    train.to_csv("train.csv", index=False)
    val.to_csv("val.csv", index=False)
    print(f"Dataset prepared with {len(full_df)} samples.")

if __name__ == "__main__":
    prepare_data()