Dinesh310 commited on
Commit
aee05e1
·
verified ·
1 Parent(s): d414f06

Create training/preprocess.py

Browse files
Files changed (1) hide show
  1. training/preprocess.py +30 -0
training/preprocess.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from datasets import load_dataset
3
+ from sklearn.model_selection import train_test_split
4
+
5
+ def prepare_data(languages=["hin", "ben", "tam"]):
6
+ combined_data = []
7
+
8
+ for lang in languages:
9
+ # Load the dataset from Hugging Face
10
+ # Aksharantar usually uses 'english' as source and 'native' as target
11
+ dataset = load_dataset("ai4bharat/Aksharantar", lang, split='train', streaming=False)
12
+ df = pd.DataFrame(dataset)
13
+
14
+ # Prepare: <lang> english_word -> native_word
15
+ df['source'] = f"<{lang}> " + df['english']
16
+ df['target'] = df['native']
17
+
18
+ combined_data.append(df[['source', 'target']])
19
+
20
+ full_df = pd.concat(combined_data).sample(frac=1).reset_index(drop=True)
21
+
22
+ # Split: 90% Train, 10% Val
23
+ train, val = train_test_split(full_df, test_size=0.1, random_state=42)
24
+
25
+ train.to_csv("train.csv", index=False)
26
+ val.to_csv("val.csv", index=False)
27
+ print(f"Dataset prepared with {len(full_df)} samples.")
28
+
29
+ if __name__ == "__main__":
30
+ prepare_data()