Spaces:
Runtime error
Runtime error
Create training/preprocess.py
Browse files- training/preprocess.py +30 -0
training/preprocess.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from datasets import load_dataset
|
| 3 |
+
from sklearn.model_selection import train_test_split
|
| 4 |
+
|
| 5 |
+
def prepare_data(languages=["hin", "ben", "tam"]):
|
| 6 |
+
combined_data = []
|
| 7 |
+
|
| 8 |
+
for lang in languages:
|
| 9 |
+
# Load the dataset from Hugging Face
|
| 10 |
+
# Aksharantar usually uses 'english' as source and 'native' as target
|
| 11 |
+
dataset = load_dataset("ai4bharat/Aksharantar", lang, split='train', streaming=False)
|
| 12 |
+
df = pd.DataFrame(dataset)
|
| 13 |
+
|
| 14 |
+
# Prepare: <lang> english_word -> native_word
|
| 15 |
+
df['source'] = f"<{lang}> " + df['english']
|
| 16 |
+
df['target'] = df['native']
|
| 17 |
+
|
| 18 |
+
combined_data.append(df[['source', 'target']])
|
| 19 |
+
|
| 20 |
+
full_df = pd.concat(combined_data).sample(frac=1).reset_index(drop=True)
|
| 21 |
+
|
| 22 |
+
# Split: 90% Train, 10% Val
|
| 23 |
+
train, val = train_test_split(full_df, test_size=0.1, random_state=42)
|
| 24 |
+
|
| 25 |
+
train.to_csv("train.csv", index=False)
|
| 26 |
+
val.to_csv("val.csv", index=False)
|
| 27 |
+
print(f"Dataset prepared with {len(full_df)} samples.")
|
| 28 |
+
|
| 29 |
+
if __name__ == "__main__":
|
| 30 |
+
prepare_data()
|