File size: 502 Bytes
eb7f075
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
from datasets import load_dataset
import pandas as pd

def prepare_dataset():
    dataset = load_dataset("ageron/tatoeba_mt_train", "eng-spa")

    df = pd.concat([
        dataset["validation"].to_pandas(), dataset["test"].to_pandas()
    ], axis=0)\
           .sample(frac=1, random_state=42)\
           .reset_index(drop=True)
    
    df[["source_text", "target_text"]].to_parquet("eng_spa.parquet")
    print("Data saved to eng_spa.parquet")
    
if __name__ == "__main__":
    prepare_dataset()