Spaces:
Sleeping
Sleeping
File size: 502 Bytes
eb7f075 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | from datasets import load_dataset
import pandas as pd
def prepare_dataset():
dataset = load_dataset("ageron/tatoeba_mt_train", "eng-spa")
df = pd.concat([
dataset["validation"].to_pandas(), dataset["test"].to_pandas()
], axis=0)\
.sample(frac=1, random_state=42)\
.reset_index(drop=True)
df[["source_text", "target_text"]].to_parquet("eng_spa.parquet")
print("Data saved to eng_spa.parquet")
if __name__ == "__main__":
prepare_dataset() |