| from pathlib import Path | |
| import pandas as pd | |
| train = (pd.concat([pd.read_csv(p, sep="\t", names=["_text", "lang"]) for p in Path("/nfsmounts/datastore/langid2").rglob("*_train.tsv")]) | |
| .assign(text=lambda x: x["_text"].str[6:].str.strip()) | |
| .drop("_text", axis=1) | |
| .query("'xxx' not in text") | |
| .sample(frac=1) | |
| .reset_index() | |
| .drop('index', axis=1) | |
| ) | |
| validation = (pd.concat([pd.read_csv(p, sep="\t", names=["_text", "lang"]) for p in Path("/nfsmounts/datastore/langid2").rglob("*_dev.tsv")]) | |
| .assign(text=lambda x: x["_text"].str[6:].str.strip()) | |
| .drop("_text", axis=1) | |
| .query("'xxx' not in text") | |
| .sample(frac=1) | |
| .reset_index() | |
| .drop('index', axis=1) | |
| ) | |
| test = (pd.concat([pd.read_csv(p, sep="\t", names=["_text", "lang"]) for p in Path("/nfsmounts/datastore/langid2").rglob("*_test.tsv")]) | |
| .assign(text=lambda x: x["_text"].str[6:].str.strip()) | |
| .drop("_text", axis=1) | |
| .query("'xxx' not in text") | |
| .sample(frac=1) | |
| .reset_index() | |
| .drop('index', axis=1) | |
| ) | |
| train.to_csv("train.csv", index=False) | |
| validation.to_csv("validation.csv", index=False) | |
| test.to_csv("test.csv", index=False) | |
| Path("train.txt").write_text("\n".join(train.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values)) | |
| Path("validation.txt").write_text("\n".join(validation.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values)) | |
| Path("test.txt").write_text("\n".join(test.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values)) | |