import argparse import json import os import shutil import numpy as np import pandas as pd try: from preprocessor import DataTransformer from baselines.ClavaDDPM.preprocess_utils import topological_sort except (ModuleNotFoundError, ImportError): import importlib import sys base_dir = os.path.dirname(__file__) full_path = os.path.abspath(os.path.join(base_dir, "..", "..", "preprocessor.py")) spec = importlib.util.spec_from_file_location("preprocessor", full_path) preprocessor = importlib.util.module_from_spec(spec) sys.modules["preprocessor"] = preprocessor spec.loader.exec_module(preprocessor) DataTransformer = preprocessor.DataTransformer def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser() subparsers = parser.add_subparsers(dest='op') pre_parser = subparsers.add_parser('pre') pre_parser.add_argument("--dataset-dir", "-d", default=os.path.join("data")) # pre_parser.add_argument("--n-games", "-n", type=int, default=None) pre_parser.add_argument("--out-dir", "-o", default=os.path.join(".")) post_parser = subparsers.add_parser('desimplify') post_parser.add_argument("--dataset-dir", "-d", default=os.path.join("data")) return parser.parse_args() def main(): args = parse_args() if args.op == "pre": table_names = ["players", "courses", "course_maker", "plays", "clears", "likes", "records", "course_meta"] players = pd.read_csv(os.path.join(args.dataset_dir, "players.csv"), sep="\t") players = players.drop(columns=["image", "name"]) courses = pd.read_csv(os.path.join(args.dataset_dir, "courses.csv"), sep="\t") courses = courses.drop(columns=["title", "thumbnail", "image"]) plays = pd.read_csv(os.path.join(args.dataset_dir, "plays.csv"), sep="\t") clears = pd.read_csv(os.path.join(args.dataset_dir, "clears.csv"), sep="\t") likes = pd.read_csv(os.path.join(args.dataset_dir, "likes.csv"), sep="\t") records = pd.read_csv(os.path.join(args.dataset_dir, "records.csv"), sep="\t") course_meta = pd.read_csv(os.path.join(args.dataset_dir, "course-meta.csv"), sep="\t") all_plays = plays[["id", "player"]].apply(lambda row: "$$".join(row.tolist()), axis=1) clears = clears[ clears[["id", "player"]].apply(lambda row: "$$".join(row.tolist()), axis=1).isin(all_plays) ].reset_index(drop=True) likes = likes[ likes[["id", "player"]].apply(lambda row: "$$".join(row.tolist()), axis=1).isin(all_plays) ].reset_index(drop=True) records = records[ records[["id", "player"]].apply(lambda row: "$$".join(row.tolist()), axis=1).isin(all_plays) ].reset_index(drop=True) course_meta = course_meta[ course_meta["firstClear"].isna() | course_meta[["id", "firstClear"]].astype(str).apply( lambda row: "$$".join(row.tolist()), axis=1 ).isin(clears[["id", "player"]].apply(lambda row: "$$".join(row.tolist()), axis=1)) ] courses = courses[courses.maker.isna() | courses.maker.isin(players["id"])].reset_index(drop=True) course_maker = courses[["id", "maker"]] courses = courses.drop(columns=["maker"]) processors = { table: DataTransformer() for table in table_names } if os.path.exists(os.path.join(args.out_dir, "processor.json")): with open(os.path.join(args.out_dir, "processor.json"), "r") as f: loaded = json.load(f) for table in table_names: processors[table] = DataTransformer.from_dict(loaded[table]) else: processors["players"].fit(players, ["id"]) processors["courses"].fit(courses, ["id"]) processors["course_maker"].fit(course_maker, ["id"], ref_cols={ "maker": processors["players"].columns["id"], "id": processors["courses"].columns["id"] }) processors["plays"].fit(plays, ref_cols={ "id": processors["courses"].columns["id"], "player": processors["players"].columns["id"], }) processors["clears"].fit(clears, ref_cols={ "id": processors["courses"].columns["id"], "player": processors["players"].columns["id"], }) processors["likes"].fit(likes, ref_cols={ "id": processors["courses"].columns["id"], "player": processors["players"].columns["id"], }) processors["records"].fit(records, ref_cols={ "id": processors["courses"].columns["id"], "player": processors["players"].columns["id"], }) processors["course_meta"].fit(course_meta, ref_cols={ "id": processors["courses"].columns["id"], "firstClear": processors["players"].columns["id"], }) with open(os.path.join(args.out_dir, "processor.json"), "w") as f: json.dump({ t: p.to_dict() for t, p in processors.items() }, f, indent=2) players = processors["players"].transform(players) courses = processors["courses"].transform(courses) course_maker = processors["course_maker"].transform(course_maker) plays = processors["plays"].transform(plays) clears = processors["clears"].transform(clears) likes = processors["likes"].transform(likes) records = processors["records"].transform(records) course_meta = processors["course_meta"].transform(course_meta) os.makedirs(args.out_dir, exist_ok=True) os.makedirs(os.path.join(args.out_dir, "preprocessed"), exist_ok=True) for table in table_names: locals()[table].to_csv(os.path.join(args.out_dir, f"preprocessed/{table}.csv"), index=False) os.makedirs(os.path.join(args.out_dir, "simplified"), exist_ok=True) plays, course_meta, (clears, likes, records) = simplify_dataset( plays, course_meta, clears, likes, records ) for table in table_names: locals()[table].to_csv(os.path.join(args.out_dir, f"simplified/{table}.csv"), index=False) elif args.op == "desimplify": desimplify_dataset(args.dataset_dir) def simplify_dataset(plays, course_meta, *other_tables): plays["playID"] = plays[["id", "player"]].apply(lambda row: "$$".join(row.tolist()), axis=1) course_meta["firstClear"] = course_meta.apply( lambda row: np.nan if pd.isna(row["firstClear"]) else f"{row['id']}$${row['firstClear']}", axis=1 ) new_other_tables = [] for table in other_tables: table["playID"] = table[["id", "player"]].apply(lambda row: "$$".join(row.tolist()), axis=1) table = table.drop(columns=["id", "player"]) new_other_tables.append(table) return plays, course_meta, new_other_tables def desimplify_dataset(generated_dir): plays = pd.read_csv(os.path.join(generated_dir, "plays.csv")) course_meta = pd.read_csv(os.path.join(generated_dir, "course_meta.csv")) def _process_df(df_name): df = pd.read_csv(os.path.join(generated_dir, f"{df_name}.csv")) df["index"] = df.index merged_clears = df.merge( plays[["id", "player", "playID"]], on="playID", how="left" ).set_index("index").loc[df.index].drop(columns=["playID"]) df = merged_clears df.to_csv(os.path.join(generated_dir, f"{df_name}.csv"), index=False) _process_df("clears") _process_df("likes") _process_df("records") course_meta["index"] = course_meta.index merged_course_meta = course_meta.merge( plays[["id", "player", "playID"]].rename(columns={"id": "play_course_id"}), right_on="playID", left_on="firstClear", how="left" ).set_index("index").loc[course_meta.index].drop(columns=["playID", "play_course_id", "player"]) merged_course_meta.to_csv(os.path.join(generated_dir, "course_meta.csv"), index=False) if __name__ == "__main__": main()