| import pandas as pd |
| import pyarrow as pa |
| import pyarrow.parquet as pq |
| import numpy as np |
| import tiktoken |
| import pickle |
| from sklearn.model_selection import train_test_split |
| import random |
| import os |
|
|
|
|
| move_num_in_gamestate = False |
|
|
| def tokenize_game(game, stoi): |
| |
| game_cleaned = game.split('\n\n', 1)[1] if '\n\n' in game else game |
| game_cleaned = ' '.join(['.' + m.split(".")[-1] if "." in m else m for m in game_cleaned.split()]) |
| return np.array(encode(game_cleaned), dtype=np.uint8) |
|
|
| if __name__ == "__main__": |
| dataset_path = "/media/hailey/TVBox/csv_datasets/anneal.csv" |
| meta_path = "data/chess/meta.pkl" |
| |
| |
| if move_num_in_gamestate: |
| meta_path = os.path.join(os.path.join('data', 'chess'), 'meta.pkl') |
| with open(meta_path, "rb") as f: |
| meta = pickle.load(f) |
| stoi, itos = meta["stoi"], meta["itos"] |
| encode = lambda s: [stoi[c] for c in s] |
| decode = lambda l: "".join([itos[i] for i in l]) |
| else: |
| stoi = {' ': 0, '.': 1, 'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6, 'f': 7, 'g': 8, 'h': 9, '1': 10, '2': 11, '3': 12, '4': 13, '5': 14, '6': 15, '7': 16, '8': 17, 'B': 18, 'N': 19, 'R': 20, 'Q': 21, 'K': 22, 'O': 23, 'x': 24, '+': 25, '#': 26, '=': 27} |
| itos = {0: ' ', 1: '.', 2: 'a', 3: 'b', 4: 'c', 5: 'd', 6: 'e', 7: 'f', 8: 'g', 9: 'h', 10: '1', 11: '2', 12: '3', 13: '4', 14: '5', 15: '6', 16: '7', 17: '8', 18: 'B', 19: 'N', 20: 'R', 21: 'Q', 22: 'K', 23: 'O', 24: 'x', 25: '+', 26: '#', 27: '='} |
| for s in stoi: |
| assert itos[stoi[s]] == s |
| encode = lambda s: [stoi[c] for c in s.replace('-', '')] |
| decode = lambda l: "".join([itos[i] for i in l]).replace("OOO", "O-O-O").replace("OO", "O-O") |
| |
| |
| print("Opening csv...") |
| df = pd.read_csv(dataset_path) |
| |
| |
| |
| total_games = len(df) |
| |
| |
| |
| print(f"Total games: {total_games}. Tokenizing...") |
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
| df['tokenized'] = df['transcript'].apply(lambda x: tokenize_game(x, stoi)) |
| print("Tokenized. Writing parquet file...") |
|
|
| |
| |
| train_df = df |
| val_df = None |
|
|
| |
| def write_parquet_with_row_groups(df, file_name, rows_per_group=100): |
| table = pa.Table.from_pandas(df[['tokenized']]) |
| writer = pq.ParquetWriter(file_name, table.schema) |
| for i in range(0, len(df), rows_per_group): |
| writer.write_table(table.slice(i, min(rows_per_group, len(df) - i))) |
| writer.close() |
|
|
| write_parquet_with_row_groups(train_df, '/media/hailey/TVBox/NEW_anneal.parquet') |
| |
| print("Done.") |
|
|