HaileyStorm
/

chess-mamba-vs-xformer

Model card Files Files and versions

chess-mamba-vs-xformer / csv2pqt_windraw.py

HaileyStorm's picture

Upload 5 files

80bc2b3 verified about 2 years ago

history blame contribute delete

3.56 kB

	import pandas as pd
	import pyarrow as pa
	import pyarrow.parquet as pq
	import numpy as np
	import tiktoken
	import pickle
	from sklearn.model_selection import train_test_split
	import random
	import os


	move_num_in_gamestate = False

	def tokenize_game(game, stoi):
	# Remove the prefix and tokenize the game
	game_cleaned = game.split('\n\n', 1)[1] if '\n\n' in game else game
	game_cleaned = ' '.join(['.' + m.split(".")[-1] if "." in m else m for m in game_cleaned.split()])
	return np.array(encode(game_cleaned), dtype=np.uint8)

	if __name__ == "__main__":
	dataset_path = "/media/hailey/TVBox/csv_datasets/anneal.csv"
	meta_path = "data/chess/meta.pkl"

	# Load metadata for tokenization
	if move_num_in_gamestate:
	meta_path = os.path.join(os.path.join('data', 'chess'), 'meta.pkl')
	with open(meta_path, "rb") as f:
	meta = pickle.load(f)
	stoi, itos = meta["stoi"], meta["itos"]
	encode = lambda s: [stoi[c] for c in s]
	decode = lambda l: "".join([itos[i] for i in l])
	else:
	stoi = {' ': 0, '.': 1, 'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6, 'f': 7, 'g': 8, 'h': 9, '1': 10, '2': 11, '3': 12, '4': 13, '5': 14, '6': 15, '7': 16, '8': 17, 'B': 18, 'N': 19, 'R': 20, 'Q': 21, 'K': 22, 'O': 23, 'x': 24, '+': 25, '#': 26, '=': 27}
	itos = {0: ' ', 1: '.', 2: 'a', 3: 'b', 4: 'c', 5: 'd', 6: 'e', 7: 'f', 8: 'g', 9: 'h', 10: '1', 11: '2', 12: '3', 13: '4', 14: '5', 15: '6', 16: '7', 17: '8', 18: 'B', 19: 'N', 20: 'R', 21: 'Q', 22: 'K', 23: 'O', 24: 'x', 25: '+', 26: '#', 27: '='}
	for s in stoi:
	assert itos[stoi[s]] == s
	encode = lambda s: [stoi[c] for c in s.replace('-', '')]
	decode = lambda l: "".join([itos[i] for i in l]).replace("OOO", "O-O-O").replace("OO", "O-O")

	# Read CSV with headers
	print("Opening csv...")
	df = pd.read_csv(dataset_path)
	#print(df.iloc[random.randint(0, len(df) - 1)])

	# Report statistics
	total_games = len(df)
	#white_wins = len(df[df['Result'] == '1-0'])
	#white_draws = len(df[df['Result'] == '1/2-1/2'])
	#discarded_games = total_games - white_wins #- white_draws
	print(f"Total games: {total_games}. Tokenizing...")
	#print(f"White wins: {white_wins} ({white_wins/total_games*100:.2f}%)")
	#print(f"White draws: {white_draws} ({white_draws/total_games*100:.2f}%)")
	#print(f"Discarded games: {discarded_games} ({discarded_games/total_games*100:.2f}%)")

	# Filter out games where white loses
	#df = df[df['Result'].isin(['1-0', '1/2-1/2'])]
	#df = df[df['Result'] == '1-0']

	# Tokenize games in the 'transcript' column
	df['tokenized'] = df['transcript'].apply(lambda x: tokenize_game(x, stoi))
	print("Tokenized. Writing parquet file...")

	# Split dataset into training and validation
	#train_df, val_df = train_test_split(df, test_size=0.0, random_state=42)
	train_df = df
	val_df = None

	# Define a function to write the DataFrame to a Parquet file with multiple rows per row group
	def write_parquet_with_row_groups(df, file_name, rows_per_group=100):
	table = pa.Table.from_pandas(df[['tokenized']])
	writer = pq.ParquetWriter(file_name, table.schema)
	for i in range(0, len(df), rows_per_group):
	writer.write_table(table.slice(i, min(rows_per_group, len(df) - i)))
	writer.close()

	write_parquet_with_row_groups(train_df, '/media/hailey/TVBox/NEW_anneal.parquet')
	#write_parquet_with_row_groups(val_df, 'val_lich_windraw.parquet')
	print("Done.")