Spaces:

3morrrrr
/

Handwriting_Model_Inf

Sleeping

App Files Files Community

Handwriting_Model_Inf / data_frame.py

3morrrrr

Upload 14 files

569596a verified about 1 year ago

raw

history blame contribute delete

3.75 kB

	import copy

	import numpy as np
	import pandas as pd
	from sklearn.model_selection import train_test_split


	class DataFrame(object):

	"""Minimal pd.DataFrame analog for handling n-dimensional numpy matrices with additional
	support for shuffling, batching, and train/test splitting.

	Args:
	columns: List of names corresponding to the matrices in data.
	data: List of n-dimensional data matrices ordered in correspondence with columns.
	All matrices must have the same leading dimension. Data can also be fed a list of
	instances of np.memmap, in which case RAM usage can be limited to the size of a
	single batch.
	"""

	def __init__(self, columns, data):
	assert len(columns) == len(data), 'columns length does not match data length'

	lengths = [mat.shape[0] for mat in data]
	assert len(set(lengths)) == 1, 'all matrices in data must have same first dimension'

	self.length = lengths[0]
	self.columns = columns
	self.data = data
	self.dict = dict(zip(self.columns, self.data))
	self.idx = np.arange(self.length)

	def shapes(self):
	return pd.Series(dict(zip(self.columns, [mat.shape for mat in self.data])))

	def dtypes(self):
	return pd.Series(dict(zip(self.columns, [mat.dtype for mat in self.data])))

	def shuffle(self):
	np.random.shuffle(self.idx)

	def train_test_split(self, train_size, random_state=np.random.randint(1000), stratify=None):
	train_idx, test_idx = train_test_split(
	self.idx,
	train_size=train_size,
	random_state=random_state,
	stratify=stratify
	)
	train_df = DataFrame(copy.copy(self.columns), [mat[train_idx] for mat in self.data])
	test_df = DataFrame(copy.copy(self.columns), [mat[test_idx] for mat in self.data])
	return train_df, test_df

	def batch_generator(self, batch_size, shuffle=True, num_epochs=10000, allow_smaller_final_batch=False):
	epoch_num = 0
	while epoch_num < num_epochs:
	if shuffle:
	self.shuffle()

	for i in range(0, self.length + 1, batch_size):
	batch_idx = self.idx[i: i + batch_size]
	if not allow_smaller_final_batch and len(batch_idx) != batch_size:
	break
	yield DataFrame(
	columns=copy.copy(self.columns),
	data=[mat[batch_idx].copy() for mat in self.data]
	)

	epoch_num += 1

	def iterrows(self):
	for i in self.idx:
	yield self[i]

	def mask(self, mask):
	return DataFrame(copy.copy(self.columns), [mat[mask] for mat in self.data])

	def concat(self, other_df):
	mats = []
	for column in self.columns:
	mats.append(np.concatenate([self[column], other_df[column]], axis=0))
	return DataFrame(copy.copy(self.columns), mats)

	def items(self):
	return self.dict.items()

	def __iter__(self):
	return self.dict.items().__iter__()

	def __len__(self):
	return self.length

	def __getitem__(self, key):
	if isinstance(key, str):
	return self.dict[key]

	elif isinstance(key, int):
	return pd.Series(dict(zip(self.columns, [mat[self.idx[key]] for mat in self.data])))

	def __setitem__(self, key, value):
	assert value.shape[0] == len(self), 'matrix first dimension does not match'
	if key not in self.columns:
	self.columns.append(key)
	self.data.append(value)
	self.dict[key] = value