JayKimDevolved/deepseek

c011401 verified 11 months ago

7.28 kB

	"""
	A buffered iterator for big arrays.

	This module solves the problem of iterating over a big file-based array
	without having to read it into memory. The `Arrayterator` class wraps
	an array object, and when iterated it will return sub-arrays with at most
	a user-specified number of elements.

	"""
	from __future__ import division, absolute_import, print_function

	from operator import mul
	from functools import reduce

	from numpy.compat import long

	__all__ = ['Arrayterator']


	class Arrayterator(object):
	"""
	Buffered iterator for big arrays.

	`Arrayterator` creates a buffered iterator for reading big arrays in small
	contiguous blocks. The class is useful for objects stored in the
	file system. It allows iteration over the object without reading
	everything in memory; instead, small blocks are read and iterated over.

	`Arrayterator` can be used with any object that supports multidimensional
	slices. This includes NumPy arrays, but also variables from
	Scientific.IO.NetCDF or pynetcdf for example.

	Parameters
	----------
	var : array_like
	The object to iterate over.
	buf_size : int, optional
	The buffer size. If `buf_size` is supplied, the maximum amount of
	data that will be read into memory is `buf_size` elements.
	Default is None, which will read as many element as possible
	into memory.

	Attributes
	----------
	var
	buf_size
	start
	stop
	step
	shape
	flat

	See Also
	--------
	ndenumerate : Multidimensional array iterator.
	flatiter : Flat array iterator.
	memmap : Create a memory-map to an array stored in a binary file on disk.

	Notes
	-----
	The algorithm works by first finding a "running dimension", along which
	the blocks will be extracted. Given an array of dimensions
	``(d1, d2, ..., dn)``, e.g. if `buf_size` is smaller than ``d1``, the
	first dimension will be used. If, on the other hand,
	``d1 < buf_size < d1*d2`` the second dimension will be used, and so on.
	Blocks are extracted along this dimension, and when the last block is
	returned the process continues from the next dimension, until all
	elements have been read.

	Examples
	--------
	>>> import numpy as np
	>>> a = np.arange(3 * 4 * 5 * 6).reshape(3, 4, 5, 6)
	>>> a_itor = np.lib.arrayterator.Arrayterator(a, 2)
	>>> a_itor.shape
	(3, 4, 5, 6)

	Now we can iterate over ``a_itor``, and it will return arrays of size
	two. Since `buf_size` was smaller than any dimension, the first
	dimension will be iterated over first:

	>>> for subarr in a_itor:
	... if not subarr.all():
	... print subarr, subarr.shape
	...
	[[[[0 1]]]] (1, 1, 1, 2)

	"""

	def __init__(self, var, buf_size=None):
	self.var = var
	self.buf_size = buf_size

	self.start = [0 for dim in var.shape]
	self.stop = [dim for dim in var.shape]
	self.step = [1 for dim in var.shape]

	def __getattr__(self, attr):
	return getattr(self.var, attr)

	def __getitem__(self, index):
	"""
	Return a new arrayterator.

	"""
	# Fix index, handling ellipsis and incomplete slices.
	if not isinstance(index, tuple):
	index = (index,)
	fixed = []
	length, dims = len(index), len(self.shape)
	for slice_ in index:
	if slice_ is Ellipsis:
	fixed.extend([slice(None)] * (dims-length+1))
	length = len(fixed)
	elif isinstance(slice_, (int, long)):
	fixed.append(slice(slice_, slice_+1, 1))
	else:
	fixed.append(slice_)
	index = tuple(fixed)
	if len(index) < dims:
	index += (slice(None),) * (dims-len(index))

	# Return a new arrayterator object.
	out = self.__class__(self.var, self.buf_size)
	for i, (start, stop, step, slice_) in enumerate(
	zip(self.start, self.stop, self.step, index)):
	out.start[i] = start + (slice_.start or 0)
	out.step[i] = step * (slice_.step or 1)
	out.stop[i] = start + (slice_.stop or stop-start)
	out.stop[i] = min(stop, out.stop[i])
	return out

	def __array__(self):
	"""
	Return corresponding data.

	"""
	slice_ = tuple(slice(*t) for t in zip(
	self.start, self.stop, self.step))
	return self.var[slice_]

	@property
	def flat(self):
	"""
	A 1-D flat iterator for Arrayterator objects.

	This iterator returns elements of the array to be iterated over in
	`Arrayterator` one by one. It is similar to `flatiter`.

	See Also
	--------
	`Arrayterator`
	flatiter

	Examples
	--------
	>>> a = np.arange(3 * 4 * 5 * 6).reshape(3, 4, 5, 6)
	>>> a_itor = np.lib.arrayterator.Arrayterator(a, 2)

	>>> for subarr in a_itor.flat:
	... if not subarr:
	... print subarr, type(subarr)
	...
	0 <type 'numpy.int32'>

	"""
	for block in self:
	for value in block.flat:
	yield value

	@property
	def shape(self):
	"""
	The shape of the array to be iterated over.

	For an example, see `Arrayterator`.

	"""
	return tuple(((stop-start-1)//step+1) for start, stop, step in
	zip(self.start, self.stop, self.step))

	def __iter__(self):
	# Skip arrays with degenerate dimensions
	if [dim for dim in self.shape if dim <= 0]:
	raise StopIteration

	start = self.start[:]
	stop = self.stop[:]
	step = self.step[:]
	ndims = len(self.var.shape)

	while True:
	count = self.buf_size or reduce(mul, self.shape)

	# iterate over each dimension, looking for the
	# running dimension (ie, the dimension along which
	# the blocks will be built from)
	rundim = 0
	for i in range(ndims-1, -1, -1):
	# if count is zero we ran out of elements to read
	# along higher dimensions, so we read only a single position
	if count == 0:
	stop[i] = start[i]+1
	elif count <= self.shape[i]:
	# limit along this dimension
	stop[i] = start[i] + count*step[i]
	rundim = i
	else:
	# read everything along this dimension
	stop[i] = self.stop[i]
	stop[i] = min(self.stop[i], stop[i])
	count = count//self.shape[i]

	# yield a block
	slice_ = tuple(slice(*t) for t in zip(start, stop, step))
	yield self.var[slice_]

	# Update start position, taking care of overflow to
	# other dimensions
	start[rundim] = stop[rundim] # start where we stopped
	for i in range(ndims-1, 0, -1):
	if start[i] >= self.stop[i]:
	start[i] = self.start[i]
	start[i-1] += self.step[i-1]
	if start[0] >= self.stop[0]:
	raise StopIteration