Source code for slycat.darray

# Copyright (c) 2013, 2018 National Technology and Engineering Solutions of Sandia, LLC . Under the terms of Contract
# DE-NA0003525 with National Technology and Engineering Solutions of Sandia, LLC, the U.S. Government
# retains certain rights in this software.

"""Slycat makes extensive use of `darray` objects - dense, multi-dimension,
multi-attribute arrays - as its fundamental unit of storage and organization.
In the abstract, a darray can be modeled as follows:

* A set of dimensions.  Each dimension has a name, index type, and a half-open range of valid index values.  Currently, the only supported index type is "int64", and indices are all zero-based (i.e. the range always begins at zero), but these may change in the future.  Collectively, the dimensions define the size and shape of the array.
* A set of attributes, each with a name and type.  Allowed attribute types include a full complement of signed and unsigned fixed-width integer types, plus floating-point and string types.  Collectively, attributes define *what* will be stored in the array.
* The array data.  Because darrays are dense, the data will include one value per attribute, for every location in the array.

This definition allows darrays to be flexible and efficient - for example, a
"table" data structure with heterogenous column types can be stored as a 1D
darray with multiple attributes, while a "matrix" would be stored as a 2D darray
with a single floating-point attribute.

Note that darrays are an abstract concept with multiple concrete
representations.  This module defines an abstract interface for manipulating
Python darrays, and a concrete implementation with in-memory storage.  The
:py:mod:`slycat.hdf5` module defines functionality for manipulating darrays
stored in HDF5 files on disk, and the :ref:`rest-api` defines functionality
for working with darrays using HTTP.

Note that it is rare to manipulate entire darrays in memory at once, due to
their size - most applications will work with *slices* of a darray to keep
memory use manageable.
"""

import numpy
import cherrypy


[docs] class Prototype(object): """Abstract interface for all darray implementations.""" @property def ndim(self): """Return the number of dimensions in the array.""" raise NotImplementedError() @property def shape(self): """Return the shape (size along each dimension) of the array.""" raise NotImplementedError() @property def size(self): """Return the size (total number of elements) of the array.""" raise NotImplementedError() @property def dimensions(self): """Return a description of the array dimensions.""" raise NotImplementedError() @property def attributes(self): """Return a description of the array attributes.""" raise NotImplementedError()
[docs] def get_statistics(self, attribute=0): """Return statistics describing one attribute.""" raise NotImplementedError()
[docs] def get_data(self, attribute=0): """Return data from one attribute.""" raise NotImplementedError()
[docs] def set_data(self, attribute, slice, data): """Write data to one attribute.""" raise NotImplementedError()
[docs] class Stub(Prototype): """darray implementation that only stores array metadata (dimensions and attributes).""" def __init__(self, dimensions, attributes): if len(dimensions) < 1: cherrypy.log.error("darray.py Stub.__init__", "At least one dimension is required.") raise ValueError("At least one dimension is required.") if len(attributes) < 1: cherrypy.log.error("darray.py Stub.__init__", "At least one attribute is required.") raise ValueError("At least one attribute is required.") self._dimensions = [dict(name=_require_dimension_name(dimension["name"]), type=_require_dimension_type(dimension.get("type", "int64")), begin=_require_dimension_bound(dimension.get("begin", 0)), end=_require_dimension_bound(dimension["end"])) for dimension in dimensions] self._attributes = [dict(name=_require_attribute_name(attribute["name"]), type=_require_attribute_type(attribute["type"])) for attribute in attributes] for dimension in self._dimensions: if dimension["begin"] != 0: cherrypy.log.error("darray.py Stub.__init__", "Dimension range must being with 0.") raise ValueError("Dimension range must begin with 0.") @property def ndim(self): """Return the number of dimensions in the array.""" return len(self._dimensions) @property def shape(self): """Return the shape (size along each dimension) of the array.""" return tuple([dimension["end"] - dimension["begin"] for dimension in self._dimensions]) @property def size(self): """Return the size (total number of elements) of the array.""" return numpy.prod(self.shape) @property def dimensions(self): """Return a description of the array dimensions.""" return self._dimensions @property def attributes(self): """Return a description of the array attributes.""" return self._attributes
[docs] class MemArray(Stub): """darray implementation that holds the full array contents in memory.""" def __init__(self, dimensions, attributes, data): Stub.__init__(self, dimensions, attributes) if len(attributes) != len(data): cherrypy.log.error("darray.py MemArray.__init__", "Attribute and data counts must match.") raise ValueError("Attribute and data counts must match.") self._data = [numpy.array(attribute) for attribute in data] for attribute in self._data: if attribute.shape != self.shape: cherrypy.log.error("darray.py MemArray.__init__", "Attribute data must match array shape.") raise ValueError("Attribute data must match array shape.")
[docs] def get_statistics(self, attribute=0): """Return statistics describing one attribute.""" attribute = self._data[attribute] if attribute.dtype.char in ["O", "S", "U"]: return dict(min=min(attribute), max=max(attribute)) attribute = attribute[numpy.invert(numpy.isnan(attribute))] if len(attribute): return dict(min=attribute.min(), max=attribute.max()) return dict(min=None, max=None)
[docs] def get_data(self, attribute=0): """Return a data slice from one attribute.""" return self._data[attribute]
[docs] def set_data(self, attribute, slice, data): """Write a data slice to one attribute.""" self._data[attribute][slice] = data
def _require_attribute_name(name): if not isinstance(name, str): cherrypy.log.error("darray.py _require_attribute_name", "Attribute name must be a string.") raise ValueError("Attribute name must be a string.") return name def _require_attribute_type(type): if type not in _require_attribute_type.allowed_types: cherrypy.log.error("darray.py _require_attribute_type", "Attribute type must be one of %s" % ",".join(_require_attribute_type.allowed_types)) raise ValueError("Attribute type must be one of %s" % ",".join(_require_attribute_type.allowed_types)) return type _require_attribute_type.allowed_types = set(["int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64", "float32", "float64", "string", "bool"]) def _require_dimension_name(name): if not isinstance(name, str): cherrypy.log.error("darray.py _require_attribute_name", "Dimension name must be a string.") raise ValueError("Dimension name must be a string.") return name def _require_dimension_type(type): if type not in _require_dimension_type.allowed_types: cherrypy.log.error("darray.py _require_dimension_type", "Dimension type must be one of %s" % ",".join(_require_dimension_type.allowed_types)) raise ValueError("Dimension type must be one of %s" % ",".join(_require_dimension_type.allowed_types)) return type _require_dimension_type.allowed_types = set(["int64"]) def _require_dimension_bound(bound): if not isinstance(bound, int) and type(bound) is not numpy.int64: cherrypy.log.error("darray.py _require_dimension_bound", "Dimension bound must be an integer.") raise ValueError("Dimension bound must be an integer.") return bound