Source code for nsdf.nsdfwriter
# nsdfwriter.py ---
#
# Filename: nsdfwriter.py
# Description:
# Author: Subhasis Ray [email: {lastname} dot {firstname} at gmail dot com]
# Maintainer:
# Created: Fri Apr 25 19:51:42 2014 (+0530)
# Version:
# Last-Updated:
# By:
# Update #: 0
# URL:
# Keywords:
# Compatibility:
#
#
# Commentary:
#
#
#
#
# Change log:
#
#
#
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 3, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; see the file COPYING. If not, write to
# the Free Software Foundation, Inc., 51 Franklin Street, Fifth
# Floor, Boston, MA 02110-1301, USA.
#
#
# Code:
"""
Writer for NSDF file format.
"""
import h5py as h5
import numpy as np
import os
from .model import ModelComponent, common_prefix
from .constants import *
from .util import *
from datetime import datetime
[docs]def match_datasets(hdfds, pydata):
"""Match entries in hdfds with those in pydata. Returns true if the
two sets are equal. False otherwise.
"""
src_set = set([item for item in hdfds])
dsrc_set = set(pydata)
return src_set == dsrc_set
[docs]def add_model_component(component, parentgroup):
"""Add a model component as a group under `parentgroup`.
This creates a group `component.name` under parent group if not
already present. The `uid` of the component is stored in the `uid`
attribute of the group. Key-value pairs in the `component.attrs`
dict are stored as attributes of the group.
Args:
component (ModelComponent): model component object to be
written to NSDF file.
parentgroup (HDF Group): group under which this
component's group should be created.
Returns:
HDF Group created for this model component.
Raises:
KeyError if the parentgroup is None and no group
corresponding to the component's parent exists.
"""
grp = parentgroup.require_group(component.name)
component.hdfgroup = grp
if component.uid is not None:
grp.attrs['uid'] = component.uid
else:
grp.attrs['uid'] = component.path
for key, value in component.attrs.items():
grp.attrs[key] = value
return grp
[docs]def write_ascii_file(group, name, fname, **compression_opts):
"""Add a dataset `name` under `group` and store the contents of text
file `fname` in it."""
with open(fname, 'rt') as fhandle:
data = fhandle.read()
if '\x00' in data:
raise ValueError('Cannot handle NULL byte in ascii data')
dataset = group.create_dataset(name, shape=(1,), data=data, dtype=VLENBYTE,
**compression_opts)
return dataset
[docs]def write_binary_file(group, name, fname, **compression_opts):
"""Add a dataset `name` under `group` and store the contents of binary
file `fname` in it."""
with open(fname, 'rb') as fhandle:
data = np.void(fhandle.read())
dataset = group.create_dataset(name, shape=(1,), data=data, dtype=np.void)
return dataset
[docs]def write_dir_contents(root_group, root_dir, ascii, **compression_opts):
"""Walk the directory tree rooted at `root_dir` and replicate it under
`root_group` in HDF5 file.
This is a helper function for copying model directory structure
and file contents into an hdf5 file. If ascii=True all files are
considered ascii text else all files are taken as binary blob.
Args:
root_group (h5py.Group): group under which the directory tree
is to be created.
root_dir (str): path of the directory from which to start
traversal.
ascii (bool): whether to treat each file as ascii text file.
"""
for root, dirs, files in os.walk(root_dir):
relative_root = root[root.find(os.path.basename(root_dir)):]
grp = root_group.require_group(relative_root)
for fname in files:
dset_name = os.path.basename(fname)
file_path = os.path.join(root, fname)
if ascii:
try:
dset = write_ascii_file(grp, dset_name, file_path, **compression_opts)
except ValueError:
print 'Skipping binary file', file_path
else:
dset = write_binary_file(grp, dset_name, file_path, **compression_opts)
[docs]class NSDFWriter(object):
"""Writer for NSDF files.
An NSDF file has three main groups: `/model`, `/data` and `/map`.
Attributes:
mode (str): File open mode. Defaults to append
('a'). Can be 'w' or 'w+' also.
dialect (nsdf.dialect member): ONED for storing nonuniformly
sampled and event data in 1D arrays.
VLEN for storing such data in 2D VLEN datasets.
NANPADDED for storing such data in 2D homogeneous datasets
with NaN padding.
model (h5.Group): /model group
data (h5.Group): /data group
mapping (h5.Group): /map group
time_dim (h5.Group): /map/time group contains the sampling
time points as dimension scales of data. It is mainly used
for nonuniformly sampled data.
modeltree: (h5.Group): '/model/modeltree group can be used for
storing the model in a hierarchical manner. Each subgroup
under `modeltree` is a model component and can contain
other subgroups representing subcomponents. Each group
stores the unique identifier of the model component it
represents in the string attribute `uid`.
"""
def __init__(self, filename, dialect=dialect.ONED, mode='a', **h5args):
"""Initialize NSDF writer.
Args:
filename (str): path of the file to be written.
dialect (nsdf.dialect member): the dialect of NSDF to be
used. Default: ONED.
mode (str): file write mode. Default is 'a', which is also
the default of h5py.File.
**h5args: other keyword arguments are passed to h5py when
creating datasets. These can be `compression`
(='gzip'/'szip'/'lzf'), `compression_opts` (=0-9
with gzip), `fletcher32` (=True/False), `shuffle`
(=True/False).
"""
self._fd = h5.File(filename, mode)
self.timestamp = datetime.utcnow()
self._fd.attrs['created'] = self.timestamp.isoformat()
self._fd.attrs['nsdf_version'] = '0.1'
self._fd.attrs['dialect'] = dialect
self.mode = mode
self.dialect = dialect
self.data = self._fd.require_group('/data')
self.model = self._fd.require_group('/model')
self.mapping = self._fd.require_group('/map')
self.time_dim = self.mapping.require_group('time')
self.modeltree = self.model.require_group('modeltree')
for stype in SAMPLING_TYPES:
self.data.require_group(stype)
self.mapping.require_group(stype)
self.modelroot = ModelComponent('modeltree', uid='modeltree',
hdfgroup=self.modeltree)
self.h5args = h5args
def __del__(self):
self._fd.close()
[docs] def set_properties(self, properties):
"""Set the file attributes (environments).
Args:
properties (dict): mapping property names to values.
It must contain the following keyes:
title (str)
creator (list of str)
software (list of str)
method (list of str)
description (str)
rights (str)
tstart (datetime.datetime)
tend (datetime.datetime)
contributor (list of str)
Raises:
KeyError if not all environment properties are specified in the dict.
"""
self._fd.attrs['title'] = properties['title']
attr = np.zeros((len(properties['creator']),), dtype=VLENSTR)
attr[:] = properties['creator']
self._fd.attrs['creator'] = attr
attr = np.zeros((len(properties['software']),), dtype=VLENSTR)
attr[:] = properties['software']
self._fd.attrs['software'] = attr
attr = np.zeros((len(properties['method']),), dtype=VLENSTR)
attr[:] = properties['method']
self._fd.attrs['method'] = attr
self._fd.attrs['description'] = properties['description']
self._fd.attrs['rights'] = properties['rights']
self._fd.attrs['tstart'] = properties['tstart'].isoformat()
self._fd.attrs['tend'] = properties['tend'].isoformat()
attr = np.zeros((len(properties['contributor']),), dtype=VLENSTR)
attr[:] = properties['contributor']
self._fd.attrs['contributor'] = attr
@property
def title(self):
"""Title of the file"""
try:
return self._fd.attrs['title']
except KeyError:
return None
@title.setter
[docs] def title(self, title):
"""Set the title of the file.
Args:
title (str): title text of the file.
"""
self._fd.attrs['title'] = title
@property
def creator(self):
return self._fd.attrs['creator']
@creator.setter
def creator(self, creator_list):
"""Set the creator (one or more authors) of the file.
Args:
creator_list (list of str): list of creators of the file.
"""
attr = np.zeros((len(creator_list),), dtype=VLENSTR)
attr[:] = creator_list
self._fd.attrs['creator'] = attr
@property
def license(self):
"""License information about the file. This is text string."""
return self._fd.attrs['license']
@license.setter
@property
def software(self):
"""Software (one or more) used to generate the data in the file.
"""
return self._fd.attrs['software']
@software.setter
[docs] def software(self, software_list):
"""Set the software (one or more) used to generate the data in the
file.
Args:
software_list (list of str): list of software that
involved in generating the data in the file.
"""
attr = np.zeros((len(software_list),), dtype=VLENSTR)
attr[:] = software_list
self._fd.attrs['software'] = attr
@property
def method(self):
"""(numerical) methods applied in generating the data."""
return self._fd.attrs['method']
@method.setter
[docs] def method(self, method_list):
"""Set the (numerical) methods applied in generating the data.
Args:
method_list (list of str): names of the methods employed
to generate the data.
"""
attr = np.zeros((len(method_list),), dtype=VLENSTR)
attr[:] = method_list
self._fd.attrs['method'] = attr
@property
def description(self):
"""Description of the file. A text string."""
return self._fd.attrs['description']
@description.setter
[docs] def description(self, description):
"""Set the description of the file.
Args:
description (str): a human readable description of the
file.
"""
self._fd.attrs['description'] = description
@property
def rights(self):
"""The rights of the file contents."""
return self._fd.attrs['rights']
@rights.setter
[docs] def rights(self, rights):
"""Set the rights of the file contents.
Args:
rights (str): text describing the rights of various
individuals/organizations/other entities on the file
contents.
"""
self._fd.attrs['rights'] = rights
@property
def tstart(self):
"""Start time of the simulation / data recording. A string
representation of the timestamp in ISO format
"""
return self._fd.attrs['tstart']
@tstart.setter
[docs] def tstart(self, tstart):
"""Set the start time of simulation/recording
Args:
tstart (datetime.datetime): start date-time of the data
recording/simulation.
Note:
We take datetime instance here because we want to ensure ISO format.
"""
self._fd.attrs['tstart'] = tstart.isoformat()
@property
def tend(self):
"""End time of the simulation/recording."""
return self._fd.attrs['tend']
@tend.setter
[docs] def tend(self, tend):
"""Set the end time of recording/simulation.
Args:
tend (datetime.datetime): end date-time of the data
recording or simulation.
Note:
We take datetime instance here because we want to ensure ISO format.
"""
self._fd.attrs['tend'] = tend.isoformat()
@property
def contributor(self):
"""List of contributors to the content of this file."""
return self._fd.attrs['contributor']
@contributor.setter
[docs] def contributor(self, contributor_list):
"""Set the list of contributors to the contents of the file.
Args:
contributor_list (list of str): list of
individuals/organizations/other entities who
contributed towards the data stored in the file.
"""
attr = np.zeros((len(contributor_list),), dtype=VLENSTR)
attr[:] = contributor_list
self._fd.attrs['contributor'] = attr
def _link_map_model(self, mapds):
"""Link the model to map dataset and vice versa.
The map dataset stores a list of references to the closest
common ancestor of all the source components in it in the
attribute `model`. The closest common ancestor in the model
tree also stores a reference to this map dataset in its `map`
attribute.
This is an internal optimization in NSDF because given that
every model component has an unique id and the map datasets
store these unique ids, it is always possible to search the
entire mdoel tree for these unique ids.
Args:
mapds: The map dataset for which the linking should be done.
Returns:
None
"""
self.modelroot.update_id_path_dict()
id_path_dict = self.modelroot.get_id_path_dict()
if mapds.dtype.fields is None:
idlist = mapds
else:
idlist = mapds['source']
if len(id_path_dict) > 1:
# there are elements other than /model/modeltree
paths = [id_path_dict[uid] for uid in idlist]
prefix = common_prefix(paths)[len('/modeltree/'):]
try:
source = self.modeltree[prefix]
tmpattr = ([ref for ref in source.attrs.get('map', [])]
+ [mapds.ref])
attr = np.zeros((len(tmpattr),), dtype=REFTYPE)
attr[:] = tmpattr
source.attrs['map'] = attr
tmpattr = ([ref for ref in mapds.attrs.get('map', [])]
+ [source.ref])
attr = np.zeros((len(tmpattr),), dtype=REFTYPE)
attr[:] = tmpattr
mapds.attrs['model'] = attr
except KeyError, error:
print error.message
[docs] def add_modeltree(self, root, target='/'):
"""Add an entire model tree. This will cause the modeltree rooted at
`root` to be written to the NSDF file.
Args:
root (ModelComponent): root of the source tree.
target (str): target node path in NSDF file with respect
to '/model/modeltree'. `root` and its children are
added under this group.
"""
def write_absolute(node, rootgroup):
"""Write ModelComponent `node` at its path relative to `rootgroup`.
"""
if node.parent is None:
parentgroup = rootgroup
else:
parentpath = node.parent.path[1:]
parentgroup = rootgroup[parentpath]
add_model_component(node, parentgroup)
node = self.modelroot
# Get the node corresponding to `target`, traverse by
# splitting to avoid confusion between absolute and relative
# paths.
for name in target.split('/'):
if name:
node = node.children[name]
node.add_child(root)
self.modelroot.visit(write_absolute, self.model)
[docs] def add_model_filecontents(self, filenames, ascii=True, recursive=True):
"""Add the files and directories listed in `filenames` to
``/model/filecontents``.
This function is for storing the contents of model files in
the NSDF file. In case of external formats like NeuroML,
NineML, SBML and NEURON/GENESIS scripts, this function is
useful. Each directory is stored as a group and each file is
stored as a dataset.
Args:
filenames (sequence): the paths of files and/or
directories which contain model information.
ascii (bool): whether the files are in ascii.
recursive (bool): whether to recursively store
subdirectories.
"""
filecontents = self.model.require_group('filecontents')
for fname in filenames:
if os.path.isfile(fname):
buf = bytearray(os.path.getsize(fname))
with open(fname, 'rb') as fhandle:
fhandle.readinto(buf)
components = []
while True:
head, tail = os.path.split(fame)
if tail:
components.append(tail)
if not head:
break
grp = filecontents
for name in components[:0:-1]:
grp = filecontents.require_group(name)
if ascii:
fdata = write_ascii_file(grp, components[-1], fname, **self.h5args)
else:
fdata = write_binary_file(grp, components[-1], fname, **self.h5args)
elif os.path.isdir(fname):
write_dir_contents(filecontents, fname, ascii=ascii, **self.h5args)
[docs] def add_uniform_ds(self, name, idlist):
"""Add the sources listed in idlist under /map/uniform.
Args:
name (str): name with which the datasource list
should be stored. This will represent a population of
data sources.
idlist (list of str): list of unique identifiers of the
data sources.
Returns:
An HDF5 Dataset storing the source ids. This is
converted into a dimension scale when actual data is
added.
"""
if len(idlist) == 0:
raise ValueError('idlist must be nonempty')
base = None
try:
base = self.mapping[UNIFORM]
except KeyError:
base = self.mapping.create_group(UNIFORM)
src_ds = base.create_dataset(name, shape=(len(idlist),),
dtype=VLENSTR, data=idlist)
self._link_map_model(src_ds)
return src_ds
[docs] def add_nonuniform_ds(self, popname, idlist):
"""Add the sources listed in idlist under /map/nonuniform/{popname}.
Args:
popname (str): name with which the datasource list
should be stored. This will represent a population of
data sources.
idlist (list of str): list of unique identifiers of the
data sources. This becomes irrelevant if homogeneous=False.
Returns:
An HDF5 Dataset storing the source ids when dialect
is VLEN or NANPADDED. This is converted into a dimension
scale when actual data is added.
Raises:
AssertionError if idlist is empty or dialect is ONED.
"""
base = None
base = self.mapping.require_group(NONUNIFORM)
assert self.dialect != dialect.ONED
assert len(idlist) > 0
src_ds = base.create_dataset(popname, shape=(len(idlist),),
dtype=VLENSTR, data=idlist)
self._link_map_model(src_ds)
return src_ds
[docs] def add_nonuniform_ds_1d(self, popname, varname, idlist):
"""Add the sources listed in idlist under
/map/nonuniform/{popname}/{varname}.
In case of 1D datasets, for each variable we store the mapping
from source id to dataset ref in a two column compund dataset
with dtype=[('source', VLENSTR), ('data', REFTYPE)]
Args:
popname (str): name with which the datasource list
should be stored. This will represent a population of
data sources.
varname (str): name of the variable beind recorded. The
same name should be passed when actual data is being
added.
idlist (list of str): list of unique identifiers of the
data sources.
Returns:
An HDF5 Dataset storing the source ids in `source` column.
Raises:
AssertionError if idlist is empty or if dialect is not ONED.
"""
base = self.mapping.require_group(NONUNIFORM)
assert self.dialect == dialect.ONED, 'valid only for dialect=ONED'
assert len(idlist) > 0, 'idlist must be nonempty'
grp = base.require_group(popname)
src_ds = grp.create_dataset(varname, shape=(len(idlist),),
dtype=SRCDATAMAPTYPE)
for iii in range(len(idlist)):
src_ds[iii] = (idlist[iii], None)
self._link_map_model(src_ds)
return src_ds
[docs] def add_event_ds(self, name, idlist):
"""Create a group under `/map/event` with name `name` to store mapping
between the datasources and event data.
Args:
name (str): name with which the datasource list
should be stored. This will represent a population of
data sources.
idlist (list): unique ids of the data sources.
Returns:
The HDF5 Group `/map/event/{name}`.
"""
base = self.mapping.require_group(EVENT)
assert len(idlist) > 0, 'idlist must be nonempty'
assert ((self.dialect != dialect.ONED) and
(self.dialect != dialect.NUREGULAR)), \
'only for VLEN or NANPADDED dialects'
src_ds = base.create_dataset(name, shape=(len(idlist),),
dtype=VLENSTR, data=idlist)
self._link_map_model(src_ds)
return src_ds
[docs] def add_event_ds_1d(self, popname, varname, idlist):
"""Create a group under `/map/event` with name `name` to store mapping
between the datasources and event data.
Args:
popname (str): name of the group under which the
datasource list should be stored. This will represent
a population of data sources.
varname (str): name of the dataset mapping source uid to
data. This should be same as the name of the recorded
variable.
Returns:
The HDF5 Dataset `/map/event/{popname}/{varname}`.
"""
base = self.mapping.require_group(EVENT)
assert len(idlist) > 0, 'idlist must be nonempty'
assert ((self.dialect == dialect.ONED) or
(self.dialect == dialect.NUREGULAR)), \
'dialect must be ONED or NUREGULAR'
grp = base.require_group(popname)
src_ds = grp.create_dataset(varname, shape=(len(idlist),),
dtype=SRCDATAMAPTYPE)
for iii in range(len(idlist)):
src_ds[iii] = (idlist[iii], None)
self._link_map_model(src_ds)
return src_ds
[docs] def add_static_ds(self, popname, idlist):
"""Add the sources listed in idlist under /map/static.
Args:
popname (str): name with which the datasource list
should be stored. This will represent a population of
data sources.
idlist (list of str): list of unique identifiers of the
data sources.
Returns:
An HDF5 Dataset storing the source ids. This is
converted into a dimension scale when actual data is
added.
"""
if len(idlist) == 0:
raise ValueError('idlist must be nonempty')
base = self.mapping.require_group(STATIC)
src_ds = base.create_dataset(popname, shape=(len(idlist),),
dtype=VLENSTR, data=idlist)
self.modelroot.update_id_path_dict()
self._link_map_model(src_ds)
return src_ds
[docs] def add_uniform_data(self, source_ds, data_object, tstart=0.0,
fixed=False):
"""Append uniformly sampled `variable` values from `sources` to
`data`.
Args:
source_ds (HDF5 Dataset): the dataset storing the source
ids under map. This is attached to the stored data as
a dimension scale called `source` on the row
dimension.
data_object (nsdf.UniformData): Uniform dataset to be
added to file.
tstart (double): (optional) start time of this dataset
recording. Defaults to 0.
fixed (bool): if True, the data cannot grow. Default: False
Returns:
HDF5 dataset storing the data
Raises:
KeyError if the sources in `source_data_dict` do not match
those in `source_ds`.
ValueError if dt is not specified or <= 0 when inserting
data for the first time.
"""
popname = source_ds.name.rpartition('/')[-1]
ugrp = self.data[UNIFORM].require_group(popname)
if not match_datasets(source_ds, data_object.get_sources()):
raise KeyError('members of `source_ds` must match sources in'
' `data`.')
ordered_data = [data_object.get_data(src) for src in source_ds]
data = np.vstack(ordered_data)
try:
dataset = ugrp[data_object.name]
oldcolcount = dataset.shape[1]
dataset.resize(oldcolcount + data.shape[1], axis=1)
dataset[:, oldcolcount:] = data
except KeyError:
if data_object.dt <= 0.0:
raise ValueError('`dt` must be > 0.0 for creating dataset.')
if data_object.unit is None:
raise ValueError('`unit` is required for creating dataset.')
if data_object.tunit is None:
raise ValueError('`tunit` is required for creating dataset.')
maxcol = None
if fixed:
maxcol = data.shape[1]
dataset = ugrp.create_dataset(
data_object.name,
shape=data.shape,
dtype=data_object.dtype,
data=data,
maxshape=(data.shape[0], maxcol),
**self.h5args)
dataset.dims.create_scale(source_ds, 'source')
dataset.dims[0].attach_scale(source_ds)
dataset.dims[0].label = 'source'
dataset.attrs['tstart'] = tstart
dataset.attrs['dt'] = data_object.dt
dataset.attrs['field'] = data_object.field
dataset.attrs['unit'] = data_object.unit
dataset.attrs['tunit'] = data_object.tunit
return dataset
[docs] def add_nonuniform_regular(self, source_ds, data_object,
fixed=False):
"""Append nonuniformly sampled `variable` values from `sources` to
`data`. In this case sampling times of all the sources are
same and the data is stored in a 2D dataset.
Args:
source_ds (HDF5 Dataset): the dataset storing the source
ids under map. This is attached to the stored data as
a dimension scale called `source` on the row
dimension.
data_object (nsdf.NonuniformRegularData):
NonUniformRegular dataset to be added to file.
fixed (bool): if True, the data cannot grow. Default: False
Returns:
HDF5 dataset storing the data
Raises:
KeyError if the sources in `data_object` do not match
those in `source_ds`.
ValueError if the data arrays are not all equal in length.
ValueError if dt is not specified or <= 0 when inserting
data for the first time.
"""
popname = source_ds.name.rpartition('/')[-1]
ngrp = self.data[NONUNIFORM].require_group(popname)
if not match_datasets(source_ds, data_object.get_sources()):
raise KeyError('members of `source_ds` must match sources in'
' `data_object`.')
ordered_data = [data_object.get_data(src) for src in source_ds]
data = np.vstack(ordered_data)
if data.shape[1] != len(data_object.get_times()):
raise ValueError('number sampling times must be '
'same as the number of data points')
try:
dataset = ngrp[data_object.name]
oldcolcount = dataset.shape[1]
dataset.resize(oldcolcount + data.shape[1], axis=1)
dataset[:, oldcolcount:] = data
except KeyError:
if data_object.unit is None:
raise ValueError('`unit` is required for creating dataset.')
if data_object.tunit is None:
raise ValueError('`tunit` is required for creating dataset.')
maxcol = None
if fixed:
maxcol = data.shape[1]
dataset = ngrp.create_dataset(
data_object.name, shape=data.shape,
dtype=data.dtype,
data=data,
maxshape=(data.shape[0], maxcol),
**self.h5args)
dataset.dims.create_scale(source_ds, 'source')
dataset.dims[0].attach_scale(source_ds)
dataset.dims[0].label = 'source'
dataset.attrs['field'] = data_object.field
dataset.attrs['unit'] = data_object.unit
tsname = '{}_{}'.format(popname, data_object.name)
tscale = self.time_dim.create_dataset(
tsname,
shape=(len(data_object.get_times()),),
dtype=np.float64,
data=data_object.get_times(),
**self.h5args)
dataset.dims.create_scale(tscale, 'time')
dataset.dims[1].attach_scale(tscale)
dataset.dims[1].label = 'time'
tscale.attrs['unit'] = data_object.tunit
return dataset
[docs] def add_nonuniform_1d(self, source_ds, data_object,
source_name_dict=None, fixed=False):
"""Add nonuniform data when data from each source is in a separate 1D
dataset.
For a population of sources called {population}, a group
`/map/nonuniform/{population}` must be first created (using
add_nonuniform_ds). This is passed as `source_ds` argument.
When adding the data, the uid of the sources and the names for
the corresponding datasets must be specified and this function
will create one dataset for each source under
`/data/nonuniform/{population}/{name}` where {name} is the
name of the data_object, preferably the name of the field
being recorded.
This function can be used when different sources in a
population are sampled at different time points for a field
value. Such case may arise when each member of the population
is simulated using a variable timestep method like CVODE and
this timestep is not global.
Args:
source_ds (HDF5 dataset): the dataset
`/map/nonuniform/{population}/{variable}` created for
this population of sources (created by
add_nonunifrom_ds_1d).
data_object (nsdf.NonuniformData): NSDFData object storing
the data for all sources in `source_ds`.
source_name_dict (dict): mapping from source id to dataset
name. If None (default), the uids of the sources will
be used as dataset names. If the uids are not
compatible with HDF5 names (contain '.' or '/'), then
the index of the source in source_ds will be used.
fixed (bool): if True, the data cannot grow. Default:
False
Returns:
dict mapping source ids to the tuple (dataset, time).
Raises:
AssertionError when dialect is not ONED.
"""
assert self.dialect == dialect.ONED, \
'add 1D dataset under nonuniform only for dialect=ONED'
if source_name_dict is None:
names = np.asarray(source_ds['source'], dtype=str)
if np.any((np.char.find(names, '/') >= 0) |
(np.char.find(names, '.') >= 0)):
names = [str(index) for index in range(len(names))]
source_name_dict = dict(zip(source_ds['source'], names))
assert len(set(source_name_dict.values())) == len(source_ds), \
'The names in `source_name_dict` must be unique'
popname = source_ds.name.split('/')[-2]
ngrp = self.data[NONUNIFORM].require_group(popname)
assert match_datasets(source_name_dict.keys(),
data_object.get_sources()), \
'sources in `source_name_dict`' \
' do not match those in `data_object`'
assert match_datasets(source_ds['source'],
source_name_dict.keys()), \
'sources in mapping dataset do not match those with data'
datagrp = ngrp.require_group(data_object.name)
datagrp.attrs['source'] = source_ds.ref
datagrp.attrs['unit'] = data_object.unit
datagrp.attrs['field'] = data_object.field
ret = {}
for iii, source in enumerate(source_ds['source']):
data, time = data_object.get_data(source)
dsetname = source_name_dict[source]
timescale = None
try:
dset = datagrp[dsetname]
oldlen = dset.shape[0]
timescale = dset.dims[0]['time']
dset.resize((oldlen + len(data),))
dset[oldlen:] = data
timescale.resize((oldlen + len(data),))
timescale[oldlen:] = time
except KeyError:
if data_object.unit is None:
raise ValueError('`unit` is required'
' for creating dataset.')
if data_object.tunit is None:
raise ValueError('`tunit` is required'
' for creating dataset.')
maxcol = len(data) if fixed else None
dset = datagrp.create_dataset(
dsetname,
shape=(len(data),),
dtype=data_object.dtype,
data=data,
maxshape=(maxcol,),
**self.h5args)
dset.attrs['unit'] = data_object.unit
dset.attrs['field'] = data_object.field
dset.attrs['source'] = source
source_ds[iii] = (source, dset.ref)
# Using {popname}_{variablename}_{dsetname} for
# simplicity. What about creating a hierarchy?
tsname = '{}_{}_{}'.format(popname, data_object.name, dsetname)
timescale = self.time_dim.create_dataset(
tsname,
shape=(len(data),),
dtype=data_object.ttype,
data=time,
maxshape=(maxcol,),
**self.h5args)
dset.dims.create_scale(timescale, 'time')
dset.dims[0].label = 'time'
dset.dims[0].attach_scale(timescale)
timescale.attrs['unit'] = data_object.tunit
ret[source] = (dset, timescale)
return ret
[docs] def add_nonuniform_vlen(self, source_ds, data_object,
fixed=False):
"""Add nonuniform data when data from all sources in a population is
stored in a 2D ragged array.
When adding the data, the uid of the sources and the names for
the corresponding datasets must be specified and this function
will create the dataset `/data/nonuniform/{population}/{name}`
where {name} is the first argument, preferably the name of the
field being recorded.
This function can be used when different sources in a
population are sampled at different time points for a field
value. Such case may arise when each member of the population
is simulated using a variable timestep method like CVODE and
this timestep is not global.
Args:
source_ds (HDF5 dataset): the dataset under
`/map/nonuniform` created for this population of
sources (created by add_nonunifrom_ds).
data_object (nsdf.NonuniformData): NSDFData object storing
the data for all sources in `source_ds`.
fixed (bool): if True, this is a one-time write and the
data cannot grow. Default: False
Returns:
tuple containing HDF5 Datasets for the data and sampling
times.
TODO:
Concatenating old data with new data and reassigning is a poor
choice. waiting for response from h5py mailing list about
appending data to rows of vlen datasets. If that is not
possible, vlen dataset is a technically poor choice.
h5py does not support vlen datasets with float64
elements. Change dtype to np.float64 once that is
developed.
"""
if self.dialect != dialect.VLEN:
raise Exception('add 2D vlen dataset under nonuniform'
' only for dialect=VLEN')
popname = source_ds.name.rpartition('/')[-1]
ngrp = self.data[NONUNIFORM].require_group(popname)
if not match_datasets(source_ds, data_object.get_sources()):
raise KeyError('members of `source_ds` must match keys of'
' `source_data_dict`.')
# Using {popname}_{variablename} for simplicity. What
# about creating a hierarchy?
tsname = '{}_{}'.format(popname, data_object.name)
try:
dataset = ngrp[data_object.name]
time_ds = self.time_dim[tsname]
except KeyError:
if data_object.unit is None:
raise ValueError('`unit` is required for creating dataset.')
if data_object.tunit is None:
raise ValueError('`tunit` is required for creating dataset.')
vlentype = h5.special_dtype(vlen=data_object.dtype)
maxrows = source_ds.shape[0] if fixed else None
# Fix me: is there any point of keeping the compression
# and shuffle options?
dataset = ngrp.create_dataset(
data_object.name,
shape=source_ds.shape,
dtype=vlentype,
**self.h5args)
dataset.attrs['field'] = data_object.field
dataset.attrs['unit'] = data_object.unit
dataset.dims.create_scale(source_ds, 'source')
dataset.dims[0].attach_scale(source_ds)
dataset.dims[0].label = 'source'
# FIXME: VLENFLOAT should be made VLENDOUBLE whenever h5py
# fixes it
time_ds = self.time_dim.create_dataset(
tsname,
shape=dataset.shape,
maxshape=(maxrows,),
dtype=VLENFLOAT,
**self.h5args)
dataset.dims.create_scale(time_ds, 'time')
dataset.dims[0].attach_scale(time_ds)
dataset.dims[0].label = 'time'
time_ds.attrs['unit'] = data_object.tunit
for iii, source in enumerate(source_ds):
data, time, = data_object.get_data(source)
dataset[iii] = np.concatenate((dataset[iii], data))
time_ds[iii] = np.concatenate((time_ds[iii], time))
return dataset, time_ds
[docs] def add_nonuniform_nan(self, source_ds, data_object, fixed=False):
"""Add nonuniform data when data from all sources in a population is
stored in a 2D array with NaN padding.
Args:
source_ds (HDF5 Dataset): the dataset under
`/map/event` created for this population of
sources (created by add_nonunifrom_ds).
data_object (nsdf.EventData): NSDFData object storing
the data for all sources in `source_ds`.
fixed (bool): if True, this is a one-time write and the
data cannot grow. Default: False
Returns:
HDF5 Dataset containing the data.
Notes:
Concatenating old data with new data and reassigning is a
poor choice for saving data incrementally. HDF5 does not
seem to support appending data to VLEN datasets.
h5py does not support vlen datasets with float64
elements. Change dtype to np.float64 once that is
developed.
"""
assert self.dialect == dialect.NANPADDED, \
'add 2D dataset under `nonuniform` only for dialect=NANPADDED'
popname = source_ds.name.rpartition('/')[-1]
ngrp = self.data[NONUNIFORM].require_group(popname)
if not match_datasets(source_ds, data_object.get_sources()):
raise KeyError('members of `source_ds` must match sources '
'in `data_object`.')
# Using {popname}_{variablename} for simplicity. What
# about creating a hierarchy?
tsname = '{}_{}'.format(popname, data_object.name)
cols = [len(data_object.get_data(source)[0]) for source in
source_ds]
starts = np.zeros(source_ds.shape[0], dtype=int)
ends = np.asarray(cols, dtype=int)
try:
dataset = ngrp[data_object.name]
for iii in range(source_ds.shape[0]):
try:
starts[iii] = next(find(dataset[iii], np.isnan))[0][0]
except StopIteration:
starts[iii] = len(dataset[iii])
ends[iii] = starts[iii] + cols[iii]
dataset.resize(max(ends), 1)
time_ds = self.time_dim[tsname]
time_ds.resize(max(ends), 1)
except KeyError:
if data_object.unit is None:
raise ValueError('`unit` is required for creating dataset.')
if data_object.tunit is None:
raise ValueError('`tunit` is required for creating dataset.')
maxrows = len(source_ds) if fixed else None
maxcols = max(cols) if fixed else None
dataset = ngrp.create_dataset(
data_object.name,
shape=(source_ds.shape[0], max(ends)),
maxshape=(maxrows, maxcols),
fillvalue=np.nan,
dtype=data_object.dtype,
**self.h5args)
dataset.attrs['field'] = data_object.field
dataset.attrs['unit'] = data_object.unit
dataset.dims.create_scale(source_ds, 'source')
dataset.dims[0].attach_scale(source_ds)
dataset.dims[0].label = 'source'
time_ds = self.time_dim.create_dataset(
tsname,
shape=dataset.shape,
maxshape=(maxrows,maxcols),
dtype=data_object.ttype,
fillvalue=np.nan,
**self.h5args)
dataset.dims.create_scale(time_ds, 'time')
dataset.dims[1].attach_scale(time_ds)
dataset.dims[1].label = 'time'
time_ds.attrs['unit'] = data_object.tunit
for iii, source in enumerate(source_ds):
data, time = data_object.get_data(source)
dataset[iii, starts[iii]:ends[iii]] = data
time_ds[iii, starts[iii]:ends[iii]] = time
return dataset
[docs] def add_event_1d(self, source_ds, data_object, source_name_dict=None,
fixed=False):
"""Add event time data when data from each source is in a separate 1D
dataset.
For a population of sources called {population}, a group
`/map/event/{population}` must be first created (using
add_event_ds). This is passed as `source_ds` argument.
When adding the data, the uid of the sources and the names for
the corresponding datasets must be specified in
`source_name_dict` and this function will create one dataset
for each source under `/data/event/{population}/{name}` where
{name} is the name of the data_object, preferably the field
name.
Args:
source_ds (HDF5 Dataset): the dataset
`/map/event/{populationname}{variablename}` created
for this population of sources (created by
add_event_ds_1d). The name of this group reflects
that of the group under `/data/event` which stores the
datasets.
data_object (nsdf.EventData): NSDFData object storing
the data for all sources in `source_ds`.
source_name_dict (dict): mapping from source id to dataset
name. If None (default) it tries to use the uids in
the source_ds. If the uids do not fit the hdf5 naming
convention, the index of the entries in source_ds will
be used.
fixed (bool): if True, the data cannot grow. Default:
False
Returns:
dict mapping source ids to datasets.
"""
assert ((self.dialect == dialect.ONED) or
self.dialect == dialect.NUREGULAR), \
'add 1D dataset under event only for dialect=ONED or NUREGULAR'
if source_name_dict is None:
names = np.asarray(source_ds['source'], dtype=str)
if np.any((np.char.find(names, '/') >= 0) |
(np.char.find(names, '.') >= 0)):
names = [str(index) for index in range(len(names))]
source_name_dict = dict(zip(source_ds['source'], names))
assert len(set(source_name_dict.values())) == len(source_ds), \
'The names in `source_name_dict` must be unique'
popname = source_ds.name.split('/')[-2]
ngrp = self.data[EVENT].require_group(popname)
assert match_datasets(source_name_dict.keys(),
data_object.get_sources()), \
'number of sources do not match number of datasets'
datagrp = ngrp.require_group(data_object.name)
datagrp.attrs['source'] = source_ds.ref
datagrp.attrs['unit'] = data_object.unit
datagrp.attrs['field'] = data_object.field
ret = {}
for iii, source in enumerate(source_ds['source']):
data = data_object.get_data(source)
dsetname = source_name_dict[source]
try:
dset = datagrp[dsetname]
oldlen = dset.shape[0]
dset.resize((oldlen + len(data),))
dset[oldlen:] = data
except KeyError:
if data_object.unit is None:
raise ValueError('`unit` is required for creating dataset.')
if data_object.field is None:
raise ValueError('`field` is required for creating dataset.')
maxrows = len(data) if fixed else None
dset = datagrp.create_dataset(
dsetname,
shape=(len(data),),
dtype=data_object.dtype, data=data,
maxshape=(maxrows,),
**self.h5args)
dset.attrs['unit'] = data_object.unit
dset.attrs['field'] = data_object.field
dset.attrs['source'] = source
source_ds[iii] = (source, dset.ref)
ret[source] = dset
return ret
[docs] def add_event_vlen(self, source_ds, data_object, fixed=False):
"""Add event data when data from all sources in a population is
stored in a 2D ragged array.
When adding the data, the uid of the sources and the names for
the corresponding datasets must be specified and this function
will create the dataset `/data/event/{population}/{name}`
where {name} is name of the data_object, preferably the name
of the field being recorded.
Args:
source_ds (HDF5 Dataset): the dataset under
`/map/event` created for this population of
sources (created by add_nonunifrom_ds).
data_object (nsdf.EventData): NSDFData object storing
the data for all sources in `source_ds`.
fixed (bool): if True, this is a one-time write and the
data cannot grow. Default: False
Returns:
HDF5 Dataset containing the data.
Notes:
Concatenating old data with new data and reassigning is a
poor choice for saving data incrementally. HDF5 does not
seem to support appending data to VLEN datasets.
h5py does not support vlen datasets with float64
elements. Change dtype to np.float64 once that is
developed.
"""
if self.dialect != dialect.VLEN:
raise Exception('add 2D vlen dataset under event'
' only for dialect=VLEN')
popname = source_ds.name.rpartition('/')[-1]
ngrp = self.data[EVENT].require_group(popname)
if not match_datasets(source_ds, data_object.get_sources()):
raise KeyError('members of `source_ds` must match sources '
'in `data_object`.')
try:
dataset = ngrp[data_object.name]
except KeyError:
if data_object.unit is None:
raise ValueError('`unit` is required for creating dataset.')
vlentype = h5.special_dtype(vlen=data_object.dtype)
maxrows = len(source_ds) if fixed else None
# Fix me: is there any point of keeping the compression
# and shuffle options?
dataset = ngrp.create_dataset(
data_object.name, shape=source_ds.shape,
maxshape=(maxrows,),
dtype=vlentype,
**self.h5args)
dataset.attrs['field'] = data_object.field
dataset.attrs['unit'] = data_object.unit
dataset.dims.create_scale(source_ds, 'source')
dataset.dims[0].attach_scale(source_ds)
dataset.dims[0].label = 'source'
for iii, source in enumerate(source_ds):
data = data_object.get_data(source)
dataset[iii] = np.concatenate((dataset[iii], data))
return dataset
[docs] def add_event_nan(self, source_ds, data_object, fixed=False):
"""Add event data when data from all sources in a population is
stored in a 2D array with NaN padding.
Args:
source_ds (HDF5 Dataset): the dataset under
`/map/event` created for this population of
sources (created by add_nonunifrom_ds).
data_object (nsdf.EventData): NSDFData object storing
the data for all sources in `source_ds`.
fixed (bool): if True, this is a one-time write and the
data cannot grow. Default: False
Returns:
HDF5 Dataset containing the data.
"""
assert self.dialect == dialect.NANPADDED, \
'add 2D vlen dataset under event only for dialect=NANPADDED'
popname = source_ds.name.rpartition('/')[-1]
ngrp = self.data[EVENT].require_group(popname)
if not match_datasets(source_ds, data_object.get_sources()):
raise KeyError('members of `source_ds` must match sources '
'in `data_object`.')
cols = [len(data_object.get_data(source)) for source in
source_ds]
starts = np.zeros(source_ds.shape[0], dtype=int)
ends = np.asarray(cols, dtype=int)
try:
dataset = ngrp[data_object.name]
for iii in range(dataset.shape[0]):
try:
starts[iii] = next(find(dataset[iii], np.isnan))[0][0]
except StopIteration:
starts[iii] = len(dataset[iii])
ends[iii] = starts[iii] + cols[iii]
dataset.resize(max(ends), 1)
except KeyError:
if data_object.unit is None:
raise ValueError('`unit` is required for creating dataset.')
maxrows = len(source_ds) if fixed else None
maxcols = max(ends) if fixed else None
dataset = ngrp.create_dataset(
data_object.name,
shape=(source_ds.shape[0], max(ends)),
maxshape=(maxrows, maxcols),
dtype=data_object.dtype,
fillvalue=np.nan,
**self.h5args)
dataset.attrs['field'] = data_object.field
dataset.attrs['unit'] = data_object.unit
dataset.dims.create_scale(source_ds, 'source')
dataset.dims[0].attach_scale(source_ds)
dataset.dims[0].label = 'source'
for iii, source in enumerate(source_ds):
data = data_object.get_data(source)
dataset[iii, starts[iii]:ends[iii]] = data
return dataset
[docs] def add_static_data(self, source_ds, data_object,
fixed=True):
"""Append static data `variable` values from `sources` to `data`.
Args:
source_ds (HDF5 Dataset): the dataset storing the source
ids under map. This is attached to the stored data as
a dimension scale called `source` on the row
dimension.
data_object (nsdf.EventData): NSDFData object storing
the data for all sources in `source_ds`.
fixed (bool): if True, the data cannot grow. Default: True
Returns:
HDF5 dataset storing the data
Raises:
KeyError if the sources in `source_data_dict` do not match
those in `source_ds`.
"""
popname = source_ds.name.rpartition('/')[-1]
ugrp = self.data[STATIC].require_group(popname)
if not match_datasets(source_ds, data_object.get_sources()):
raise KeyError('members of `source_ds` must match keys of'
' `source_data_dict`.')
ordered_data = [data_object.get_data( src) for src in \
source_ds]
data = np.vstack(ordered_data)
try:
dataset = ugrp[data_object.name]
oldcolcount = dataset.shape[1]
dataset.resize(oldcolcount + data.shape[1], axis=1)
dataset[:, oldcolcount:] = data
except KeyError:
if data_object.unit is None:
raise ValueError('`unit` is required for creating dataset.')
maxcol = None
if fixed:
maxcol = data.shape[1]
dataset = ugrp.create_dataset(
data_object.name, shape=data.shape,
dtype=data_object.dtype,
data=data,
maxshape=(data.shape[0], maxcol),
**self.h5args)
dataset.dims.create_scale(source_ds, 'source')
dataset.dims[0].attach_scale(source_ds)
dataset.dims[0].label = 'source'
dataset.attrs['field'] = data_object.field
dataset.attrs['unit'] = data_object.unit
return dataset
#
# nsdfwriter.py ends here