"""
Proposed Extensions
-------------------
DataSet.split(scheme={devtraintest, crossvalidate, traintest} returns DataSets
DataSets.join() returns DataSet (combines train or cross validation)
DataSet + DataSet returns DataSet
DataSets + DataSets returns DataSets
DataSets constructor from list of DataSet objects
DataSet for Online data
DataSet for Sequence data
Binary data formats for Streaming data
Loading, Saving, and Testing
----------------------------
:any:`save`
:any:`load`
:any:`is_one_hot`
:any:`read_data_sets`
:any:`untar`
:any:`maybe_download`
Classes
-------
:any:`DataSet`
:any:`DataSets`
:any:`IndexVector`
:any:`IndexVector`
Data Transforms
---------------
:any:`center`
:any:`l1normalize`
:any:`l2normalize`
:any:`tfidf`
:any:`toOnehot`
:any:`toIndex`
:any:`unit_variance`
Exceptions
----------
:any:`BadDirectoryStructureError`
:any:`MatFormatError`
:any:`SparseFormatError`
:any:`UnsupportedFormatError`
"""
import struct
import numpy as np
import scipy.io
import os
import scipy.sparse as sps
from antk.lib import termcolor as tc
import tarfile
import os.path
import urllib
import numbers
import repr
import antk.core
slash = '/'
if os.name == 'nt':
slash = '\\' # so this works in Windows
[docs]class BadDirectoryStructureError(Exception):
"""
Raised when a data directory specified, does not contain
a subfolder specified in the *folders* argument to
:any:`read_data_sets`.
"""
pass
# ==============================================================================================
# =============================DATA STRUCTURES==================================================
# ==============================================================================================
[docs]class DataSet(object):
"""
Data structure for mini-batch gradient descent training involving non-sequential data.
:param features: (dict) A dictionary of string label names to data matrices.
Matrices may be of types :any:`IndexVector`, scipy sparse csr_matrix, or numpy array.
:param labels: (dict) A dictionary of string label names to data matrices.
Matrices may be of types :any:`IndexVector`, scipy sparse csr_matrix, or numpy array.
:param mix: (boolean) Whether or not to shuffle per epoch.
:examples:
>>> import numpy as np
>>> from antk.core.loader import DataSet
>>> d = DataSet({'id': np.eye(5)}, labels={'ones':np.ones((5, 2))})
>>> d #doctest: +NORMALIZE_WHITESPACE
antk.core.DataSet object with fields:
'_labels': {'ones': array([[ 1., 1.],
[ 1., 1.],
[ 1., 1.],
[ 1., 1.],
[ 1., 1.]])}
'mix_after_epoch': False
'_num_examples': 5
'_index_in_epoch': 0
'_last_batch_size': 5
'_features': {'id': array([[ 1., 0., 0., 0., 0.],
[ 0., 1., 0., 0., 0.],
[ 0., 0., 1., 0., 0.],
[ 0., 0., 0., 1., 0.],
[ 0., 0., 0., 0., 1.]])}
>>> d.show() #doctest: +NORMALIZE_WHITESPACE
features:
id: (5, 5) <type 'numpy.ndarray'>
labels:
ones: (5, 2) <type 'numpy.ndarray'>
>>> d.next_batch(3) #doctest: +NORMALIZE_WHITESPACE
antk.core.DataSet object with fields:
'_labels': {'ones': array([[ 1., 1.],
[ 1., 1.],
[ 1., 1.]])}
'mix_after_epoch': False
'_num_examples': 3
'_index_in_epoch': 0
'_last_batch_size': 3
'_features': {'id': array([[ 1., 0., 0., 0., 0.],
[ 0., 1., 0., 0., 0.],
[ 0., 0., 1., 0., 0.]])}
"""
def __init__(self, features, labels=None, mix=False):
self._features = features # hashmap of feature matrices
self._num_examples = features[features.keys()[0]].shape[0]
if labels:
self._labels = labels # hashmap of label matrices
else:
self._labels = {}
self._index_in_epoch = 0
self.mix_after_epoch = mix
self._last_batch_size = self._num_examples
def __repr__(self):
attrs = vars(self)
return 'antk.core.DataSet object with fields:\n' + '\n'.join("\t%r: %r" % item for item in attrs.items())
# ======================================================================================
# =============================PROPERTIES===============================================
# ======================================================================================
@property
def features(self):
"""
:attribute: (dict) A dictionary with string keys and feature matrix values.
"""
return self._features
@property
def index_in_epoch(self):
"""
:attribute: (int) The number of data points that have been trained on in a particular epoch.
"""
return self._index_in_epoch
@property
def labels(self):
"""
:attribute: (dict) A dictionary with string keys and label matrix values.
"""
return self._labels
@property
def num_examples(self):
"""
:attribute: (int) Number of rows (data points) of the matrices in this :any:`DataSet`.
"""
return self._num_examples
# ======================================================================================
# =============================PUBLIC METHODS===========================================
# ======================================================================================
[docs] def reset_index_to_zero(self):
"""
:method: Sets :any:`index_in_epoch` to 0.
"""
self._index_in_epoch = 0
[docs] def next_batch(self, batch_size):
"""
:method:
Return a sub DataSet of next batch-size examples.
If no shuffling (mix=False):
If `batch_size` is greater than the number of examples left in
the epoch then a batch size DataSet wrapping past beginning
(rows [index_in_epcoch:num_examples, 0::any:`num_examples`-:any:`index_in_epoch`]
will be returned.
If shuffling enabled (mix=True):
If `batch_size` is greater than the number of examples left in the epoch,
points will be shuffled and `batch_size` DataSet is returned starting from index 0.
:param batch_size: (int) The number of rows in the matrices of the sub DataSet.
:return: :any:`DataSet`
"""
if batch_size != self._last_batch_size and self._index_in_epoch != 0:
self.reset_index_to_zero()
self._last_batch_size = batch_size
assert batch_size <= self._num_examples
start = self._index_in_epoch
if self._index_in_epoch + batch_size > self._num_examples:
if not self.mix_after_epoch:
self._index_in_epoch = (self._index_in_epoch + batch_size) % self._num_examples
end = self._index_in_epoch
newbatch = DataSet(self._next_batch_(self._features, start, end),
self._next_batch_(self._labels, start, end))
else:
self.shuffle()
start = 0
end = batch_size
newbatch = DataSet(self._next_batch_(self._features, start, end),
self._next_batch_(self._labels, start, end))
self._index_in_epoch = batch_size
return newbatch
else:
end = self._index_in_epoch + batch_size
self._index_in_epoch = (batch_size + self._index_in_epoch) % self._num_examples
if self._index_in_epoch == 0 and self.mix_after_epoch:
self.shuffle()
return DataSet(self._next_batch_(self._features, start, end),
self._next_batch_(self._labels, start, end))
[docs] def show(self):
"""
:method: Prints the data specs (dimensions, keys, type) in the :any:`DataSet` object
"""
print('features:')
for name, feature, in self.features.iteritems():
if type(feature) is IndexVector:
print('\t %s: vec.shape: %s dim: %s %s' % (name, feature.vec.shape,
feature.dim, type(feature)))
else:
print('\t %s: %s %s' % (name, feature.shape, type(feature)))
print('labels:')
for name, label in self.labels.iteritems():
if type(label) is IndexVector:
print('\t %s: vec.shape: %s dim: %s %s' % (name, (label.vec.shape),
label.dim, type(label)))
else:
print('\t %s: %s %s' % (name, label.shape, type(label)))
[docs] def showmore(self):
"""
:method: Prints the data specs (dimensions, keys, type) in the :any:`DataSet` object,
along with a sample of up to the first twenty rows for matrices in DataSet.
"""
print('features:')
for name, feature in self.features.iteritems():
row = min(5, feature.shape[0])
print('\t %s: \nFirst %s rows:\n%s\n' % (name, row, feature[0:row]))
print('labels:')
for name, label in self.labels.iteritems():
row = min(5, label.shape[0])
print('\t %s: \nFirst %s rows:\n%s\n' % (name, row, label[0:row]))
[docs] def shuffle(self):
"""
:method: The same random permutation is applied to the
rows of all the matrices in :any:`features` and :any:`labels` .
"""
perm = np.arange(self._num_examples)
np.random.shuffle(perm)
self._shuffle_(perm, self._features)
self._shuffle_(perm, self._labels)
def _shuffle_(self, order, datamap):
'''
:param order: A list of the indices for the row permutation
:param datamap:
:return: void
Shuffles the rows an individual matrix in the :any:`DataSet` object.'
'''
for matrix in datamap:
if type(datamap[matrix]) is IndexVector:
datamap[matrix] = IndexVector(datamap[matrix].vec[order], datamap[matrix].dim)
else:
datamap[matrix] = datamap[matrix][order]
def _next_batch_(self, datamap, start, end=None):
'''
:param datamap: A hash map of matrices
:param start: starting row
:param end: ending row
:return: A hash map of slices of matrices from row start to row end
'''
if end is None:
end = self._num_examples
batch_data_map = {}
if end <= start:
start2 = 0
end2 = end
end = self._num_examples
wrapdata = {}
for matrix in datamap:
if type(datamap[matrix]) is IndexVector:
wrapdata[matrix] = datamap[matrix].vec[start2:end2]
batch_data_map[matrix] = datamap[matrix].vec[start:end]
else:
wrapdata[matrix] = datamap[matrix][start2:end2]
batch_data_map[matrix] = datamap[matrix][start:end]
if sps.issparse(batch_data_map[matrix]):
batch_data_map[matrix] = sps.vstack([batch_data_map[matrix], wrapdata[matrix]])
else:
batch_data_map[matrix] = np.concatenate([batch_data_map[matrix], wrapdata[matrix]], axis=0)
else:
for matrix in datamap:
if type(datamap[matrix]) is IndexVector:
batch_data_map[matrix] = datamap[matrix].vec[start:end]
else:
batch_data_map[matrix] = datamap[matrix][start:end]
return batch_data_map
[docs]class DataSets(object):
"""
A record of :any:`DataSet` objects.
:param datasets_map: (dict) A dictionary with string keys and `DataSet` objects as values.
:param mix: (boolean) Whether or not to enable shuffling for mini-batching.
:attributes: (:any:`DataSet`) There is an attribute for each key value pair in
**datasets_map** argument.
:examples:
>>> import numpy as np
>>> from antk.core.loader import DataSets
>>> from antk.core.loader import DataSet
>>> d = DataSets({'train': DataSet({'id': np.eye(5)}, labels={'one': np.ones((5,6))}),
... 'dev': DataSet({'id': 5*np.eye(2)}, labels={'one': 5*np.ones((2,6))})})
>>> d.show() #doctest: +SKIP
dev:
features:
id: (2, 2) <type 'numpy.ndarray'>
labels:
one: (2, 6) <type 'numpy.ndarray'>
train:
features:
id: (5, 5) <type 'numpy.ndarray'>
labels:
one: (5, 6) <type 'numpy.ndarray'>
>>> d.showmore() #doctest: +SKIP
dev:
features:
id:
First 2 rows:
[[ 5. 0.]
[ 0. 5.]]
<BLANKLINE>
labels:
one:
First 2 rows:
[[ 5. 5. 5. 5. 5. 5.]
[ 5. 5. 5. 5. 5. 5.]]
<BLANKLINE>
train:
features:
id:
First 5 rows:
[[ 1. 0. 0. 0. 0.]
[ 0. 1. 0. 0. 0.]
[ 0. 0. 1. 0. 0.]
[ 0. 0. 0. 1. 0.]
[ 0. 0. 0. 0. 1.]]
<BLANKLINE>
labels:
one:
First 5 rows:
[[ 1. 1. 1. 1. 1. 1.]
[ 1. 1. 1. 1. 1. 1.]
[ 1. 1. 1. 1. 1. 1.]
[ 1. 1. 1. 1. 1. 1.]
[ 1. 1. 1. 1. 1. 1.]]
<BLANKLINE>
"""
def __init__(self, datasets_map={}, mix=False):
if not datasets_map:
pass
elif isinstance(next(iter(datasets_map.values())), antk.core.loader.DataSet):
for k, v in datasets_map.iteritems():
setattr(self, k, v)
else:
for k, v in datasets_map.iteritems():
setattr(self, k, DataSet(v['features'], v['labels'], mix=mix))
def __repr__(self):
attrs = vars(self)
return 'antk.core.DataSets object with fields:\n' + '\n'.join("\t%s: %s" % item for item in attrs.items())
[docs] def show(self):
"""
:method: Pretty print data attributes.
"""
datasets = [s for s in dir(self) if not s.startswith('__') and not s == 'show' and not s == 'showmore']
for dataset in datasets:
print tc.colored(dataset + ':', 'yellow')
getattr(self, dataset).show()
[docs] def showmore(self):
"""
:method: Pretty print data attributes, and data.
"""
datasets = [s for s in dir(self) if not s.startswith('__') and not s == 'show' and not s == 'showmore']
for dataset in datasets:
print tc.colored(dataset + ':', 'yellow')
getattr(self, dataset).showmore()
[docs]class IndexVector(object):
"""
Index vector representation of one hot matrix.
:param matrix: (scipy.sparse.csr_matrix or numpy array)
A one hot matrix or vector of *on* indices of a one hot matrix.
If **matrix** is a vector of indices and no dimension argument is supplied
then **dimension** is set to the maximum index value + 1.
:param dimension: (int) The number of columns in the one hot matrix to be represented.
.. note::
IndexVector objects implement the python sequence protocol, so slicing,
indexing and iteration behave as you might expect.
Slices of an IndexVector return another IndexVector.
Indexing returns an integer. Iteration will loop over all the elements
in the :any:`vec` attribute.
:examples:
>>> import numpy as np
>>> from antk.core import loader
>>> xhot = np.array([[1,0,0], [1,0,0], [0,1,0], [0,0,1]])
>>> xindex = loader.IndexVector(xhot)
>>> xindex.vec
array([0, 0, 1, 2])
>>> xindex.dim
3
>>> xindex.hot() #doctest: +NORMALIZE_WHITESPACE
<4x3 sparse matrix of type '<type 'numpy.float64'>'
with 4 stored elements in Compressed Sparse Row format>
>>> xindex.hot().toarray() #doctest: +NORMALIZE_WHITESPACE
array([[ 1., 0., 0.],
[ 1., 0., 0.],
[ 0., 1., 0.],
[ 0., 0., 1.]])
>>> xindex.shape
(4, 3)
>>> xindex
<class 'antk.core.loader.IndexVector'>(shape=(4, 3))
vec=[0, 0, 1, 2]
dim=3
>>> xindex[0]
0
>>> xindex[1:3]
<class 'antk.core.loader.IndexVector'>(shape=(2, 3))
vec=[0, 1]
dim=3
>>> [index+2 for index in xindex]
[2, 2, 3, 4]
"""
def __init__(self, matrix, dimension=None):
if is_one_hot(matrix):
self._dim = matrix.shape[1]
self._vec = toIndex(matrix).flatten()
else:
if matrix.dtype != np.int64:
raise ValueError('Indices must be integers.')
self._dim = dimension
if self.dim is None:
self._dim = np.amax(matrix) + 1
self._vec = np.array(matrix).flatten()
def __repr__(self):
vector = repr.repr(self._vec.tolist())
return '%s(shape=%s)\nvec=%s\ndim=%s' % (type(self), self.shape, vector, self._dim)
def __str__(self):
return '%s(shape=%s)\nvec=%s\ndim=%s' % (type(self), self.shape, self._vec, self._dim)
def __iter__(self):
return iter(self._vec)
def __eq__(self, other):
return np.array_equal(self._vec, other._vec) and self._dim == other._dim
def __len__(self):
return self._vec.shape[0]
def __getitem__(self, index):
cls = type(self)
if isinstance(index, numbers.Integral):
return self._vec[index]
elif isinstance(index, slice):
return cls(self._vec[index], self.dim)
else:
raise TypeError('Indices must be integers.')
@property
def dim(self):
"""
:attribute: (int) The feature dimension (number of columns) of the one hot matrix.
"""
return self._dim
@property
def vec(self):
"""
:attribute: (numpy 1d array) The vector of hot indices.
"""
return self._vec
@property
def shape(self):
"""
:attribute: (tuple) The shape of the one hot matrix encoded.
"""
return (self.vec.shape[0], self.dim)
[docs] def hot(self):
"""
:method:
:return: A one hot scipy sparse csr_matrix
"""
return toOnehot(self)
[docs]class HotIndex(IndexVector):
"""
Same data structure as :any:`IndexVector`. This is the legacy name.
"""
pass
# ===================================================================================
# ====================== I/0 ========================================================
# ===================================================================================
[docs]def load(filename):
"""
Calls :any:`import_data`.
Decides how to load data into python matrices by file extension.
Raises :any:`UnsupportedFormatError` if extension is not one of the supported
extensions (mat, sparse, binary, dense, sparsetxt, densetxt, index).
:param filename: (str) A file of an accepted format representing a matrix.
:return: A numpy matrix, scipy sparse csr_matrix, or any:`IndexVector`.
"""
return import_data(filename)
[docs]def import_data(filename):
'''
Decides how to load data into python matrices by file extension.
Raises :any:`UnsupportedFormatError` if extension is not one of the supported
extensions (mat, sparse, binary, dense, sparsetxt, densetxt, index).
:param filename: (str) A file of an accepted format representing a matrix.
:return: A numpy matrix, scipy sparse csr_matrix, or any:`IndexVector`.
'''
extension = filename.split(slash)[-1].split('.')[-1].strip()
if extension == 'mat':
mat_file_map = scipy.io.loadmat(filename)
if 'data' not in mat_file_map:
raise MatFormatError('Matrix in .mat file ' +
filename + ' must be named "data"')
if mat_file_map['data'].shape[0] == 1:
return np.transpose(mat_file_map['data'])
else:
return mat_file_map['data']
elif extension == 'index':
return _imatload(filename)
elif extension == 'sparse':
return _smatload(filename)
elif extension == 'binary' or extension == 'dense':
return _matload(filename)
elif extension == 'sparsetxt':
X = np.loadtxt(filename)
if X.shape[1] != 3:
raise SparseFormatError('Sparse Format: row col val')
return sps.csr_matrix((X[:, 2], (X[:, 0], X[:, 1])))
elif extension == 'densetxt':
return np.loadtxt(filename)
else:
raise UnsupportedFormatError('Supported extensions: '
'mat, sparse, binary, sparsetxt, densetxt, index')
[docs]def save(filename, data):
"""
Calls :any`export_data`.
Decides how to save data by file extension.
Raises :any:`UnsupportedFormatError` if extension is not one of the supported
extensions (mat, sparse, binary, dense, index).
Data contained in .mat files should be saved in a matrix named *data*.
:param filename: (str) A filename with extension of an
accepted format for representing a matrix.
:param data: numpy array, scipy sparse matrix, or :any:`IndexVector` object.
"""
export_data(filename, data)
[docs]def export_data(filename, data):
"""
Decides how to save data by file extension.
Raises :any:`UnsupportedFormatError` if extension is not one of the supported
extensions (mat, sparse, binary, dense, index).
Data contained in .mat files should be saved in a matrix named *data*.
:param filename: A file of an accepted format representing a matrix.
:param data: A numpy array, scipy sparse matrix, or :any:`IndexVector` object.
"""
extension = filename.split(slash)[-1].split('.')[-1].strip()
if extension == 'mat':
scipy.io.savemat(filename, {'data': data})
elif extension == 'index':
if not isinstance(data, IndexVector):
raise UnsupportedFormatError('Only IndexVector objects may be saved in .index format.')
_imatsave(filename, data)
elif extension == 'sparse':
if not sps.issparse(data):
raise UnsupportedFormatError('Only scipy sparse matrices may be saved in .sparse format.')
_smatsave(filename, data)
elif extension == 'binary' or extension == 'dense':
if sps.issparse(data):
raise UnsupportedFormatError('Only numpy 2d arrays may be saved in .binary or .dense format.')
_matsave(filename, data)
elif extension == 'densetxt':
if sps.issparse(data):
raise UnsupportedFormatError('Only numpy 2d arrays may be saved in .densetxt format')
np.savetxt(filename, data)
elif extension == 'sparsetxt':
if not sps.issparse(data):
raise UnsupportedFormatError('Only scipy sparse matrices may be saved in .sparsetxt format.')
indices = list(data.nonzero())
indices.append(data.data)
data = [m.reshape((-1,1)) for m in indices]
data = np.concatenate(data, axis=1)
np.savetxt(filename, data)
else:
raise UnsupportedFormatError('Supported extensions: '
'mat, sparse, binary, dense, index, sparsetxt, densetxt')
def _write_int64(file_obj, num):
"""
Writes an 8 byte integer to a file in binary format. From David Palzer.
:param file_obj: The open file object to write to.
:param num: The integer to write, will be converted to a long int.
"""
file_obj.write(struct.pack('q', long(num)))
def _read_int64(file_obj):
"""
Reads an 8 byte binary integer from a file. From David Palzer.
:param file_obj: The open file object from which to read.
:return: The eight bytes read from the file object interpreted as a long int.
"""
return struct.unpack('q', file_obj.read(8))[0]
def _matload(filename):
"""
Reads in a dense matrix from binary (dense) file filename. From `David Palzer`_.
:param filename: (str) file from which to read.
:return: The matrix which has been read.
"""
f = open(filename, 'r')
m = _read_int64(f)
n = _read_int64(f)
x = np.fromfile(f, np.dtype(np.float64), -1, "")
x = x.reshape((m, n), order="FORTRAN")
f.close()
return np.array(x)
def _matsave(filename, x):
"""
Saves the input matrix to the input file in dense format. From `David Palzer`_.
:param filename: file to write to
:param x: matrix to write
"""
if len(x.shape) == 1:
x = np.reshape(x, (x.shape[0], 1))
f = open(filename, 'wb')
_write_int64(f, x.shape[0])
_write_int64(f, x.shape[1])
x.astype(np.float64, copy=False).ravel('FORTRAN').tofile(f)
f.close()
def _smatload(filename):
"""
Reads in a sparse matrix from file. From `David Palzer`_.
:param filename: the file from which to read
:return: a sparse matrix created from the sparse data
"""
f = open(filename, 'r')
row = _read_int64(f)
col = _read_int64(f)
nnz = _read_int64(f)
S = np.fromfile(f, 'd', 3 * nnz)
f.close()
S = S.reshape((nnz, 3))
rows = S[:, 0].astype(int) - 1
cols = S[:, 1].astype(int) - 1
vals = S[:, 2]
return sps.csr_matrix((vals, (rows, cols)), shape=(row, col))
def _smatsave(filename, t):
"""
Saves the input matrix to the input file in sparse format. From `David Palzer`_.
:param filename:
:param t:
"""
t = sps.csr_matrix(t,copy=False)
f = open(filename, 'wb')
_write_int64(f, t.shape[0])
_write_int64(f, t.shape[1])
indices = t.nonzero()
idxs = np.vstack((indices[0], indices[1]))
_write_int64(f, len(indices[1]))
ti = np.mat(t[indices])
indices = np.concatenate((idxs, ti))
indices[0:2, :] += 1
indices.astype(float, copy=False).ravel('F').tofile(f)
f.close()
def _imatload(filename):
"""
Reads in a :any:`IndexVector` matrix from file
:param filename: the file from which to read where a :any:`IndexVector` object was stored.
:return: A :any:`IndexVector` object.
"""
f = open(filename, 'r')
vec_length = _read_int64(f)
dim = _read_int64(f)
vec = np.fromfile(f, 'd', vec_length)
f.close()
vec = vec.astype(int) - 1
return IndexVector(vec, dim)
def _imatsave(filename, index_vec):
"""
Saves the input matrix to the input file in sparse format
:param filename: Filename to save to.
:param index_vec: A :any:`IndexVector` object.
"""
f = open(filename, 'wb')
vector = index_vec.vec
vector = vector + 1
_write_int64(f, vector.shape[0])
_write_int64(f, index_vec.dim)
vector.astype(float, copy=False).tofile(f)
f.close()
[docs]def makedirs(datadirectory, sub_directory_list=('train', 'dev', 'test')):
"""
:param datadirectory: Name of the directory you want to create containing the subdirectory folders.
If the directory already exists it will be populated with the subdirectory folders.
:param sub_directory_list: The list of subdirectories you want to create
:return: void
"""
if not datadirectory.endswith(slash):
datadirectory += slash
os.system('mkdir ' + datadirectory)
for sub in sub_directory_list:
os.system('mkdir ' + datadirectory + sub)
[docs]def read_data_sets(directory, folders=('train', 'dev', 'test'), hashlist=(), mix=False):
"""
:param directory: (str) Root directory containing data to load.
:param folders: (dict) The subfolders of *directory* to read data from.
By default there are train, dev, and test folders.
If you want others you have to make an explicit list.
:param hashlist: (dict) If you provide a hashlist these files and
only these files will be added to your :any:`DataSet` objects.
It you do not provide a hashlist then anything with
the privileged prefixes labels_ or features_ will be loaded.
:param mix: (boolean) Whether to shuffle during mini-batching.
:return: A :any:`DataSets` object.
:examples:
>>> import antk.core.loader as loader
>>> import numpy as np
>>> loader.makedirs('/tmp/test_data/')
>>> loader.save('/tmp/test_data/test/features_id.dense', np.eye(5))
>>> loader.save('/tmp/test_data/test/features_ones.dense', np.ones((5, 2)))
>>> loader.save('/tmp/test_data/test/labels_id.dense', np.eye(5))
>>> loader.save('/tmp/test_data/dev/features_id.dense', np.eye(5))
>>> loader.save('/tmp/test_data/dev/features_ones.dense', np.ones((5, 2)))
>>> loader.save('/tmp/test_data/dev/labels_id.dense', np.eye(5))
>>> loader.save('/tmp/test_data/train/features_id.dense', np.eye(5))
>>> loader.save('/tmp/test_data/train/features_ones.dense', np.ones((5, 2)))
>>> loader.save('/tmp/test_data/train/labels_id.dense', np.eye(5))
>>> loader.read_data_sets('/tmp/test_data').show() #doctest: +SKIP
reading train...
reading dev...
reading test...
dev:
features:
ones: (5, 2) <type 'numpy.ndarray'>
id: (5, 5) <type 'numpy.ndarray'>
labels:
id: (5, 5) <type 'numpy.ndarray'>
test:
features:
ones: (5, 2) <type 'numpy.ndarray'>
id: (5, 5) <type 'numpy.ndarray'>
labels:
id: (5, 5) <type 'numpy.ndarray'>
train:
features:
ones: (5, 2) <type 'numpy.ndarray'>
id: (5, 5) <type 'numpy.ndarray'>
labels:
id: (5, 5) <type 'numpy.ndarray'>
>>> loader.read_data_sets('/tmp/test_data',
... folders=['train', 'dev'],
... hashlist=['ones']).show() #doctest: +SKIP
reading train...
reading dev...
dev:
features:
ones: (5, 2) <type 'numpy.ndarray'>
labels:
train:
features:
ones: (5, 2) <type 'numpy.ndarray'>
labels:
"""
if not directory.endswith(slash):
directory += slash
dir_files = os.listdir(directory)
datasets_map = {}
for folder in folders: # iterates over keys
dataset_map = {'features': {}, 'labels': {}}
print('reading ' + folder + '...')
if folder not in dir_files:
raise BadDirectoryStructureError('Need ' + folder + ' folder in ' + directory + ' directory.')
file_list = os.listdir(directory + folder)
for filename in file_list:
prefix = filename.split('_')[0]
if prefix == 'features' or prefix == 'labels':
prefix_ = prefix + '_'
descriptor = (filename.split('.')[0]).split(prefix_)[-1]
if (not hashlist) or (descriptor in hashlist):
dataset_map[prefix][descriptor] = import_data(directory + folder + slash + filename)
datasets_map[folder] = dataset_map
return DataSets(datasets_map, mix=mix)
# ===================================================================================
# ====================== DATA MANIPULATION===========================================
# ===================================================================================
[docs]def toOnehot(X, dim=None):
'''
:param X: (numpy array) Vector of indices or :any:`IndexVector` object
:param dim: (int) Dimension of indexing
:return: A sparse csr_matrix of one hots.
:examples:
>>> import numpy as np
>>> from antk.core import loader
>>> x = np.array([0, 1, 2, 3])
>>> loader.toOnehot(x) #doctest: +ELLIPSIS
<4x4 sparse matrix of type '<type 'numpy.float64'>'...
>>> loader.toOnehot(x).toarray()
array([[ 1., 0., 0., 0.],
[ 0., 1., 0., 0.],
[ 0., 0., 1., 0.],
[ 0., 0., 0., 1.]])
>>> x = loader.IndexVector(x, dimension=8)
>>> loader.toOnehot(x).toarray()
array([[ 1., 0., 0., 0., 0., 0., 0., 0.],
[ 0., 1., 0., 0., 0., 0., 0., 0.],
[ 0., 0., 1., 0., 0., 0., 0., 0.],
[ 0., 0., 0., 1., 0., 0., 0., 0.]])
'''
if isinstance(X, IndexVector):
dim = X.dim
X = X.vec
else:
if dim is None:
dim = np.amax(X) + 1
return sps.csr_matrix(([1.0]*X.shape[0], (range(X.shape[0]), X.astype(int))), shape=(X.shape[0], dim))
[docs]def is_one_hot(A):
'''
:param A: A 2-d numpy array or scipy sparse matrix
:return: True if matrix is a sparse matrix of one hot vectors, False otherwise
:examples:
>>> import numpy as np
>>> from antk.core import loader
>>> x = np.eye(3)
>>> loader.is_one_hot(x)
True
>>> x *= 5
>>> loader.is_one_hot(x)
False
>>> x = np.array([[1, 0, 0], [1, 0, 0], [1, 0, 0]])
>>> loader.is_one_hot(x)
True
>>> x[0,1] = 2
>>> loader.is_one_hot(x)
False
'''
B = sps.csr_matrix(A)
(i, j, v) = sps.find(B)
return (np.sum(v) == B.shape[0] and
np.unique(i).shape[0] == B.shape[0] and
np.unique(v).shape[0] == 1 and
np.unique(v)[0] == 1 and
len(A.shape) == 2)
[docs]def toIndex(A):
'''
:param A: (numpy array or scipy.sparse.csr_matrix) A matrix of one hot row vectors.
:return: The hot indices.
:examples:
>>> import numpy as np
>>> from antk.core import loader
>>> x = np.array([[1,0,0], [0,0,1], [1,0,0]])
>>> loader.toIndex(x)
array([0, 2, 0])
'''
if is_one_hot(A):
if sps.issparse(A):
return sps.find(A)[1]
else:
return np.nonzero(A)[1]
else:
raise ValueError('Argument to function must be a one hot matrix.')
[docs]def center(X, axis=None):
"""
:param X: (numpy array or scipy.sparse.csr_matrix) A matrix to center about the
mean(over columns axis=0, over rows axis=1, over all entries axis=None)
:return: A matrix with entries centered along the specified axis.
"""
if sps.issparse(X):
X = X.todense()
return X - np.mean(X, axis=axis)
else:
if axis == 1:
return X - np.transpose(np.array([X.mean(axis=1).tolist()] * X.shape[1]))
return X - np.mean(X, axis=axis)
[docs]def unit_variance(X, axis=None):
"""
:param X: (numpy array or scipy.sparse.csr_matrix) A matrix to transform to have
unit variance (over columns axis=0, over rows axis=1, over all entries axis=None)
:param axis: The axis to perform the transform.
:return: A matrix with unit variance along the specified axis.
"""
if sps.issparse(X):
X = X.todense()
return X / np.std(X, axis=axis)
# ===================================================
# Normalizations for tfidf or whatever
# ====================================================
[docs]def l2normalize(X, axis=1):
"""
axis=1 normalizes each row of X by norm of said row. :math:`l2normalize(X)_{ij} = \\frac{X_{ij}}{\sqrt{\sum_k X_{
ik}^2}}`
axis=0 normalizes each column of X by norm of said column. :math:`l2normalize(X)_{ij} = \\frac{X_{ij}}{\sqrt{\sum_k
X_{kj}^2}}`
:param X: A scipy sparse csr_matrix or numpy array.
:param axis: The dimension to normalize over.
:return: A normalized matrix.
:raise: ValueError
"""
if axis is None:
raise ValueError('axis must be 0 or 1. Normalization is over rows or columns.')
if sps.issparse(X):
X = X.toarray()
normalized_matrix = X/np.linalg.norm(X, ord=2, axis=axis, keepdims=True)
if sps.issparse(X):
normalized_matrix = sps.csr_matrix(normalized_matrix)
return normalized_matrix
[docs]def l1normalize(X, axis=1):
"""
axis=1 normalizes each row of X by norm of said row. :math:`l1normalize(X)_{ij} = \\frac{X_{ij}}{\sum_k |X_{ik}|}`
axis=0 normalizes each column of X by norm of said column. :math:`l1normalize(X)_{ij} = \\frac{X_{ij}}{\sum_k
|X_{kj}|}`
:param X: A scipy sparse csr_matrix or numpy array.
:param axis: The dimension to normalize over.
:return: A normalized matrix.
:raise: ValueError
"""
if axis is None:
raise ValueError('axis must be 0 or 1. Normalization is over rows or columns.')
if sps.issparse(X):
X = X.toarray()
normalized_matrix = X/np.linalg.norm(X, ord=1, axis=axis, keepdims=True)
if sps.issparse(X):
normalized_matrix = sps.csr_matrix(normalized_matrix)
return normalized_matrix
[docs]def maxnormalize(X, axis=1):
"""
axis=1 normalizes each row of X by norm of said row. :math:`maxnormalize(X)_{ij} = \\frac{X_{ij}}{max(X_{i:})}`
axis=0 normalizes each column of X by norm of said column. :math:`maxnormalize(X)_{ij} = \\frac{X_{ij}}{max(X_{
:j})}`
:param X: A scipy sparse csr_matrix or numpy array.
:param axis: The dimension to normalize over.
:return: A normalized matrix.
:raise: ValueError
"""
if axis is None:
raise ValueError('Axis must be 0 or 1. Normalization is over rows or columns.')
if sps.issparse(X):
X = X.toarray()
normalized_matrix = X/np.linalg.norm(X, ord=np.inf, axis=axis, keepdims=True)
if sps.issparse(X):
normalized_matrix = sps.csr_matrix(normalized_matrix)
return normalized_matrix
[docs]def tfidf(X, norm='l2'):
"""
:param X: (numpy array or scipy.sparse.csr_matrix) A document-term matrix with term counts.
:param norm: Normalization strategy: `l2row`: normalizes the scores of rows by
length of rows after basic tfidf (each document vector is a unit vector),
`count`: normalizes the scores of rows by the the total word count of a document.
`max` normalizes the scores of rows by the maximum count for a single word in a document.
:return: Returns tfidf of document-term matrix X with optional normalization.
"""
NORM = {'l2': l2normalize,
'count': l1normalize,
'max': maxnormalize}
X = sps.csr_matrix(X)
idf = np.log(X.shape[0]/X.sign().sum(0))
# make a diagonal matrix of idf values to matrix multiply with tf.
IDF = sps.csr_matrix((idf.tolist()[0], (range(X.shape[1]), range(X.shape[1]))))
if norm == 'count' or norm == 'max':
# Only normalizing tf
return sps.csr_matrix(sps.csr_matrix(NORM[norm](X)).dot(IDF))
elif norm == 'l2':
# normalizing tfidf
return sps.csr_matrix(NORM[norm](X.dot(IDF)))
else:
# no normalization
return sps.csr_matrix(X.dot(IDF))
[docs]def untar(fname):
"""
Untar and ungzip a file in the current directory.
:param fname: (str) Name of the .tar.gz file
"""
if fname.endswith("tar.gz"):
tar = tarfile.open(fname)
tar.extractall()
tar.close()
print("Extracted " + fname + " in Current Directory")
else:
print("Not a tar.gz file: '%s '" % fname)
[docs]def maybe_download(filename, directory, source_url):
"""
Download the data from source url, unless it's already here.
From https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/learn/python/learn/datasets/base.py
:param filename: string, name of the file in the directory.
:param directory: string, path to working directory.
:param source_url: url to download from if file doesn't exist.
:return: Path to resulting file.
"""
filepath = os.path.join(directory, filename)
if not os.path.isfile(filepath):
urlopen = urllib.URLopener()
urlopen.retrieve(source_url, filepath)
return filepath