Source code for util

"""
Python and numpy functions.
"""
from tf_ops import softmax_dist_loss, diag_mvn_loss, full_mvn_loss
import numpy as np
import argparse


[docs]def make_feature_spec(dataspec): """ Makes lists of all the continuous and categorical features to be used as input features of a neural network. :param dataspec: (dict) From a json specification of the purpose of fields in the csv input file (See docs for formatting) :return: (dict) features {'categorical': [categorical_feature_1, ..., categorical_feature_j], 'continuous': [continuous_feature_1, ..., continuous_feature_k]} """ spec = {k: v for k, v in dataspec.iteritems() if k != 'num_features'} feature_spec = {'categorical': [], 'continuous': []} for key, field in spec.iteritems(): if field['num_classes'] == 0 and field['feature']: feature_spec['continuous'].append(key) if field['num_classes'] > 0 and field['feature']: feature_spec['categorical'].append(key) return feature_spec
[docs]def make_loss_spec(dataspec, mvn): """ Makes a list of tuples for each target to be used in training a multiple output neural network modeling a mixed joint distribution of discrete and continuous variables. :param dataspec: (dict) From a json specification of the purpose of fields in the csv input file (See docs for formatting) :param mvn: Tensorflow function for calculating type of multivariate loss for continuous target vectors. Can be tf_ops.diag_mvn_loss, tf_ops.full_mvn_loss, tf_ops.eyed_mvn_loss :return: A list of tuples of the form: (target_name, loss_function, dimension) where dimension is the dimension of the target vector (for categorical features this is the number of classes, for continuous targets this is the size of the continuous target vector) """ spec = {k:v for k,v in dataspec.iteritems() if k != 'num_features'} loss_spec = [] for key, field in spec.iteritems(): if field['num_classes'] == 0 and field['target']: loss_spec.append((key, mvn, len(field['index']))) if field['num_classes'] > 0 and field['target']: loss_spec.append((key, softmax_dist_loss, field['num_classes'])) return loss_spec
[docs]def get_multivariate_loss_names(loss_spec): """ For use in conjunction with `tf_ops.multivariate_loss`. Gives the names of all contributors (columns) of the loss matrix. :param loss_spec: A list of 3-tuples of the form (input_name, loss_function, dimension) where input_name is the same as a target in datadict, loss_function takes two parameters, a target and prediction, and dimension is the dimension of the target. :return: loss_names is a list concatenated_feature_size long with names of all loss contributors. """ loss_names, log_det_names = [], [] for i, (input_name, loss_func, dimension) in enumerate(loss_spec): if loss_func == softmax_dist_loss: # discrete loss_names.append("loss_%s" % input_name) else: # continuous if loss_func == diag_mvn_loss or loss_func == full_mvn_loss: log_det_names.append("loss_%s.logdet" % input_name) for k in range(dimension): loss_names.append("loss_%s.%d" % (input_name, k)) loss_names.extend(log_det_names) return loss_names
[docs]def get_mask(lens, num_tokens): """ For masking output of lm_rnn for jagged sequences for correct gradient update. Sequence length of 0 will output nan for that row of mask so don't do this. :param lens: Numpy vector of sequence lengths :param num_tokens: (int) Number of predicted tokens in sentence. :return: A numpy array mask MB X num_tokens For each row there are: lens[i] values of 1/lens[i] followed by num_tokens - lens[i] zeros """ mask_template = np.repeat(np.arange(num_tokens).reshape(1, -1), lens.shape[0], axis=0) return (mask_template < lens.reshape([-1, 1])).astype(float) / lens.reshape([-1, 1]).astype(float)
[docs]class RunningMean: """ Calculates the batchwise running mean from rows, columns, or values of a matrix. """ def __init__(self, axis=0): """ :param axis: The axis to calculate the running mean over. If axis==None then the running mean for the entire array is taken. """ self.n = 0.0 # total number of samples self.avg = 0.0 self.axis = axis
[docs] def __call__(self, samples): """ :param samples: a matrix of samples to incorporate into running mean :return: running average over axis """ if self.axis is not None: m = float(samples.shape[self.axis]) # num_new_samples else: m = np.prod(np.array(samples.shape)) self.n += m self.avg = ((self.n - m) / self.n) * self.avg + np.sum(samples, axis=self.axis) / self.n # second term = (new_avg*m)/n return self.avg
[docs]class ExponentialRunningMean: """ Calculates the running mean of row vectors batchwise given a sequence of matrices. """ def __init__(self, alpha=1.0): """ :param alpha: (float) Higher alpha discounts older observations faster. The smaller the alpha, the further you take into consideration the past. """ self.mean = None self.alpha = alpha
[docs] def __call__(self, samples): """ :param samples: a matrix of samples to incorporate into running mean :return: running average over axis """ if self.mean is None: self.mean = np.mean(samples, axis=0).reshape([1, -1]) else: old_mean = self.mean[-1, :] self.mean = np.empty((0, samples.shape[1])) for i in range(samples.shape[0]): new_mean = (1 - self.alpha)*old_mean + self.alpha*samples[i, :] self.mean = np.vstack([self.mean, new_mean]) old_mean = new_mean return self.mean
[docs]class Parser(argparse.ArgumentParser): """ Hack for Sphinx documentation of scripts to work correctly. """ def _get_option_tuples(self, option_string): result = [] # option strings starting with two prefix characters are only # split at the '=' chars = self.prefix_chars if option_string[0] in chars and option_string[1] in chars: if '=' in option_string: option_prefix, explicit_arg = option_string.split('=', 1) else: option_prefix = option_string explicit_arg = None for option_string in self._option_string_actions: if option_string == option_prefix: action = self._option_string_actions[option_string] tup = action, option_string, explicit_arg result.append(tup) # single character options can be concatenated with their arguments # but multiple character options always have to have their argument # separate elif option_string[0] in chars and option_string[1] not in chars: option_prefix = option_string explicit_arg = None short_option_prefix = option_string[:2] short_explicit_arg = option_string[2:] for option_string in self._option_string_actions: if option_string == short_option_prefix: action = self._option_string_actions[option_string] tup = action, option_string, short_explicit_arg result.append(tup) elif option_string == option_prefix: action = self._option_string_actions[option_string] tup = action, option_string, explicit_arg result.append(tup) # shouldn't ever get here else: self.error(_('unexpected option string: %s') % option_string) # return the collected option tuples return result