Source code for simple_lm

"""
Simple language model code for network modeling.

File name convention:
---------------------

- lr: learnrate for gradient descent
- nl: number of stacked lstm layers
- hs: size of hidden layers (presumes all layers have same number of hidden units)
- mb: Size of mini-batch
- bc: max bad count for early stopping
- em: Size of token embeddings
- rs: Random seed for reproducible results

stdout
------

For each mini-batch the following is printed to standard output ::

    batchsize line_number second status filename index current_loss

Where:

- batchsize: The size of the mini-batch
- line_number: Line number from original auth.txt file (may be off by 1)
- second: The second of the first event in the mini-batch
- status: Whether the model is updating or merely forward propagating
- filename: The current file being processed
- index: The number of samples processed to date
- current_loss: The average loss over the mini-batch

File output:
------------
::

    batch_num line second day user red loss

Where:

- batch_num: The mini-batch this event was a part of
- line: Line number from original auth.txt file (may be off by 1)
- second: The second which the event occurred on
- day: The day the event occurred on
- user: The user who performed the event
- red: Whether this event was a labeled red team activity (1 for red team activity 0 otherwise)
- loss: The anomaly score for this event


Example calls
-------------

**Simple character based language model.** ::

    python safekit/models/simple_lm.py results/ safekit/features/specs/lm/lanl_char_config.json data_examples/lanl/lm_feats/raw_day_split/ -test -skipsos -jagged

.. Note :: The output results will be printed to /tmp/lanl_result/ and then moved to results/ upon completion
      to avoid experiment slowdown of constant network traffic when using a distributed file system.

Input Data
----------

The format of the input makes the following assumptions:

- Input files are together in datafolder, one file for each day.
- Input files are plain text files with one line of integers per log line representing meta data and the tokens from log text.
- Input format for fixed length sequences ::

    line_nums second day user red logtokenid1 .... logtokenid_SentenceLen
- Zero paded Input format for jagged sequences ::

    line_nums second day user red SentenceLen logtokenid1 .... logtokenid_SentenceLen 0 0 .... 0
"""

import os
import sys
# So we can run this code on arbitrary environment which has tensorflow but not safekit installed
cyberpath = '/'.join(os.path.realpath(__file__).split('/')[:-3])
sys.path.insert(0, cyberpath)
import tensorflow as tf
import json
from safekit.batch import OnlineBatcher
from safekit.graph_training_utils import EarlyStop, ModelRunner
from safekit.tf_ops import lm_rnn, bidir_lm_rnn
from safekit.util import get_mask, Parser
import time
import numpy as np


[docs]def return_parser(): """ Defines and returns argparse ArgumentParser object. :return: ArgumentParser """ parser = Parser("Simple token based rnn for network modeling.") parser.add_argument('results_folder', type=str, help='The folder to print results to.') parser.add_argument('config', type=str, help='The data spec.') parser.add_argument("datafolder", type=str, help='The folder where the data is stored.') parser.add_argument('-learnrate', type=float, default=0.001, help='Step size for gradient descent.') parser.add_argument("-lm_layers", nargs='+', type=int, default=[10], help="A list of hidden layer sizes.") parser.add_argument("-context_layers", nargs='+', type=int, default=[10], help="decoy arg.") parser.add_argument("-numsteps", type=int, default=10, help="decoy arg.") parser.add_argument('-mb', type=int, default=128, help='The mini batch size for stochastic gradient descent.') parser.add_argument('-debug', action='store_true', help='Use this flag to print feed dictionary contents and dimensions.') parser.add_argument('-maxbadcount', type=str, default=20, help='Threshold for early stopping.') parser.add_argument('-em', type=int, default=20, help='Size of embeddings for categorical features.') parser.add_argument('-encoding', type=str, default=None, help='Can be "oct", "raw" or "word"') parser.add_argument('-random_seed', type=int, default=5, help='Random seed for reproducible experiments.') parser.add_argument('-jagged', action='store_true', help='Whether using sequences of variable length (Input should' 'be zero-padded to max_sequence_length.') parser.add_argument('-skipsos', action='store_true', help='Whether to skip a start of sentence token.') parser.add_argument('-bidir', action='store_true', help='Whether to use bidirectional lstm for lower tier.') parser.add_argument('-test', action='store_true', help='Whether to run on a subset of the data (5000 lines from days 1,2,3) or the entire set.') parser.add_argument('-verbose', type=int, default=1, help='Whether to print loss during training.') parser.add_argument('-delimiter', type=str, default=',', help="Delimiter for input text file. You should be using ' ' for the dayshuffled cert.") parser.add_argument('-cell_type', type=str, default='lstm', help='Can be either "lstm", "ident_ran", or "ran"') return parser
[docs]def write_results(datadict, pointloss, outfile, batch): """ Writes loss for each datapoint, along with meta-data to file. :param datadict: Dictionary of data names (str) keys to numpy matrix values for this mini-batch. :param pointloss: MB X 1 numpy array :param outfile: Where to write results. :param batch: The mini-batch number for these events. :return: """ for line, sec, day, usr, red, loss in zip(datadict['line'].flatten().tolist(), datadict['second'].flatten().tolist(), datadict['day'].flatten().tolist(), datadict['user'].flatten().tolist(), datadict['red'].flatten().tolist(), pointloss.flatten().tolist()): outfile.write('%s %s %s %s %s %s %r\n' % (batch, line, sec, day, usr, red, loss))
CELL = {'lstm': tf.nn.rnn_cell.BasicLSTMCell} if __name__ == '__main__': args = return_parser().parse_args() direction = ('fwd', 'bidir')[args.bidir] outfile_name = "simple_%s_%s_%s_%s_lr_%s_nl_%s_hs_%s_mb_%s_bc_%s_em_%s_rs_%s" % (direction, args.encoding, args.cell_type, time.ctime(time.time()).replace(' ', '-'), args.learnrate, len(args.lm_layers), args.lm_layers[0], args.mb, args.maxbadcount, args.em, args.random_seed) args = return_parser().parse_args() conf = json.load(open(args.config, 'r')) if not args.results_folder.endswith('/'): args.results_folder += '/' tf.set_random_seed(args.random_seed) np.random.seed(args.random_seed) if not args.datafolder.endswith('/'): args.datafolder += '/' if "lanl_result" not in os.listdir("/tmp"): os.system("mkdir /tmp/lanl_result; chmod o+rwx /tmp/lanl_result") if not args.bidir: language_model = lm_rnn else: language_model = bidir_lm_rnn outfile = open('/tmp/lanl_result/' + outfile_name, 'w') tf.set_random_seed(args.random_seed) np.random.seed(args.random_seed) sentence_length = conf["sentence_length"]- 1 - int(args.skipsos) + int(args.bidir) token_set_size = conf["token_set_size"] x = tf.placeholder(tf.int32, [None, sentence_length]) t = tf.placeholder(tf.int32, [None, sentence_length-2*args.bidir]) ph_dict = {'x': x, 't': t} if args.jagged: seq_len = tf.placeholder(tf.int32, [None]) ph_dict['lengths'] = seq_len else: seq_len = None token_embed = tf.Variable(tf.truncated_normal([token_set_size, args.em])) # Initial embeddings vocab X embedding size token_losses, hidden_states, final_hidden = language_model(x, t, token_embed, args.lm_layers, seq_len=seq_len, cell=CELL[args.cell_type]) if args.jagged: ph_dict['mask'] = tf.placeholder(tf.float32, [None, sentence_length-2*args.bidir]) token_losses *= ph_dict['mask'] line_losses = tf.reduce_sum(token_losses, axis=1) # batch_size X 1 else: line_losses = tf.reduce_mean(token_losses, axis=1) # batch_size X 1 avgloss = tf.reduce_mean(line_losses) # scalar model = ModelRunner(avgloss, ph_dict, learnrate=args.learnrate, debug=args.debug, decay=True, decay_rate=0.99, decay_steps=20) # training loop start_time = time.time() jag = int(args.jagged) skipsos = int(args.skipsos) def trainday(is_training, f): batch_num = 0 data = OnlineBatcher(args.datafolder + f, args.mb, delimiter=' ') raw_batch = data.next_batch() current_loss = sys.float_info.max not_early_stop = EarlyStop(args.maxbadcount) endx = raw_batch.shape[1] - int(not args.bidir) endt = raw_batch.shape[1] - int(args.bidir) continue_training = not_early_stop(raw_batch, current_loss) while continue_training: # mat is not None and self.badcount < self.badlimit and loss != inf, nan: datadict = {'line': raw_batch[:, 0], 'second': raw_batch[:, 1], 'day': raw_batch[:, 2], 'user': raw_batch[:, 3], 'red': raw_batch[:, 4], 'x': raw_batch[:, (5+jag+skipsos):endx], 't': raw_batch[:, (6+jag+skipsos):endt]} if args.jagged: datadict['lengths'] = raw_batch[:, 5] datadict['mask'] = get_mask(datadict['lengths']-2*args.bidir-args.skipsos, sentence_length-2*args.bidir) assert np.all(datadict['lengths'] <= x.get_shape().as_list()[1]), 'Sequence found greater than num_tokens_predicted' assert np.nonzero(datadict['lengths'])[0].shape[0] == datadict['lengths'].shape[0], \ 'Sequence lengths must be greater than zero.' \ 'Found zero length sequence in datadict["lengths"]: %s' % datadict['lengths'] eval_tensors = [avgloss, line_losses] _, current_loss, pointloss = model.train_step(datadict, eval_tensors, update=is_training) if not is_training: write_results(datadict, pointloss, outfile, batch_num) batch_num += 1 if args.verbose: print('%s %s %s %s %s %s %r' % (raw_batch.shape[0], datadict['line'][0], datadict['second'][0], ('fixed', 'update')[is_training], f, data.index, current_loss)) raw_batch = data.next_batch() continue_training = not_early_stop(raw_batch, current_loss) if continue_training < 0: exit(0) weekend_days = conf["weekend_days"] if args.test: files = conf["test_files"] # 5000 lines from each of day 0, day 1 and day 2 else: files = [str(i) + '.txt' for i in range(conf["num_days"]) if i not in weekend_days] outfile.write("batch line second day user red loss\n") for idx, f in enumerate(files[:-1]): trainday(True, f) trainday(False, files[idx + 1]) outfile.close() total_time = time.time() - start_time print('elapsed time: %s' % total_time) os.system("mv /tmp/lanl_result/%s %s" % (outfile_name, args.results_folder + outfile_name))