#!/usr/bin/python '''defines the MalConv architecture. Adapted from https://arxiv.org/pdf/1710.09435.pdf Things different about our implementation and that of the original paper: * The paper uses batch_size = 256 and SGD(lr=0.01, momentum=0.9, decay=UNDISCLOSED, nesterov=True ) * The paper didn't have a special EOF symbol * The paper allowed for up to 2MB malware sizes, we use 1.0MB because of memory on a Titan X ''' def main(): from keras.layers import Dense, Conv1D, Activation, GlobalMaxPooling1D, Input, Embedding, Multiply from keras.models import Model from keras import backend as K from keras import metrics import multi_gpu import os import math import random import argparse import os import numpy as np import requests batch_size = 100 input_dim = 257 # every byte plus a special padding symbol padding_char = 256 parser = argparse.ArgumentParser() parser.add_argument('--gpus', help='number of GPUs', default=1) args = parser.parse_args() ngpus = int(args.gpus) if os.path.exists('malconv.h5'): print("restoring malconv.h5 from disk for continuation training...") from keras.models import load_model basemodel = load_model('malconv.h5') _, maxlen, embedding_size = basemodel.layers[1].output_shape input_dim else: maxlen = 2**20 # 1MB embedding_size = 8 # define model structure inp = Input( shape=(maxlen,)) emb = Embedding( input_dim, embedding_size )( inp ) filt = Conv1D( filters=128, kernel_size=500, strides=500, use_bias=True, activation='relu', padding='valid' )(emb) attn = Conv1D( filters=128, kernel_size=500, strides=500, use_bias=True, activation='sigmoid', padding='valid')(emb) gated = Multiply()([filt,attn]) feat = GlobalMaxPooling1D()( gated ) dense = Dense(128, activation='relu')(feat) outp = Dense(1, activation='sigmoid')(dense) basemodel = Model( inp, outp ) basemodel.summary() print("Using %i GPUs" %ngpus) if ngpus > 1: model = multi_gpu.make_parallel(basemodel,ngpus) else: model = basemodel from keras.optimizers import SGD model.compile( loss='binary_crossentropy', optimizer=SGD(lr=0.01,momentum=0.9,nesterov=True,decay=1e-3), metrics=[metrics.binary_accuracy] ) def bytez_to_numpy(bytez,maxlen): b = np.ones( (maxlen,), dtype=np.uint16 )*padding_char bytez = np.frombuffer( bytez[:maxlen], dtype=np.uint8 ) b[:len(bytez)] = bytez return b def getfile_service(sha256,url=None,maxlen=maxlen): if url is None: raise NotImplementedError("You must provide your own url for getting file bytez by sha256") r = requests.get( url, params={'sha256':sha256} ) if not r.ok: return None return bytez_to_numpy( r.content, maxlen ) def generator( hashes, labels, batch_size, shuffle=True ): X = [] y = [] zipped = list(zip(hashes, labels)) while True: if shuffle: random.shuffle( zipped ) for sha256,l in zipped: x = getfile_service(sha256) if x is None: continue X.append( x ) y.append( l ) if len(X) == batch_size: yield np.asarray(X,dtype=np.uint16), np.asarray(y) X = [] y = [] import pandas as pd train_labels = pd.read_csv('ember_training.csv.gz') train_labels = train_labels[ train_labels['y'] != -1 ] # get only labeled samples labels = train_labels['y'].tolist() hashes = train_labels['sha256'].tolist() from sklearn.model_selection import train_test_split hashes_train, hashes_val, labels_train, labels_val = train_test_split( hashes, labels, test_size=200 ) train_gen = generator( hashes_train, labels_train, batch_size ) val_gen = generator( hashes_val, labels_val, batch_size ) from keras.callbacks import LearningRateScheduler base = K.get_value( model.optimizer.lr ) def schedule(epoch): return base / 10.0**(epoch//2) model.fit_generator( train_gen, steps_per_epoch=len(hashes_train)//batch_size, epochs=10, validation_data=val_gen, callbacks=[ LearningRateScheduler( schedule ) ], validation_steps=int(math.ceil(len(hashes_val)/batch_size)), ) basemodel.save('malconv.h5') test_labels = pd.read_csv('ember_test.csv.gz') labels_test = test_labels['y'].tolist() hashes_test = test_labels['sha256'].tolist() test_generator = generator(hashes_test,labels_test,batch_size=1,shuffle=False) test_p = basemodel.predict_generator( test_generator, steps=len(test_labels), verbose=1 ) if __name__ == '__main__': print('*'*80) print(''' This is nonfunctional demonstration code that is provided for convenience. It shows - The MalConv structure used in our paper - Training procedure used in the paper - How to load the weights for the MalConv model that we used. It may be made functional by modifying the code to retrieve file contents by sha256 from a user-defined URL. You may use the provided weights under the Ember AGPL-3.0 license included in the parent directory. We also ask that you cite the original MalConv paper and refer to the Ember paper as the implementation. (1) E. Raff, J. Barker, J. Sylvester, R. Brandon, B. Catanzaro, C. Nicholas, "Malware Detection by Eating a Whole EXE", in ArXiv e-prints. Oct. 2017. @ARTICLE{raff2017malware, title={Malware detection by eating a whole exe}, author={Raff, Edward and Barker, Jon and Sylvester, Jared and Brandon, Robert and Catanzaro, Bryan and Nicholas, Charles}, journal={arXiv preprint arXiv:1710.09435}, year={2017} } (2) H. Anderson and P. Roth, "EMBER: An Open Dataset for Training Static PE Malware Machine Learning Models”, in ArXiv e-prints. Apr. 2018. @ARTICLE{2018arXiv180404637A, author = {{Anderson}, H.~S. and {Roth}, P.}, title = "{EMBER: An Open Dataset for Training Static PE Malware Machine Learning Models}", journal = {ArXiv e-prints}, archivePrefix = "arXiv", eprint = {1804.04637}, primaryClass = "cs.CR", keywords = {Computer Science - Cryptography and Security}, year = 2018, month = apr, adsurl = {http://adsabs.harvard.edu/abs/2018arXiv180404637A}, } ''') print('*'*80) #main() # uncomment this line after fixing the URL NotImplementedError above