File size: 6,591 Bytes
b92918a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
#!/usr/bin/python
'''defines the MalConv architecture.
Adapted from https://arxiv.org/pdf/1710.09435.pdf
Things different about our implementation and that of the original paper:
* The paper uses batch_size = 256 and SGD(lr=0.01, momentum=0.9, decay=UNDISCLOSED, nesterov=True )
* The paper didn't have a special EOF symbol
* The paper allowed for up to 2MB malware sizes, we use 1.0MB because of memory on a Titan X
'''
def main():
from keras.layers import Dense, Conv1D, Activation, GlobalMaxPooling1D, Input, Embedding, Multiply
from keras.models import Model
from keras import backend as K
from keras import metrics
import multi_gpu
import os
import math
import random
import argparse
import os
import numpy as np
import requests
batch_size = 100
input_dim = 257 # every byte plus a special padding symbol
padding_char = 256
parser = argparse.ArgumentParser()
parser.add_argument('--gpus', help='number of GPUs', default=1)
args = parser.parse_args()
ngpus = int(args.gpus)
if os.path.exists('malconv.h5'):
print("restoring malconv.h5 from disk for continuation training...")
from keras.models import load_model
basemodel = load_model('malconv.h5')
_, maxlen, embedding_size = basemodel.layers[1].output_shape
input_dim
else:
maxlen = 2**20 # 1MB
embedding_size = 8
# define model structure
inp = Input( shape=(maxlen,))
emb = Embedding( input_dim, embedding_size )( inp )
filt = Conv1D( filters=128, kernel_size=500, strides=500, use_bias=True, activation='relu', padding='valid' )(emb)
attn = Conv1D( filters=128, kernel_size=500, strides=500, use_bias=True, activation='sigmoid', padding='valid')(emb)
gated = Multiply()([filt,attn])
feat = GlobalMaxPooling1D()( gated )
dense = Dense(128, activation='relu')(feat)
outp = Dense(1, activation='sigmoid')(dense)
basemodel = Model( inp, outp )
basemodel.summary()
print("Using %i GPUs" %ngpus)
if ngpus > 1:
model = multi_gpu.make_parallel(basemodel,ngpus)
else:
model = basemodel
from keras.optimizers import SGD
model.compile( loss='binary_crossentropy', optimizer=SGD(lr=0.01,momentum=0.9,nesterov=True,decay=1e-3), metrics=[metrics.binary_accuracy] )
def bytez_to_numpy(bytez,maxlen):
b = np.ones( (maxlen,), dtype=np.uint16 )*padding_char
bytez = np.frombuffer( bytez[:maxlen], dtype=np.uint8 )
b[:len(bytez)] = bytez
return b
def getfile_service(sha256,url=None,maxlen=maxlen):
if url is None:
raise NotImplementedError("You must provide your own url for getting file bytez by sha256")
r = requests.get( url, params={'sha256':sha256} )
if not r.ok:
return None
return bytez_to_numpy( r.content, maxlen )
def generator( hashes, labels, batch_size, shuffle=True ):
X = []
y = []
zipped = list(zip(hashes, labels))
while True:
if shuffle:
random.shuffle( zipped )
for sha256,l in zipped:
x = getfile_service(sha256)
if x is None:
continue
X.append( x )
y.append( l )
if len(X) == batch_size:
yield np.asarray(X,dtype=np.uint16), np.asarray(y)
X = []
y = []
import pandas as pd
train_labels = pd.read_csv('ember_training.csv.gz')
train_labels = train_labels[ train_labels['y'] != -1 ] # get only labeled samples
labels = train_labels['y'].tolist()
hashes = train_labels['sha256'].tolist()
from sklearn.model_selection import train_test_split
hashes_train, hashes_val, labels_train, labels_val = train_test_split( hashes, labels, test_size=200 )
train_gen = generator( hashes_train, labels_train, batch_size )
val_gen = generator( hashes_val, labels_val, batch_size )
from keras.callbacks import LearningRateScheduler
base = K.get_value( model.optimizer.lr )
def schedule(epoch):
return base / 10.0**(epoch//2)
model.fit_generator(
train_gen,
steps_per_epoch=len(hashes_train)//batch_size,
epochs=10,
validation_data=val_gen,
callbacks=[ LearningRateScheduler( schedule ) ],
validation_steps=int(math.ceil(len(hashes_val)/batch_size)),
)
basemodel.save('malconv.h5')
test_labels = pd.read_csv('ember_test.csv.gz')
labels_test = test_labels['y'].tolist()
hashes_test = test_labels['sha256'].tolist()
test_generator = generator(hashes_test,labels_test,batch_size=1,shuffle=False)
test_p = basemodel.predict_generator( test_generator, steps=len(test_labels), verbose=1 )
if __name__ == '__main__':
print('*'*80)
print('''
This is nonfunctional demonstration code that is provided for convenience. It shows
- The MalConv structure used in our paper
- Training procedure used in the paper
- How to load the weights for the MalConv model that we used.
It may be made functional by modifying the code to retrieve file contents by sha256
from a user-defined URL.
You may use the provided weights under the Ember AGPL-3.0 license included in the parent directory.
We also ask that you cite the original MalConv paper and refer to the Ember paper as the implementation.
(1) E. Raff, J. Barker, J. Sylvester, R. Brandon, B. Catanzaro, C. Nicholas, "Malware Detection by Eating a Whole EXE", in ArXiv e-prints. Oct. 2017.
@ARTICLE{raff2017malware,
title={Malware detection by eating a whole exe},
author={Raff, Edward and Barker, Jon and Sylvester, Jared and Brandon, Robert and Catanzaro, Bryan and Nicholas, Charles},
journal={arXiv preprint arXiv:1710.09435},
year={2017}
}
(2) H. Anderson and P. Roth, "EMBER: An Open Dataset for Training Static PE Malware Machine Learning Models”, in ArXiv e-prints. Apr. 2018.
@ARTICLE{2018arXiv180404637A,
author = {{Anderson}, H.~S. and {Roth}, P.},
title = "{EMBER: An Open Dataset for Training Static PE Malware Machine Learning Models}",
journal = {ArXiv e-prints},
archivePrefix = "arXiv",
eprint = {1804.04637},
primaryClass = "cs.CR",
keywords = {Computer Science - Cryptography and Security},
year = 2018,
month = apr,
adsurl = {http://adsabs.harvard.edu/abs/2018arXiv180404637A},
}
''')
print('*'*80)
#main() # uncomment this line after fixing the URL NotImplementedError above |