Upload 6 files
Browse files- .gitattributes +1 -0
- labelData.csv +10 -0
- malware_classifier_lime.h5 +3 -0
- model_training.py +131 -0
- predict.py +61 -0
- spectrum.tar +3 -0
- validate.py +50 -0
.gitattributes
CHANGED
|
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
spectrum.tar filter=lfs diff=lfs merge=lfs -text
|
labelData.csv
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Login successful for user admin,0
|
| 2 |
+
User admin added a new user john_doe,0
|
| 3 |
+
Failed login attempt from IP 192.168.1.55,1
|
| 4 |
+
User john_doe deleted file important_document.txt,0
|
| 5 |
+
New user jane_doe registered,0
|
| 6 |
+
Unauthorized access from IP 10.0.0.27,1
|
| 7 |
+
User admin updated file financial_report.xlsx,0
|
| 8 |
+
Failed login attempt from IP 192.168.1.56,1
|
| 9 |
+
User jane_doe uploaded file meeting_notes.docx,0
|
| 10 |
+
Unauthorized access from IP 10.0.0.28,1
|
malware_classifier_lime.h5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b23d9ceac1ec5ac69287c9a597d51586a017dcc525aede8a6b0953ba0597ca56
|
| 3 |
+
size 53156780
|
model_training.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#the goal of this script is to train the model and then save it
|
| 2 |
+
import os
|
| 3 |
+
import tensorflow as tf
|
| 4 |
+
from tensorflow.keras.preprocessing.image import ImageDataGenerator
|
| 5 |
+
from tensorflow.keras.models import Sequential
|
| 6 |
+
from tensorflow.keras.layers import Conv2D, Activation, MaxPooling2D, Flatten, Dense, Dropout
|
| 7 |
+
from tensorflow.keras.optimizers import Adam
|
| 8 |
+
from tensorflow.keras.models import load_model
|
| 9 |
+
from tensorflow.keras.preprocessing.image import load_img, img_to_array
|
| 10 |
+
import shutil
|
| 11 |
+
from PIL import Image
|
| 12 |
+
from tensorflow.keras.preprocessing.image import load_img, img_to_array
|
| 13 |
+
import matplotlib.pyplot as plt
|
| 14 |
+
import cv2
|
| 15 |
+
#import seaborn as sns
|
| 16 |
+
import numpy as np
|
| 17 |
+
import pickle
|
| 18 |
+
|
| 19 |
+
def clean_directory(directory, cache_file="cache.pkl"):
|
| 20 |
+
if os.path.exists(cache_file):
|
| 21 |
+
with open(cache_file, "rb") as f:
|
| 22 |
+
num_classes = pickle.load(f)
|
| 23 |
+
print("Loaded cached results.")
|
| 24 |
+
return num_classes
|
| 25 |
+
|
| 26 |
+
num_classes = 0
|
| 27 |
+
for subdir, dirs, files in os.walk(directory):
|
| 28 |
+
if not dirs:
|
| 29 |
+
num_classes += 1
|
| 30 |
+
valid_files = []
|
| 31 |
+
for file in files:
|
| 32 |
+
file_path = os.path.join(subdir, file)
|
| 33 |
+
try:
|
| 34 |
+
img = Image.open(file_path)
|
| 35 |
+
img.verify() # Verify if the image is not corrupted
|
| 36 |
+
valid_files.append(file)
|
| 37 |
+
except (IOError, SyntaxError) as e:
|
| 38 |
+
print(f"Removing corrupted file: {file_path}")
|
| 39 |
+
os.remove(file_path)
|
| 40 |
+
|
| 41 |
+
# Remove empty directories
|
| 42 |
+
if not valid_files:
|
| 43 |
+
print(f"Removing empty directory: {subdir}")
|
| 44 |
+
shutil.rmtree(subdir)
|
| 45 |
+
num_classes -= 1
|
| 46 |
+
|
| 47 |
+
# Save the results in cache
|
| 48 |
+
with open(cache_file, "wb") as f:
|
| 49 |
+
pickle.dump(num_classes, f)
|
| 50 |
+
print("Saved results to cache.")
|
| 51 |
+
|
| 52 |
+
return num_classes
|
| 53 |
+
|
| 54 |
+
data_dir = 'Malign/extract'
|
| 55 |
+
|
| 56 |
+
num_classes = clean_directory(data_dir)
|
| 57 |
+
|
| 58 |
+
# Parameters
|
| 59 |
+
batch_size = 32
|
| 60 |
+
epochs = 50
|
| 61 |
+
image_size = (200, 200) # Set the desired image size for input to the model
|
| 62 |
+
# Data preprocessing
|
| 63 |
+
train_datagen = ImageDataGenerator(
|
| 64 |
+
rescale=1./255,
|
| 65 |
+
validation_split=0.2 # Split 20% of data for validation
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
train_generator = train_datagen.flow_from_directory(
|
| 69 |
+
data_dir,
|
| 70 |
+
target_size=image_size,
|
| 71 |
+
batch_size=batch_size,
|
| 72 |
+
class_mode='categorical',
|
| 73 |
+
subset='training'
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
validation_generator = train_datagen.flow_from_directory(
|
| 77 |
+
data_dir,
|
| 78 |
+
target_size=image_size,
|
| 79 |
+
batch_size=batch_size,
|
| 80 |
+
class_mode='categorical',
|
| 81 |
+
subset='validation'
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
# Model creation
|
| 85 |
+
model = Sequential()
|
| 86 |
+
|
| 87 |
+
# First convolution layer
|
| 88 |
+
model.add(Conv2D(64, (3, 3), input_shape=(*image_size, 3)))
|
| 89 |
+
model.add(Activation('relu'))
|
| 90 |
+
model.add(MaxPooling2D(pool_size=(2, 2)))
|
| 91 |
+
|
| 92 |
+
# Second convolution layer
|
| 93 |
+
model.add(Conv2D(64, (3, 3)))
|
| 94 |
+
model.add(Activation('relu'))
|
| 95 |
+
model.add(MaxPooling2D(pool_size=(2, 2)))
|
| 96 |
+
|
| 97 |
+
# Third convolution layer
|
| 98 |
+
model.add(Conv2D(64, (3, 3)))
|
| 99 |
+
model.add(Activation('relu'))
|
| 100 |
+
model.add(MaxPooling2D(pool_size=(2, 2)))
|
| 101 |
+
|
| 102 |
+
# Fully connected layers
|
| 103 |
+
model.add(Flatten())
|
| 104 |
+
model.add(Dense(128))
|
| 105 |
+
model.add(Dropout(0.5))
|
| 106 |
+
model.add(Activation('relu'))
|
| 107 |
+
|
| 108 |
+
# Output layer
|
| 109 |
+
model.add(Dense(119))
|
| 110 |
+
model.add(Activation('softmax'))
|
| 111 |
+
|
| 112 |
+
model.summary()
|
| 113 |
+
|
| 114 |
+
model.compile(
|
| 115 |
+
optimizer=Adam(learning_rate=0.001),
|
| 116 |
+
loss='categorical_crossentropy',
|
| 117 |
+
metrics=['accuracy']
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
# Model training
|
| 121 |
+
history = model.fit(
|
| 122 |
+
train_generator,
|
| 123 |
+
epochs=epochs,
|
| 124 |
+
validation_data=validation_generator
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
# Save the trained model
|
| 128 |
+
model.save("malware_classifier_lime.h5")
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
|
predict.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import numpy as np
|
| 3 |
+
import tensorflow as tf
|
| 4 |
+
from tensorflow.keras.models import load_model
|
| 5 |
+
from tensorflow.keras.preprocessing.image import load_img, img_to_array
|
| 6 |
+
import matplotlib.pyplot as plt
|
| 7 |
+
from lime import lime_image
|
| 8 |
+
from skimage.segmentation import mark_boundaries
|
| 9 |
+
|
| 10 |
+
def explain_instance(image_path, model, num_features=5, num_samples=1000):
|
| 11 |
+
img = load_img(image_path, target_size=image_size)
|
| 12 |
+
img_array = img_to_array(img) / 255
|
| 13 |
+
explanation = explainer.explain_instance(img_array, model.predict, top_labels=num_classes, hide_color=0,
|
| 14 |
+
num_samples=num_samples, num_features=num_features)
|
| 15 |
+
return explanation
|
| 16 |
+
|
| 17 |
+
if __name__ == "__main__":
|
| 18 |
+
if len(sys.argv) != 2:
|
| 19 |
+
print("Usage: predict.py image_path")
|
| 20 |
+
sys.exit(1)
|
| 21 |
+
|
| 22 |
+
image_path = sys.argv[1]
|
| 23 |
+
image_size = (200, 200)
|
| 24 |
+
model_path = "malware_classifier_lime.h5"
|
| 25 |
+
model = load_model(model_path)
|
| 26 |
+
num_classes = 119
|
| 27 |
+
|
| 28 |
+
explainer = lime_image.LimeImageExplainer()
|
| 29 |
+
explanation = explain_instance(image_path, model)
|
| 30 |
+
|
| 31 |
+
temp, mask = explanation.get_image_and_mask(explanation.top_labels[0], positive_only=True, num_features=5, hide_rest=False)
|
| 32 |
+
img = load_img(image_path, target_size=image_size)
|
| 33 |
+
img_array = img_to_array(img) / 255
|
| 34 |
+
|
| 35 |
+
# Display the original image
|
| 36 |
+
plt.figure(figsize=(10, 5))
|
| 37 |
+
plt.subplot(1, 2, 1)
|
| 38 |
+
plt.imshow(img_array)
|
| 39 |
+
plt.title("Original Image")
|
| 40 |
+
plt.axis("off")
|
| 41 |
+
|
| 42 |
+
# Display the LIME explanation
|
| 43 |
+
plt.subplot(1, 2, 2)
|
| 44 |
+
plt.imshow(mark_boundaries(temp, mask))
|
| 45 |
+
plt.title("LIME Explanation")
|
| 46 |
+
plt.axis("off")
|
| 47 |
+
|
| 48 |
+
plt.show()
|
| 49 |
+
|
| 50 |
+
# Make a prediction
|
| 51 |
+
img = load_img(image_path, target_size=image_size)
|
| 52 |
+
img_array = img_to_array(img) / 255
|
| 53 |
+
img_array = np.expand_dims(img_array, axis=0)
|
| 54 |
+
|
| 55 |
+
prediction = model.predict(img_array)
|
| 56 |
+
predicted_class = np.argmax(prediction)
|
| 57 |
+
|
| 58 |
+
# Get the class name
|
| 59 |
+
class_name = list(train_generator.class_indices.keys())[list(train_generator.class_indices.values()).index(predicted_class)]
|
| 60 |
+
|
| 61 |
+
print(f"Predicted class: {predicted_class}, Class name: {class_name}")
|
spectrum.tar
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:485d96d9d396c57cde7433181d3645ebe5f87155972921a50ee101f9882a515d
|
| 3 |
+
size 542136320
|
validate.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import numpy as np
|
| 3 |
+
import tensorflow as tf
|
| 4 |
+
from tensorflow.keras.preprocessing.image import ImageDataGenerator
|
| 5 |
+
from tensorflow.keras.models import load_model
|
| 6 |
+
from sklearn.metrics import classification_report, confusion_matrix
|
| 7 |
+
import pickle
|
| 8 |
+
|
| 9 |
+
# Load the saved model
|
| 10 |
+
model = load_model("malware_classifier_lime.h5")
|
| 11 |
+
|
| 12 |
+
data_dir = 'Malign/extract'
|
| 13 |
+
|
| 14 |
+
# Load the number of classes from the cache file
|
| 15 |
+
with open("cache.pkl", "rb") as f:
|
| 16 |
+
num_classes = pickle.load(f)
|
| 17 |
+
|
| 18 |
+
# Parameters
|
| 19 |
+
batch_size = 32
|
| 20 |
+
image_size = (200, 200)
|
| 21 |
+
|
| 22 |
+
# Data preprocessing
|
| 23 |
+
test_datagen = ImageDataGenerator(rescale=1./255)
|
| 24 |
+
|
| 25 |
+
test_generator = test_datagen.flow_from_directory(
|
| 26 |
+
data_dir,
|
| 27 |
+
target_size=image_size,
|
| 28 |
+
batch_size=batch_size,
|
| 29 |
+
class_mode='categorical',
|
| 30 |
+
shuffle=False
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
# Evaluate the model
|
| 34 |
+
print("Evaluating the model...")
|
| 35 |
+
score = model.evaluate(test_generator)
|
| 36 |
+
print("Loss: ", score[0])
|
| 37 |
+
print("Accuracy: ", score[1])
|
| 38 |
+
|
| 39 |
+
# Predict the class labels
|
| 40 |
+
print("Predicting the class labels...")
|
| 41 |
+
y_pred = model.predict(test_generator)
|
| 42 |
+
y_pred_classes = np.argmax(y_pred, axis=1)
|
| 43 |
+
|
| 44 |
+
# Classification report
|
| 45 |
+
print("Classification report:")
|
| 46 |
+
print(classification_report(test_generator.classes, y_pred_classes, target_names=test_generator.class_indices.keys()))
|
| 47 |
+
|
| 48 |
+
# Confusion matrix
|
| 49 |
+
print("Confusion matrix:")
|
| 50 |
+
print(confusion_matrix(test_generator.classes, y_pred_classes))
|