mulasagg commited on
Commit
a9640f8
·
1 Parent(s): d476005
Files changed (22) hide show
  1. src/config/config.py +18 -0
  2. src/data/images/000449f94c6e689a227209669911783303c5157257d65a42b3d58182e1943376.jpg +3 -0
  3. src/data/images/0005e5022cd608e05426c717720cab930b17de32f9afde7af7db5bff68db21ea.jpg +3 -0
  4. src/data/images/000a04b60f05b748c8716f9bb32fdd88b06f782e0e3f2e8228c77fe1bf39de52.jpg +3 -0
  5. src/data/images/00121fcfdd4dd59e2cc603ddaae415fd17d782f0e89bf663beece329f7c168bd.jpg +3 -0
  6. src/data/images/0016d7e725a7387d3a3992bc27c13a9fe30fffe737808580619ec1e7b7237125.jpg +3 -0
  7. src/data/images/002501f44f86764349341bbd1b50c3a694ae9acd16bdc0a9e1a7655dae6e8ff5.jpg +3 -0
  8. src/data/images/002ce0d28ec990aadbbc89df457189de37d8adaadc9c084b78eb7be9a9820c81.jpg +3 -0
  9. src/data/images/0037ef6aea2b179208cd379210224fb863e12100e921a9e3c036ffbdea7e63d2.jpg +3 -0
  10. src/data/images/006bb451c7207fa375e67a1684a97136a46beea1ff74e193eb4bbf6665a0ec9b.jpg +3 -0
  11. src/data/images/00cf133ba8da1fd1e73a1aa41693334c4d288ec71ced6c331e40d1de09a0c0df.jpg +3 -0
  12. src/data/images/00f3810a4b6c7f552e0bff91fe48694b7a4a7bf750fb03ea846aa3de97a41ba7.jpg +3 -0
  13. src/data/images/05e64dbd41d8dc2baf23d43fa0fcad946d04856691fc17728c1a4d480926e375.jpg +3 -0
  14. src/data/images/05ef80081391bbd33e0f7fa89d9b1b3eca8be6265c3728e223282e1f61739ec2.jpg +3 -0
  15. src/data/images/95a956b0e45c41a80fbc6b479226a9c6780da71e223ca1643cc2e060feea5977.jpg +3 -0
  16. src/data/subset_dataset.csv +0 -0
  17. src/datasets/datasets.py +41 -0
  18. src/main/make_predictions.py +55 -0
  19. src/models/bigru.py +32 -0
  20. src/models/cnn.py +41 -0
  21. src/models/multimodal.py +36 -0
  22. src/utils/get_features.py +31 -0
src/config/config.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertTokenizer
2
+ import torch
3
+
4
+
5
+ configuration = {
6
+ "sequence_length": 100,
7
+ "tokenizer_name": "bert-base-uncased",
8
+ "vocab_size": len(BertTokenizer.from_pretrained("bert-base-uncased")),
9
+ "embedding_dim": 10,
10
+ "input_length": 100,
11
+ "num_filters": 128,
12
+ "kernel_size": 4,
13
+ "num_gated_units": 64,
14
+ "hidden_neurons": 128,
15
+ "dropout_cnn": 0.2,
16
+ "dropout_fc": 0.5,
17
+ "device": "cuda"
18
+ }
src/data/images/000449f94c6e689a227209669911783303c5157257d65a42b3d58182e1943376.jpg ADDED

Git LFS Details

  • SHA256: 360cbf854015a1976a2f075ec6047ed77d659f167610a35b315f1e450ace32a2
  • Pointer size: 132 Bytes
  • Size of remote file: 2 MB
src/data/images/0005e5022cd608e05426c717720cab930b17de32f9afde7af7db5bff68db21ea.jpg ADDED

Git LFS Details

  • SHA256: ca00d8cfddf50b4a47afd0f914ad835c365a91905bb21edcd1ed7ca0d10c5ed2
  • Pointer size: 130 Bytes
  • Size of remote file: 86 kB
src/data/images/000a04b60f05b748c8716f9bb32fdd88b06f782e0e3f2e8228c77fe1bf39de52.jpg ADDED

Git LFS Details

  • SHA256: 3ad5daedabf18f5170233ad0cc2b441776af0f940d6413e819aa6bbafa54b429
  • Pointer size: 131 Bytes
  • Size of remote file: 571 kB
src/data/images/00121fcfdd4dd59e2cc603ddaae415fd17d782f0e89bf663beece329f7c168bd.jpg ADDED

Git LFS Details

  • SHA256: d3678f8f227bb7725b37fcbed07531c656cfa1e3f188832c3fad1ea359c6ebc0
  • Pointer size: 131 Bytes
  • Size of remote file: 184 kB
src/data/images/0016d7e725a7387d3a3992bc27c13a9fe30fffe737808580619ec1e7b7237125.jpg ADDED

Git LFS Details

  • SHA256: 1f6b69428367f22c8580681397ae26e8f7ac58b01388903f2478f7e6bf6b9596
  • Pointer size: 132 Bytes
  • Size of remote file: 2.72 MB
src/data/images/002501f44f86764349341bbd1b50c3a694ae9acd16bdc0a9e1a7655dae6e8ff5.jpg ADDED

Git LFS Details

  • SHA256: 061468dcdc4f88800770a7e4fe8f9da6017b8872c1910b36b49d736b8b95f14a
  • Pointer size: 130 Bytes
  • Size of remote file: 10.6 kB
src/data/images/002ce0d28ec990aadbbc89df457189de37d8adaadc9c084b78eb7be9a9820c81.jpg ADDED

Git LFS Details

  • SHA256: fab0901e12291504a8d881833be73bd30c420b1989e5dc8b8ac865b9edd31549
  • Pointer size: 131 Bytes
  • Size of remote file: 224 kB
src/data/images/0037ef6aea2b179208cd379210224fb863e12100e921a9e3c036ffbdea7e63d2.jpg ADDED

Git LFS Details

  • SHA256: 70ee117602e7dc7a72b549b073253ca2655019d9b155be7d704bf9faef96f2f9
  • Pointer size: 131 Bytes
  • Size of remote file: 117 kB
src/data/images/006bb451c7207fa375e67a1684a97136a46beea1ff74e193eb4bbf6665a0ec9b.jpg ADDED

Git LFS Details

  • SHA256: ba7892954330ff2e19f16ff7c8d9165e5685c4c16b48400682d2cff368b16bae
  • Pointer size: 130 Bytes
  • Size of remote file: 61.5 kB
src/data/images/00cf133ba8da1fd1e73a1aa41693334c4d288ec71ced6c331e40d1de09a0c0df.jpg ADDED

Git LFS Details

  • SHA256: 3041ffe6341df13056711e00241912e00821e1e3e07f06571715eafb7f061e33
  • Pointer size: 131 Bytes
  • Size of remote file: 368 kB
src/data/images/00f3810a4b6c7f552e0bff91fe48694b7a4a7bf750fb03ea846aa3de97a41ba7.jpg ADDED

Git LFS Details

  • SHA256: b0c4a11cd0a985f0e96ff4e011135937545ac5b461b1cb5860faa1170c0ef97d
  • Pointer size: 130 Bytes
  • Size of remote file: 52.6 kB
src/data/images/05e64dbd41d8dc2baf23d43fa0fcad946d04856691fc17728c1a4d480926e375.jpg ADDED

Git LFS Details

  • SHA256: 6d4b6007de18afcb01213dd9e37e95b5d6326f816b878e8bfd7685f3cb1845a6
  • Pointer size: 131 Bytes
  • Size of remote file: 604 kB
src/data/images/05ef80081391bbd33e0f7fa89d9b1b3eca8be6265c3728e223282e1f61739ec2.jpg ADDED

Git LFS Details

  • SHA256: d8397b21580eb342c540c219105539a7792523e21ac90b75ec425d24cd160e9a
  • Pointer size: 129 Bytes
  • Size of remote file: 6.79 kB
src/data/images/95a956b0e45c41a80fbc6b479226a9c6780da71e223ca1643cc2e060feea5977.jpg ADDED

Git LFS Details

  • SHA256: ed48add871d87db735c3b9656257a03504d53840255312562630bb31bd7a295b
  • Pointer size: 130 Bytes
  • Size of remote file: 12.1 kB
src/data/subset_dataset.csv ADDED
The diff for this file is too large to render. See raw diff
 
src/datasets/datasets.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.utils.data import Dataset
3
+
4
+ from transformers import BertTokenizer
5
+ from PIL import Image
6
+ import numpy as np
7
+
8
+ from typing import List
9
+
10
+
11
+ class CombinedDataset(Dataset):
12
+
13
+ def __init__(self, api_call_list, img_path, sequence_length, max_len=128, transforms=None, tokenizer_name='bert-base-uncased'):
14
+
15
+ self.image_path = img_path
16
+
17
+ self.transforms = transforms
18
+ self.max_len = max_len
19
+ self.sequence_length = sequence_length
20
+
21
+
22
+ self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name)
23
+ self.api_calls = api_call_list
24
+ self.encoded_calls = [self.tokenizer.encode(" ".join(call), add_special_tokens=True, max_length=self.max_len, padding='max_length', truncation=True) for call in self.api_calls]
25
+ self.padded_calls = np.array([x + [0] * (self.max_len - len(x)) if len(x) < self.max_len else x[:self.max_len] for x in self.encoded_calls])
26
+ print("Dataset initialized")
27
+
28
+ def __len__(self):
29
+ return len(self.padded_calls)
30
+
31
+ def __getitem__(self,idx):
32
+ img_path = self.image_path
33
+ image = Image.open(img_path)
34
+
35
+ if self.transforms:
36
+ image = self.transforms(image)
37
+
38
+ tokenized_seq = self.padded_calls
39
+
40
+
41
+ return torch.tensor(tokenized_seq, dtype=torch.long), image
src/main/make_predictions.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.config import config
2
+ from src.datasets.datasets import CombinedDataset
3
+ from src.models.multimodal import CombinedMalwareDetectionModel
4
+ from src.models.bigru import CNNBiGRU
5
+ from src.models.cnn import ImprovedCNN
6
+ from src.utils.get_features import get_img_api
7
+ from torchvision import transforms
8
+
9
+ import pickle
10
+ import torch
11
+
12
+
13
+ data_path = 'src/data/subset_dataset.csv'
14
+
15
+ simple_transform = transforms.Compose([
16
+ transforms.Resize((128, 128)),
17
+ transforms.RandomHorizontalFlip(p=0.5),
18
+ transforms.ToTensor(),
19
+ transforms.Normalize(mean=[0.5], std=[0.5])
20
+ ])
21
+
22
+ def load_model(model_path, device='cpu'):
23
+ """Loads the model from a pickle file and moves it to the specified device."""
24
+ with open(model_path, 'rb') as f:
25
+ model = pickle.load(f)
26
+ return model.to(device)
27
+
28
+
29
+ def get_prediction(model, padded_sequences, img_x, device='cuda'):
30
+ malware_classes = ["Benign", "RedLine Stealer", "Downloader", "RAT",
31
+ "Banking Trojan", "Snake Keylogger", "Spyware"]
32
+
33
+ # Move inputs to the device
34
+ padded_sequences, img_x = padded_sequences.to(device), img_x.to(device)
35
+
36
+ # Perform inference
37
+ outputs = model(padded_sequences, img_x)
38
+ _, predicted = torch.max(outputs, 1)
39
+
40
+ return malware_classes[predicted]
41
+
42
+
43
+ model_path = "model_dump/model_malware_lstm (1).pkl"
44
+
45
+ image_path, api_call_list = get_img_api('002ce0d28ec990aadbbc89df457189de37d8adaadc9c084b78eb7be9a9820c81', data_path)
46
+
47
+
48
+ dataset = CombinedDataset(api_call_list, image_path, transforms=simple_transform ,sequence_length=config.configuration["sequence_length"])
49
+
50
+ padded_sequences,img_x=next(iter(dataset))
51
+ img_x=img_x.unsqueeze(0) #type: ignore
52
+
53
+ model = load_model(model_path, device=config.configuration["device"])
54
+ predicted_class = get_prediction(model, padded_sequences, img_x, config.configuration["device"])
55
+ print(f"Predicted class: {predicted_class}")
src/models/bigru.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ class CNNBiGRU(nn.Module):
6
+ def __init__(self, vocab_size, embedding_dim, input_length, num_filters, kernel_size,
7
+ num_gated_units, hidden_neurons, dropout_cnn, dropout_fc):
8
+ super(CNNBiGRU, self).__init__()
9
+
10
+ self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
11
+ self.conv1d = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=kernel_size)
12
+ self.dropout_cnn = nn.Dropout(dropout_cnn)
13
+ self.maxpool = nn.MaxPool1d(kernel_size=kernel_size, stride=1)
14
+ self.bigru = nn.LSTM(input_size=num_filters, hidden_size=num_gated_units, num_layers=1,
15
+ batch_first=True, bidirectional=True)
16
+
17
+ self.fc1 = nn.Linear(num_gated_units * 2, hidden_neurons)
18
+ self.fc2 = nn.Linear(hidden_neurons, hidden_neurons)
19
+ self.dropout_fc = nn.Dropout(dropout_fc)
20
+ self.output = nn.Linear(hidden_neurons, 128)
21
+
22
+ def forward(self, x):
23
+ x = self.embedding(x)
24
+ x = x.permute(0, 2, 1)
25
+ x = F.relu(self.conv1d(x))
26
+ x = self.dropout_cnn(x)
27
+ x = self.maxpool(x)
28
+ x = x.permute(0, 2, 1)
29
+ x, _ = self.bigru(x)
30
+ x = x[:, -1, :]
31
+ x = self.output(x)
32
+ return x
src/models/cnn.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ class ImprovedCNN(nn.Module):
5
+ def __init__(self, input_channels, hidden_units, num_classes=4):
6
+ super().__init__()
7
+ self.block1 = nn.Sequential(
8
+ nn.Conv2d(in_channels=input_channels, out_channels=hidden_units, kernel_size=3, stride=1, padding=1),
9
+ nn.BatchNorm2d(hidden_units),
10
+ nn.ReLU(),
11
+ nn.MaxPool2d(kernel_size=2)
12
+ )
13
+
14
+ self.block2 = nn.Sequential(
15
+ nn.Conv2d(in_channels=hidden_units, out_channels=hidden_units*2, kernel_size=3, stride=1, padding=1),
16
+ nn.BatchNorm2d(hidden_units*2),
17
+ nn.ReLU(),
18
+ nn.MaxPool2d(kernel_size=2)
19
+ )
20
+
21
+ self.block3 = nn.Sequential(
22
+ nn.Conv2d(in_channels=hidden_units*2, out_channels=hidden_units*4, kernel_size=3, stride=1, padding=1),
23
+ nn.BatchNorm2d(hidden_units*4),
24
+ nn.ReLU(),
25
+ nn.AdaptiveAvgPool2d(output_size=(4, 4))
26
+ )
27
+
28
+ self.classifier = nn.Sequential(
29
+ nn.Flatten(),
30
+ nn.Linear(hidden_units*4*4*4, 256),
31
+ nn.ReLU(),
32
+ nn.Dropout(0.5),
33
+ nn.Linear(256, 128)
34
+ )
35
+
36
+ def forward(self, x):
37
+ x = self.block1(x)
38
+ x = self.block2(x)
39
+ x = self.block3(x)
40
+ x = self.classifier(x)
41
+ return x
src/models/multimodal.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from src.models.bigru import CNNBiGRU
5
+ from src.models.cnn import ImprovedCNN
6
+ from src.config import config
7
+
8
+
9
+ class CombinedMalwareDetectionModel(nn.Module):
10
+ def __init__(self, vocab_size, embedding_dim, num_filters, kernel_size):
11
+ super(CombinedMalwareDetectionModel, self).__init__()
12
+
13
+ self.malware_detection_model = CNNBiGRU(vocab_size, embedding_dim,config.configuration["input_length"], num_filters, kernel_size,
14
+ config.configuration["num_gated_units"], config.configuration["hidden_neurons"],
15
+ config.configuration["dropout_cnn"], config.configuration["dropout_fc"])
16
+
17
+ self.improved_cnn = ImprovedCNN(input_channels=1, hidden_units=32)
18
+
19
+ self.fc1 = nn.Linear(256, 64)
20
+ self.fc2 = nn.Linear(64, 32)
21
+ self.fc3 = nn.Linear(32, 7)
22
+
23
+ self.dropout = nn.Dropout(0.2)
24
+
25
+ def forward(self, padded_sequences, img_x):
26
+ output_api = self.malware_detection_model(padded_sequences)
27
+ output_img = self.improved_cnn(img_x)
28
+
29
+ input_multi = torch.cat([output_img, output_api], dim=-1).to(torch.float32)
30
+
31
+ x = F.relu(self.fc1(input_multi))
32
+ x = self.dropout(x)
33
+ x = F.relu(self.fc2(x))
34
+ x = self.dropout(x)
35
+ x = self.fc3(x)
36
+ return x
src/utils/get_features.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ def get_img_api(hash, data_path):
4
+
5
+ df = pd.read_csv(data_path)
6
+
7
+
8
+ row = df[df['SHA256'] == hash]
9
+
10
+
11
+ if row.empty:
12
+ return None, None
13
+
14
+ # Extract the image path
15
+ img_path = 'src/data/images/' + row['SHA256'].values[0] + '.jpg'
16
+
17
+ # Extract the API calls
18
+ api_columns = df.columns[2:] # Skip the first two columns (SHA256 and Type)
19
+ api_calls = row[api_columns].values.flatten().tolist()
20
+
21
+ # Filter out only the API calls that are present (value == 1)
22
+ api_call_list = [[api for api, value in zip(api_columns, api_calls) if value == 1]]
23
+
24
+ return img_path, api_call_list
25
+
26
+
27
+ # hash_value = '002ce0d28ec990aadbbc89df457189de37d8adaadc9c084b78eb7be9a9820c81'
28
+ # img_path, api_call_list = get_img_api(hash_value)
29
+
30
+ # print("Image Path:", img_path)
31
+ # print("API Call List:", api_call_list)