ChenPingAn74 commited on
Commit
082b1e3
·
verified ·
1 Parent(s): 0a7c80e

Upload 7 files

Browse files
README.md CHANGED
@@ -1,4 +1,91 @@
1
  ---
2
- tags:
3
- - code
4
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ license: mit
3
+ ---
4
+
5
+ <h1 align="center">OpTrans: Enhancing Binary Code Similarity Detection with Function Inlining Re-Optimization</h1>
6
+
7
+ <h4 align="center">
8
+ <p>
9
+ <a href=#about>About</a> |
10
+ <a href=#intuition>Intuition</a> |
11
+ <a href=#quickstart>QuickStart</a> |
12
+ <p>
13
+ </h4>
14
+
15
+ ## About
16
+
17
+ OpTrans (Re-Optimization Transformer), is an innovative framework fuses binary code optimization techniques with the transformer model for BCSD. By OpTrans employs an algorithm based on binary program analysis to determine which functions should be inlined, followed by binary rewriting techniques to effectuate re-optimization on binaries. Our goal is to provide an effective tool for researchers and practitioners in binary code similarity detection, with our models accessible on the Hugging Face Model Hub.
18
+
19
+
20
+ ## Intuition
21
+
22
+ This document will present how function inlining optimization improve binary code similarity detection.
23
+
24
+ Function Faust_next in sc3-plugins-HOAEncLebedev501.so compiled with -O0 (sc3-plugins-HOAEncLebedev501.so-O0.i64)
25
+ ![O0](./Intuition/O0.jpg)
26
+
27
+ Function Faust_next in sc3-plugins-HOAEncLebedev501.so compiled with -O3 (sc3-plugins-HOAEncLebedev501.so-O3.i64)
28
+ ![O3](./Intuition/O3.jpg)
29
+
30
+ Function Faust_next in sc3-plugins-HOAEncLebedev501.so compiled with -O0 and processed by function inlining optimization (sc3-plugins-HOAEncLebedev501.so-O0-inline.i64)
31
+ ![O0-inline](./Intuition/O0-inline.jpg)
32
+
33
+
34
+ ## QuickStart
35
+
36
+ This document will help you set up and start using the OpTrans model for embedding generation.
37
+
38
+
39
+ ### Requirements
40
+ - Python 3.6 or higher
41
+ - [PyTorch](https://pytorch.org/get-started/locally/)
42
+ - [Transformers library](https://huggingface.co/docs/transformers/installation)
43
+ - A CUDA-enabled GPU is highly recommended for faster processing.
44
+
45
+ Ensure you have Python and PyTorch installed on your system. Then, install the Transformers library using pip:
46
+ ```bash
47
+ pip install transformers
48
+ ```
49
+
50
+ ### Preparing Tokenizers and Models
51
+ Import necessary libraries and initialize the model and tokenizers:
52
+ ```python
53
+ import torch
54
+ from transformers import AutoModel, AutoTokenizer
55
+
56
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
57
+
58
+ tokenizer = AutoTokenizer.from_pretrained("sandspeare/optrans", trust_remote_code=True)
59
+ encoder = AutoModel.from_pretrained("sandspeare/optrans", trust_remote_code=True).to(device)
60
+ tokenizer.pad_token = tokenizer.unk_token
61
+ ```
62
+
63
+ ### Example Use Cases
64
+ **Function inlining optimization for BCSD**
65
+
66
+ 1. Load your binary code dataset. For demonstration, we use a pickle file containing binary code snippets for similarity compare.
67
+
68
+ ```python
69
+ with open("./CaseStudy/casestudy.json") as fp:
70
+ data = json.load(fp)
71
+ ```
72
+
73
+ 2. Encode the binary code.
74
+
75
+ ```python
76
+ asm_O0 = tokenizer([data["O0"]], padding=True, return_tensors="pt").to(device)
77
+ asm_embedding_O0 = encoder(**asm_O0)
78
+
79
+ asm_O0_inline = tokenizer([data["O0_inline"]], padding=True, return_tensors="pt").to(device)
80
+ asm_embedding_O0_inline = encoder(**asm_O0_inline)
81
+
82
+ asm_O3 = tokenizer([data["O3"]], padding=True, return_tensors="pt").to(device)
83
+ asm_embedding_O3 = encoder(**asm_O3)
84
+ ```
85
+
86
+ 3. Perform similarity comparison:
87
+
88
+ ```python
89
+ sim_O0vsO3 = torch.mm(asm_embedding_O0, asm_embedding_O3.T).squeeze() / 0.07
90
+ sim_O0_inlinevsO3 = torch.mm(asm_embedding_O0_inline, asm_embedding_O3.T).squeeze() / 0.07
91
+ ```
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "AsmEncoder"
4
+ ],
5
+ "auto_map": {
6
+ "AutoModel": "optrans_modeling.AsmEncoder"
7
+ },
8
+ "attention_probs_dropout_prob": 0.1,
9
+ "bla_dim": 1024,
10
+ "embedding_size": 768,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-12,
17
+ "max_position_embeddings": 2048,
18
+ "model_type": "roformer",
19
+ "num_attention_heads": 12,
20
+ "num_hidden_layers": 12,
21
+ "pad_token_id": 1,
22
+ "rotary_value": false,
23
+ "token_type_ids": "share",
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.35.0",
26
+ "type_vocab_size": 2,
27
+ "use_bias": false,
28
+ "use_cache": true,
29
+ "vocab_size": 33494
30
+ }
mytrans_modeling.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.utils.checkpoint
4
+ from torch import nn
5
+ from typing import Optional, Tuple
6
+ import torch.nn.functional as F
7
+ from transformers import BatchEncoding
8
+ from transformers import MPNetTokenizerFast
9
+
10
+
11
+ from transformers.models.roformer.modeling_roformer import (
12
+ RoFormerEmbeddings,
13
+ RoFormerModel,
14
+ RoFormerEncoder,
15
+ RoFormerLayer,
16
+ RoFormerAttention,
17
+ RoFormerIntermediate,
18
+ RoFormerOutput,
19
+ RoFormerSelfAttention,
20
+ RoFormerPreTrainedModel
21
+ )
22
+
23
+ from transformers.models.mpnet.modeling_mpnet import MPNetModel
24
+
25
+
26
+ class JRoFormerEmbeddings(RoFormerEmbeddings):
27
+ """Construct the embeddings from word and token_type embeddings."""
28
+
29
+ def __init__(self, config):
30
+ super().__init__(config)
31
+ self.word_embeddings = nn.Embedding(
32
+ config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id
33
+ )
34
+ self.token_type_embeddings = self.word_embeddings
35
+
36
+
37
+ class JRoFormerSelfAttention(RoFormerSelfAttention):
38
+ def __init__(self, config):
39
+ super().__init__(config)
40
+ self.query = nn.Linear(
41
+ config.hidden_size, self.all_head_size, bias=config.use_bias
42
+ )
43
+ self.key = nn.Linear(
44
+ config.hidden_size, self.all_head_size, bias=config.use_bias
45
+ )
46
+ self.value = nn.Linear(
47
+ config.hidden_size, self.all_head_size, bias=config.use_bias
48
+ )
49
+
50
+
51
+ class JRoFormerAttention(RoFormerAttention):
52
+ def __init__(self, config):
53
+ super().__init__(config)
54
+ self.self = JRoFormerSelfAttention(config)
55
+
56
+
57
+ class JRoFormerLayer(RoFormerLayer):
58
+ def __init__(self, config):
59
+ super().__init__(config)
60
+ self.attention = JRoFormerAttention(config)
61
+ self.is_decoder = config.is_decoder
62
+ self.add_cross_attention = config.add_cross_attention
63
+ if self.add_cross_attention:
64
+ if not self.is_decoder:
65
+ raise ValueError(
66
+ f"{self} should be used as a decoder model if cross attention is added"
67
+ )
68
+ self.crossattention = RoFormerAttention(config)
69
+ self.intermediate = RoFormerIntermediate(config)
70
+ self.output = RoFormerOutput(config)
71
+
72
+
73
+ class JRoFormerEncoder(RoFormerEncoder):
74
+ def __init__(self, config):
75
+ super().__init__(config)
76
+ self.layer = nn.ModuleList(
77
+ [JRoFormerLayer(config) for _ in range(config.num_hidden_layers)]
78
+ )
79
+
80
+
81
+ class JRoFormerModel(RoFormerModel):
82
+ def __init__(self, config):
83
+ super().__init__(config)
84
+ self.config = config
85
+ self.embeddings = JRoFormerEmbeddings(config)
86
+
87
+ if config.embedding_size != config.hidden_size:
88
+ self.embeddings_project = nn.Linear(
89
+ config.embedding_size, config.hidden_size
90
+ )
91
+
92
+ self.encoder = JRoFormerEncoder(config)
93
+
94
+ # Initialize weights and apply final processing
95
+ self.post_init()
96
+
97
+ class AsmEncoder(RoFormerPreTrainedModel):
98
+ def __init__(self, config):
99
+ super().__init__(config)
100
+ self.config = config
101
+ self.roformer = JRoFormerModel(config)
102
+ self.projection = nn.Linear(config.hidden_size, config.bla_dim)
103
+
104
+ def forward(
105
+ self,
106
+ input_ids: Optional[torch.LongTensor] = None,
107
+ attention_mask: Optional[torch.FloatTensor] = None,
108
+ token_type_ids: Optional[torch.LongTensor] = None,
109
+ head_mask: Optional[torch.FloatTensor] = None,
110
+ inputs_embeds: Optional[torch.FloatTensor] = None,
111
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
112
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
113
+ output_attentions: Optional[bool] = None,
114
+ output_hidden_states: Optional[bool] = None,
115
+ return_dict: Optional[bool] = None,
116
+ ):
117
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
118
+
119
+ outputs = self.roformer(
120
+ input_ids,
121
+ attention_mask=attention_mask,
122
+ token_type_ids=token_type_ids,
123
+ head_mask=head_mask,
124
+ inputs_embeds=inputs_embeds,
125
+ encoder_hidden_states=encoder_hidden_states,
126
+ encoder_attention_mask=encoder_attention_mask,
127
+ output_attentions=output_attentions,
128
+ output_hidden_states=output_hidden_states,
129
+ return_dict=return_dict,
130
+ )
131
+
132
+ token_embeddings = outputs[0]
133
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).to(token_embeddings.dtype)
134
+ asm_embedding = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
135
+ asm_embedding = self.projection(asm_embedding)
136
+ asm_embedding = F.normalize(asm_embedding, p=2, dim=1)
137
+
138
+ return asm_embedding
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4add8e8eb42e5e0a6e6864837fb0da7b6fe547a2b171b99046efd09478e65a05
3
+ size 223364289
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff