Upload 7 files
Browse files- README.md +90 -3
- config.json +30 -0
- mytrans_modeling.py +138 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +23 -0
- tokenizer.json +0 -0
- tokenizer_config.json +0 -0
README.md
CHANGED
|
@@ -1,4 +1,91 @@
|
|
| 1 |
---
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
license: mit
|
| 3 |
+
---
|
| 4 |
+
|
| 5 |
+
<h1 align="center">OpTrans: Enhancing Binary Code Similarity Detection with Function Inlining Re-Optimization</h1>
|
| 6 |
+
|
| 7 |
+
<h4 align="center">
|
| 8 |
+
<p>
|
| 9 |
+
<a href=#about>About</a> |
|
| 10 |
+
<a href=#intuition>Intuition</a> |
|
| 11 |
+
<a href=#quickstart>QuickStart</a> |
|
| 12 |
+
<p>
|
| 13 |
+
</h4>
|
| 14 |
+
|
| 15 |
+
## About
|
| 16 |
+
|
| 17 |
+
OpTrans (Re-Optimization Transformer), is an innovative framework fuses binary code optimization techniques with the transformer model for BCSD. By OpTrans employs an algorithm based on binary program analysis to determine which functions should be inlined, followed by binary rewriting techniques to effectuate re-optimization on binaries. Our goal is to provide an effective tool for researchers and practitioners in binary code similarity detection, with our models accessible on the Hugging Face Model Hub.
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
## Intuition
|
| 21 |
+
|
| 22 |
+
This document will present how function inlining optimization improve binary code similarity detection.
|
| 23 |
+
|
| 24 |
+
Function Faust_next in sc3-plugins-HOAEncLebedev501.so compiled with -O0 (sc3-plugins-HOAEncLebedev501.so-O0.i64)
|
| 25 |
+

|
| 26 |
+
|
| 27 |
+
Function Faust_next in sc3-plugins-HOAEncLebedev501.so compiled with -O3 (sc3-plugins-HOAEncLebedev501.so-O3.i64)
|
| 28 |
+

|
| 29 |
+
|
| 30 |
+
Function Faust_next in sc3-plugins-HOAEncLebedev501.so compiled with -O0 and processed by function inlining optimization (sc3-plugins-HOAEncLebedev501.so-O0-inline.i64)
|
| 31 |
+

|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
## QuickStart
|
| 35 |
+
|
| 36 |
+
This document will help you set up and start using the OpTrans model for embedding generation.
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
### Requirements
|
| 40 |
+
- Python 3.6 or higher
|
| 41 |
+
- [PyTorch](https://pytorch.org/get-started/locally/)
|
| 42 |
+
- [Transformers library](https://huggingface.co/docs/transformers/installation)
|
| 43 |
+
- A CUDA-enabled GPU is highly recommended for faster processing.
|
| 44 |
+
|
| 45 |
+
Ensure you have Python and PyTorch installed on your system. Then, install the Transformers library using pip:
|
| 46 |
+
```bash
|
| 47 |
+
pip install transformers
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
### Preparing Tokenizers and Models
|
| 51 |
+
Import necessary libraries and initialize the model and tokenizers:
|
| 52 |
+
```python
|
| 53 |
+
import torch
|
| 54 |
+
from transformers import AutoModel, AutoTokenizer
|
| 55 |
+
|
| 56 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 57 |
+
|
| 58 |
+
tokenizer = AutoTokenizer.from_pretrained("sandspeare/optrans", trust_remote_code=True)
|
| 59 |
+
encoder = AutoModel.from_pretrained("sandspeare/optrans", trust_remote_code=True).to(device)
|
| 60 |
+
tokenizer.pad_token = tokenizer.unk_token
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
### Example Use Cases
|
| 64 |
+
**Function inlining optimization for BCSD**
|
| 65 |
+
|
| 66 |
+
1. Load your binary code dataset. For demonstration, we use a pickle file containing binary code snippets for similarity compare.
|
| 67 |
+
|
| 68 |
+
```python
|
| 69 |
+
with open("./CaseStudy/casestudy.json") as fp:
|
| 70 |
+
data = json.load(fp)
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
2. Encode the binary code.
|
| 74 |
+
|
| 75 |
+
```python
|
| 76 |
+
asm_O0 = tokenizer([data["O0"]], padding=True, return_tensors="pt").to(device)
|
| 77 |
+
asm_embedding_O0 = encoder(**asm_O0)
|
| 78 |
+
|
| 79 |
+
asm_O0_inline = tokenizer([data["O0_inline"]], padding=True, return_tensors="pt").to(device)
|
| 80 |
+
asm_embedding_O0_inline = encoder(**asm_O0_inline)
|
| 81 |
+
|
| 82 |
+
asm_O3 = tokenizer([data["O3"]], padding=True, return_tensors="pt").to(device)
|
| 83 |
+
asm_embedding_O3 = encoder(**asm_O3)
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
3. Perform similarity comparison:
|
| 87 |
+
|
| 88 |
+
```python
|
| 89 |
+
sim_O0vsO3 = torch.mm(asm_embedding_O0, asm_embedding_O3.T).squeeze() / 0.07
|
| 90 |
+
sim_O0_inlinevsO3 = torch.mm(asm_embedding_O0_inline, asm_embedding_O3.T).squeeze() / 0.07
|
| 91 |
+
```
|
config.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"AsmEncoder"
|
| 4 |
+
],
|
| 5 |
+
"auto_map": {
|
| 6 |
+
"AutoModel": "optrans_modeling.AsmEncoder"
|
| 7 |
+
},
|
| 8 |
+
"attention_probs_dropout_prob": 0.1,
|
| 9 |
+
"bla_dim": 1024,
|
| 10 |
+
"embedding_size": 768,
|
| 11 |
+
"hidden_act": "gelu",
|
| 12 |
+
"hidden_dropout_prob": 0.1,
|
| 13 |
+
"hidden_size": 768,
|
| 14 |
+
"initializer_range": 0.02,
|
| 15 |
+
"intermediate_size": 3072,
|
| 16 |
+
"layer_norm_eps": 1e-12,
|
| 17 |
+
"max_position_embeddings": 2048,
|
| 18 |
+
"model_type": "roformer",
|
| 19 |
+
"num_attention_heads": 12,
|
| 20 |
+
"num_hidden_layers": 12,
|
| 21 |
+
"pad_token_id": 1,
|
| 22 |
+
"rotary_value": false,
|
| 23 |
+
"token_type_ids": "share",
|
| 24 |
+
"torch_dtype": "bfloat16",
|
| 25 |
+
"transformers_version": "4.35.0",
|
| 26 |
+
"type_vocab_size": 2,
|
| 27 |
+
"use_bias": false,
|
| 28 |
+
"use_cache": true,
|
| 29 |
+
"vocab_size": 33494
|
| 30 |
+
}
|
mytrans_modeling.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import torch
|
| 3 |
+
import torch.utils.checkpoint
|
| 4 |
+
from torch import nn
|
| 5 |
+
from typing import Optional, Tuple
|
| 6 |
+
import torch.nn.functional as F
|
| 7 |
+
from transformers import BatchEncoding
|
| 8 |
+
from transformers import MPNetTokenizerFast
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
from transformers.models.roformer.modeling_roformer import (
|
| 12 |
+
RoFormerEmbeddings,
|
| 13 |
+
RoFormerModel,
|
| 14 |
+
RoFormerEncoder,
|
| 15 |
+
RoFormerLayer,
|
| 16 |
+
RoFormerAttention,
|
| 17 |
+
RoFormerIntermediate,
|
| 18 |
+
RoFormerOutput,
|
| 19 |
+
RoFormerSelfAttention,
|
| 20 |
+
RoFormerPreTrainedModel
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
from transformers.models.mpnet.modeling_mpnet import MPNetModel
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class JRoFormerEmbeddings(RoFormerEmbeddings):
|
| 27 |
+
"""Construct the embeddings from word and token_type embeddings."""
|
| 28 |
+
|
| 29 |
+
def __init__(self, config):
|
| 30 |
+
super().__init__(config)
|
| 31 |
+
self.word_embeddings = nn.Embedding(
|
| 32 |
+
config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id
|
| 33 |
+
)
|
| 34 |
+
self.token_type_embeddings = self.word_embeddings
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class JRoFormerSelfAttention(RoFormerSelfAttention):
|
| 38 |
+
def __init__(self, config):
|
| 39 |
+
super().__init__(config)
|
| 40 |
+
self.query = nn.Linear(
|
| 41 |
+
config.hidden_size, self.all_head_size, bias=config.use_bias
|
| 42 |
+
)
|
| 43 |
+
self.key = nn.Linear(
|
| 44 |
+
config.hidden_size, self.all_head_size, bias=config.use_bias
|
| 45 |
+
)
|
| 46 |
+
self.value = nn.Linear(
|
| 47 |
+
config.hidden_size, self.all_head_size, bias=config.use_bias
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class JRoFormerAttention(RoFormerAttention):
|
| 52 |
+
def __init__(self, config):
|
| 53 |
+
super().__init__(config)
|
| 54 |
+
self.self = JRoFormerSelfAttention(config)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class JRoFormerLayer(RoFormerLayer):
|
| 58 |
+
def __init__(self, config):
|
| 59 |
+
super().__init__(config)
|
| 60 |
+
self.attention = JRoFormerAttention(config)
|
| 61 |
+
self.is_decoder = config.is_decoder
|
| 62 |
+
self.add_cross_attention = config.add_cross_attention
|
| 63 |
+
if self.add_cross_attention:
|
| 64 |
+
if not self.is_decoder:
|
| 65 |
+
raise ValueError(
|
| 66 |
+
f"{self} should be used as a decoder model if cross attention is added"
|
| 67 |
+
)
|
| 68 |
+
self.crossattention = RoFormerAttention(config)
|
| 69 |
+
self.intermediate = RoFormerIntermediate(config)
|
| 70 |
+
self.output = RoFormerOutput(config)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
class JRoFormerEncoder(RoFormerEncoder):
|
| 74 |
+
def __init__(self, config):
|
| 75 |
+
super().__init__(config)
|
| 76 |
+
self.layer = nn.ModuleList(
|
| 77 |
+
[JRoFormerLayer(config) for _ in range(config.num_hidden_layers)]
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
class JRoFormerModel(RoFormerModel):
|
| 82 |
+
def __init__(self, config):
|
| 83 |
+
super().__init__(config)
|
| 84 |
+
self.config = config
|
| 85 |
+
self.embeddings = JRoFormerEmbeddings(config)
|
| 86 |
+
|
| 87 |
+
if config.embedding_size != config.hidden_size:
|
| 88 |
+
self.embeddings_project = nn.Linear(
|
| 89 |
+
config.embedding_size, config.hidden_size
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
self.encoder = JRoFormerEncoder(config)
|
| 93 |
+
|
| 94 |
+
# Initialize weights and apply final processing
|
| 95 |
+
self.post_init()
|
| 96 |
+
|
| 97 |
+
class AsmEncoder(RoFormerPreTrainedModel):
|
| 98 |
+
def __init__(self, config):
|
| 99 |
+
super().__init__(config)
|
| 100 |
+
self.config = config
|
| 101 |
+
self.roformer = JRoFormerModel(config)
|
| 102 |
+
self.projection = nn.Linear(config.hidden_size, config.bla_dim)
|
| 103 |
+
|
| 104 |
+
def forward(
|
| 105 |
+
self,
|
| 106 |
+
input_ids: Optional[torch.LongTensor] = None,
|
| 107 |
+
attention_mask: Optional[torch.FloatTensor] = None,
|
| 108 |
+
token_type_ids: Optional[torch.LongTensor] = None,
|
| 109 |
+
head_mask: Optional[torch.FloatTensor] = None,
|
| 110 |
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
| 111 |
+
encoder_hidden_states: Optional[torch.FloatTensor] = None,
|
| 112 |
+
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
| 113 |
+
output_attentions: Optional[bool] = None,
|
| 114 |
+
output_hidden_states: Optional[bool] = None,
|
| 115 |
+
return_dict: Optional[bool] = None,
|
| 116 |
+
):
|
| 117 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 118 |
+
|
| 119 |
+
outputs = self.roformer(
|
| 120 |
+
input_ids,
|
| 121 |
+
attention_mask=attention_mask,
|
| 122 |
+
token_type_ids=token_type_ids,
|
| 123 |
+
head_mask=head_mask,
|
| 124 |
+
inputs_embeds=inputs_embeds,
|
| 125 |
+
encoder_hidden_states=encoder_hidden_states,
|
| 126 |
+
encoder_attention_mask=encoder_attention_mask,
|
| 127 |
+
output_attentions=output_attentions,
|
| 128 |
+
output_hidden_states=output_hidden_states,
|
| 129 |
+
return_dict=return_dict,
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
token_embeddings = outputs[0]
|
| 133 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).to(token_embeddings.dtype)
|
| 134 |
+
asm_embedding = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
| 135 |
+
asm_embedding = self.projection(asm_embedding)
|
| 136 |
+
asm_embedding = F.normalize(asm_embedding, p=2, dim=1)
|
| 137 |
+
|
| 138 |
+
return asm_embedding
|
pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4add8e8eb42e5e0a6e6864837fb0da7b6fe547a2b171b99046efd09478e65a05
|
| 3 |
+
size 223364289
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"unk_token": {
|
| 17 |
+
"content": "<unk>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
}
|
| 23 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|