File size: 907 Bytes
b144cb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import torch
from transformers import AutoTokenizer, AutoModel

MODEL_NAME = "microsoft/unixcoder-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
unix_model = AutoModel.from_pretrained(MODEL_NAME)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
unix_model.to(device)
unix_model.eval()


def get_unixcoder_embedding(code, max_length=512):

    inputs = tokenizer(
        code,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = unix_model(**inputs)
        last_hidden = outputs.last_hidden_state

        cls_embedding = last_hidden[:, 0, :]
        mean_embedding = last_hidden.mean(dim=1)

        combined = torch.cat((cls_embedding, mean_embedding), dim=1)

    return combined.cpu().numpy().flatten()