File size: 5,070 Bytes
929e325 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | # Self-contained loader for the CVE exploitability model.
# Usage:
# from huggingface_hub import hf_hub_download
# import importlib.util, sys
# spec = importlib.util.spec_from_file_location("hf_model", hf_hub_download("sumitp76/cve-exploitability","hf_model.py"))
# m = importlib.util.module_from_spec(spec); sys.modules["hf_model"]=m; spec.loader.exec_module(m)
# pp, net = m.load_model("sumitp76/cve-exploitability")
import re, json
import numpy as np
import torch, torch.nn as nn
_WORD = re.compile(r"[A-Za-z0-9_.\-]+")
CAT_COLS = ["severity","AV","AC","PR","UI","S","C","I","A"]
NUM_COLS = ["base_score","v2_base_score","has_v3","year","desc_len"]
def tokenize(t): return _WORD.findall(str(t).lower())
def _isnan(x):
try: return isinstance(x, float) and np.isnan(x)
except Exception: return False
class Preprocessor:
def __init__(self, max_len=200, max_vocab=20000):
self.max_len=max_len; self.max_vocab=max_vocab
self.word2idx={}; self.cat_maps={}; self.cwe_vocab={}
self.num_mean={}; self.num_std={}
def transform_text(self, descriptions):
X = np.zeros((len(descriptions), self.max_len), dtype=np.int64)
for i,t in enumerate(descriptions):
for j,w in enumerate(tokenize(t)[:self.max_len]):
X[i,j] = self.word2idx.get(w,1)
return X
def transform_struct(self, df):
n=len(df)
def col(name, default):
return df[name].values if name in df.columns else np.array([default]*n)
num=np.zeros((n,len(NUM_COLS)),dtype=np.float32)
desc_len=np.array([len(str(x)) for x in col("description","")],dtype=float)
srcs={"base_score":col("base_score",np.nan).astype(float),
"v2_base_score":col("v2_base_score",np.nan).astype(float),
"has_v3":col("has_v3",0).astype(float),
"year":col("year",2020).astype(float),
"desc_len":desc_len}
for k,name in enumerate(NUM_COLS):
v=srcs[name].astype(float); v=np.where(np.isnan(v), self.num_mean[name], v)
num[:,k]=(v-self.num_mean[name])/self.num_std[name]
blocks=[num]
for c in CAT_COLS:
m=self.cat_maps[c]; width=len(m)+2; b=np.zeros((n,width),dtype=np.float32)
for i,val in enumerate(col(c,None)):
idx = m.get(val,1) if (val is not None and not _isnan(val)) else 0
b[i,idx]=1.0
blocks.append(b)
cwe_w=len(self.cwe_vocab)+1; cb=np.zeros((n,cwe_w),dtype=np.float32)
for i,val in enumerate(col("cwe","UNKNOWN")): cb[i,self.cwe_vocab.get(val,0)]=1.0
blocks.append(cb)
return np.concatenate(blocks, axis=1)
@property
def struct_dim(self):
return len(NUM_COLS)+sum(len(self.cat_maps[c])+2 for c in CAT_COLS)+len(self.cwe_vocab)+1
@property
def vocab_size(self): return len(self.word2idx)
@classmethod
def from_dict(cls,d):
p=cls(d["max_len"],d["max_vocab"]); p.word2idx=d["word2idx"]; p.cat_maps=d["cat_maps"]
p.cwe_vocab=d["cwe_vocab"]; p.num_mean=d["num_mean"]; p.num_std=d["num_std"]; return p
class TextCNN(nn.Module):
def __init__(self, vocab_size, emb_dim=128, kernels=(2,3,4,5), n_filters=96, dropout=0.3, out_dim=192):
super().__init__()
self.emb=nn.Embedding(vocab_size, emb_dim, padding_idx=0)
self.convs=nn.ModuleList(nn.Conv1d(emb_dim,n_filters,k,padding=k//2) for k in kernels)
self.act=nn.ReLU(); self.drop=nn.Dropout(dropout)
self.proj=nn.Linear(n_filters*len(kernels), out_dim); self.out_dim=out_dim
def forward(self,x):
e=self.emb(x).transpose(1,2)
feats=[self.act(c(e)).max(dim=2).values for c in self.convs]
return self.proj(self.drop(torch.cat(feats,dim=1)))
class StructuredEncoder(nn.Module):
def __init__(self, in_dim, hidden=128, out_dim=96, dropout=0.3):
super().__init__()
self.net=nn.Sequential(nn.Linear(in_dim,hidden),nn.ReLU(),nn.Dropout(dropout),
nn.Linear(hidden,out_dim),nn.ReLU()); self.out_dim=out_dim
def forward(self,x): return self.net(x)
class ExploitabilityNet(nn.Module):
def __init__(self, vocab_size, struct_dim, dropout=0.3):
super().__init__()
self.text=TextCNN(vocab_size,dropout=dropout)
self.struct=StructuredEncoder(struct_dim,dropout=dropout)
self.head=nn.Sequential(nn.Linear(self.text.out_dim+self.struct.out_dim,128),
nn.ReLU(),nn.Dropout(dropout),nn.Linear(128,1))
def forward(self, text_ids, struct):
return self.head(torch.cat([self.text(text_ids), self.struct(struct)],dim=1)).squeeze(-1)
def load_model(repo_id, device="cpu"):
from huggingface_hub import hf_hub_download
pp = Preprocessor.from_dict(json.load(open(hf_hub_download(repo_id,"preprocessor.json"))))
net = ExploitabilityNet(pp.vocab_size, pp.struct_dim).to(device)
net.load_state_dict(torch.load(hf_hub_download(repo_id,"model.pt"), map_location=device))
net.eval()
return pp, net
|