| |
| |
| |
| |
| |
| |
| |
| import re, json |
| import numpy as np |
| import torch, torch.nn as nn |
|
|
| _WORD = re.compile(r"[A-Za-z0-9_.\-]+") |
| CAT_COLS = ["severity","AV","AC","PR","UI","S","C","I","A"] |
| NUM_COLS = ["base_score","v2_base_score","has_v3","year","desc_len"] |
|
|
| def tokenize(t): return _WORD.findall(str(t).lower()) |
| def _isnan(x): |
| try: return isinstance(x, float) and np.isnan(x) |
| except Exception: return False |
|
|
| class Preprocessor: |
| def __init__(self, max_len=200, max_vocab=20000): |
| self.max_len=max_len; self.max_vocab=max_vocab |
| self.word2idx={}; self.cat_maps={}; self.cwe_vocab={} |
| self.num_mean={}; self.num_std={} |
| def transform_text(self, descriptions): |
| X = np.zeros((len(descriptions), self.max_len), dtype=np.int64) |
| for i,t in enumerate(descriptions): |
| for j,w in enumerate(tokenize(t)[:self.max_len]): |
| X[i,j] = self.word2idx.get(w,1) |
| return X |
| def transform_struct(self, df): |
| n=len(df) |
| def col(name, default): |
| return df[name].values if name in df.columns else np.array([default]*n) |
| num=np.zeros((n,len(NUM_COLS)),dtype=np.float32) |
| desc_len=np.array([len(str(x)) for x in col("description","")],dtype=float) |
| srcs={"base_score":col("base_score",np.nan).astype(float), |
| "v2_base_score":col("v2_base_score",np.nan).astype(float), |
| "has_v3":col("has_v3",0).astype(float), |
| "year":col("year",2020).astype(float), |
| "desc_len":desc_len} |
| for k,name in enumerate(NUM_COLS): |
| v=srcs[name].astype(float); v=np.where(np.isnan(v), self.num_mean[name], v) |
| num[:,k]=(v-self.num_mean[name])/self.num_std[name] |
| blocks=[num] |
| for c in CAT_COLS: |
| m=self.cat_maps[c]; width=len(m)+2; b=np.zeros((n,width),dtype=np.float32) |
| for i,val in enumerate(col(c,None)): |
| idx = m.get(val,1) if (val is not None and not _isnan(val)) else 0 |
| b[i,idx]=1.0 |
| blocks.append(b) |
| cwe_w=len(self.cwe_vocab)+1; cb=np.zeros((n,cwe_w),dtype=np.float32) |
| for i,val in enumerate(col("cwe","UNKNOWN")): cb[i,self.cwe_vocab.get(val,0)]=1.0 |
| blocks.append(cb) |
| return np.concatenate(blocks, axis=1) |
| @property |
| def struct_dim(self): |
| return len(NUM_COLS)+sum(len(self.cat_maps[c])+2 for c in CAT_COLS)+len(self.cwe_vocab)+1 |
| @property |
| def vocab_size(self): return len(self.word2idx) |
| @classmethod |
| def from_dict(cls,d): |
| p=cls(d["max_len"],d["max_vocab"]); p.word2idx=d["word2idx"]; p.cat_maps=d["cat_maps"] |
| p.cwe_vocab=d["cwe_vocab"]; p.num_mean=d["num_mean"]; p.num_std=d["num_std"]; return p |
|
|
| class TextCNN(nn.Module): |
| def __init__(self, vocab_size, emb_dim=128, kernels=(2,3,4,5), n_filters=96, dropout=0.3, out_dim=192): |
| super().__init__() |
| self.emb=nn.Embedding(vocab_size, emb_dim, padding_idx=0) |
| self.convs=nn.ModuleList(nn.Conv1d(emb_dim,n_filters,k,padding=k//2) for k in kernels) |
| self.act=nn.ReLU(); self.drop=nn.Dropout(dropout) |
| self.proj=nn.Linear(n_filters*len(kernels), out_dim); self.out_dim=out_dim |
| def forward(self,x): |
| e=self.emb(x).transpose(1,2) |
| feats=[self.act(c(e)).max(dim=2).values for c in self.convs] |
| return self.proj(self.drop(torch.cat(feats,dim=1))) |
|
|
| class StructuredEncoder(nn.Module): |
| def __init__(self, in_dim, hidden=128, out_dim=96, dropout=0.3): |
| super().__init__() |
| self.net=nn.Sequential(nn.Linear(in_dim,hidden),nn.ReLU(),nn.Dropout(dropout), |
| nn.Linear(hidden,out_dim),nn.ReLU()); self.out_dim=out_dim |
| def forward(self,x): return self.net(x) |
|
|
| class ExploitabilityNet(nn.Module): |
| def __init__(self, vocab_size, struct_dim, dropout=0.3): |
| super().__init__() |
| self.text=TextCNN(vocab_size,dropout=dropout) |
| self.struct=StructuredEncoder(struct_dim,dropout=dropout) |
| self.head=nn.Sequential(nn.Linear(self.text.out_dim+self.struct.out_dim,128), |
| nn.ReLU(),nn.Dropout(dropout),nn.Linear(128,1)) |
| def forward(self, text_ids, struct): |
| return self.head(torch.cat([self.text(text_ids), self.struct(struct)],dim=1)).squeeze(-1) |
|
|
| def load_model(repo_id, device="cpu"): |
| from huggingface_hub import hf_hub_download |
| pp = Preprocessor.from_dict(json.load(open(hf_hub_download(repo_id,"preprocessor.json")))) |
| net = ExploitabilityNet(pp.vocab_size, pp.struct_dim).to(device) |
| net.load_state_dict(torch.load(hf_hub_download(repo_id,"model.pt"), map_location=device)) |
| net.eval() |
| return pp, net |
|
|