# Self-contained loader for the CVE exploitability model. # Usage: # from huggingface_hub import hf_hub_download # import importlib.util, sys # spec = importlib.util.spec_from_file_location("hf_model", hf_hub_download("sumitp76/cve-exploitability","hf_model.py")) # m = importlib.util.module_from_spec(spec); sys.modules["hf_model"]=m; spec.loader.exec_module(m) # pp, net = m.load_model("sumitp76/cve-exploitability") import re, json import numpy as np import torch, torch.nn as nn _WORD = re.compile(r"[A-Za-z0-9_.\-]+") CAT_COLS = ["severity","AV","AC","PR","UI","S","C","I","A"] NUM_COLS = ["base_score","v2_base_score","has_v3","year","desc_len"] def tokenize(t): return _WORD.findall(str(t).lower()) def _isnan(x): try: return isinstance(x, float) and np.isnan(x) except Exception: return False class Preprocessor: def __init__(self, max_len=200, max_vocab=20000): self.max_len=max_len; self.max_vocab=max_vocab self.word2idx={}; self.cat_maps={}; self.cwe_vocab={} self.num_mean={}; self.num_std={} def transform_text(self, descriptions): X = np.zeros((len(descriptions), self.max_len), dtype=np.int64) for i,t in enumerate(descriptions): for j,w in enumerate(tokenize(t)[:self.max_len]): X[i,j] = self.word2idx.get(w,1) return X def transform_struct(self, df): n=len(df) def col(name, default): return df[name].values if name in df.columns else np.array([default]*n) num=np.zeros((n,len(NUM_COLS)),dtype=np.float32) desc_len=np.array([len(str(x)) for x in col("description","")],dtype=float) srcs={"base_score":col("base_score",np.nan).astype(float), "v2_base_score":col("v2_base_score",np.nan).astype(float), "has_v3":col("has_v3",0).astype(float), "year":col("year",2020).astype(float), "desc_len":desc_len} for k,name in enumerate(NUM_COLS): v=srcs[name].astype(float); v=np.where(np.isnan(v), self.num_mean[name], v) num[:,k]=(v-self.num_mean[name])/self.num_std[name] blocks=[num] for c in CAT_COLS: m=self.cat_maps[c]; width=len(m)+2; b=np.zeros((n,width),dtype=np.float32) for i,val in enumerate(col(c,None)): idx = m.get(val,1) if (val is not None and not _isnan(val)) else 0 b[i,idx]=1.0 blocks.append(b) cwe_w=len(self.cwe_vocab)+1; cb=np.zeros((n,cwe_w),dtype=np.float32) for i,val in enumerate(col("cwe","UNKNOWN")): cb[i,self.cwe_vocab.get(val,0)]=1.0 blocks.append(cb) return np.concatenate(blocks, axis=1) @property def struct_dim(self): return len(NUM_COLS)+sum(len(self.cat_maps[c])+2 for c in CAT_COLS)+len(self.cwe_vocab)+1 @property def vocab_size(self): return len(self.word2idx) @classmethod def from_dict(cls,d): p=cls(d["max_len"],d["max_vocab"]); p.word2idx=d["word2idx"]; p.cat_maps=d["cat_maps"] p.cwe_vocab=d["cwe_vocab"]; p.num_mean=d["num_mean"]; p.num_std=d["num_std"]; return p class TextCNN(nn.Module): def __init__(self, vocab_size, emb_dim=128, kernels=(2,3,4,5), n_filters=96, dropout=0.3, out_dim=192): super().__init__() self.emb=nn.Embedding(vocab_size, emb_dim, padding_idx=0) self.convs=nn.ModuleList(nn.Conv1d(emb_dim,n_filters,k,padding=k//2) for k in kernels) self.act=nn.ReLU(); self.drop=nn.Dropout(dropout) self.proj=nn.Linear(n_filters*len(kernels), out_dim); self.out_dim=out_dim def forward(self,x): e=self.emb(x).transpose(1,2) feats=[self.act(c(e)).max(dim=2).values for c in self.convs] return self.proj(self.drop(torch.cat(feats,dim=1))) class StructuredEncoder(nn.Module): def __init__(self, in_dim, hidden=128, out_dim=96, dropout=0.3): super().__init__() self.net=nn.Sequential(nn.Linear(in_dim,hidden),nn.ReLU(),nn.Dropout(dropout), nn.Linear(hidden,out_dim),nn.ReLU()); self.out_dim=out_dim def forward(self,x): return self.net(x) class ExploitabilityNet(nn.Module): def __init__(self, vocab_size, struct_dim, dropout=0.3): super().__init__() self.text=TextCNN(vocab_size,dropout=dropout) self.struct=StructuredEncoder(struct_dim,dropout=dropout) self.head=nn.Sequential(nn.Linear(self.text.out_dim+self.struct.out_dim,128), nn.ReLU(),nn.Dropout(dropout),nn.Linear(128,1)) def forward(self, text_ids, struct): return self.head(torch.cat([self.text(text_ids), self.struct(struct)],dim=1)).squeeze(-1) def load_model(repo_id, device="cpu"): from huggingface_hub import hf_hub_download pp = Preprocessor.from_dict(json.load(open(hf_hub_download(repo_id,"preprocessor.json")))) net = ExploitabilityNet(pp.vocab_size, pp.struct_dim).to(device) net.load_state_dict(torch.load(hf_hub_download(repo_id,"model.pt"), map_location=device)) net.eval() return pp, net