CVE exploitability model: weights + preprocessor + loader + card

929e325 verified 15 days ago

5.07 kB

	# Self-contained loader for the CVE exploitability model.
	# Usage:
	# from huggingface_hub import hf_hub_download
	# import importlib.util, sys
	# spec = importlib.util.spec_from_file_location("hf_model", hf_hub_download("sumitp76/cve-exploitability","hf_model.py"))
	# m = importlib.util.module_from_spec(spec); sys.modules["hf_model"]=m; spec.loader.exec_module(m)
	# pp, net = m.load_model("sumitp76/cve-exploitability")
	import re, json
	import numpy as np
	import torch, torch.nn as nn

	_WORD = re.compile(r"[A-Za-z0-9_.\-]+")
	CAT_COLS = ["severity","AV","AC","PR","UI","S","C","I","A"]
	NUM_COLS = ["base_score","v2_base_score","has_v3","year","desc_len"]

	def tokenize(t): return _WORD.findall(str(t).lower())
	def _isnan(x):
	try: return isinstance(x, float) and np.isnan(x)
	except Exception: return False

	class Preprocessor:
	def __init__(self, max_len=200, max_vocab=20000):
	self.max_len=max_len; self.max_vocab=max_vocab
	self.word2idx={}; self.cat_maps={}; self.cwe_vocab={}
	self.num_mean={}; self.num_std={}
	def transform_text(self, descriptions):
	X = np.zeros((len(descriptions), self.max_len), dtype=np.int64)
	for i,t in enumerate(descriptions):
	for j,w in enumerate(tokenize(t)[:self.max_len]):
	X[i,j] = self.word2idx.get(w,1)
	return X
	def transform_struct(self, df):
	n=len(df)
	def col(name, default):
	return df[name].values if name in df.columns else np.array([default]*n)
	num=np.zeros((n,len(NUM_COLS)),dtype=np.float32)
	desc_len=np.array([len(str(x)) for x in col("description","")],dtype=float)
	srcs={"base_score":col("base_score",np.nan).astype(float),
	"v2_base_score":col("v2_base_score",np.nan).astype(float),
	"has_v3":col("has_v3",0).astype(float),
	"year":col("year",2020).astype(float),
	"desc_len":desc_len}
	for k,name in enumerate(NUM_COLS):
	v=srcs[name].astype(float); v=np.where(np.isnan(v), self.num_mean[name], v)
	num[:,k]=(v-self.num_mean[name])/self.num_std[name]
	blocks=[num]
	for c in CAT_COLS:
	m=self.cat_maps[c]; width=len(m)+2; b=np.zeros((n,width),dtype=np.float32)
	for i,val in enumerate(col(c,None)):
	idx = m.get(val,1) if (val is not None and not _isnan(val)) else 0
	b[i,idx]=1.0
	blocks.append(b)
	cwe_w=len(self.cwe_vocab)+1; cb=np.zeros((n,cwe_w),dtype=np.float32)
	for i,val in enumerate(col("cwe","UNKNOWN")): cb[i,self.cwe_vocab.get(val,0)]=1.0
	blocks.append(cb)
	return np.concatenate(blocks, axis=1)
	@property
	def struct_dim(self):
	return len(NUM_COLS)+sum(len(self.cat_maps[c])+2 for c in CAT_COLS)+len(self.cwe_vocab)+1
	@property
	def vocab_size(self): return len(self.word2idx)
	@classmethod
	def from_dict(cls,d):
	p=cls(d["max_len"],d["max_vocab"]); p.word2idx=d["word2idx"]; p.cat_maps=d["cat_maps"]
	p.cwe_vocab=d["cwe_vocab"]; p.num_mean=d["num_mean"]; p.num_std=d["num_std"]; return p

	class TextCNN(nn.Module):
	def __init__(self, vocab_size, emb_dim=128, kernels=(2,3,4,5), n_filters=96, dropout=0.3, out_dim=192):
	super().__init__()
	self.emb=nn.Embedding(vocab_size, emb_dim, padding_idx=0)
	self.convs=nn.ModuleList(nn.Conv1d(emb_dim,n_filters,k,padding=k//2) for k in kernels)
	self.act=nn.ReLU(); self.drop=nn.Dropout(dropout)
	self.proj=nn.Linear(n_filters*len(kernels), out_dim); self.out_dim=out_dim
	def forward(self,x):
	e=self.emb(x).transpose(1,2)
	feats=[self.act(c(e)).max(dim=2).values for c in self.convs]
	return self.proj(self.drop(torch.cat(feats,dim=1)))

	class StructuredEncoder(nn.Module):
	def __init__(self, in_dim, hidden=128, out_dim=96, dropout=0.3):
	super().__init__()
	self.net=nn.Sequential(nn.Linear(in_dim,hidden),nn.ReLU(),nn.Dropout(dropout),
	nn.Linear(hidden,out_dim),nn.ReLU()); self.out_dim=out_dim
	def forward(self,x): return self.net(x)

	class ExploitabilityNet(nn.Module):
	def __init__(self, vocab_size, struct_dim, dropout=0.3):
	super().__init__()
	self.text=TextCNN(vocab_size,dropout=dropout)
	self.struct=StructuredEncoder(struct_dim,dropout=dropout)
	self.head=nn.Sequential(nn.Linear(self.text.out_dim+self.struct.out_dim,128),
	nn.ReLU(),nn.Dropout(dropout),nn.Linear(128,1))
	def forward(self, text_ids, struct):
	return self.head(torch.cat([self.text(text_ids), self.struct(struct)],dim=1)).squeeze(-1)

	def load_model(repo_id, device="cpu"):
	from huggingface_hub import hf_hub_download
	pp = Preprocessor.from_dict(json.load(open(hf_hub_download(repo_id,"preprocessor.json"))))
	net = ExploitabilityNet(pp.vocab_size, pp.struct_dim).to(device)
	net.load_state_dict(torch.load(hf_hub_download(repo_id,"model.pt"), map_location=device))
	net.eval()
	return pp, net