File size: 5,070 Bytes

929e325

# Self-contained loader for the CVE exploitability model.
# Usage:
#   from huggingface_hub import hf_hub_download
#   import importlib.util, sys
#   spec = importlib.util.spec_from_file_location("hf_model", hf_hub_download("sumitp76/cve-exploitability","hf_model.py"))
#   m = importlib.util.module_from_spec(spec); sys.modules["hf_model"]=m; spec.loader.exec_module(m)
#   pp, net = m.load_model("sumitp76/cve-exploitability")
import re, json
import numpy as np
import torch, torch.nn as nn

_WORD = re.compile(r"[A-Za-z0-9_.\-]+")
CAT_COLS = ["severity","AV","AC","PR","UI","S","C","I","A"]
NUM_COLS = ["base_score","v2_base_score","has_v3","year","desc_len"]

def tokenize(t): return _WORD.findall(str(t).lower())
def _isnan(x):
    try: return isinstance(x, float) and np.isnan(x)
    except Exception: return False

class Preprocessor:
    def __init__(self, max_len=200, max_vocab=20000):
        self.max_len=max_len; self.max_vocab=max_vocab
        self.word2idx={}; self.cat_maps={}; self.cwe_vocab={}
        self.num_mean={}; self.num_std={}
    def transform_text(self, descriptions):
        X = np.zeros((len(descriptions), self.max_len), dtype=np.int64)
        for i,t in enumerate(descriptions):
            for j,w in enumerate(tokenize(t)[:self.max_len]):
                X[i,j] = self.word2idx.get(w,1)
        return X
    def transform_struct(self, df):
        n=len(df)
        def col(name, default):
            return df[name].values if name in df.columns else np.array([default]*n)
        num=np.zeros((n,len(NUM_COLS)),dtype=np.float32)
        desc_len=np.array([len(str(x)) for x in col("description","")],dtype=float)
        srcs={"base_score":col("base_score",np.nan).astype(float),
              "v2_base_score":col("v2_base_score",np.nan).astype(float),
              "has_v3":col("has_v3",0).astype(float),
              "year":col("year",2020).astype(float),
              "desc_len":desc_len}
        for k,name in enumerate(NUM_COLS):
            v=srcs[name].astype(float); v=np.where(np.isnan(v), self.num_mean[name], v)
            num[:,k]=(v-self.num_mean[name])/self.num_std[name]
        blocks=[num]
        for c in CAT_COLS:
            m=self.cat_maps[c]; width=len(m)+2; b=np.zeros((n,width),dtype=np.float32)
            for i,val in enumerate(col(c,None)):
                idx = m.get(val,1) if (val is not None and not _isnan(val)) else 0
                b[i,idx]=1.0
            blocks.append(b)
        cwe_w=len(self.cwe_vocab)+1; cb=np.zeros((n,cwe_w),dtype=np.float32)
        for i,val in enumerate(col("cwe","UNKNOWN")): cb[i,self.cwe_vocab.get(val,0)]=1.0
        blocks.append(cb)
        return np.concatenate(blocks, axis=1)
    @property
    def struct_dim(self):
        return len(NUM_COLS)+sum(len(self.cat_maps[c])+2 for c in CAT_COLS)+len(self.cwe_vocab)+1
    @property
    def vocab_size(self): return len(self.word2idx)
    @classmethod
    def from_dict(cls,d):
        p=cls(d["max_len"],d["max_vocab"]); p.word2idx=d["word2idx"]; p.cat_maps=d["cat_maps"]
        p.cwe_vocab=d["cwe_vocab"]; p.num_mean=d["num_mean"]; p.num_std=d["num_std"]; return p

class TextCNN(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, kernels=(2,3,4,5), n_filters=96, dropout=0.3, out_dim=192):
        super().__init__()
        self.emb=nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.convs=nn.ModuleList(nn.Conv1d(emb_dim,n_filters,k,padding=k//2) for k in kernels)
        self.act=nn.ReLU(); self.drop=nn.Dropout(dropout)
        self.proj=nn.Linear(n_filters*len(kernels), out_dim); self.out_dim=out_dim
    def forward(self,x):
        e=self.emb(x).transpose(1,2)
        feats=[self.act(c(e)).max(dim=2).values for c in self.convs]
        return self.proj(self.drop(torch.cat(feats,dim=1)))

class StructuredEncoder(nn.Module):
    def __init__(self, in_dim, hidden=128, out_dim=96, dropout=0.3):
        super().__init__()
        self.net=nn.Sequential(nn.Linear(in_dim,hidden),nn.ReLU(),nn.Dropout(dropout),
                               nn.Linear(hidden,out_dim),nn.ReLU()); self.out_dim=out_dim
    def forward(self,x): return self.net(x)

class ExploitabilityNet(nn.Module):
    def __init__(self, vocab_size, struct_dim, dropout=0.3):
        super().__init__()
        self.text=TextCNN(vocab_size,dropout=dropout)
        self.struct=StructuredEncoder(struct_dim,dropout=dropout)
        self.head=nn.Sequential(nn.Linear(self.text.out_dim+self.struct.out_dim,128),
                                nn.ReLU(),nn.Dropout(dropout),nn.Linear(128,1))
    def forward(self, text_ids, struct):
        return self.head(torch.cat([self.text(text_ids), self.struct(struct)],dim=1)).squeeze(-1)

def load_model(repo_id, device="cpu"):
    from huggingface_hub import hf_hub_download
    pp = Preprocessor.from_dict(json.load(open(hf_hub_download(repo_id,"preprocessor.json"))))
    net = ExploitabilityNet(pp.vocab_size, pp.struct_dim).to(device)
    net.load_state_dict(torch.load(hf_hub_download(repo_id,"model.pt"), map_location=device))
    net.eval()
    return pp, net