""" DGA Benchmark Loader — use this in Colab to load any model from HuggingFace. Usage: from dga_loader import load_dga_model, predict_domains model, mod = load_dga_model("cnn") results = predict_domains(mod, model, ["google.com", "xkr3f9mq.ru"]) Available models: "cnn" -> Reynier/dga-cnn "bilbo" -> Reynier/dga-bilbo "bilstm" -> Reynier/dga-bilstm "labin" -> Reynier/dga-labin "logit" -> Reynier/dga-logit "fanci" -> Reynier/dga-fanci "modernbert" -> Reynier/modernbert-dga-detector (HF pipeline) "domurlsbert" -> Reynier/dga-domurlsbert (PEFT/LoRA) """ import importlib.util import sys from huggingface_hub import hf_hub_download REGISTRY = { "cnn": ("Reynier/dga-cnn", "dga_cnn_model_1M.pth", "model.py"), "bilbo": ("Reynier/dga-bilbo", "bilbo_best.pth", "model.py"), "bilstm": ("Reynier/dga-bilstm", "bilstm_best.pth", "model.py"), "labin": ("Reynier/dga-labin", "LABin_best_model.keras", "model.py"), "logit": ("Reynier/dga-logit", "artifacts.joblib", "model.py"), "fanci": ("Reynier/dga-fanci", "fanci_dga_detector.joblib", "model.py"), } def _import_module(path: str, name: str): """Dynamically import a Python file as a module.""" spec = importlib.util.spec_from_file_location(name, path) mod = importlib.util.module_from_spec(spec) spec.loader.exec_module(mod) sys.modules[name] = mod return mod def load_dga_model(model_name: str, device: str = None): """ Download and load a DGA model from HuggingFace. Parameters ---------- model_name : str One of: cnn, bilbo, bilstm, labin, logit, fanci, modernbert, domurlsbert device : str, optional 'cpu' or 'cuda'. Auto-detected if None. Returns ------- model : loaded model object mod : the model module (call mod.predict(model, domains) to get predictions) For modernbert/domurlsbert, mod=None (use the pipeline/model directly). """ model_name = model_name.lower() # ── Transformer models (special handling) ───────────────────────────── if model_name == "modernbert": from transformers import pipeline print("Loading Reynier/modernbert-dga-detector ...") pipe = pipeline( "text-classification", model="Reynier/modernbert-dga-detector", device=0 if _cuda_available() else -1, ) return pipe, None if model_name == "domurlsbert": import torch from transformers import BertTokenizer, BertForSequenceClassification from peft import PeftModel print("Loading Reynier/dga-domurlsbert ...") tok = BertTokenizer.from_pretrained("Reynier/dga-domurlsbert") base = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2) model = PeftModel.from_pretrained(base, "Reynier/dga-domurlsbert").eval() dev = device or ("cuda" if _cuda_available() else "cpu") model.to(dev) model._tokenizer = tok model._device = dev return model, None # ── Standard models ─────────────────────────────────────────────────── if model_name not in REGISTRY: raise ValueError( f"Unknown model '{model_name}'. " f"Choose from: {list(REGISTRY.keys()) + ['modernbert', 'domurlsbert']}" ) repo_id, weights_file, module_file = REGISTRY[model_name] print(f"Downloading {model_name} from {repo_id} ...") weights_path = hf_hub_download(repo_id, weights_file) module_path = hf_hub_download(repo_id, module_file) mod = _import_module(module_path, f"dga_{model_name}") model = mod.load_model(weights_path) if device is None else mod.load_model(weights_path, device) print(f" {model_name} ready.") return model, mod def predict_domains(mod, model, domains): """ Unified prediction interface. Works with both standard models (mod + model) and transformer pipelines. Parameters ---------- mod : module returned by load_dga_model, or None for transformers model : loaded model domains : str or list of str Returns ------- list of dicts: [{"domain": ..., "label": "dga"/"legit", "score": float}] """ if isinstance(domains, str): domains = [domains] # HF pipeline (modernbert) if mod is None and hasattr(model, '__call__') and not hasattr(model, '_tokenizer'): raw = model(domains) return [ { "domain": d, "label": r["label"].lower().replace("label_1", "dga").replace("label_0", "legit"), "score": round(r["score"], 4), } for d, r in zip(domains, raw) ] # PEFT/LoRA model (domurlsbert) if mod is None and hasattr(model, '_tokenizer'): import torch tok = model._tokenizer dev = model._device id2label = {0: "legit", 1: "dga"} results = [] for domain in domains: inputs = tok(domain, return_tensors="pt", truncation=True).to(dev) with torch.no_grad(): logits = model(**inputs).logits pred = torch.argmax(logits, dim=1).item() score = torch.softmax(logits, dim=1)[0, 1].item() results.append({"domain": domain, "label": id2label[pred], "score": round(score, 4)}) return results # Standard models return mod.predict(model, domains) def _cuda_available(): try: import torch return torch.cuda.is_available() except ImportError: return False