dga-labin / model.py
Reynier's picture
Upload model.py with huggingface_hub
095f796 verified
"""
DGA-LABin: BiLSTM + Self-Attention (Keras) for DGA detection.
Based on Labin architecture, trained on 54 DGA families.
Requires: keras, keras_self_attention
"""
import numpy as np
# Charset: '*' + digits + lowercase + ['-', '_', '.']
charset = (
['*']
+ [chr(x) for x in range(0x30, 0x30 + 10)]
+ [chr(x) for x in range(0x61, 0x61 + 26)]
+ ['-', '_', '.']
)
stoi = {k: charset.index(k) for k in charset}
VOCAB_SIZE = len(charset) # 40
MAX_LEN = 64 # left-padded
def encode_domain(domain: str) -> list:
domain = str(domain).lower().strip()
encoded = []
for ch in domain:
encoded.append(stoi.get(ch, stoi['*']))
# Left-pad / truncate to MAX_LEN
if len(encoded) >= MAX_LEN:
return encoded[:MAX_LEN]
return [0] * (MAX_LEN - len(encoded)) + encoded
def load_model(weights_path: str):
"""Load trained Keras model from .keras file."""
from keras.models import load_model as keras_load
from keras_self_attention import SeqSelfAttention, SeqWeightedAttention
custom_objects = {
'SeqSelfAttention': SeqSelfAttention,
'SeqWeightedAttention': SeqWeightedAttention,
}
model = keras_load(weights_path, custom_objects=custom_objects)
return model
def predict(model, domains, batch_size: int = 256):
"""
Predict DGA vs legit for a list of domain strings.
Returns list of dicts: [{"domain": ..., "label": "dga"/"legit", "score": float}]
"""
if isinstance(domains, str):
domains = [domains]
results = []
for i in range(0, len(domains), batch_size):
batch = domains[i : i + batch_size]
X = np.array([encode_domain(d) for d in batch])
scores = model.predict(X, verbose=0).flatten()
for domain, score in zip(batch, scores):
results.append({
"domain": domain,
"label": "dga" if score > 0.5 else "legit",
"score": round(float(score), 4),
})
return results