| """ |
| DGA-LABin: BiLSTM + Self-Attention (Keras) for DGA detection. |
| Based on Labin architecture, trained on 54 DGA families. |
| Requires: keras, keras_self_attention |
| """ |
| import numpy as np |
|
|
| |
| charset = ( |
| ['*'] |
| + [chr(x) for x in range(0x30, 0x30 + 10)] |
| + [chr(x) for x in range(0x61, 0x61 + 26)] |
| + ['-', '_', '.'] |
| ) |
| stoi = {k: charset.index(k) for k in charset} |
| VOCAB_SIZE = len(charset) |
| MAX_LEN = 64 |
|
|
|
|
| def encode_domain(domain: str) -> list: |
| domain = str(domain).lower().strip() |
| encoded = [] |
| for ch in domain: |
| encoded.append(stoi.get(ch, stoi['*'])) |
| |
| if len(encoded) >= MAX_LEN: |
| return encoded[:MAX_LEN] |
| return [0] * (MAX_LEN - len(encoded)) + encoded |
|
|
|
|
| def load_model(weights_path: str): |
| """Load trained Keras model from .keras file.""" |
| from keras.models import load_model as keras_load |
| from keras_self_attention import SeqSelfAttention, SeqWeightedAttention |
|
|
| custom_objects = { |
| 'SeqSelfAttention': SeqSelfAttention, |
| 'SeqWeightedAttention': SeqWeightedAttention, |
| } |
| model = keras_load(weights_path, custom_objects=custom_objects) |
| return model |
|
|
|
|
| def predict(model, domains, batch_size: int = 256): |
| """ |
| Predict DGA vs legit for a list of domain strings. |
| Returns list of dicts: [{"domain": ..., "label": "dga"/"legit", "score": float}] |
| """ |
| if isinstance(domains, str): |
| domains = [domains] |
|
|
| results = [] |
| for i in range(0, len(domains), batch_size): |
| batch = domains[i : i + batch_size] |
| X = np.array([encode_domain(d) for d in batch]) |
| scores = model.predict(X, verbose=0).flatten() |
| for domain, score in zip(batch, scores): |
| results.append({ |
| "domain": domain, |
| "label": "dga" if score > 0.5 else "legit", |
| "score": round(float(score), 4), |
| }) |
| return results |
|
|