File size: 2,084 Bytes
7500cab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import sys
from pathlib import Path

sys.path.append(str(Path(__file__).resolve().parent.parent))

"""training the mini-transformer on UCI URL data."""

import torch 
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from data.tokenizer import url_to_ids
from model.model import MiniTransformer


def load_csv(path : str):
    """Return (token_ds, labels) tensors."""
    df = pd.read_csv(path)
    urls, labels = df["URL"].values, df["label"].values
    ids = torch.tensor([url_to_ids(u) for u in urls], dtype= torch.long)
    lbl = torch.tensor(labels, dtype=torch.float32)

    return ids, lbl

def train(model, train_loader, val_x, val_y, epochs: int = 5, lr: float = 3e-4):
    """Train with early-stop on best validation acc."""

    device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
    model.to(device)

    opt = torch.optim.AdamW(model.parameters(), lr =lr)
    loss_fn = nn.BCELoss()
    best_acc = 0.0

    for epoch in range(epochs):
        model.train() # activating training mode
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad()
            out = model(xb)

            loss = loss_fn(out, yb)
            loss.backward()
            opt.step()
        
        # validation
        model.eval()
        with torch.no_grad():
            preds = (model(val_x.to(device)) > 0.5).cpu()
            acc = (preds == val_y).float().mean().item()

            print(f"Epoch {epoch} : val-acc {acc:3f}")
            if acc > best_acc:
                best_acc = acc
                torch.save(model.state_dict(), "./models/phish_model.pt")

    print("Saved best model to phish_model.pt")      

if __name__ == "__main__":
    train_x, train_y = load_csv("./datasets/train.csv")
    val_x, val_y = load_csv("./datasets/val.csv")
    train_ds = TensorDataset(train_x, train_y)
    train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)
    model = MiniTransformer()
    train(model, train_dl, val_x, val_y, epochs=5)