In [None]:
%pip install onnx



In [None]:
import ast
import torch
import pandas as pd
import torch.nn as nn
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, AdamW, BertForSequenceClassification

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
df = pd.read_csv('dataset.csv')
df['classes'] = df['classes'].apply(ast.literal_eval)

In [None]:
classes_count = [0 for i in range(630)]

for classes in df['classes']:
  for c in classes:
    classes_count[c] +=1

classes_min = min(classes_count)
classes_max = max(classes_count)

pos_weights = torch.tensor([0.3 + 0.7 * (1 - (c - classes_min)  / (classes_max - classes_min)) for c in classes_count]).to(device)  # Adjust weights for each class

In [None]:
mlb = MultiLabelBinarizer()
mlb.fit([range(len(classes_count))])
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(data, max_length=16):
    input_ids = []
    attention_masks = []
    labels = []

    for _, row in data.iterrows():
        text = row['domain'].replace('.', ' ')
        classes = row['classes']

        encoding = tokenizer.encode_plus(
            text,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])
        labels.append(torch.tensor(mlb.transform([classes])))

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.cat(labels, dim=0)

    return TensorDataset(input_ids, attention_masks, labels)


train_dataset = tokenize_data(train_df)
val_dataset = tokenize_data(val_df)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class SmallBERT(nn.Module):
    def __init__(self, hidden_size=256, num_layers=2, num_attention_heads=4, num_classes=2, vocab_size=30522):
        super(SmallBERT, self).__init__()

        self.embedding = nn.Embedding(vocab_size, hidden_size)

        self.transformer = nn.TransformerEncoder(
            encoder_layer=nn.TransformerEncoderLayer(
                d_model=hidden_size,
                nhead=num_attention_heads,
                dim_feedforward=hidden_size * 4
            ),
            num_layers=num_layers
        )

        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids)
        transformer_output = self.transformer(embedded)
        pooled_output = transformer_output.mean(dim=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        return logits

In [None]:
model = SmallBERT(hidden_size=64, num_layers=1, num_attention_heads=2, num_classes=len(mlb.classes_), vocab_size=30522)
model.to(device)



SmallBERT(
  (embedding): Embedding(30522, 64)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=256, out_features=64, bias=True)
        (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (classifier): Linear(in_features=64, out_features=630, bias=True)
)

In [None]:
model.train()

optimizer = AdamW(model.parameters(), lr=2e-4)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weights, reduction='mean')

epochs = 50
for epoch in range(epochs):
  total_loss = 0

  for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}'):
    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)
    labels = batch[2].to(device, dtype=torch.float)

    optimizer.zero_grad()
    logits = model(input_ids=input_ids, attention_mask=attention_mask)

    loss = criterion(logits, labels)
    total_loss += loss.item()

    loss.backward()
    optimizer.step()

  average_loss = total_loss / len(train_dataloader)
  print(f'Training Loss: {average_loss}')

Epoch 1: 100%|██████████| 17333/17333 [01:34<00:00, 183.05it/s]


Training Loss: 0.01455639958662022


Epoch 2: 100%|██████████| 17333/17333 [01:32<00:00, 187.41it/s]


Training Loss: 0.009372596806357445


Epoch 3: 100%|██████████| 17333/17333 [01:33<00:00, 185.43it/s]


Training Loss: 0.00895946430422158


Epoch 4: 100%|██████████| 17333/17333 [01:33<00:00, 184.48it/s]


Training Loss: 0.008654748138614369


Epoch 5: 100%|██████████| 17333/17333 [01:37<00:00, 178.17it/s]


Training Loss: 0.008430107505195222


Epoch 6: 100%|██████████| 17333/17333 [01:35<00:00, 181.19it/s]


Training Loss: 0.00825132777584388


Epoch 7: 100%|██████████| 17333/17333 [01:35<00:00, 181.75it/s]


Training Loss: 0.008098585383865228


Epoch 8: 100%|██████████| 17333/17333 [01:36<00:00, 178.89it/s]


Training Loss: 0.007973489129822018


Epoch 9: 100%|██████████| 17333/17333 [01:34<00:00, 182.66it/s]


Training Loss: 0.007862447003308942


Epoch 10: 100%|██████████| 17333/17333 [01:34<00:00, 184.37it/s]


Training Loss: 0.007767102444360781


Epoch 11: 100%|██████████| 17333/17333 [01:32<00:00, 187.08it/s]


Training Loss: 0.007681683442265059


Epoch 12: 100%|██████████| 17333/17333 [01:34<00:00, 183.56it/s]


Training Loss: 0.0076040456217698555


Epoch 13: 100%|██████████| 17333/17333 [01:33<00:00, 185.58it/s]


Training Loss: 0.007536577820711911


Epoch 14: 100%|██████████| 17333/17333 [01:33<00:00, 185.95it/s]


Training Loss: 0.007473697705334162


Epoch 15: 100%|██████████| 17333/17333 [01:35<00:00, 182.32it/s]


Training Loss: 0.0074216005095400365


Epoch 16: 100%|██████████| 17333/17333 [01:34<00:00, 183.65it/s]


Training Loss: 0.007366839836092643


Epoch 17: 100%|██████████| 17333/17333 [01:35<00:00, 182.22it/s]


Training Loss: 0.007323595992545943


Epoch 18: 100%|██████████| 17333/17333 [01:35<00:00, 182.39it/s]


Training Loss: 0.007281322365534597


Epoch 19: 100%|██████████| 17333/17333 [01:34<00:00, 184.08it/s]


Training Loss: 0.007237244195990475


Epoch 20: 100%|██████████| 17333/17333 [01:32<00:00, 186.76it/s]


Training Loss: 0.007204567027232412


Epoch 21: 100%|██████████| 17333/17333 [01:34<00:00, 182.81it/s]


Training Loss: 0.007169707122033225


Epoch 22: 100%|██████████| 17333/17333 [01:34<00:00, 183.90it/s]


Training Loss: 0.0071396396113856736


Epoch 23: 100%|██████████| 17333/17333 [01:34<00:00, 184.27it/s]


Training Loss: 0.0071061778510986525


Epoch 24: 100%|██████████| 17333/17333 [01:32<00:00, 187.58it/s]


Training Loss: 0.007079116968562793


Epoch 25: 100%|██████████| 17333/17333 [01:35<00:00, 182.40it/s]


Training Loss: 0.007053613749052303


Epoch 26: 100%|██████████| 17333/17333 [01:33<00:00, 184.62it/s]


Training Loss: 0.007027668537672933


Epoch 27: 100%|██████████| 17333/17333 [01:32<00:00, 187.06it/s]


Training Loss: 0.007002266663781792


Epoch 28: 100%|██████████| 17333/17333 [01:33<00:00, 185.74it/s]


Training Loss: 0.006983264061160072


Epoch 29: 100%|██████████| 17333/17333 [01:32<00:00, 186.62it/s]


Training Loss: 0.006960326002187834


Epoch 30: 100%|██████████| 17333/17333 [01:33<00:00, 185.09it/s]


Training Loss: 0.006937417333392953


Epoch 31: 100%|██████████| 17333/17333 [01:34<00:00, 183.75it/s]


Training Loss: 0.006922033323461952


Epoch 32: 100%|██████████| 17333/17333 [01:33<00:00, 184.46it/s]


Training Loss: 0.006898388598138539


Epoch 33: 100%|██████████| 17333/17333 [01:33<00:00, 185.13it/s]


Training Loss: 0.006879616245666734


Epoch 34: 100%|██████████| 17333/17333 [01:33<00:00, 184.60it/s]


Training Loss: 0.006862940901992164


Epoch 35: 100%|██████████| 17333/17333 [01:42<00:00, 169.91it/s]


Training Loss: 0.006847698244776712


Epoch 36: 100%|██████████| 17333/17333 [01:34<00:00, 183.32it/s]


Training Loss: 0.006830427775150129


Epoch 37: 100%|██████████| 17333/17333 [01:34<00:00, 183.19it/s]


Training Loss: 0.006812137309707921


Epoch 38: 100%|██████████| 17333/17333 [01:33<00:00, 184.78it/s]


Training Loss: 0.0067945668246664325


Epoch 39: 100%|██████████| 17333/17333 [01:35<00:00, 181.25it/s]


Training Loss: 0.0067820949417340725


Epoch 40: 100%|██████████| 17333/17333 [01:35<00:00, 182.22it/s]


Training Loss: 0.006767237709435089


Epoch 41: 100%|██████████| 17333/17333 [01:33<00:00, 186.12it/s]


Training Loss: 0.006754826359595461


Epoch 42: 100%|██████████| 17333/17333 [01:34<00:00, 184.04it/s]


Training Loss: 0.006737771247406829


Epoch 43: 100%|██████████| 17333/17333 [01:33<00:00, 185.51it/s]


Training Loss: 0.006729339890418221


Epoch 44: 100%|██████████| 17333/17333 [01:34<00:00, 183.86it/s]


Training Loss: 0.006713867076464513


Epoch 45: 100%|██████████| 17333/17333 [01:36<00:00, 180.05it/s]


Training Loss: 0.006700394152881958


Epoch 46: 100%|██████████| 17333/17333 [01:34<00:00, 183.63it/s]


Training Loss: 0.006688476914265044


Epoch 47: 100%|██████████| 17333/17333 [01:33<00:00, 185.67it/s]


Training Loss: 0.006676017345109116


Epoch 48: 100%|██████████| 17333/17333 [01:33<00:00, 184.72it/s]


Training Loss: 0.006663939434730227


Epoch 49: 100%|██████████| 17333/17333 [01:33<00:00, 185.32it/s]


Training Loss: 0.006649965298220924


Epoch 50: 100%|██████████| 17333/17333 [01:33<00:00, 185.48it/s]

Training Loss: 0.006640829532034061





In [None]:
model.eval()

predicted_logits = []
with torch.no_grad():
    for batch in tqdm(val_dataloader, desc='Validation'):
      input_ids = batch[0].to(device)
      attention_mask = batch[1].to(device)
      labels = batch[2].to(device, dtype=torch.float)

      logits = model(input_ids=input_ids, attention_mask=attention_mask)
      probabilities = torch.sigmoid(logits)

      preds = (probabilities > 0.2).cpu().numpy().astype(int)
      predicted_logits.extend(preds)

predicted_labels = mlb.inverse_transform(torch.tensor(predicted_logits))
print('Predicted labels', predicted_labels[1000:1100])

true_inputs = val_df['domain'].tolist()
print('True inputs', true_inputs[1000:1100])

true_labels = val_df['classes'].tolist()
print('True labels', true_labels[1000:1100])

true_logits = mlb.transform(true_labels)
accuracy = accuracy_score(true_logits, predicted_logits)
print(f'Validation Accuracy: {accuracy}')

Validation: 100%|██████████| 4334/4334 [00:07<00:00, 563.03it/s]
  predicted_labels = mlb.inverse_transform(torch.tensor(predicted_logits))


Predicted labels [(), (), (332,), (183, 194, 215), (215,), (23,), (534,), (), (140,), (183,), (229,), (239,), (), (227, 542), (536,), (140,), (1,), (140, 299), (), (250,), (173,), (227, 542), (239,), (363, 365), (), (215,), (243,), (1, 183, 363), (332, 340), (215,), (289,), (), (96, 289, 572), (), (), (), (), (160,), (215,), (1, 104, 183), (140, 227, 439), (), (289,), (408, 412, 601), (), (140, 215, 250, 528), (332,), (), (183,), (289,), (183, 194), (183,), (), (48, 363), (), (215,), (104,), (), (239, 423), (215, 250), (), (183,), (1,), (444,), (215,), (243, 245), (299, 325, 363, 364, 365), (104, 533), (239,), (534,), (183, 533), (400,), (332,), (), (250,), (215,), (103, 215, 439), (), (), (1, 183, 186, 215), (), (), (103, 126, 289), (363,), (172,), (215,), (140, 215, 439), (183,), (), (23,), (351,), (227, 229, 542), (1, 23), (), (), (), (183,), (215,), (183, 351), (325,)]
True inputs ['usw1.green.ops.kargo.com', '6002359.global.siteimproveanalytics.io', 'didiglobal.com.dob.sibl.suppor

In [None]:
torch.save(model, "bert_domain_classifier")
torch.save(model.state_dict(), "bert_domain_classifier.pth")

In [None]:
dummy_input_ids = torch.zeros((1, 16), dtype=torch.long).to(device)
dummy_attention_mask = torch.zeros((1, 16), dtype=torch.long).to(device)
input_names = ['input_ids', 'attention_mask']
output_names = ['logits']
dynamic_axes = {'input_ids': {0: 'batch_size'}, 'attention_mask': {0: 'batch_size'},
                'logits': {0: 'batch_size'}}

torch.onnx.export(model, (dummy_input_ids, dummy_attention_mask),
                  "bert_domain_classifier.onnx", opset_version=14,
                  input_names=input_names,
                  output_names=output_names,
                  dynamic_axes=dynamic_axes)