Spaces:
Runtime error
Runtime error
Commit ·
3f90381
1
Parent(s): 9a2df03
gitignore
Browse files- .gitignore +9 -0
- bert_pytorch/model/attention/multi_head.py +37 -0
- bert_pytorch/model/attention/single.py +25 -0
- bert_pytorch/model/utils/feed_forward.py +16 -0
- bert_pytorch/model/utils/gelu.py +12 -0
- bert_pytorch/model/utils/layer_norm.py +17 -0
- bert_pytorch/model/utils/sublayer.py +18 -0
- logparser/Drain.py +345 -0
- logparser/Spell.py +356 -0
- model/bert/parameters.txt +40 -0
.gitignore
CHANGED
|
@@ -1,3 +1,12 @@
|
|
| 1 |
model/bert/best_center.pt
|
| 2 |
model/bert/best_total_dist.pt
|
| 3 |
model/bert/best_bert.pth
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
model/bert/best_center.pt
|
| 2 |
model/bert/best_total_dist.pt
|
| 3 |
model/bert/best_bert.pth
|
| 4 |
+
model/bert/train_log2.csv
|
| 5 |
+
model/bert/train_valid_loss.png
|
| 6 |
+
model/bert/valid_log2.csv
|
| 7 |
+
bert_pytorch/trainer/optim_schedule.py
|
| 8 |
+
bert_pytorch/trainer/pretrain.py
|
| 9 |
+
model/bert/test_abnormal_errors.pkl
|
| 10 |
+
model/bert/test_abnormal_results
|
| 11 |
+
model/bert/test_normal_errors.pkl
|
| 12 |
+
model/bert/test_normal_results
|
bert_pytorch/model/attention/multi_head.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch.nn as nn
|
| 2 |
+
from .single import Attention
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class MultiHeadedAttention(nn.Module):
|
| 6 |
+
"""
|
| 7 |
+
Take in model size and number of heads.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
def __init__(self, h, d_model, dropout=0.1):
|
| 11 |
+
super().__init__()
|
| 12 |
+
assert d_model % h == 0
|
| 13 |
+
|
| 14 |
+
# We assume d_v always equals d_k
|
| 15 |
+
self.d_k = d_model // h
|
| 16 |
+
self.h = h
|
| 17 |
+
|
| 18 |
+
self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(3)])
|
| 19 |
+
self.output_linear = nn.Linear(d_model, d_model)
|
| 20 |
+
self.attention = Attention()
|
| 21 |
+
|
| 22 |
+
self.dropout = nn.Dropout(p=dropout)
|
| 23 |
+
|
| 24 |
+
def forward(self, query, key, value, mask=None):
|
| 25 |
+
batch_size = query.size(0)
|
| 26 |
+
|
| 27 |
+
# 1) Do all the linear projections in batch from d_model => h x d_k
|
| 28 |
+
query, key, value = [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
|
| 29 |
+
for l, x in zip(self.linear_layers, (query, key, value))]
|
| 30 |
+
|
| 31 |
+
# 2) Apply attention on all the projected vectors in batch.
|
| 32 |
+
x, attn = self.attention(query, key, value, mask=mask, dropout=self.dropout)
|
| 33 |
+
|
| 34 |
+
# 3) "Concat" using a view and apply a final linear.
|
| 35 |
+
x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)
|
| 36 |
+
|
| 37 |
+
return self.output_linear(x)
|
bert_pytorch/model/attention/single.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch.nn as nn
|
| 2 |
+
import torch.nn.functional as F
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
import math
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Attention(nn.Module):
|
| 9 |
+
"""
|
| 10 |
+
Compute 'Scaled Dot Product Attention
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
def forward(self, query, key, value, mask=None, dropout=None):
|
| 14 |
+
scores = torch.matmul(query, key.transpose(-2, -1)) \
|
| 15 |
+
/ math.sqrt(query.size(-1))
|
| 16 |
+
|
| 17 |
+
if mask is not None:
|
| 18 |
+
scores = scores.masked_fill(mask == 0, -1e9)
|
| 19 |
+
|
| 20 |
+
p_attn = F.softmax(scores, dim=-1)
|
| 21 |
+
|
| 22 |
+
if dropout is not None:
|
| 23 |
+
p_attn = dropout(p_attn)
|
| 24 |
+
|
| 25 |
+
return torch.matmul(p_attn, value), p_attn
|
bert_pytorch/model/utils/feed_forward.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch.nn as nn
|
| 2 |
+
from .gelu import GELU
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class PositionwiseFeedForward(nn.Module):
|
| 6 |
+
"Implements FFN equation."
|
| 7 |
+
|
| 8 |
+
def __init__(self, d_model, d_ff, dropout=0.1):
|
| 9 |
+
super(PositionwiseFeedForward, self).__init__()
|
| 10 |
+
self.w_1 = nn.Linear(d_model, d_ff)
|
| 11 |
+
self.w_2 = nn.Linear(d_ff, d_model)
|
| 12 |
+
self.dropout = nn.Dropout(dropout)
|
| 13 |
+
self.activation = GELU()
|
| 14 |
+
|
| 15 |
+
def forward(self, x):
|
| 16 |
+
return self.w_2(self.dropout(self.activation(self.w_1(x))))
|
bert_pytorch/model/utils/gelu.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch.nn as nn
|
| 2 |
+
import torch
|
| 3 |
+
import math
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class GELU(nn.Module):
|
| 7 |
+
"""
|
| 8 |
+
Paper Section 3.4, last paragraph notice that BERT used the GELU instead of RELU
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
def forward(self, x):
|
| 12 |
+
return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
bert_pytorch/model/utils/layer_norm.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch.nn as nn
|
| 2 |
+
import torch
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class LayerNorm(nn.Module):
|
| 6 |
+
"Construct a layernorm module (See citation for details)."
|
| 7 |
+
|
| 8 |
+
def __init__(self, features, eps=1e-6):
|
| 9 |
+
super(LayerNorm, self).__init__()
|
| 10 |
+
self.a_2 = nn.Parameter(torch.ones(features))
|
| 11 |
+
self.b_2 = nn.Parameter(torch.zeros(features))
|
| 12 |
+
self.eps = eps
|
| 13 |
+
|
| 14 |
+
def forward(self, x):
|
| 15 |
+
mean = x.mean(-1, keepdim=True)
|
| 16 |
+
std = x.std(-1, keepdim=True)
|
| 17 |
+
return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
|
bert_pytorch/model/utils/sublayer.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch.nn as nn
|
| 2 |
+
from .layer_norm import LayerNorm
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class SublayerConnection(nn.Module):
|
| 6 |
+
"""
|
| 7 |
+
A residual connection followed by a layer norm.
|
| 8 |
+
Note for code simplicity the norm is first as opposed to last.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
def __init__(self, size, dropout):
|
| 12 |
+
super(SublayerConnection, self).__init__()
|
| 13 |
+
self.norm = LayerNorm(size)
|
| 14 |
+
self.dropout = nn.Dropout(dropout)
|
| 15 |
+
|
| 16 |
+
def forward(self, x, sublayer):
|
| 17 |
+
"Apply residual connection to any sublayer with the same size."
|
| 18 |
+
return x + self.dropout(sublayer(self.norm(x)))
|
logparser/Drain.py
ADDED
|
@@ -0,0 +1,345 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Description : This file implements the Drain algorithm for log parsing
|
| 3 |
+
Author : LogPAI team
|
| 4 |
+
License : MIT
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import re
|
| 8 |
+
import os
|
| 9 |
+
import numpy as np
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import hashlib
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class Logcluster:
|
| 16 |
+
def __init__(self, logTemplate='', logIDL=None):
|
| 17 |
+
self.logTemplate = logTemplate
|
| 18 |
+
if logIDL is None:
|
| 19 |
+
logIDL = []
|
| 20 |
+
self.logIDL = logIDL
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class Node:
|
| 24 |
+
def __init__(self, childD=None, depth=0, digitOrtoken=None):
|
| 25 |
+
if childD is None:
|
| 26 |
+
childD = dict()
|
| 27 |
+
self.childD = childD
|
| 28 |
+
self.depth = depth
|
| 29 |
+
self.digitOrtoken = digitOrtoken
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class LogParser:
|
| 33 |
+
def __init__(self, log_format, indir='./', outdir='./result/', depth=4, st=0.4,
|
| 34 |
+
maxChild=100, rex=[], keep_para=True):
|
| 35 |
+
"""
|
| 36 |
+
Attributes
|
| 37 |
+
----------
|
| 38 |
+
rex : regular expressions used in preprocessing (step1)
|
| 39 |
+
path : the input path stores the input log file name
|
| 40 |
+
depth : depth of all leaf nodes
|
| 41 |
+
st : similarity threshold
|
| 42 |
+
maxChild : max number of children of an internal node
|
| 43 |
+
logName : the name of the input file containing raw log messages
|
| 44 |
+
savePath : the output path stores the file containing structured logs
|
| 45 |
+
"""
|
| 46 |
+
self.path = indir
|
| 47 |
+
self.depth = depth - 2
|
| 48 |
+
self.st = st
|
| 49 |
+
self.maxChild = maxChild
|
| 50 |
+
self.logName = None
|
| 51 |
+
self.savePath = outdir
|
| 52 |
+
self.df_log = None
|
| 53 |
+
self.log_format = log_format
|
| 54 |
+
self.rex = rex
|
| 55 |
+
self.keep_para = keep_para
|
| 56 |
+
|
| 57 |
+
def hasNumbers(self, s):
|
| 58 |
+
return any(char.isdigit() for char in s)
|
| 59 |
+
|
| 60 |
+
def treeSearch(self, rn, seq):
|
| 61 |
+
retLogClust = None
|
| 62 |
+
|
| 63 |
+
seqLen = len(seq)
|
| 64 |
+
if seqLen not in rn.childD:
|
| 65 |
+
return retLogClust
|
| 66 |
+
|
| 67 |
+
parentn = rn.childD[seqLen]
|
| 68 |
+
|
| 69 |
+
currentDepth = 1
|
| 70 |
+
for token in seq:
|
| 71 |
+
if currentDepth >= self.depth or currentDepth > seqLen:
|
| 72 |
+
break
|
| 73 |
+
|
| 74 |
+
if token in parentn.childD:
|
| 75 |
+
parentn = parentn.childD[token]
|
| 76 |
+
elif '<*>' in parentn.childD:
|
| 77 |
+
parentn = parentn.childD['<*>']
|
| 78 |
+
else:
|
| 79 |
+
return retLogClust
|
| 80 |
+
currentDepth += 1
|
| 81 |
+
|
| 82 |
+
logClustL = parentn.childD
|
| 83 |
+
|
| 84 |
+
retLogClust = self.fastMatch(logClustL, seq)
|
| 85 |
+
|
| 86 |
+
return retLogClust
|
| 87 |
+
|
| 88 |
+
def addSeqToPrefixTree(self, rn, logClust):
|
| 89 |
+
seqLen = len(logClust.logTemplate)
|
| 90 |
+
if seqLen not in rn.childD:
|
| 91 |
+
firtLayerNode = Node(depth=1, digitOrtoken=seqLen)
|
| 92 |
+
rn.childD[seqLen] = firtLayerNode
|
| 93 |
+
else:
|
| 94 |
+
firtLayerNode = rn.childD[seqLen]
|
| 95 |
+
|
| 96 |
+
parentn = firtLayerNode
|
| 97 |
+
|
| 98 |
+
currentDepth = 1
|
| 99 |
+
for token in logClust.logTemplate:
|
| 100 |
+
|
| 101 |
+
# Add current log cluster to the leaf node
|
| 102 |
+
if currentDepth >= self.depth or currentDepth > seqLen:
|
| 103 |
+
if len(parentn.childD) == 0:
|
| 104 |
+
parentn.childD = [logClust]
|
| 105 |
+
else:
|
| 106 |
+
parentn.childD.append(logClust)
|
| 107 |
+
break
|
| 108 |
+
|
| 109 |
+
# If token not matched in this layer of existing tree.
|
| 110 |
+
if token not in parentn.childD:
|
| 111 |
+
if not self.hasNumbers(token):
|
| 112 |
+
if '<*>' in parentn.childD:
|
| 113 |
+
if len(parentn.childD) < self.maxChild:
|
| 114 |
+
newNode = Node(depth=currentDepth + 1, digitOrtoken=token)
|
| 115 |
+
parentn.childD[token] = newNode
|
| 116 |
+
parentn = newNode
|
| 117 |
+
else:
|
| 118 |
+
parentn = parentn.childD['<*>']
|
| 119 |
+
else:
|
| 120 |
+
if len(parentn.childD) + 1 < self.maxChild:
|
| 121 |
+
newNode = Node(depth=currentDepth + 1, digitOrtoken=token)
|
| 122 |
+
parentn.childD[token] = newNode
|
| 123 |
+
parentn = newNode
|
| 124 |
+
elif len(parentn.childD) + 1 == self.maxChild:
|
| 125 |
+
newNode = Node(depth=currentDepth + 1, digitOrtoken='<*>')
|
| 126 |
+
parentn.childD['<*>'] = newNode
|
| 127 |
+
parentn = newNode
|
| 128 |
+
else:
|
| 129 |
+
parentn = parentn.childD['<*>']
|
| 130 |
+
|
| 131 |
+
else:
|
| 132 |
+
if '<*>' not in parentn.childD:
|
| 133 |
+
newNode = Node(depth=currentDepth + 1, digitOrtoken='<*>')
|
| 134 |
+
parentn.childD['<*>'] = newNode
|
| 135 |
+
parentn = newNode
|
| 136 |
+
else:
|
| 137 |
+
parentn = parentn.childD['<*>']
|
| 138 |
+
|
| 139 |
+
# If the token is matched
|
| 140 |
+
else:
|
| 141 |
+
parentn = parentn.childD[token]
|
| 142 |
+
|
| 143 |
+
currentDepth += 1
|
| 144 |
+
|
| 145 |
+
# seq1 is template
|
| 146 |
+
def seqDist(self, seq1, seq2):
|
| 147 |
+
assert len(seq1) == len(seq2)
|
| 148 |
+
simTokens = 0
|
| 149 |
+
numOfPar = 0
|
| 150 |
+
|
| 151 |
+
for token1, token2 in zip(seq1, seq2):
|
| 152 |
+
if token1 == '<*>':
|
| 153 |
+
numOfPar += 1
|
| 154 |
+
continue #comment@haixuanguo: <*> == <*> are similar pairs
|
| 155 |
+
if token1 == token2:
|
| 156 |
+
simTokens += 1
|
| 157 |
+
|
| 158 |
+
retVal = float(simTokens) / len(seq1)
|
| 159 |
+
|
| 160 |
+
return retVal, numOfPar
|
| 161 |
+
|
| 162 |
+
def fastMatch(self, logClustL, seq):
|
| 163 |
+
retLogClust = None
|
| 164 |
+
|
| 165 |
+
maxSim = -1
|
| 166 |
+
maxNumOfPara = -1
|
| 167 |
+
maxClust = None
|
| 168 |
+
|
| 169 |
+
for logClust in logClustL:
|
| 170 |
+
curSim, curNumOfPara = self.seqDist(logClust.logTemplate, seq)
|
| 171 |
+
if curSim > maxSim or (curSim == maxSim and curNumOfPara > maxNumOfPara):
|
| 172 |
+
maxSim = curSim
|
| 173 |
+
maxNumOfPara = curNumOfPara
|
| 174 |
+
maxClust = logClust
|
| 175 |
+
|
| 176 |
+
if maxSim >= self.st:
|
| 177 |
+
retLogClust = maxClust
|
| 178 |
+
|
| 179 |
+
return retLogClust
|
| 180 |
+
|
| 181 |
+
def getTemplate(self, seq1, seq2):
|
| 182 |
+
assert len(seq1) == len(seq2)
|
| 183 |
+
retVal = []
|
| 184 |
+
|
| 185 |
+
i = 0
|
| 186 |
+
for word in seq1:
|
| 187 |
+
if word == seq2[i]:
|
| 188 |
+
retVal.append(word)
|
| 189 |
+
else:
|
| 190 |
+
retVal.append('<*>')
|
| 191 |
+
|
| 192 |
+
i += 1
|
| 193 |
+
|
| 194 |
+
return retVal
|
| 195 |
+
|
| 196 |
+
def outputResult(self, logClustL):
|
| 197 |
+
log_templates = [0] * self.df_log.shape[0]
|
| 198 |
+
log_templateids = [0] * self.df_log.shape[0]
|
| 199 |
+
df_events = []
|
| 200 |
+
for logClust in logClustL:
|
| 201 |
+
template_str = ' '.join(logClust.logTemplate)
|
| 202 |
+
occurrence = len(logClust.logIDL)
|
| 203 |
+
template_id = hashlib.md5(template_str.encode('utf-8')).hexdigest()[0:8]
|
| 204 |
+
for logID in logClust.logIDL:
|
| 205 |
+
logID -= 1
|
| 206 |
+
log_templates[logID] = template_str
|
| 207 |
+
log_templateids[logID] = template_id
|
| 208 |
+
df_events.append([template_id, template_str, occurrence])
|
| 209 |
+
|
| 210 |
+
df_event = pd.DataFrame(df_events, columns=['EventId', 'EventTemplate', 'Occurrences'])
|
| 211 |
+
self.df_log['EventId'] = log_templateids
|
| 212 |
+
self.df_log['EventTemplate'] = log_templates
|
| 213 |
+
|
| 214 |
+
if self.keep_para:
|
| 215 |
+
self.df_log["ParameterList"] = self.df_log.apply(self.get_parameter_list, axis=1)
|
| 216 |
+
self.df_log.to_csv(os.path.join(self.savePath, self.logName + '_structured.csv'), index=False)
|
| 217 |
+
|
| 218 |
+
occ_dict = dict(self.df_log['EventTemplate'].value_counts())
|
| 219 |
+
df_event = pd.DataFrame()
|
| 220 |
+
df_event['EventTemplate'] = self.df_log['EventTemplate'].unique()
|
| 221 |
+
df_event['EventId'] = df_event['EventTemplate'].map(lambda x: hashlib.md5(str(x).encode('utf-8')).hexdigest()[0:8])
|
| 222 |
+
df_event['Occurrences'] = df_event['EventTemplate'].map(occ_dict)
|
| 223 |
+
df_event.to_csv(os.path.join(self.savePath, self.logName + '_templates.csv'), index=False,
|
| 224 |
+
columns=["EventId", "EventTemplate", "Occurrences"])
|
| 225 |
+
|
| 226 |
+
def printTree(self, node, dep):
|
| 227 |
+
pStr = ''
|
| 228 |
+
for i in range(dep):
|
| 229 |
+
pStr += '\t'
|
| 230 |
+
|
| 231 |
+
if node.depth == 0:
|
| 232 |
+
pStr += 'Root'
|
| 233 |
+
elif node.depth == 1:
|
| 234 |
+
pStr += '<' + str(node.digitOrtoken) + '>'
|
| 235 |
+
else:
|
| 236 |
+
pStr += node.digitOrtoken
|
| 237 |
+
|
| 238 |
+
print(pStr)
|
| 239 |
+
|
| 240 |
+
if node.depth == self.depth:
|
| 241 |
+
return 1
|
| 242 |
+
for child in node.childD:
|
| 243 |
+
self.printTree(node.childD[child], dep + 1)
|
| 244 |
+
|
| 245 |
+
def parse(self, logName):
|
| 246 |
+
print('Parsing file: ' + os.path.join(self.path, logName))
|
| 247 |
+
start_time = datetime.now()
|
| 248 |
+
self.logName = logName
|
| 249 |
+
rootNode = Node()
|
| 250 |
+
logCluL = []
|
| 251 |
+
|
| 252 |
+
self.load_data()
|
| 253 |
+
|
| 254 |
+
count = 0
|
| 255 |
+
for idx, line in self.df_log.iterrows():
|
| 256 |
+
|
| 257 |
+
logID = line['LineId']
|
| 258 |
+
logmessageL = self.preprocess(line['Content']).strip().split()
|
| 259 |
+
# logmessageL = filter(lambda x: x != '', re.split('[\s=:,]', self.preprocess(line['Content'])))
|
| 260 |
+
matchCluster = self.treeSearch(rootNode, logmessageL)
|
| 261 |
+
|
| 262 |
+
# Match no existing log cluster
|
| 263 |
+
if matchCluster is None:
|
| 264 |
+
newCluster = Logcluster(logTemplate=logmessageL, logIDL=[logID])
|
| 265 |
+
logCluL.append(newCluster)
|
| 266 |
+
self.addSeqToPrefixTree(rootNode, newCluster)
|
| 267 |
+
|
| 268 |
+
# Add the new log message to the existing cluster
|
| 269 |
+
else:
|
| 270 |
+
newTemplate = self.getTemplate(logmessageL, matchCluster.logTemplate)
|
| 271 |
+
matchCluster.logIDL.append(logID)
|
| 272 |
+
if ' '.join(newTemplate) != ' '.join(matchCluster.logTemplate):
|
| 273 |
+
matchCluster.logTemplate = newTemplate
|
| 274 |
+
|
| 275 |
+
count += 1
|
| 276 |
+
if count % 1000 == 0 or count == len(self.df_log):
|
| 277 |
+
print('Processed {0:.1f}% of log lines.'.format(count * 100.0 / len(self.df_log)), end='\r')
|
| 278 |
+
|
| 279 |
+
if not os.path.exists(self.savePath):
|
| 280 |
+
os.makedirs(self.savePath)
|
| 281 |
+
|
| 282 |
+
self.outputResult(logCluL)
|
| 283 |
+
|
| 284 |
+
print('Parsing done. [Time taken: {!s}]'.format(datetime.now() - start_time))
|
| 285 |
+
|
| 286 |
+
def load_data(self):
|
| 287 |
+
headers, regex = self.generate_logformat_regex(self.log_format)
|
| 288 |
+
self.df_log = self.log_to_dataframe(os.path.join(self.path, self.logName), regex, headers, self.log_format)
|
| 289 |
+
|
| 290 |
+
def preprocess(self, line):
|
| 291 |
+
for currentRex in self.rex:
|
| 292 |
+
line = re.sub(currentRex, '<*>', line)
|
| 293 |
+
return line
|
| 294 |
+
|
| 295 |
+
def log_to_dataframe(self, log_file, regex, headers, logformat):
|
| 296 |
+
""" Function to transform log file to dataframe
|
| 297 |
+
"""
|
| 298 |
+
log_messages = []
|
| 299 |
+
linecount = 0
|
| 300 |
+
cnt = 0
|
| 301 |
+
with open(log_file, 'r') as fin:
|
| 302 |
+
for line in fin.readlines():
|
| 303 |
+
cnt += 1
|
| 304 |
+
try:
|
| 305 |
+
match = regex.search(line.strip())
|
| 306 |
+
message = [match.group(header) for header in headers]
|
| 307 |
+
log_messages.append(message)
|
| 308 |
+
linecount += 1
|
| 309 |
+
except Exception as e:
|
| 310 |
+
# print("\n", line)
|
| 311 |
+
# print(e)
|
| 312 |
+
pass
|
| 313 |
+
print("Total size after encoding is", linecount, cnt)
|
| 314 |
+
logdf = pd.DataFrame(log_messages, columns=headers)
|
| 315 |
+
logdf.insert(0, 'LineId', None)
|
| 316 |
+
logdf['LineId'] = [i + 1 for i in range(linecount)]
|
| 317 |
+
return logdf
|
| 318 |
+
|
| 319 |
+
def generate_logformat_regex(self, logformat):
|
| 320 |
+
""" Function to generate regular expression to split log messages
|
| 321 |
+
"""
|
| 322 |
+
headers = []
|
| 323 |
+
splitters = re.split(r'(<[^<>]+>)', logformat)
|
| 324 |
+
regex = ''
|
| 325 |
+
for k in range(len(splitters)):
|
| 326 |
+
if k % 2 == 0:
|
| 327 |
+
splitter = re.sub(' +', '\\\s+', splitters[k])
|
| 328 |
+
regex += splitter
|
| 329 |
+
else:
|
| 330 |
+
header = splitters[k].strip('<').strip('>')
|
| 331 |
+
regex += '(?P<%s>.*?)' % header
|
| 332 |
+
headers.append(header)
|
| 333 |
+
regex = re.compile('^' + regex + '$')
|
| 334 |
+
return headers, regex
|
| 335 |
+
|
| 336 |
+
def get_parameter_list(self, row):
|
| 337 |
+
template_regex = re.sub(r"<.{1,5}>", "<*>", str(row["EventTemplate"]))
|
| 338 |
+
if "<*>" not in template_regex: return []
|
| 339 |
+
template_regex = re.sub(r'([^A-Za-z0-9])', r'\\\1', template_regex)
|
| 340 |
+
template_regex = re.sub(r' +', r'\\s+', template_regex)
|
| 341 |
+
template_regex = "^" + template_regex.replace("\<\*\>", "(.*?)") + "$"
|
| 342 |
+
parameter_list = re.findall(template_regex, row["Content"])
|
| 343 |
+
parameter_list = parameter_list[0] if parameter_list else ()
|
| 344 |
+
parameter_list = list(parameter_list) if isinstance(parameter_list, tuple) else [parameter_list]
|
| 345 |
+
return parameter_list
|
logparser/Spell.py
ADDED
|
@@ -0,0 +1,356 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Description : This file implements the Spell algorithm for log parsing
|
| 3 |
+
Author : LogPAI team
|
| 4 |
+
License : MIT
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import sys
|
| 8 |
+
# import re
|
| 9 |
+
import regex as re
|
| 10 |
+
import os
|
| 11 |
+
import numpy as np
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import hashlib
|
| 14 |
+
from datetime import datetime
|
| 15 |
+
import string
|
| 16 |
+
from tqdm import tqdm
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class LCSObject:
|
| 20 |
+
""" Class object to store a log group with the same template
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
def __init__(self, logTemplate='', logIDL=[]):
|
| 24 |
+
self.logTemplate = logTemplate
|
| 25 |
+
self.logIDL = logIDL
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class Node:
|
| 29 |
+
""" A node in prefix tree data structure
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
def __init__(self, token='', templateNo=0):
|
| 33 |
+
self.logClust = None
|
| 34 |
+
self.token = token
|
| 35 |
+
self.templateNo = templateNo
|
| 36 |
+
self.childD = dict()
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class LogParser:
|
| 40 |
+
""" LogParser class
|
| 41 |
+
Attributes
|
| 42 |
+
----------
|
| 43 |
+
path : the path of the input file
|
| 44 |
+
logName : the file name of the input file
|
| 45 |
+
savePath : the path of the output file
|
| 46 |
+
tau : how much percentage of tokens matched to merge a log message
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
def __init__(self, indir='./', outdir='./result/', log_format=None, tau=0.5, rex=[], keep_para=True):
|
| 50 |
+
self.path = indir
|
| 51 |
+
self.logName = None
|
| 52 |
+
self.savePath = outdir
|
| 53 |
+
self.tau = tau
|
| 54 |
+
self.logformat = log_format
|
| 55 |
+
self.df_log = None
|
| 56 |
+
self.rex = rex
|
| 57 |
+
self.keep_para = keep_para
|
| 58 |
+
|
| 59 |
+
def LCS(self, seq1, seq2):
|
| 60 |
+
lengths = [[0 for j in range(len(seq2) + 1)] for i in range(len(seq1) + 1)]
|
| 61 |
+
# row 0 and column 0 are initialized to 0 already
|
| 62 |
+
for i in range(len(seq1)):
|
| 63 |
+
for j in range(len(seq2)):
|
| 64 |
+
if seq1[i] == seq2[j]:
|
| 65 |
+
lengths[i + 1][j + 1] = lengths[i][j] + 1
|
| 66 |
+
else:
|
| 67 |
+
lengths[i + 1][j + 1] = max(lengths[i + 1][j], lengths[i][j + 1])
|
| 68 |
+
|
| 69 |
+
# read the substring out from the matrix
|
| 70 |
+
result = []
|
| 71 |
+
lenOfSeq1, lenOfSeq2 = len(seq1), len(seq2)
|
| 72 |
+
while lenOfSeq1 != 0 and lenOfSeq2 != 0:
|
| 73 |
+
if lengths[lenOfSeq1][lenOfSeq2] == lengths[lenOfSeq1 - 1][lenOfSeq2]:
|
| 74 |
+
lenOfSeq1 -= 1
|
| 75 |
+
elif lengths[lenOfSeq1][lenOfSeq2] == lengths[lenOfSeq1][lenOfSeq2 - 1]:
|
| 76 |
+
lenOfSeq2 -= 1
|
| 77 |
+
else:
|
| 78 |
+
assert seq1[lenOfSeq1 - 1] == seq2[lenOfSeq2 - 1]
|
| 79 |
+
result.insert(0, seq1[lenOfSeq1 - 1])
|
| 80 |
+
lenOfSeq1 -= 1
|
| 81 |
+
lenOfSeq2 -= 1
|
| 82 |
+
return result
|
| 83 |
+
#for each seq, find the corresponding log key(template)
|
| 84 |
+
def SimpleLoopMatch(self, logClustL, seq):
|
| 85 |
+
for logClust in logClustL:
|
| 86 |
+
if float(len(logClust.logTemplate)) < 0.5 * len(seq):
|
| 87 |
+
continue
|
| 88 |
+
# Check the template is a subsequence of seq (we use set checking as a proxy here for speedup since
|
| 89 |
+
# incorrect-ordering bad cases rarely occur in logs)
|
| 90 |
+
token_set = set(seq)
|
| 91 |
+
if all(token in token_set or token == '<*>' for token in logClust.logTemplate):
|
| 92 |
+
return logClust
|
| 93 |
+
return None
|
| 94 |
+
|
| 95 |
+
def PrefixTreeMatch(self, parentn, seq, idx):
|
| 96 |
+
retLogClust = None
|
| 97 |
+
length = len(seq)
|
| 98 |
+
for i in range(idx, length):
|
| 99 |
+
if seq[i] in parentn.childD:
|
| 100 |
+
childn = parentn.childD[seq[i]]
|
| 101 |
+
if (childn.logClust is not None):
|
| 102 |
+
constLM = [w for w in childn.logClust.logTemplate if w != '<*>']
|
| 103 |
+
if float(len(constLM)) >= self.tau * length:
|
| 104 |
+
return childn.logClust
|
| 105 |
+
else:
|
| 106 |
+
return self.PrefixTreeMatch(childn, seq, i + 1)
|
| 107 |
+
|
| 108 |
+
return retLogClust
|
| 109 |
+
|
| 110 |
+
#for each seq, find the corresponding log template using LCS
|
| 111 |
+
def LCSMatch(self, logClustL, seq):
|
| 112 |
+
retLogClust = None
|
| 113 |
+
|
| 114 |
+
maxLen = -1
|
| 115 |
+
maxlcs = []
|
| 116 |
+
maxClust = None
|
| 117 |
+
set_seq = set(seq)
|
| 118 |
+
size_seq = len(seq)
|
| 119 |
+
for logClust in logClustL:
|
| 120 |
+
set_template = set(logClust.logTemplate)
|
| 121 |
+
if len(set_seq & set_template) < 0.5 * size_seq:
|
| 122 |
+
continue
|
| 123 |
+
lcs = self.LCS(seq, logClust.logTemplate)
|
| 124 |
+
if len(lcs) > maxLen or (len(lcs) == maxLen and len(logClust.logTemplate) < len(maxClust.logTemplate)):
|
| 125 |
+
maxLen = len(lcs)
|
| 126 |
+
maxlcs = lcs
|
| 127 |
+
maxClust = logClust
|
| 128 |
+
|
| 129 |
+
# LCS should be large then tau * len(itself)
|
| 130 |
+
if float(maxLen) >= self.tau * size_seq:
|
| 131 |
+
retLogClust = maxClust
|
| 132 |
+
|
| 133 |
+
return retLogClust
|
| 134 |
+
|
| 135 |
+
def getTemplate(self, lcs, seq):
|
| 136 |
+
retVal = []
|
| 137 |
+
if not lcs:
|
| 138 |
+
return retVal
|
| 139 |
+
|
| 140 |
+
lcs = lcs[::-1]
|
| 141 |
+
i = 0
|
| 142 |
+
for token in seq:
|
| 143 |
+
i += 1
|
| 144 |
+
if token == lcs[-1]:
|
| 145 |
+
retVal.append(token)
|
| 146 |
+
lcs.pop()
|
| 147 |
+
else:
|
| 148 |
+
retVal.append('<*>')
|
| 149 |
+
if not lcs:
|
| 150 |
+
break
|
| 151 |
+
if i < len(seq):
|
| 152 |
+
retVal.append('<*>')
|
| 153 |
+
return retVal
|
| 154 |
+
|
| 155 |
+
def addSeqToPrefixTree(self, rootn, newCluster):
|
| 156 |
+
parentn = rootn
|
| 157 |
+
seq = newCluster.logTemplate
|
| 158 |
+
seq = [w for w in seq if w != '<*>']
|
| 159 |
+
|
| 160 |
+
for i in range(len(seq)):
|
| 161 |
+
tokenInSeq = seq[i]
|
| 162 |
+
# Match
|
| 163 |
+
if tokenInSeq in parentn.childD:
|
| 164 |
+
parentn.childD[tokenInSeq].templateNo += 1
|
| 165 |
+
# Do not Match
|
| 166 |
+
else:
|
| 167 |
+
parentn.childD[tokenInSeq] = Node(token=tokenInSeq, templateNo=1)
|
| 168 |
+
parentn = parentn.childD[tokenInSeq]
|
| 169 |
+
|
| 170 |
+
if parentn.logClust is None:
|
| 171 |
+
parentn.logClust = newCluster
|
| 172 |
+
|
| 173 |
+
def removeSeqFromPrefixTree(self, rootn, newCluster):
|
| 174 |
+
parentn = rootn
|
| 175 |
+
seq = newCluster.logTemplate
|
| 176 |
+
seq = [w for w in seq if w != '<*>']
|
| 177 |
+
|
| 178 |
+
for tokenInSeq in seq:
|
| 179 |
+
if tokenInSeq in parentn.childD:
|
| 180 |
+
matchedNode = parentn.childD[tokenInSeq]
|
| 181 |
+
if matchedNode.templateNo == 1:
|
| 182 |
+
del parentn.childD[tokenInSeq]
|
| 183 |
+
break
|
| 184 |
+
else:
|
| 185 |
+
matchedNode.templateNo -= 1
|
| 186 |
+
parentn = matchedNode
|
| 187 |
+
|
| 188 |
+
def outputResult(self, logClustL):
|
| 189 |
+
print("output result", self.savePath)
|
| 190 |
+
templates = [0] * self.df_log.shape[0]
|
| 191 |
+
ids = [0] * self.df_log.shape[0]
|
| 192 |
+
df_event = []
|
| 193 |
+
|
| 194 |
+
for logclust in tqdm(logClustL):
|
| 195 |
+
template_str = ' '.join(logclust.logTemplate)
|
| 196 |
+
eid = hashlib.md5(template_str.encode('utf-8')).hexdigest()[0:8]
|
| 197 |
+
for logid in logclust.logIDL:
|
| 198 |
+
templates[logid - 1] = template_str
|
| 199 |
+
ids[logid - 1] = eid
|
| 200 |
+
df_event.append([eid, template_str, len(logclust.logIDL)])
|
| 201 |
+
|
| 202 |
+
df_event = pd.DataFrame(df_event, columns=['EventId', 'EventTemplate', 'Occurrences'])
|
| 203 |
+
|
| 204 |
+
self.df_log['EventId'] = ids
|
| 205 |
+
self.df_log['EventTemplate'] = templates
|
| 206 |
+
if self.keep_para:
|
| 207 |
+
self.df_log["ParameterList"] = self.df_log.apply(self.get_parameter_list, axis=1)
|
| 208 |
+
self.df_log.to_csv(os.path.join(self.savePath, self.logname + '_structured.csv'), index=False)
|
| 209 |
+
df_event.to_csv(os.path.join(self.savePath, self.logname + '_templates.csv'), index=False)
|
| 210 |
+
|
| 211 |
+
def printTree(self, node, dep):
|
| 212 |
+
pStr = ''
|
| 213 |
+
for i in range(len(dep)):
|
| 214 |
+
pStr += '\t'
|
| 215 |
+
|
| 216 |
+
if node.token == '':
|
| 217 |
+
pStr += 'Root'
|
| 218 |
+
else:
|
| 219 |
+
pStr += node.token
|
| 220 |
+
if node.logClust is not None:
|
| 221 |
+
pStr += '-->' + ' '.join(node.logClust.logTemplate)
|
| 222 |
+
print(pStr + ' (' + str(node.templateNo) + ')')
|
| 223 |
+
|
| 224 |
+
for child in node.childD:
|
| 225 |
+
self.printTree(node.childD[child], dep + 1)
|
| 226 |
+
|
| 227 |
+
def parse(self, logname):
|
| 228 |
+
starttime = datetime.now()
|
| 229 |
+
print('Parsing file: ' + os.path.join(self.path, logname))
|
| 230 |
+
self.logname = logname
|
| 231 |
+
self.load_data()
|
| 232 |
+
rootNode = Node()
|
| 233 |
+
logCluL = []
|
| 234 |
+
punc = re.sub('[<*>]', '', string.punctuation)
|
| 235 |
+
count = 0
|
| 236 |
+
for idx, line in self.df_log.iterrows():
|
| 237 |
+
logID = line['LineId']
|
| 238 |
+
logmessageL = list(filter(lambda x: x.strip() != '', re.split(f'[{punc}]', self.preprocess(line['Content']))))
|
| 239 |
+
constLogMessL = [w for w in logmessageL if w != '<*>']
|
| 240 |
+
#constLogMessL = [w for w in logmessageL]
|
| 241 |
+
|
| 242 |
+
# Find an existing matched log cluster
|
| 243 |
+
matchCluster = self.PrefixTreeMatch(rootNode, constLogMessL, 0)
|
| 244 |
+
|
| 245 |
+
if matchCluster is None:
|
| 246 |
+
matchCluster = self.SimpleLoopMatch(logCluL, constLogMessL)
|
| 247 |
+
|
| 248 |
+
if matchCluster is None:
|
| 249 |
+
matchCluster = self.LCSMatch(logCluL, logmessageL)
|
| 250 |
+
|
| 251 |
+
# Match no existing log cluster
|
| 252 |
+
if matchCluster is None:
|
| 253 |
+
newCluster = LCSObject(logTemplate=logmessageL, logIDL=[logID])
|
| 254 |
+
logCluL.append(newCluster)
|
| 255 |
+
self.addSeqToPrefixTree(rootNode, newCluster)
|
| 256 |
+
# Add the new log message to the existing cluster
|
| 257 |
+
else:
|
| 258 |
+
newTemplate = self.getTemplate(self.LCS(logmessageL, matchCluster.logTemplate),
|
| 259 |
+
matchCluster.logTemplate)
|
| 260 |
+
if ' '.join(newTemplate) != ' '.join(matchCluster.logTemplate):
|
| 261 |
+
self.removeSeqFromPrefixTree(rootNode, matchCluster)
|
| 262 |
+
matchCluster.logTemplate = newTemplate
|
| 263 |
+
self.addSeqToPrefixTree(rootNode, matchCluster)
|
| 264 |
+
if matchCluster:
|
| 265 |
+
matchCluster.logIDL.append(logID)
|
| 266 |
+
count += 1
|
| 267 |
+
if count % 1000 == 0 or count == len(self.df_log):
|
| 268 |
+
print('Processed {0:.1f}% of log lines.'.format(count * 100.0 / len(self.df_log)), end='\r')
|
| 269 |
+
|
| 270 |
+
if not os.path.exists(self.savePath):
|
| 271 |
+
os.makedirs(self.savePath)
|
| 272 |
+
|
| 273 |
+
self.outputResult(logCluL)
|
| 274 |
+
print('Parsing done. [Time taken: {!s}]'.format(datetime.now() - starttime))
|
| 275 |
+
|
| 276 |
+
def load_data(self):
|
| 277 |
+
headers, regex = self.generate_logformat_regex(self.logformat)
|
| 278 |
+
self.df_log = self.log_to_dataframe(os.path.join(self.path, self.logname), regex, headers, self.logformat)
|
| 279 |
+
|
| 280 |
+
def preprocess(self, line):
|
| 281 |
+
for currentRex in self.rex:
|
| 282 |
+
line = re.sub(currentRex, '<*>', line)
|
| 283 |
+
return line
|
| 284 |
+
|
| 285 |
+
def log_to_dataframe(self, log_file, regex, headers, logformat):
|
| 286 |
+
""" Function to transform log file to dataframe
|
| 287 |
+
"""
|
| 288 |
+
log_messages = []
|
| 289 |
+
linecount = 0
|
| 290 |
+
k = 0
|
| 291 |
+
with open(log_file, 'r') as fin:
|
| 292 |
+
for line in fin.readlines():
|
| 293 |
+
#extract small size data
|
| 294 |
+
k += 1
|
| 295 |
+
if k%10000 == 0:
|
| 296 |
+
print("extracted {0} log lines from {1}".format(k, log_file))
|
| 297 |
+
line = re.sub(r'[^\x00-\x7F]+', '<NASCII>', line) #replace non ASCII (\x00-\x7F) character with <NASCII>
|
| 298 |
+
try:
|
| 299 |
+
match = regex.search(line.strip())
|
| 300 |
+
message = [match.group(header) for header in headers]
|
| 301 |
+
log_messages.append(message)
|
| 302 |
+
linecount += 1
|
| 303 |
+
except Exception as e:
|
| 304 |
+
pass
|
| 305 |
+
logdf = pd.DataFrame(log_messages, columns=headers)
|
| 306 |
+
logdf.insert(0, 'LineId', None)
|
| 307 |
+
logdf['LineId'] = [i + 1 for i in range(linecount)]
|
| 308 |
+
return logdf
|
| 309 |
+
|
| 310 |
+
def generate_logformat_regex(self, logformat):
|
| 311 |
+
""" Function to generate regular expression to split log messages
|
| 312 |
+
"""
|
| 313 |
+
headers = []
|
| 314 |
+
splitters = re.split(r'(<[^<>]+>)', logformat)
|
| 315 |
+
regex = ''
|
| 316 |
+
for k in range(len(splitters)):
|
| 317 |
+
if k % 2 == 0:
|
| 318 |
+
splitter = re.sub(' +', '\\\s+', splitters[k]) #re.sub(' +', '\s+', splitters[k])
|
| 319 |
+
|
| 320 |
+
regex += splitter
|
| 321 |
+
else:
|
| 322 |
+
header = splitters[k].strip('<').strip('>')
|
| 323 |
+
regex += '(?P<%s>.*?)' % header
|
| 324 |
+
headers.append(header)
|
| 325 |
+
regex = re.compile('^' + regex + '$')
|
| 326 |
+
return headers, regex
|
| 327 |
+
|
| 328 |
+
def get_parameter_list(self, row):
|
| 329 |
+
template_regex = re.sub(r"\s<.{1,5}>\s", "<*>", row["EventTemplate"])
|
| 330 |
+
if "<*>" not in template_regex: return []
|
| 331 |
+
template_regex = re.sub(r'([^A-Za-z0-9])', r'\\\1', template_regex)
|
| 332 |
+
template_regex = re.sub(r'\\ +', r'[^A-Za-z0-9]+', template_regex)
|
| 333 |
+
template_regex = "^" + template_regex.replace("\<\*\>", "(.*?)") + "$"
|
| 334 |
+
parameter_list = re.findall(template_regex, row["Content"])
|
| 335 |
+
parameter_list = parameter_list[0] if parameter_list else ()
|
| 336 |
+
parameter_list = list(parameter_list) if isinstance(parameter_list, tuple) else [parameter_list]
|
| 337 |
+
parameter_list = [para.strip(string.punctuation).strip(' ') for para in parameter_list]
|
| 338 |
+
return parameter_list
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
if __name__ == "__main__":
|
| 342 |
+
import os
|
| 343 |
+
import pandas as pd
|
| 344 |
+
print(os.getcwd())
|
| 345 |
+
os.chdir("../")
|
| 346 |
+
print(os.getcwd())
|
| 347 |
+
|
| 348 |
+
lp = LogParser()
|
| 349 |
+
# print(lp.LCS(seq1="abcbb", seq2="bc"))
|
| 350 |
+
output_dir = 'demo/Spell_result/' # The output directory of parsing results
|
| 351 |
+
log_file = 'HDFS.log' # The input log file name
|
| 352 |
+
log_structured_file = output_dir + log_file + "_structured.csv"
|
| 353 |
+
df = pd.read_csv(log_structured_file)
|
| 354 |
+
for _, row in df.iterrows():
|
| 355 |
+
lp.get_parameter_list(row)
|
| 356 |
+
|
model/bert/parameters.txt
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
device: cuda
|
| 2 |
+
output_dir: AI_MODELS/trained_models/Hadoop_logbert/
|
| 3 |
+
model_dir: AI_MODELS/trained_models/Hadoop_logbert/bert/
|
| 4 |
+
model_path: AI_MODELS/trained_models/Hadoop_logbert/bert/best_bert.pth
|
| 5 |
+
train_vocab: AI_MODELS/trained_models/Hadoop_logbert/train
|
| 6 |
+
vocab_path: AI_MODELS/trained_models/Hadoop_logbert/vocab.pkl
|
| 7 |
+
window_size: 64
|
| 8 |
+
adaptive_window: True
|
| 9 |
+
seq_len: 256
|
| 10 |
+
max_len: 256
|
| 11 |
+
min_len: 5
|
| 12 |
+
mask_ratio: 0.7
|
| 13 |
+
train_ratio: 1
|
| 14 |
+
valid_ratio: 0.25
|
| 15 |
+
test_ratio: 1
|
| 16 |
+
is_logkey: True
|
| 17 |
+
is_time: False
|
| 18 |
+
hypersphere_loss: True
|
| 19 |
+
hypersphere_loss_test: True
|
| 20 |
+
scale: standard
|
| 21 |
+
scale_path: AI_MODELS/trained_models/Hadoop_logbert/bert/scale.pkl
|
| 22 |
+
hidden: 512
|
| 23 |
+
layers: 6
|
| 24 |
+
attn_heads: 8
|
| 25 |
+
epochs: 2000
|
| 26 |
+
n_epochs_stop: 50
|
| 27 |
+
batch_size: 32
|
| 28 |
+
corpus_lines: None
|
| 29 |
+
on_memory: True
|
| 30 |
+
num_workers: 5
|
| 31 |
+
lr: 0.0003
|
| 32 |
+
adam_beta1: 0.9
|
| 33 |
+
adam_beta2: 0.999
|
| 34 |
+
adam_weight_decay: 0.0
|
| 35 |
+
with_cuda: True
|
| 36 |
+
cuda_devices: None
|
| 37 |
+
log_freq: None
|
| 38 |
+
num_candidates: 6
|
| 39 |
+
gaussian_mean: 0
|
| 40 |
+
gaussian_std: 1
|