MukeshKapoor25 commited on
Commit
3f90381
·
1 Parent(s): 9a2df03

gitignore

Browse files
.gitignore CHANGED
@@ -1,3 +1,12 @@
1
  model/bert/best_center.pt
2
  model/bert/best_total_dist.pt
3
  model/bert/best_bert.pth
 
 
 
 
 
 
 
 
 
 
1
  model/bert/best_center.pt
2
  model/bert/best_total_dist.pt
3
  model/bert/best_bert.pth
4
+ model/bert/train_log2.csv
5
+ model/bert/train_valid_loss.png
6
+ model/bert/valid_log2.csv
7
+ bert_pytorch/trainer/optim_schedule.py
8
+ bert_pytorch/trainer/pretrain.py
9
+ model/bert/test_abnormal_errors.pkl
10
+ model/bert/test_abnormal_results
11
+ model/bert/test_normal_errors.pkl
12
+ model/bert/test_normal_results
bert_pytorch/model/attention/multi_head.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ from .single import Attention
3
+
4
+
5
+ class MultiHeadedAttention(nn.Module):
6
+ """
7
+ Take in model size and number of heads.
8
+ """
9
+
10
+ def __init__(self, h, d_model, dropout=0.1):
11
+ super().__init__()
12
+ assert d_model % h == 0
13
+
14
+ # We assume d_v always equals d_k
15
+ self.d_k = d_model // h
16
+ self.h = h
17
+
18
+ self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(3)])
19
+ self.output_linear = nn.Linear(d_model, d_model)
20
+ self.attention = Attention()
21
+
22
+ self.dropout = nn.Dropout(p=dropout)
23
+
24
+ def forward(self, query, key, value, mask=None):
25
+ batch_size = query.size(0)
26
+
27
+ # 1) Do all the linear projections in batch from d_model => h x d_k
28
+ query, key, value = [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
29
+ for l, x in zip(self.linear_layers, (query, key, value))]
30
+
31
+ # 2) Apply attention on all the projected vectors in batch.
32
+ x, attn = self.attention(query, key, value, mask=mask, dropout=self.dropout)
33
+
34
+ # 3) "Concat" using a view and apply a final linear.
35
+ x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)
36
+
37
+ return self.output_linear(x)
bert_pytorch/model/attention/single.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ import torch.nn.functional as F
3
+ import torch
4
+
5
+ import math
6
+
7
+
8
+ class Attention(nn.Module):
9
+ """
10
+ Compute 'Scaled Dot Product Attention
11
+ """
12
+
13
+ def forward(self, query, key, value, mask=None, dropout=None):
14
+ scores = torch.matmul(query, key.transpose(-2, -1)) \
15
+ / math.sqrt(query.size(-1))
16
+
17
+ if mask is not None:
18
+ scores = scores.masked_fill(mask == 0, -1e9)
19
+
20
+ p_attn = F.softmax(scores, dim=-1)
21
+
22
+ if dropout is not None:
23
+ p_attn = dropout(p_attn)
24
+
25
+ return torch.matmul(p_attn, value), p_attn
bert_pytorch/model/utils/feed_forward.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ from .gelu import GELU
3
+
4
+
5
+ class PositionwiseFeedForward(nn.Module):
6
+ "Implements FFN equation."
7
+
8
+ def __init__(self, d_model, d_ff, dropout=0.1):
9
+ super(PositionwiseFeedForward, self).__init__()
10
+ self.w_1 = nn.Linear(d_model, d_ff)
11
+ self.w_2 = nn.Linear(d_ff, d_model)
12
+ self.dropout = nn.Dropout(dropout)
13
+ self.activation = GELU()
14
+
15
+ def forward(self, x):
16
+ return self.w_2(self.dropout(self.activation(self.w_1(x))))
bert_pytorch/model/utils/gelu.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ import torch
3
+ import math
4
+
5
+
6
+ class GELU(nn.Module):
7
+ """
8
+ Paper Section 3.4, last paragraph notice that BERT used the GELU instead of RELU
9
+ """
10
+
11
+ def forward(self, x):
12
+ return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
bert_pytorch/model/utils/layer_norm.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ import torch
3
+
4
+
5
+ class LayerNorm(nn.Module):
6
+ "Construct a layernorm module (See citation for details)."
7
+
8
+ def __init__(self, features, eps=1e-6):
9
+ super(LayerNorm, self).__init__()
10
+ self.a_2 = nn.Parameter(torch.ones(features))
11
+ self.b_2 = nn.Parameter(torch.zeros(features))
12
+ self.eps = eps
13
+
14
+ def forward(self, x):
15
+ mean = x.mean(-1, keepdim=True)
16
+ std = x.std(-1, keepdim=True)
17
+ return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
bert_pytorch/model/utils/sublayer.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ from .layer_norm import LayerNorm
3
+
4
+
5
+ class SublayerConnection(nn.Module):
6
+ """
7
+ A residual connection followed by a layer norm.
8
+ Note for code simplicity the norm is first as opposed to last.
9
+ """
10
+
11
+ def __init__(self, size, dropout):
12
+ super(SublayerConnection, self).__init__()
13
+ self.norm = LayerNorm(size)
14
+ self.dropout = nn.Dropout(dropout)
15
+
16
+ def forward(self, x, sublayer):
17
+ "Apply residual connection to any sublayer with the same size."
18
+ return x + self.dropout(sublayer(self.norm(x)))
logparser/Drain.py ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Description : This file implements the Drain algorithm for log parsing
3
+ Author : LogPAI team
4
+ License : MIT
5
+ """
6
+
7
+ import re
8
+ import os
9
+ import numpy as np
10
+ import pandas as pd
11
+ import hashlib
12
+ from datetime import datetime
13
+
14
+
15
+ class Logcluster:
16
+ def __init__(self, logTemplate='', logIDL=None):
17
+ self.logTemplate = logTemplate
18
+ if logIDL is None:
19
+ logIDL = []
20
+ self.logIDL = logIDL
21
+
22
+
23
+ class Node:
24
+ def __init__(self, childD=None, depth=0, digitOrtoken=None):
25
+ if childD is None:
26
+ childD = dict()
27
+ self.childD = childD
28
+ self.depth = depth
29
+ self.digitOrtoken = digitOrtoken
30
+
31
+
32
+ class LogParser:
33
+ def __init__(self, log_format, indir='./', outdir='./result/', depth=4, st=0.4,
34
+ maxChild=100, rex=[], keep_para=True):
35
+ """
36
+ Attributes
37
+ ----------
38
+ rex : regular expressions used in preprocessing (step1)
39
+ path : the input path stores the input log file name
40
+ depth : depth of all leaf nodes
41
+ st : similarity threshold
42
+ maxChild : max number of children of an internal node
43
+ logName : the name of the input file containing raw log messages
44
+ savePath : the output path stores the file containing structured logs
45
+ """
46
+ self.path = indir
47
+ self.depth = depth - 2
48
+ self.st = st
49
+ self.maxChild = maxChild
50
+ self.logName = None
51
+ self.savePath = outdir
52
+ self.df_log = None
53
+ self.log_format = log_format
54
+ self.rex = rex
55
+ self.keep_para = keep_para
56
+
57
+ def hasNumbers(self, s):
58
+ return any(char.isdigit() for char in s)
59
+
60
+ def treeSearch(self, rn, seq):
61
+ retLogClust = None
62
+
63
+ seqLen = len(seq)
64
+ if seqLen not in rn.childD:
65
+ return retLogClust
66
+
67
+ parentn = rn.childD[seqLen]
68
+
69
+ currentDepth = 1
70
+ for token in seq:
71
+ if currentDepth >= self.depth or currentDepth > seqLen:
72
+ break
73
+
74
+ if token in parentn.childD:
75
+ parentn = parentn.childD[token]
76
+ elif '<*>' in parentn.childD:
77
+ parentn = parentn.childD['<*>']
78
+ else:
79
+ return retLogClust
80
+ currentDepth += 1
81
+
82
+ logClustL = parentn.childD
83
+
84
+ retLogClust = self.fastMatch(logClustL, seq)
85
+
86
+ return retLogClust
87
+
88
+ def addSeqToPrefixTree(self, rn, logClust):
89
+ seqLen = len(logClust.logTemplate)
90
+ if seqLen not in rn.childD:
91
+ firtLayerNode = Node(depth=1, digitOrtoken=seqLen)
92
+ rn.childD[seqLen] = firtLayerNode
93
+ else:
94
+ firtLayerNode = rn.childD[seqLen]
95
+
96
+ parentn = firtLayerNode
97
+
98
+ currentDepth = 1
99
+ for token in logClust.logTemplate:
100
+
101
+ # Add current log cluster to the leaf node
102
+ if currentDepth >= self.depth or currentDepth > seqLen:
103
+ if len(parentn.childD) == 0:
104
+ parentn.childD = [logClust]
105
+ else:
106
+ parentn.childD.append(logClust)
107
+ break
108
+
109
+ # If token not matched in this layer of existing tree.
110
+ if token not in parentn.childD:
111
+ if not self.hasNumbers(token):
112
+ if '<*>' in parentn.childD:
113
+ if len(parentn.childD) < self.maxChild:
114
+ newNode = Node(depth=currentDepth + 1, digitOrtoken=token)
115
+ parentn.childD[token] = newNode
116
+ parentn = newNode
117
+ else:
118
+ parentn = parentn.childD['<*>']
119
+ else:
120
+ if len(parentn.childD) + 1 < self.maxChild:
121
+ newNode = Node(depth=currentDepth + 1, digitOrtoken=token)
122
+ parentn.childD[token] = newNode
123
+ parentn = newNode
124
+ elif len(parentn.childD) + 1 == self.maxChild:
125
+ newNode = Node(depth=currentDepth + 1, digitOrtoken='<*>')
126
+ parentn.childD['<*>'] = newNode
127
+ parentn = newNode
128
+ else:
129
+ parentn = parentn.childD['<*>']
130
+
131
+ else:
132
+ if '<*>' not in parentn.childD:
133
+ newNode = Node(depth=currentDepth + 1, digitOrtoken='<*>')
134
+ parentn.childD['<*>'] = newNode
135
+ parentn = newNode
136
+ else:
137
+ parentn = parentn.childD['<*>']
138
+
139
+ # If the token is matched
140
+ else:
141
+ parentn = parentn.childD[token]
142
+
143
+ currentDepth += 1
144
+
145
+ # seq1 is template
146
+ def seqDist(self, seq1, seq2):
147
+ assert len(seq1) == len(seq2)
148
+ simTokens = 0
149
+ numOfPar = 0
150
+
151
+ for token1, token2 in zip(seq1, seq2):
152
+ if token1 == '<*>':
153
+ numOfPar += 1
154
+ continue #comment@haixuanguo: <*> == <*> are similar pairs
155
+ if token1 == token2:
156
+ simTokens += 1
157
+
158
+ retVal = float(simTokens) / len(seq1)
159
+
160
+ return retVal, numOfPar
161
+
162
+ def fastMatch(self, logClustL, seq):
163
+ retLogClust = None
164
+
165
+ maxSim = -1
166
+ maxNumOfPara = -1
167
+ maxClust = None
168
+
169
+ for logClust in logClustL:
170
+ curSim, curNumOfPara = self.seqDist(logClust.logTemplate, seq)
171
+ if curSim > maxSim or (curSim == maxSim and curNumOfPara > maxNumOfPara):
172
+ maxSim = curSim
173
+ maxNumOfPara = curNumOfPara
174
+ maxClust = logClust
175
+
176
+ if maxSim >= self.st:
177
+ retLogClust = maxClust
178
+
179
+ return retLogClust
180
+
181
+ def getTemplate(self, seq1, seq2):
182
+ assert len(seq1) == len(seq2)
183
+ retVal = []
184
+
185
+ i = 0
186
+ for word in seq1:
187
+ if word == seq2[i]:
188
+ retVal.append(word)
189
+ else:
190
+ retVal.append('<*>')
191
+
192
+ i += 1
193
+
194
+ return retVal
195
+
196
+ def outputResult(self, logClustL):
197
+ log_templates = [0] * self.df_log.shape[0]
198
+ log_templateids = [0] * self.df_log.shape[0]
199
+ df_events = []
200
+ for logClust in logClustL:
201
+ template_str = ' '.join(logClust.logTemplate)
202
+ occurrence = len(logClust.logIDL)
203
+ template_id = hashlib.md5(template_str.encode('utf-8')).hexdigest()[0:8]
204
+ for logID in logClust.logIDL:
205
+ logID -= 1
206
+ log_templates[logID] = template_str
207
+ log_templateids[logID] = template_id
208
+ df_events.append([template_id, template_str, occurrence])
209
+
210
+ df_event = pd.DataFrame(df_events, columns=['EventId', 'EventTemplate', 'Occurrences'])
211
+ self.df_log['EventId'] = log_templateids
212
+ self.df_log['EventTemplate'] = log_templates
213
+
214
+ if self.keep_para:
215
+ self.df_log["ParameterList"] = self.df_log.apply(self.get_parameter_list, axis=1)
216
+ self.df_log.to_csv(os.path.join(self.savePath, self.logName + '_structured.csv'), index=False)
217
+
218
+ occ_dict = dict(self.df_log['EventTemplate'].value_counts())
219
+ df_event = pd.DataFrame()
220
+ df_event['EventTemplate'] = self.df_log['EventTemplate'].unique()
221
+ df_event['EventId'] = df_event['EventTemplate'].map(lambda x: hashlib.md5(str(x).encode('utf-8')).hexdigest()[0:8])
222
+ df_event['Occurrences'] = df_event['EventTemplate'].map(occ_dict)
223
+ df_event.to_csv(os.path.join(self.savePath, self.logName + '_templates.csv'), index=False,
224
+ columns=["EventId", "EventTemplate", "Occurrences"])
225
+
226
+ def printTree(self, node, dep):
227
+ pStr = ''
228
+ for i in range(dep):
229
+ pStr += '\t'
230
+
231
+ if node.depth == 0:
232
+ pStr += 'Root'
233
+ elif node.depth == 1:
234
+ pStr += '<' + str(node.digitOrtoken) + '>'
235
+ else:
236
+ pStr += node.digitOrtoken
237
+
238
+ print(pStr)
239
+
240
+ if node.depth == self.depth:
241
+ return 1
242
+ for child in node.childD:
243
+ self.printTree(node.childD[child], dep + 1)
244
+
245
+ def parse(self, logName):
246
+ print('Parsing file: ' + os.path.join(self.path, logName))
247
+ start_time = datetime.now()
248
+ self.logName = logName
249
+ rootNode = Node()
250
+ logCluL = []
251
+
252
+ self.load_data()
253
+
254
+ count = 0
255
+ for idx, line in self.df_log.iterrows():
256
+
257
+ logID = line['LineId']
258
+ logmessageL = self.preprocess(line['Content']).strip().split()
259
+ # logmessageL = filter(lambda x: x != '', re.split('[\s=:,]', self.preprocess(line['Content'])))
260
+ matchCluster = self.treeSearch(rootNode, logmessageL)
261
+
262
+ # Match no existing log cluster
263
+ if matchCluster is None:
264
+ newCluster = Logcluster(logTemplate=logmessageL, logIDL=[logID])
265
+ logCluL.append(newCluster)
266
+ self.addSeqToPrefixTree(rootNode, newCluster)
267
+
268
+ # Add the new log message to the existing cluster
269
+ else:
270
+ newTemplate = self.getTemplate(logmessageL, matchCluster.logTemplate)
271
+ matchCluster.logIDL.append(logID)
272
+ if ' '.join(newTemplate) != ' '.join(matchCluster.logTemplate):
273
+ matchCluster.logTemplate = newTemplate
274
+
275
+ count += 1
276
+ if count % 1000 == 0 or count == len(self.df_log):
277
+ print('Processed {0:.1f}% of log lines.'.format(count * 100.0 / len(self.df_log)), end='\r')
278
+
279
+ if not os.path.exists(self.savePath):
280
+ os.makedirs(self.savePath)
281
+
282
+ self.outputResult(logCluL)
283
+
284
+ print('Parsing done. [Time taken: {!s}]'.format(datetime.now() - start_time))
285
+
286
+ def load_data(self):
287
+ headers, regex = self.generate_logformat_regex(self.log_format)
288
+ self.df_log = self.log_to_dataframe(os.path.join(self.path, self.logName), regex, headers, self.log_format)
289
+
290
+ def preprocess(self, line):
291
+ for currentRex in self.rex:
292
+ line = re.sub(currentRex, '<*>', line)
293
+ return line
294
+
295
+ def log_to_dataframe(self, log_file, regex, headers, logformat):
296
+ """ Function to transform log file to dataframe
297
+ """
298
+ log_messages = []
299
+ linecount = 0
300
+ cnt = 0
301
+ with open(log_file, 'r') as fin:
302
+ for line in fin.readlines():
303
+ cnt += 1
304
+ try:
305
+ match = regex.search(line.strip())
306
+ message = [match.group(header) for header in headers]
307
+ log_messages.append(message)
308
+ linecount += 1
309
+ except Exception as e:
310
+ # print("\n", line)
311
+ # print(e)
312
+ pass
313
+ print("Total size after encoding is", linecount, cnt)
314
+ logdf = pd.DataFrame(log_messages, columns=headers)
315
+ logdf.insert(0, 'LineId', None)
316
+ logdf['LineId'] = [i + 1 for i in range(linecount)]
317
+ return logdf
318
+
319
+ def generate_logformat_regex(self, logformat):
320
+ """ Function to generate regular expression to split log messages
321
+ """
322
+ headers = []
323
+ splitters = re.split(r'(<[^<>]+>)', logformat)
324
+ regex = ''
325
+ for k in range(len(splitters)):
326
+ if k % 2 == 0:
327
+ splitter = re.sub(' +', '\\\s+', splitters[k])
328
+ regex += splitter
329
+ else:
330
+ header = splitters[k].strip('<').strip('>')
331
+ regex += '(?P<%s>.*?)' % header
332
+ headers.append(header)
333
+ regex = re.compile('^' + regex + '$')
334
+ return headers, regex
335
+
336
+ def get_parameter_list(self, row):
337
+ template_regex = re.sub(r"<.{1,5}>", "<*>", str(row["EventTemplate"]))
338
+ if "<*>" not in template_regex: return []
339
+ template_regex = re.sub(r'([^A-Za-z0-9])', r'\\\1', template_regex)
340
+ template_regex = re.sub(r' +', r'\\s+', template_regex)
341
+ template_regex = "^" + template_regex.replace("\<\*\>", "(.*?)") + "$"
342
+ parameter_list = re.findall(template_regex, row["Content"])
343
+ parameter_list = parameter_list[0] if parameter_list else ()
344
+ parameter_list = list(parameter_list) if isinstance(parameter_list, tuple) else [parameter_list]
345
+ return parameter_list
logparser/Spell.py ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Description : This file implements the Spell algorithm for log parsing
3
+ Author : LogPAI team
4
+ License : MIT
5
+ """
6
+
7
+ import sys
8
+ # import re
9
+ import regex as re
10
+ import os
11
+ import numpy as np
12
+ import pandas as pd
13
+ import hashlib
14
+ from datetime import datetime
15
+ import string
16
+ from tqdm import tqdm
17
+
18
+
19
+ class LCSObject:
20
+ """ Class object to store a log group with the same template
21
+ """
22
+
23
+ def __init__(self, logTemplate='', logIDL=[]):
24
+ self.logTemplate = logTemplate
25
+ self.logIDL = logIDL
26
+
27
+
28
+ class Node:
29
+ """ A node in prefix tree data structure
30
+ """
31
+
32
+ def __init__(self, token='', templateNo=0):
33
+ self.logClust = None
34
+ self.token = token
35
+ self.templateNo = templateNo
36
+ self.childD = dict()
37
+
38
+
39
+ class LogParser:
40
+ """ LogParser class
41
+ Attributes
42
+ ----------
43
+ path : the path of the input file
44
+ logName : the file name of the input file
45
+ savePath : the path of the output file
46
+ tau : how much percentage of tokens matched to merge a log message
47
+ """
48
+
49
+ def __init__(self, indir='./', outdir='./result/', log_format=None, tau=0.5, rex=[], keep_para=True):
50
+ self.path = indir
51
+ self.logName = None
52
+ self.savePath = outdir
53
+ self.tau = tau
54
+ self.logformat = log_format
55
+ self.df_log = None
56
+ self.rex = rex
57
+ self.keep_para = keep_para
58
+
59
+ def LCS(self, seq1, seq2):
60
+ lengths = [[0 for j in range(len(seq2) + 1)] for i in range(len(seq1) + 1)]
61
+ # row 0 and column 0 are initialized to 0 already
62
+ for i in range(len(seq1)):
63
+ for j in range(len(seq2)):
64
+ if seq1[i] == seq2[j]:
65
+ lengths[i + 1][j + 1] = lengths[i][j] + 1
66
+ else:
67
+ lengths[i + 1][j + 1] = max(lengths[i + 1][j], lengths[i][j + 1])
68
+
69
+ # read the substring out from the matrix
70
+ result = []
71
+ lenOfSeq1, lenOfSeq2 = len(seq1), len(seq2)
72
+ while lenOfSeq1 != 0 and lenOfSeq2 != 0:
73
+ if lengths[lenOfSeq1][lenOfSeq2] == lengths[lenOfSeq1 - 1][lenOfSeq2]:
74
+ lenOfSeq1 -= 1
75
+ elif lengths[lenOfSeq1][lenOfSeq2] == lengths[lenOfSeq1][lenOfSeq2 - 1]:
76
+ lenOfSeq2 -= 1
77
+ else:
78
+ assert seq1[lenOfSeq1 - 1] == seq2[lenOfSeq2 - 1]
79
+ result.insert(0, seq1[lenOfSeq1 - 1])
80
+ lenOfSeq1 -= 1
81
+ lenOfSeq2 -= 1
82
+ return result
83
+ #for each seq, find the corresponding log key(template)
84
+ def SimpleLoopMatch(self, logClustL, seq):
85
+ for logClust in logClustL:
86
+ if float(len(logClust.logTemplate)) < 0.5 * len(seq):
87
+ continue
88
+ # Check the template is a subsequence of seq (we use set checking as a proxy here for speedup since
89
+ # incorrect-ordering bad cases rarely occur in logs)
90
+ token_set = set(seq)
91
+ if all(token in token_set or token == '<*>' for token in logClust.logTemplate):
92
+ return logClust
93
+ return None
94
+
95
+ def PrefixTreeMatch(self, parentn, seq, idx):
96
+ retLogClust = None
97
+ length = len(seq)
98
+ for i in range(idx, length):
99
+ if seq[i] in parentn.childD:
100
+ childn = parentn.childD[seq[i]]
101
+ if (childn.logClust is not None):
102
+ constLM = [w for w in childn.logClust.logTemplate if w != '<*>']
103
+ if float(len(constLM)) >= self.tau * length:
104
+ return childn.logClust
105
+ else:
106
+ return self.PrefixTreeMatch(childn, seq, i + 1)
107
+
108
+ return retLogClust
109
+
110
+ #for each seq, find the corresponding log template using LCS
111
+ def LCSMatch(self, logClustL, seq):
112
+ retLogClust = None
113
+
114
+ maxLen = -1
115
+ maxlcs = []
116
+ maxClust = None
117
+ set_seq = set(seq)
118
+ size_seq = len(seq)
119
+ for logClust in logClustL:
120
+ set_template = set(logClust.logTemplate)
121
+ if len(set_seq & set_template) < 0.5 * size_seq:
122
+ continue
123
+ lcs = self.LCS(seq, logClust.logTemplate)
124
+ if len(lcs) > maxLen or (len(lcs) == maxLen and len(logClust.logTemplate) < len(maxClust.logTemplate)):
125
+ maxLen = len(lcs)
126
+ maxlcs = lcs
127
+ maxClust = logClust
128
+
129
+ # LCS should be large then tau * len(itself)
130
+ if float(maxLen) >= self.tau * size_seq:
131
+ retLogClust = maxClust
132
+
133
+ return retLogClust
134
+
135
+ def getTemplate(self, lcs, seq):
136
+ retVal = []
137
+ if not lcs:
138
+ return retVal
139
+
140
+ lcs = lcs[::-1]
141
+ i = 0
142
+ for token in seq:
143
+ i += 1
144
+ if token == lcs[-1]:
145
+ retVal.append(token)
146
+ lcs.pop()
147
+ else:
148
+ retVal.append('<*>')
149
+ if not lcs:
150
+ break
151
+ if i < len(seq):
152
+ retVal.append('<*>')
153
+ return retVal
154
+
155
+ def addSeqToPrefixTree(self, rootn, newCluster):
156
+ parentn = rootn
157
+ seq = newCluster.logTemplate
158
+ seq = [w for w in seq if w != '<*>']
159
+
160
+ for i in range(len(seq)):
161
+ tokenInSeq = seq[i]
162
+ # Match
163
+ if tokenInSeq in parentn.childD:
164
+ parentn.childD[tokenInSeq].templateNo += 1
165
+ # Do not Match
166
+ else:
167
+ parentn.childD[tokenInSeq] = Node(token=tokenInSeq, templateNo=1)
168
+ parentn = parentn.childD[tokenInSeq]
169
+
170
+ if parentn.logClust is None:
171
+ parentn.logClust = newCluster
172
+
173
+ def removeSeqFromPrefixTree(self, rootn, newCluster):
174
+ parentn = rootn
175
+ seq = newCluster.logTemplate
176
+ seq = [w for w in seq if w != '<*>']
177
+
178
+ for tokenInSeq in seq:
179
+ if tokenInSeq in parentn.childD:
180
+ matchedNode = parentn.childD[tokenInSeq]
181
+ if matchedNode.templateNo == 1:
182
+ del parentn.childD[tokenInSeq]
183
+ break
184
+ else:
185
+ matchedNode.templateNo -= 1
186
+ parentn = matchedNode
187
+
188
+ def outputResult(self, logClustL):
189
+ print("output result", self.savePath)
190
+ templates = [0] * self.df_log.shape[0]
191
+ ids = [0] * self.df_log.shape[0]
192
+ df_event = []
193
+
194
+ for logclust in tqdm(logClustL):
195
+ template_str = ' '.join(logclust.logTemplate)
196
+ eid = hashlib.md5(template_str.encode('utf-8')).hexdigest()[0:8]
197
+ for logid in logclust.logIDL:
198
+ templates[logid - 1] = template_str
199
+ ids[logid - 1] = eid
200
+ df_event.append([eid, template_str, len(logclust.logIDL)])
201
+
202
+ df_event = pd.DataFrame(df_event, columns=['EventId', 'EventTemplate', 'Occurrences'])
203
+
204
+ self.df_log['EventId'] = ids
205
+ self.df_log['EventTemplate'] = templates
206
+ if self.keep_para:
207
+ self.df_log["ParameterList"] = self.df_log.apply(self.get_parameter_list, axis=1)
208
+ self.df_log.to_csv(os.path.join(self.savePath, self.logname + '_structured.csv'), index=False)
209
+ df_event.to_csv(os.path.join(self.savePath, self.logname + '_templates.csv'), index=False)
210
+
211
+ def printTree(self, node, dep):
212
+ pStr = ''
213
+ for i in range(len(dep)):
214
+ pStr += '\t'
215
+
216
+ if node.token == '':
217
+ pStr += 'Root'
218
+ else:
219
+ pStr += node.token
220
+ if node.logClust is not None:
221
+ pStr += '-->' + ' '.join(node.logClust.logTemplate)
222
+ print(pStr + ' (' + str(node.templateNo) + ')')
223
+
224
+ for child in node.childD:
225
+ self.printTree(node.childD[child], dep + 1)
226
+
227
+ def parse(self, logname):
228
+ starttime = datetime.now()
229
+ print('Parsing file: ' + os.path.join(self.path, logname))
230
+ self.logname = logname
231
+ self.load_data()
232
+ rootNode = Node()
233
+ logCluL = []
234
+ punc = re.sub('[<*>]', '', string.punctuation)
235
+ count = 0
236
+ for idx, line in self.df_log.iterrows():
237
+ logID = line['LineId']
238
+ logmessageL = list(filter(lambda x: x.strip() != '', re.split(f'[{punc}]', self.preprocess(line['Content']))))
239
+ constLogMessL = [w for w in logmessageL if w != '<*>']
240
+ #constLogMessL = [w for w in logmessageL]
241
+
242
+ # Find an existing matched log cluster
243
+ matchCluster = self.PrefixTreeMatch(rootNode, constLogMessL, 0)
244
+
245
+ if matchCluster is None:
246
+ matchCluster = self.SimpleLoopMatch(logCluL, constLogMessL)
247
+
248
+ if matchCluster is None:
249
+ matchCluster = self.LCSMatch(logCluL, logmessageL)
250
+
251
+ # Match no existing log cluster
252
+ if matchCluster is None:
253
+ newCluster = LCSObject(logTemplate=logmessageL, logIDL=[logID])
254
+ logCluL.append(newCluster)
255
+ self.addSeqToPrefixTree(rootNode, newCluster)
256
+ # Add the new log message to the existing cluster
257
+ else:
258
+ newTemplate = self.getTemplate(self.LCS(logmessageL, matchCluster.logTemplate),
259
+ matchCluster.logTemplate)
260
+ if ' '.join(newTemplate) != ' '.join(matchCluster.logTemplate):
261
+ self.removeSeqFromPrefixTree(rootNode, matchCluster)
262
+ matchCluster.logTemplate = newTemplate
263
+ self.addSeqToPrefixTree(rootNode, matchCluster)
264
+ if matchCluster:
265
+ matchCluster.logIDL.append(logID)
266
+ count += 1
267
+ if count % 1000 == 0 or count == len(self.df_log):
268
+ print('Processed {0:.1f}% of log lines.'.format(count * 100.0 / len(self.df_log)), end='\r')
269
+
270
+ if not os.path.exists(self.savePath):
271
+ os.makedirs(self.savePath)
272
+
273
+ self.outputResult(logCluL)
274
+ print('Parsing done. [Time taken: {!s}]'.format(datetime.now() - starttime))
275
+
276
+ def load_data(self):
277
+ headers, regex = self.generate_logformat_regex(self.logformat)
278
+ self.df_log = self.log_to_dataframe(os.path.join(self.path, self.logname), regex, headers, self.logformat)
279
+
280
+ def preprocess(self, line):
281
+ for currentRex in self.rex:
282
+ line = re.sub(currentRex, '<*>', line)
283
+ return line
284
+
285
+ def log_to_dataframe(self, log_file, regex, headers, logformat):
286
+ """ Function to transform log file to dataframe
287
+ """
288
+ log_messages = []
289
+ linecount = 0
290
+ k = 0
291
+ with open(log_file, 'r') as fin:
292
+ for line in fin.readlines():
293
+ #extract small size data
294
+ k += 1
295
+ if k%10000 == 0:
296
+ print("extracted {0} log lines from {1}".format(k, log_file))
297
+ line = re.sub(r'[^\x00-\x7F]+', '<NASCII>', line) #replace non ASCII (\x00-\x7F) character with <NASCII>
298
+ try:
299
+ match = regex.search(line.strip())
300
+ message = [match.group(header) for header in headers]
301
+ log_messages.append(message)
302
+ linecount += 1
303
+ except Exception as e:
304
+ pass
305
+ logdf = pd.DataFrame(log_messages, columns=headers)
306
+ logdf.insert(0, 'LineId', None)
307
+ logdf['LineId'] = [i + 1 for i in range(linecount)]
308
+ return logdf
309
+
310
+ def generate_logformat_regex(self, logformat):
311
+ """ Function to generate regular expression to split log messages
312
+ """
313
+ headers = []
314
+ splitters = re.split(r'(<[^<>]+>)', logformat)
315
+ regex = ''
316
+ for k in range(len(splitters)):
317
+ if k % 2 == 0:
318
+ splitter = re.sub(' +', '\\\s+', splitters[k]) #re.sub(' +', '\s+', splitters[k])
319
+
320
+ regex += splitter
321
+ else:
322
+ header = splitters[k].strip('<').strip('>')
323
+ regex += '(?P<%s>.*?)' % header
324
+ headers.append(header)
325
+ regex = re.compile('^' + regex + '$')
326
+ return headers, regex
327
+
328
+ def get_parameter_list(self, row):
329
+ template_regex = re.sub(r"\s<.{1,5}>\s", "<*>", row["EventTemplate"])
330
+ if "<*>" not in template_regex: return []
331
+ template_regex = re.sub(r'([^A-Za-z0-9])', r'\\\1', template_regex)
332
+ template_regex = re.sub(r'\\ +', r'[^A-Za-z0-9]+', template_regex)
333
+ template_regex = "^" + template_regex.replace("\<\*\>", "(.*?)") + "$"
334
+ parameter_list = re.findall(template_regex, row["Content"])
335
+ parameter_list = parameter_list[0] if parameter_list else ()
336
+ parameter_list = list(parameter_list) if isinstance(parameter_list, tuple) else [parameter_list]
337
+ parameter_list = [para.strip(string.punctuation).strip(' ') for para in parameter_list]
338
+ return parameter_list
339
+
340
+
341
+ if __name__ == "__main__":
342
+ import os
343
+ import pandas as pd
344
+ print(os.getcwd())
345
+ os.chdir("../")
346
+ print(os.getcwd())
347
+
348
+ lp = LogParser()
349
+ # print(lp.LCS(seq1="abcbb", seq2="bc"))
350
+ output_dir = 'demo/Spell_result/' # The output directory of parsing results
351
+ log_file = 'HDFS.log' # The input log file name
352
+ log_structured_file = output_dir + log_file + "_structured.csv"
353
+ df = pd.read_csv(log_structured_file)
354
+ for _, row in df.iterrows():
355
+ lp.get_parameter_list(row)
356
+
model/bert/parameters.txt ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ device: cuda
2
+ output_dir: AI_MODELS/trained_models/Hadoop_logbert/
3
+ model_dir: AI_MODELS/trained_models/Hadoop_logbert/bert/
4
+ model_path: AI_MODELS/trained_models/Hadoop_logbert/bert/best_bert.pth
5
+ train_vocab: AI_MODELS/trained_models/Hadoop_logbert/train
6
+ vocab_path: AI_MODELS/trained_models/Hadoop_logbert/vocab.pkl
7
+ window_size: 64
8
+ adaptive_window: True
9
+ seq_len: 256
10
+ max_len: 256
11
+ min_len: 5
12
+ mask_ratio: 0.7
13
+ train_ratio: 1
14
+ valid_ratio: 0.25
15
+ test_ratio: 1
16
+ is_logkey: True
17
+ is_time: False
18
+ hypersphere_loss: True
19
+ hypersphere_loss_test: True
20
+ scale: standard
21
+ scale_path: AI_MODELS/trained_models/Hadoop_logbert/bert/scale.pkl
22
+ hidden: 512
23
+ layers: 6
24
+ attn_heads: 8
25
+ epochs: 2000
26
+ n_epochs_stop: 50
27
+ batch_size: 32
28
+ corpus_lines: None
29
+ on_memory: True
30
+ num_workers: 5
31
+ lr: 0.0003
32
+ adam_beta1: 0.9
33
+ adam_beta2: 0.999
34
+ adam_weight_decay: 0.0
35
+ with_cuda: True
36
+ cuda_devices: None
37
+ log_freq: None
38
+ num_candidates: 6
39
+ gaussian_mean: 0
40
+ gaussian_std: 1