MukeshKapoor25's picture
gitignore
3f90381
"""
Description : This file implements the Drain algorithm for log parsing
Author : LogPAI team
License : MIT
"""
import re
import os
import numpy as np
import pandas as pd
import hashlib
from datetime import datetime
class Logcluster:
def __init__(self, logTemplate='', logIDL=None):
self.logTemplate = logTemplate
if logIDL is None:
logIDL = []
self.logIDL = logIDL
class Node:
def __init__(self, childD=None, depth=0, digitOrtoken=None):
if childD is None:
childD = dict()
self.childD = childD
self.depth = depth
self.digitOrtoken = digitOrtoken
class LogParser:
def __init__(self, log_format, indir='./', outdir='./result/', depth=4, st=0.4,
maxChild=100, rex=[], keep_para=True):
"""
Attributes
----------
rex : regular expressions used in preprocessing (step1)
path : the input path stores the input log file name
depth : depth of all leaf nodes
st : similarity threshold
maxChild : max number of children of an internal node
logName : the name of the input file containing raw log messages
savePath : the output path stores the file containing structured logs
"""
self.path = indir
self.depth = depth - 2
self.st = st
self.maxChild = maxChild
self.logName = None
self.savePath = outdir
self.df_log = None
self.log_format = log_format
self.rex = rex
self.keep_para = keep_para
def hasNumbers(self, s):
return any(char.isdigit() for char in s)
def treeSearch(self, rn, seq):
retLogClust = None
seqLen = len(seq)
if seqLen not in rn.childD:
return retLogClust
parentn = rn.childD[seqLen]
currentDepth = 1
for token in seq:
if currentDepth >= self.depth or currentDepth > seqLen:
break
if token in parentn.childD:
parentn = parentn.childD[token]
elif '<*>' in parentn.childD:
parentn = parentn.childD['<*>']
else:
return retLogClust
currentDepth += 1
logClustL = parentn.childD
retLogClust = self.fastMatch(logClustL, seq)
return retLogClust
def addSeqToPrefixTree(self, rn, logClust):
seqLen = len(logClust.logTemplate)
if seqLen not in rn.childD:
firtLayerNode = Node(depth=1, digitOrtoken=seqLen)
rn.childD[seqLen] = firtLayerNode
else:
firtLayerNode = rn.childD[seqLen]
parentn = firtLayerNode
currentDepth = 1
for token in logClust.logTemplate:
# Add current log cluster to the leaf node
if currentDepth >= self.depth or currentDepth > seqLen:
if len(parentn.childD) == 0:
parentn.childD = [logClust]
else:
parentn.childD.append(logClust)
break
# If token not matched in this layer of existing tree.
if token not in parentn.childD:
if not self.hasNumbers(token):
if '<*>' in parentn.childD:
if len(parentn.childD) < self.maxChild:
newNode = Node(depth=currentDepth + 1, digitOrtoken=token)
parentn.childD[token] = newNode
parentn = newNode
else:
parentn = parentn.childD['<*>']
else:
if len(parentn.childD) + 1 < self.maxChild:
newNode = Node(depth=currentDepth + 1, digitOrtoken=token)
parentn.childD[token] = newNode
parentn = newNode
elif len(parentn.childD) + 1 == self.maxChild:
newNode = Node(depth=currentDepth + 1, digitOrtoken='<*>')
parentn.childD['<*>'] = newNode
parentn = newNode
else:
parentn = parentn.childD['<*>']
else:
if '<*>' not in parentn.childD:
newNode = Node(depth=currentDepth + 1, digitOrtoken='<*>')
parentn.childD['<*>'] = newNode
parentn = newNode
else:
parentn = parentn.childD['<*>']
# If the token is matched
else:
parentn = parentn.childD[token]
currentDepth += 1
# seq1 is template
def seqDist(self, seq1, seq2):
assert len(seq1) == len(seq2)
simTokens = 0
numOfPar = 0
for token1, token2 in zip(seq1, seq2):
if token1 == '<*>':
numOfPar += 1
continue #comment@haixuanguo: <*> == <*> are similar pairs
if token1 == token2:
simTokens += 1
retVal = float(simTokens) / len(seq1)
return retVal, numOfPar
def fastMatch(self, logClustL, seq):
retLogClust = None
maxSim = -1
maxNumOfPara = -1
maxClust = None
for logClust in logClustL:
curSim, curNumOfPara = self.seqDist(logClust.logTemplate, seq)
if curSim > maxSim or (curSim == maxSim and curNumOfPara > maxNumOfPara):
maxSim = curSim
maxNumOfPara = curNumOfPara
maxClust = logClust
if maxSim >= self.st:
retLogClust = maxClust
return retLogClust
def getTemplate(self, seq1, seq2):
assert len(seq1) == len(seq2)
retVal = []
i = 0
for word in seq1:
if word == seq2[i]:
retVal.append(word)
else:
retVal.append('<*>')
i += 1
return retVal
def outputResult(self, logClustL):
log_templates = [0] * self.df_log.shape[0]
log_templateids = [0] * self.df_log.shape[0]
df_events = []
for logClust in logClustL:
template_str = ' '.join(logClust.logTemplate)
occurrence = len(logClust.logIDL)
template_id = hashlib.md5(template_str.encode('utf-8')).hexdigest()[0:8]
for logID in logClust.logIDL:
logID -= 1
log_templates[logID] = template_str
log_templateids[logID] = template_id
df_events.append([template_id, template_str, occurrence])
df_event = pd.DataFrame(df_events, columns=['EventId', 'EventTemplate', 'Occurrences'])
self.df_log['EventId'] = log_templateids
self.df_log['EventTemplate'] = log_templates
if self.keep_para:
self.df_log["ParameterList"] = self.df_log.apply(self.get_parameter_list, axis=1)
self.df_log.to_csv(os.path.join(self.savePath, self.logName + '_structured.csv'), index=False)
occ_dict = dict(self.df_log['EventTemplate'].value_counts())
df_event = pd.DataFrame()
df_event['EventTemplate'] = self.df_log['EventTemplate'].unique()
df_event['EventId'] = df_event['EventTemplate'].map(lambda x: hashlib.md5(str(x).encode('utf-8')).hexdigest()[0:8])
df_event['Occurrences'] = df_event['EventTemplate'].map(occ_dict)
df_event.to_csv(os.path.join(self.savePath, self.logName + '_templates.csv'), index=False,
columns=["EventId", "EventTemplate", "Occurrences"])
def printTree(self, node, dep):
pStr = ''
for i in range(dep):
pStr += '\t'
if node.depth == 0:
pStr += 'Root'
elif node.depth == 1:
pStr += '<' + str(node.digitOrtoken) + '>'
else:
pStr += node.digitOrtoken
print(pStr)
if node.depth == self.depth:
return 1
for child in node.childD:
self.printTree(node.childD[child], dep + 1)
def parse(self, logName):
print('Parsing file: ' + os.path.join(self.path, logName))
start_time = datetime.now()
self.logName = logName
rootNode = Node()
logCluL = []
self.load_data()
count = 0
for idx, line in self.df_log.iterrows():
logID = line['LineId']
logmessageL = self.preprocess(line['Content']).strip().split()
# logmessageL = filter(lambda x: x != '', re.split('[\s=:,]', self.preprocess(line['Content'])))
matchCluster = self.treeSearch(rootNode, logmessageL)
# Match no existing log cluster
if matchCluster is None:
newCluster = Logcluster(logTemplate=logmessageL, logIDL=[logID])
logCluL.append(newCluster)
self.addSeqToPrefixTree(rootNode, newCluster)
# Add the new log message to the existing cluster
else:
newTemplate = self.getTemplate(logmessageL, matchCluster.logTemplate)
matchCluster.logIDL.append(logID)
if ' '.join(newTemplate) != ' '.join(matchCluster.logTemplate):
matchCluster.logTemplate = newTemplate
count += 1
if count % 1000 == 0 or count == len(self.df_log):
print('Processed {0:.1f}% of log lines.'.format(count * 100.0 / len(self.df_log)), end='\r')
if not os.path.exists(self.savePath):
os.makedirs(self.savePath)
self.outputResult(logCluL)
print('Parsing done. [Time taken: {!s}]'.format(datetime.now() - start_time))
def load_data(self):
headers, regex = self.generate_logformat_regex(self.log_format)
self.df_log = self.log_to_dataframe(os.path.join(self.path, self.logName), regex, headers, self.log_format)
def preprocess(self, line):
for currentRex in self.rex:
line = re.sub(currentRex, '<*>', line)
return line
def log_to_dataframe(self, log_file, regex, headers, logformat):
""" Function to transform log file to dataframe
"""
log_messages = []
linecount = 0
cnt = 0
with open(log_file, 'r') as fin:
for line in fin.readlines():
cnt += 1
try:
match = regex.search(line.strip())
message = [match.group(header) for header in headers]
log_messages.append(message)
linecount += 1
except Exception as e:
# print("\n", line)
# print(e)
pass
print("Total size after encoding is", linecount, cnt)
logdf = pd.DataFrame(log_messages, columns=headers)
logdf.insert(0, 'LineId', None)
logdf['LineId'] = [i + 1 for i in range(linecount)]
return logdf
def generate_logformat_regex(self, logformat):
""" Function to generate regular expression to split log messages
"""
headers = []
splitters = re.split(r'(<[^<>]+>)', logformat)
regex = ''
for k in range(len(splitters)):
if k % 2 == 0:
splitter = re.sub(' +', '\\\s+', splitters[k])
regex += splitter
else:
header = splitters[k].strip('<').strip('>')
regex += '(?P<%s>.*?)' % header
headers.append(header)
regex = re.compile('^' + regex + '$')
return headers, regex
def get_parameter_list(self, row):
template_regex = re.sub(r"<.{1,5}>", "<*>", str(row["EventTemplate"]))
if "<*>" not in template_regex: return []
template_regex = re.sub(r'([^A-Za-z0-9])', r'\\\1', template_regex)
template_regex = re.sub(r' +', r'\\s+', template_regex)
template_regex = "^" + template_regex.replace("\<\*\>", "(.*?)") + "$"
parameter_list = re.findall(template_regex, row["Content"])
parameter_list = parameter_list[0] if parameter_list else ()
parameter_list = list(parameter_list) if isinstance(parameter_list, tuple) else [parameter_list]
return parameter_list