MukeshKapoor25's picture
gitignore
3f90381
"""
Description : This file implements the Spell algorithm for log parsing
Author : LogPAI team
License : MIT
"""
import sys
# import re
import regex as re
import os
import numpy as np
import pandas as pd
import hashlib
from datetime import datetime
import string
from tqdm import tqdm
class LCSObject:
""" Class object to store a log group with the same template
"""
def __init__(self, logTemplate='', logIDL=[]):
self.logTemplate = logTemplate
self.logIDL = logIDL
class Node:
""" A node in prefix tree data structure
"""
def __init__(self, token='', templateNo=0):
self.logClust = None
self.token = token
self.templateNo = templateNo
self.childD = dict()
class LogParser:
""" LogParser class
Attributes
----------
path : the path of the input file
logName : the file name of the input file
savePath : the path of the output file
tau : how much percentage of tokens matched to merge a log message
"""
def __init__(self, indir='./', outdir='./result/', log_format=None, tau=0.5, rex=[], keep_para=True):
self.path = indir
self.logName = None
self.savePath = outdir
self.tau = tau
self.logformat = log_format
self.df_log = None
self.rex = rex
self.keep_para = keep_para
def LCS(self, seq1, seq2):
lengths = [[0 for j in range(len(seq2) + 1)] for i in range(len(seq1) + 1)]
# row 0 and column 0 are initialized to 0 already
for i in range(len(seq1)):
for j in range(len(seq2)):
if seq1[i] == seq2[j]:
lengths[i + 1][j + 1] = lengths[i][j] + 1
else:
lengths[i + 1][j + 1] = max(lengths[i + 1][j], lengths[i][j + 1])
# read the substring out from the matrix
result = []
lenOfSeq1, lenOfSeq2 = len(seq1), len(seq2)
while lenOfSeq1 != 0 and lenOfSeq2 != 0:
if lengths[lenOfSeq1][lenOfSeq2] == lengths[lenOfSeq1 - 1][lenOfSeq2]:
lenOfSeq1 -= 1
elif lengths[lenOfSeq1][lenOfSeq2] == lengths[lenOfSeq1][lenOfSeq2 - 1]:
lenOfSeq2 -= 1
else:
assert seq1[lenOfSeq1 - 1] == seq2[lenOfSeq2 - 1]
result.insert(0, seq1[lenOfSeq1 - 1])
lenOfSeq1 -= 1
lenOfSeq2 -= 1
return result
#for each seq, find the corresponding log key(template)
def SimpleLoopMatch(self, logClustL, seq):
for logClust in logClustL:
if float(len(logClust.logTemplate)) < 0.5 * len(seq):
continue
# Check the template is a subsequence of seq (we use set checking as a proxy here for speedup since
# incorrect-ordering bad cases rarely occur in logs)
token_set = set(seq)
if all(token in token_set or token == '<*>' for token in logClust.logTemplate):
return logClust
return None
def PrefixTreeMatch(self, parentn, seq, idx):
retLogClust = None
length = len(seq)
for i in range(idx, length):
if seq[i] in parentn.childD:
childn = parentn.childD[seq[i]]
if (childn.logClust is not None):
constLM = [w for w in childn.logClust.logTemplate if w != '<*>']
if float(len(constLM)) >= self.tau * length:
return childn.logClust
else:
return self.PrefixTreeMatch(childn, seq, i + 1)
return retLogClust
#for each seq, find the corresponding log template using LCS
def LCSMatch(self, logClustL, seq):
retLogClust = None
maxLen = -1
maxlcs = []
maxClust = None
set_seq = set(seq)
size_seq = len(seq)
for logClust in logClustL:
set_template = set(logClust.logTemplate)
if len(set_seq & set_template) < 0.5 * size_seq:
continue
lcs = self.LCS(seq, logClust.logTemplate)
if len(lcs) > maxLen or (len(lcs) == maxLen and len(logClust.logTemplate) < len(maxClust.logTemplate)):
maxLen = len(lcs)
maxlcs = lcs
maxClust = logClust
# LCS should be large then tau * len(itself)
if float(maxLen) >= self.tau * size_seq:
retLogClust = maxClust
return retLogClust
def getTemplate(self, lcs, seq):
retVal = []
if not lcs:
return retVal
lcs = lcs[::-1]
i = 0
for token in seq:
i += 1
if token == lcs[-1]:
retVal.append(token)
lcs.pop()
else:
retVal.append('<*>')
if not lcs:
break
if i < len(seq):
retVal.append('<*>')
return retVal
def addSeqToPrefixTree(self, rootn, newCluster):
parentn = rootn
seq = newCluster.logTemplate
seq = [w for w in seq if w != '<*>']
for i in range(len(seq)):
tokenInSeq = seq[i]
# Match
if tokenInSeq in parentn.childD:
parentn.childD[tokenInSeq].templateNo += 1
# Do not Match
else:
parentn.childD[tokenInSeq] = Node(token=tokenInSeq, templateNo=1)
parentn = parentn.childD[tokenInSeq]
if parentn.logClust is None:
parentn.logClust = newCluster
def removeSeqFromPrefixTree(self, rootn, newCluster):
parentn = rootn
seq = newCluster.logTemplate
seq = [w for w in seq if w != '<*>']
for tokenInSeq in seq:
if tokenInSeq in parentn.childD:
matchedNode = parentn.childD[tokenInSeq]
if matchedNode.templateNo == 1:
del parentn.childD[tokenInSeq]
break
else:
matchedNode.templateNo -= 1
parentn = matchedNode
def outputResult(self, logClustL):
print("output result", self.savePath)
templates = [0] * self.df_log.shape[0]
ids = [0] * self.df_log.shape[0]
df_event = []
for logclust in tqdm(logClustL):
template_str = ' '.join(logclust.logTemplate)
eid = hashlib.md5(template_str.encode('utf-8')).hexdigest()[0:8]
for logid in logclust.logIDL:
templates[logid - 1] = template_str
ids[logid - 1] = eid
df_event.append([eid, template_str, len(logclust.logIDL)])
df_event = pd.DataFrame(df_event, columns=['EventId', 'EventTemplate', 'Occurrences'])
self.df_log['EventId'] = ids
self.df_log['EventTemplate'] = templates
if self.keep_para:
self.df_log["ParameterList"] = self.df_log.apply(self.get_parameter_list, axis=1)
self.df_log.to_csv(os.path.join(self.savePath, self.logname + '_structured.csv'), index=False)
df_event.to_csv(os.path.join(self.savePath, self.logname + '_templates.csv'), index=False)
def printTree(self, node, dep):
pStr = ''
for i in range(len(dep)):
pStr += '\t'
if node.token == '':
pStr += 'Root'
else:
pStr += node.token
if node.logClust is not None:
pStr += '-->' + ' '.join(node.logClust.logTemplate)
print(pStr + ' (' + str(node.templateNo) + ')')
for child in node.childD:
self.printTree(node.childD[child], dep + 1)
def parse(self, logname):
starttime = datetime.now()
print('Parsing file: ' + os.path.join(self.path, logname))
self.logname = logname
self.load_data()
rootNode = Node()
logCluL = []
punc = re.sub('[<*>]', '', string.punctuation)
count = 0
for idx, line in self.df_log.iterrows():
logID = line['LineId']
logmessageL = list(filter(lambda x: x.strip() != '', re.split(f'[{punc}]', self.preprocess(line['Content']))))
constLogMessL = [w for w in logmessageL if w != '<*>']
#constLogMessL = [w for w in logmessageL]
# Find an existing matched log cluster
matchCluster = self.PrefixTreeMatch(rootNode, constLogMessL, 0)
if matchCluster is None:
matchCluster = self.SimpleLoopMatch(logCluL, constLogMessL)
if matchCluster is None:
matchCluster = self.LCSMatch(logCluL, logmessageL)
# Match no existing log cluster
if matchCluster is None:
newCluster = LCSObject(logTemplate=logmessageL, logIDL=[logID])
logCluL.append(newCluster)
self.addSeqToPrefixTree(rootNode, newCluster)
# Add the new log message to the existing cluster
else:
newTemplate = self.getTemplate(self.LCS(logmessageL, matchCluster.logTemplate),
matchCluster.logTemplate)
if ' '.join(newTemplate) != ' '.join(matchCluster.logTemplate):
self.removeSeqFromPrefixTree(rootNode, matchCluster)
matchCluster.logTemplate = newTemplate
self.addSeqToPrefixTree(rootNode, matchCluster)
if matchCluster:
matchCluster.logIDL.append(logID)
count += 1
if count % 1000 == 0 or count == len(self.df_log):
print('Processed {0:.1f}% of log lines.'.format(count * 100.0 / len(self.df_log)), end='\r')
if not os.path.exists(self.savePath):
os.makedirs(self.savePath)
self.outputResult(logCluL)
print('Parsing done. [Time taken: {!s}]'.format(datetime.now() - starttime))
def load_data(self):
headers, regex = self.generate_logformat_regex(self.logformat)
self.df_log = self.log_to_dataframe(os.path.join(self.path, self.logname), regex, headers, self.logformat)
def preprocess(self, line):
for currentRex in self.rex:
line = re.sub(currentRex, '<*>', line)
return line
def log_to_dataframe(self, log_file, regex, headers, logformat):
""" Function to transform log file to dataframe
"""
log_messages = []
linecount = 0
k = 0
with open(log_file, 'r') as fin:
for line in fin.readlines():
#extract small size data
k += 1
if k%10000 == 0:
print("extracted {0} log lines from {1}".format(k, log_file))
line = re.sub(r'[^\x00-\x7F]+', '<NASCII>', line) #replace non ASCII (\x00-\x7F) character with <NASCII>
try:
match = regex.search(line.strip())
message = [match.group(header) for header in headers]
log_messages.append(message)
linecount += 1
except Exception as e:
pass
logdf = pd.DataFrame(log_messages, columns=headers)
logdf.insert(0, 'LineId', None)
logdf['LineId'] = [i + 1 for i in range(linecount)]
return logdf
def generate_logformat_regex(self, logformat):
""" Function to generate regular expression to split log messages
"""
headers = []
splitters = re.split(r'(<[^<>]+>)', logformat)
regex = ''
for k in range(len(splitters)):
if k % 2 == 0:
splitter = re.sub(' +', '\\\s+', splitters[k]) #re.sub(' +', '\s+', splitters[k])
regex += splitter
else:
header = splitters[k].strip('<').strip('>')
regex += '(?P<%s>.*?)' % header
headers.append(header)
regex = re.compile('^' + regex + '$')
return headers, regex
def get_parameter_list(self, row):
template_regex = re.sub(r"\s<.{1,5}>\s", "<*>", row["EventTemplate"])
if "<*>" not in template_regex: return []
template_regex = re.sub(r'([^A-Za-z0-9])', r'\\\1', template_regex)
template_regex = re.sub(r'\\ +', r'[^A-Za-z0-9]+', template_regex)
template_regex = "^" + template_regex.replace("\<\*\>", "(.*?)") + "$"
parameter_list = re.findall(template_regex, row["Content"])
parameter_list = parameter_list[0] if parameter_list else ()
parameter_list = list(parameter_list) if isinstance(parameter_list, tuple) else [parameter_list]
parameter_list = [para.strip(string.punctuation).strip(' ') for para in parameter_list]
return parameter_list
if __name__ == "__main__":
import os
import pandas as pd
print(os.getcwd())
os.chdir("../")
print(os.getcwd())
lp = LogParser()
# print(lp.LCS(seq1="abcbb", seq2="bc"))
output_dir = 'demo/Spell_result/' # The output directory of parsing results
log_file = 'HDFS.log' # The input log file name
log_structured_file = output_dir + log_file + "_structured.csv"
df = pd.read_csv(log_structured_file)
for _, row in df.iterrows():
lp.get_parameter_list(row)