""" Description : This file implements the Spell algorithm for log parsing Author : LogPAI team License : MIT """ import sys # import re import regex as re import os import numpy as np import pandas as pd import hashlib from datetime import datetime import string from tqdm import tqdm class LCSObject: """ Class object to store a log group with the same template """ def __init__(self, logTemplate='', logIDL=[]): self.logTemplate = logTemplate self.logIDL = logIDL class Node: """ A node in prefix tree data structure """ def __init__(self, token='', templateNo=0): self.logClust = None self.token = token self.templateNo = templateNo self.childD = dict() class LogParser: """ LogParser class Attributes ---------- path : the path of the input file logName : the file name of the input file savePath : the path of the output file tau : how much percentage of tokens matched to merge a log message """ def __init__(self, indir='./', outdir='./result/', log_format=None, tau=0.5, rex=[], keep_para=True): self.path = indir self.logName = None self.savePath = outdir self.tau = tau self.logformat = log_format self.df_log = None self.rex = rex self.keep_para = keep_para def LCS(self, seq1, seq2): lengths = [[0 for j in range(len(seq2) + 1)] for i in range(len(seq1) + 1)] # row 0 and column 0 are initialized to 0 already for i in range(len(seq1)): for j in range(len(seq2)): if seq1[i] == seq2[j]: lengths[i + 1][j + 1] = lengths[i][j] + 1 else: lengths[i + 1][j + 1] = max(lengths[i + 1][j], lengths[i][j + 1]) # read the substring out from the matrix result = [] lenOfSeq1, lenOfSeq2 = len(seq1), len(seq2) while lenOfSeq1 != 0 and lenOfSeq2 != 0: if lengths[lenOfSeq1][lenOfSeq2] == lengths[lenOfSeq1 - 1][lenOfSeq2]: lenOfSeq1 -= 1 elif lengths[lenOfSeq1][lenOfSeq2] == lengths[lenOfSeq1][lenOfSeq2 - 1]: lenOfSeq2 -= 1 else: assert seq1[lenOfSeq1 - 1] == seq2[lenOfSeq2 - 1] result.insert(0, seq1[lenOfSeq1 - 1]) lenOfSeq1 -= 1 lenOfSeq2 -= 1 return result #for each seq, find the corresponding log key(template) def SimpleLoopMatch(self, logClustL, seq): for logClust in logClustL: if float(len(logClust.logTemplate)) < 0.5 * len(seq): continue # Check the template is a subsequence of seq (we use set checking as a proxy here for speedup since # incorrect-ordering bad cases rarely occur in logs) token_set = set(seq) if all(token in token_set or token == '<*>' for token in logClust.logTemplate): return logClust return None def PrefixTreeMatch(self, parentn, seq, idx): retLogClust = None length = len(seq) for i in range(idx, length): if seq[i] in parentn.childD: childn = parentn.childD[seq[i]] if (childn.logClust is not None): constLM = [w for w in childn.logClust.logTemplate if w != '<*>'] if float(len(constLM)) >= self.tau * length: return childn.logClust else: return self.PrefixTreeMatch(childn, seq, i + 1) return retLogClust #for each seq, find the corresponding log template using LCS def LCSMatch(self, logClustL, seq): retLogClust = None maxLen = -1 maxlcs = [] maxClust = None set_seq = set(seq) size_seq = len(seq) for logClust in logClustL: set_template = set(logClust.logTemplate) if len(set_seq & set_template) < 0.5 * size_seq: continue lcs = self.LCS(seq, logClust.logTemplate) if len(lcs) > maxLen or (len(lcs) == maxLen and len(logClust.logTemplate) < len(maxClust.logTemplate)): maxLen = len(lcs) maxlcs = lcs maxClust = logClust # LCS should be large then tau * len(itself) if float(maxLen) >= self.tau * size_seq: retLogClust = maxClust return retLogClust def getTemplate(self, lcs, seq): retVal = [] if not lcs: return retVal lcs = lcs[::-1] i = 0 for token in seq: i += 1 if token == lcs[-1]: retVal.append(token) lcs.pop() else: retVal.append('<*>') if not lcs: break if i < len(seq): retVal.append('<*>') return retVal def addSeqToPrefixTree(self, rootn, newCluster): parentn = rootn seq = newCluster.logTemplate seq = [w for w in seq if w != '<*>'] for i in range(len(seq)): tokenInSeq = seq[i] # Match if tokenInSeq in parentn.childD: parentn.childD[tokenInSeq].templateNo += 1 # Do not Match else: parentn.childD[tokenInSeq] = Node(token=tokenInSeq, templateNo=1) parentn = parentn.childD[tokenInSeq] if parentn.logClust is None: parentn.logClust = newCluster def removeSeqFromPrefixTree(self, rootn, newCluster): parentn = rootn seq = newCluster.logTemplate seq = [w for w in seq if w != '<*>'] for tokenInSeq in seq: if tokenInSeq in parentn.childD: matchedNode = parentn.childD[tokenInSeq] if matchedNode.templateNo == 1: del parentn.childD[tokenInSeq] break else: matchedNode.templateNo -= 1 parentn = matchedNode def outputResult(self, logClustL): print("output result", self.savePath) templates = [0] * self.df_log.shape[0] ids = [0] * self.df_log.shape[0] df_event = [] for logclust in tqdm(logClustL): template_str = ' '.join(logclust.logTemplate) eid = hashlib.md5(template_str.encode('utf-8')).hexdigest()[0:8] for logid in logclust.logIDL: templates[logid - 1] = template_str ids[logid - 1] = eid df_event.append([eid, template_str, len(logclust.logIDL)]) df_event = pd.DataFrame(df_event, columns=['EventId', 'EventTemplate', 'Occurrences']) self.df_log['EventId'] = ids self.df_log['EventTemplate'] = templates if self.keep_para: self.df_log["ParameterList"] = self.df_log.apply(self.get_parameter_list, axis=1) self.df_log.to_csv(os.path.join(self.savePath, self.logname + '_structured.csv'), index=False) df_event.to_csv(os.path.join(self.savePath, self.logname + '_templates.csv'), index=False) def printTree(self, node, dep): pStr = '' for i in range(len(dep)): pStr += '\t' if node.token == '': pStr += 'Root' else: pStr += node.token if node.logClust is not None: pStr += '-->' + ' '.join(node.logClust.logTemplate) print(pStr + ' (' + str(node.templateNo) + ')') for child in node.childD: self.printTree(node.childD[child], dep + 1) def parse(self, logname): starttime = datetime.now() print('Parsing file: ' + os.path.join(self.path, logname)) self.logname = logname self.load_data() rootNode = Node() logCluL = [] punc = re.sub('[<*>]', '', string.punctuation) count = 0 for idx, line in self.df_log.iterrows(): logID = line['LineId'] logmessageL = list(filter(lambda x: x.strip() != '', re.split(f'[{punc}]', self.preprocess(line['Content'])))) constLogMessL = [w for w in logmessageL if w != '<*>'] #constLogMessL = [w for w in logmessageL] # Find an existing matched log cluster matchCluster = self.PrefixTreeMatch(rootNode, constLogMessL, 0) if matchCluster is None: matchCluster = self.SimpleLoopMatch(logCluL, constLogMessL) if matchCluster is None: matchCluster = self.LCSMatch(logCluL, logmessageL) # Match no existing log cluster if matchCluster is None: newCluster = LCSObject(logTemplate=logmessageL, logIDL=[logID]) logCluL.append(newCluster) self.addSeqToPrefixTree(rootNode, newCluster) # Add the new log message to the existing cluster else: newTemplate = self.getTemplate(self.LCS(logmessageL, matchCluster.logTemplate), matchCluster.logTemplate) if ' '.join(newTemplate) != ' '.join(matchCluster.logTemplate): self.removeSeqFromPrefixTree(rootNode, matchCluster) matchCluster.logTemplate = newTemplate self.addSeqToPrefixTree(rootNode, matchCluster) if matchCluster: matchCluster.logIDL.append(logID) count += 1 if count % 1000 == 0 or count == len(self.df_log): print('Processed {0:.1f}% of log lines.'.format(count * 100.0 / len(self.df_log)), end='\r') if not os.path.exists(self.savePath): os.makedirs(self.savePath) self.outputResult(logCluL) print('Parsing done. [Time taken: {!s}]'.format(datetime.now() - starttime)) def load_data(self): headers, regex = self.generate_logformat_regex(self.logformat) self.df_log = self.log_to_dataframe(os.path.join(self.path, self.logname), regex, headers, self.logformat) def preprocess(self, line): for currentRex in self.rex: line = re.sub(currentRex, '<*>', line) return line def log_to_dataframe(self, log_file, regex, headers, logformat): """ Function to transform log file to dataframe """ log_messages = [] linecount = 0 k = 0 with open(log_file, 'r') as fin: for line in fin.readlines(): #extract small size data k += 1 if k%10000 == 0: print("extracted {0} log lines from {1}".format(k, log_file)) line = re.sub(r'[^\x00-\x7F]+', '', line) #replace non ASCII (\x00-\x7F) character with try: match = regex.search(line.strip()) message = [match.group(header) for header in headers] log_messages.append(message) linecount += 1 except Exception as e: pass logdf = pd.DataFrame(log_messages, columns=headers) logdf.insert(0, 'LineId', None) logdf['LineId'] = [i + 1 for i in range(linecount)] return logdf def generate_logformat_regex(self, logformat): """ Function to generate regular expression to split log messages """ headers = [] splitters = re.split(r'(<[^<>]+>)', logformat) regex = '' for k in range(len(splitters)): if k % 2 == 0: splitter = re.sub(' +', '\\\s+', splitters[k]) #re.sub(' +', '\s+', splitters[k]) regex += splitter else: header = splitters[k].strip('<').strip('>') regex += '(?P<%s>.*?)' % header headers.append(header) regex = re.compile('^' + regex + '$') return headers, regex def get_parameter_list(self, row): template_regex = re.sub(r"\s<.{1,5}>\s", "<*>", row["EventTemplate"]) if "<*>" not in template_regex: return [] template_regex = re.sub(r'([^A-Za-z0-9])', r'\\\1', template_regex) template_regex = re.sub(r'\\ +', r'[^A-Za-z0-9]+', template_regex) template_regex = "^" + template_regex.replace("\<\*\>", "(.*?)") + "$" parameter_list = re.findall(template_regex, row["Content"]) parameter_list = parameter_list[0] if parameter_list else () parameter_list = list(parameter_list) if isinstance(parameter_list, tuple) else [parameter_list] parameter_list = [para.strip(string.punctuation).strip(' ') for para in parameter_list] return parameter_list if __name__ == "__main__": import os import pandas as pd print(os.getcwd()) os.chdir("../") print(os.getcwd()) lp = LogParser() # print(lp.LCS(seq1="abcbb", seq2="bc")) output_dir = 'demo/Spell_result/' # The output directory of parsing results log_file = 'HDFS.log' # The input log file name log_structured_file = output_dir + log_file + "_structured.csv" df = pd.read_csv(log_structured_file) for _, row in df.iterrows(): lp.get_parameter_list(row)