""" Description : This file implements the Drain algorithm for log parsing Author : LogPAI team License : MIT """ import re import os import numpy as np import pandas as pd import hashlib from datetime import datetime class Logcluster: def __init__(self, logTemplate='', logIDL=None): self.logTemplate = logTemplate if logIDL is None: logIDL = [] self.logIDL = logIDL class Node: def __init__(self, childD=None, depth=0, digitOrtoken=None): if childD is None: childD = dict() self.childD = childD self.depth = depth self.digitOrtoken = digitOrtoken class LogParser: def __init__(self, log_format, indir='./', outdir='./result/', depth=4, st=0.4, maxChild=100, rex=[], keep_para=True): """ Attributes ---------- rex : regular expressions used in preprocessing (step1) path : the input path stores the input log file name depth : depth of all leaf nodes st : similarity threshold maxChild : max number of children of an internal node logName : the name of the input file containing raw log messages savePath : the output path stores the file containing structured logs """ self.path = indir self.depth = depth - 2 self.st = st self.maxChild = maxChild self.logName = None self.savePath = outdir self.df_log = None self.log_format = log_format self.rex = rex self.keep_para = keep_para def hasNumbers(self, s): return any(char.isdigit() for char in s) def treeSearch(self, rn, seq): retLogClust = None seqLen = len(seq) if seqLen not in rn.childD: return retLogClust parentn = rn.childD[seqLen] currentDepth = 1 for token in seq: if currentDepth >= self.depth or currentDepth > seqLen: break if token in parentn.childD: parentn = parentn.childD[token] elif '<*>' in parentn.childD: parentn = parentn.childD['<*>'] else: return retLogClust currentDepth += 1 logClustL = parentn.childD retLogClust = self.fastMatch(logClustL, seq) return retLogClust def addSeqToPrefixTree(self, rn, logClust): seqLen = len(logClust.logTemplate) if seqLen not in rn.childD: firtLayerNode = Node(depth=1, digitOrtoken=seqLen) rn.childD[seqLen] = firtLayerNode else: firtLayerNode = rn.childD[seqLen] parentn = firtLayerNode currentDepth = 1 for token in logClust.logTemplate: # Add current log cluster to the leaf node if currentDepth >= self.depth or currentDepth > seqLen: if len(parentn.childD) == 0: parentn.childD = [logClust] else: parentn.childD.append(logClust) break # If token not matched in this layer of existing tree. if token not in parentn.childD: if not self.hasNumbers(token): if '<*>' in parentn.childD: if len(parentn.childD) < self.maxChild: newNode = Node(depth=currentDepth + 1, digitOrtoken=token) parentn.childD[token] = newNode parentn = newNode else: parentn = parentn.childD['<*>'] else: if len(parentn.childD) + 1 < self.maxChild: newNode = Node(depth=currentDepth + 1, digitOrtoken=token) parentn.childD[token] = newNode parentn = newNode elif len(parentn.childD) + 1 == self.maxChild: newNode = Node(depth=currentDepth + 1, digitOrtoken='<*>') parentn.childD['<*>'] = newNode parentn = newNode else: parentn = parentn.childD['<*>'] else: if '<*>' not in parentn.childD: newNode = Node(depth=currentDepth + 1, digitOrtoken='<*>') parentn.childD['<*>'] = newNode parentn = newNode else: parentn = parentn.childD['<*>'] # If the token is matched else: parentn = parentn.childD[token] currentDepth += 1 # seq1 is template def seqDist(self, seq1, seq2): assert len(seq1) == len(seq2) simTokens = 0 numOfPar = 0 for token1, token2 in zip(seq1, seq2): if token1 == '<*>': numOfPar += 1 continue #comment@haixuanguo: <*> == <*> are similar pairs if token1 == token2: simTokens += 1 retVal = float(simTokens) / len(seq1) return retVal, numOfPar def fastMatch(self, logClustL, seq): retLogClust = None maxSim = -1 maxNumOfPara = -1 maxClust = None for logClust in logClustL: curSim, curNumOfPara = self.seqDist(logClust.logTemplate, seq) if curSim > maxSim or (curSim == maxSim and curNumOfPara > maxNumOfPara): maxSim = curSim maxNumOfPara = curNumOfPara maxClust = logClust if maxSim >= self.st: retLogClust = maxClust return retLogClust def getTemplate(self, seq1, seq2): assert len(seq1) == len(seq2) retVal = [] i = 0 for word in seq1: if word == seq2[i]: retVal.append(word) else: retVal.append('<*>') i += 1 return retVal def outputResult(self, logClustL): log_templates = [0] * self.df_log.shape[0] log_templateids = [0] * self.df_log.shape[0] df_events = [] for logClust in logClustL: template_str = ' '.join(logClust.logTemplate) occurrence = len(logClust.logIDL) template_id = hashlib.md5(template_str.encode('utf-8')).hexdigest()[0:8] for logID in logClust.logIDL: logID -= 1 log_templates[logID] = template_str log_templateids[logID] = template_id df_events.append([template_id, template_str, occurrence]) df_event = pd.DataFrame(df_events, columns=['EventId', 'EventTemplate', 'Occurrences']) self.df_log['EventId'] = log_templateids self.df_log['EventTemplate'] = log_templates if self.keep_para: self.df_log["ParameterList"] = self.df_log.apply(self.get_parameter_list, axis=1) self.df_log.to_csv(os.path.join(self.savePath, self.logName + '_structured.csv'), index=False) occ_dict = dict(self.df_log['EventTemplate'].value_counts()) df_event = pd.DataFrame() df_event['EventTemplate'] = self.df_log['EventTemplate'].unique() df_event['EventId'] = df_event['EventTemplate'].map(lambda x: hashlib.md5(str(x).encode('utf-8')).hexdigest()[0:8]) df_event['Occurrences'] = df_event['EventTemplate'].map(occ_dict) df_event.to_csv(os.path.join(self.savePath, self.logName + '_templates.csv'), index=False, columns=["EventId", "EventTemplate", "Occurrences"]) def printTree(self, node, dep): pStr = '' for i in range(dep): pStr += '\t' if node.depth == 0: pStr += 'Root' elif node.depth == 1: pStr += '<' + str(node.digitOrtoken) + '>' else: pStr += node.digitOrtoken print(pStr) if node.depth == self.depth: return 1 for child in node.childD: self.printTree(node.childD[child], dep + 1) def parse(self, logName): print('Parsing file: ' + os.path.join(self.path, logName)) start_time = datetime.now() self.logName = logName rootNode = Node() logCluL = [] self.load_data() count = 0 for idx, line in self.df_log.iterrows(): logID = line['LineId'] logmessageL = self.preprocess(line['Content']).strip().split() # logmessageL = filter(lambda x: x != '', re.split('[\s=:,]', self.preprocess(line['Content']))) matchCluster = self.treeSearch(rootNode, logmessageL) # Match no existing log cluster if matchCluster is None: newCluster = Logcluster(logTemplate=logmessageL, logIDL=[logID]) logCluL.append(newCluster) self.addSeqToPrefixTree(rootNode, newCluster) # Add the new log message to the existing cluster else: newTemplate = self.getTemplate(logmessageL, matchCluster.logTemplate) matchCluster.logIDL.append(logID) if ' '.join(newTemplate) != ' '.join(matchCluster.logTemplate): matchCluster.logTemplate = newTemplate count += 1 if count % 1000 == 0 or count == len(self.df_log): print('Processed {0:.1f}% of log lines.'.format(count * 100.0 / len(self.df_log)), end='\r') if not os.path.exists(self.savePath): os.makedirs(self.savePath) self.outputResult(logCluL) print('Parsing done. [Time taken: {!s}]'.format(datetime.now() - start_time)) def load_data(self): headers, regex = self.generate_logformat_regex(self.log_format) self.df_log = self.log_to_dataframe(os.path.join(self.path, self.logName), regex, headers, self.log_format) def preprocess(self, line): for currentRex in self.rex: line = re.sub(currentRex, '<*>', line) return line def log_to_dataframe(self, log_file, regex, headers, logformat): """ Function to transform log file to dataframe """ log_messages = [] linecount = 0 cnt = 0 with open(log_file, 'r') as fin: for line in fin.readlines(): cnt += 1 try: match = regex.search(line.strip()) message = [match.group(header) for header in headers] log_messages.append(message) linecount += 1 except Exception as e: # print("\n", line) # print(e) pass print("Total size after encoding is", linecount, cnt) logdf = pd.DataFrame(log_messages, columns=headers) logdf.insert(0, 'LineId', None) logdf['LineId'] = [i + 1 for i in range(linecount)] return logdf def generate_logformat_regex(self, logformat): """ Function to generate regular expression to split log messages """ headers = [] splitters = re.split(r'(<[^<>]+>)', logformat) regex = '' for k in range(len(splitters)): if k % 2 == 0: splitter = re.sub(' +', '\\\s+', splitters[k]) regex += splitter else: header = splitters[k].strip('<').strip('>') regex += '(?P<%s>.*?)' % header headers.append(header) regex = re.compile('^' + regex + '$') return headers, regex def get_parameter_list(self, row): template_regex = re.sub(r"<.{1,5}>", "<*>", str(row["EventTemplate"])) if "<*>" not in template_regex: return [] template_regex = re.sub(r'([^A-Za-z0-9])', r'\\\1', template_regex) template_regex = re.sub(r' +', r'\\s+', template_regex) template_regex = "^" + template_regex.replace("\<\*\>", "(.*?)") + "$" parameter_list = re.findall(template_regex, row["Content"]) parameter_list = parameter_list[0] if parameter_list else () parameter_list = list(parameter_list) if isinstance(parameter_list, tuple) else [parameter_list] return parameter_list