Spaces:
Runtime error
Runtime error
| """ | |
| Description : This file implements the Spell algorithm for log parsing | |
| Author : LogPAI team | |
| License : MIT | |
| """ | |
| import sys | |
| # import re | |
| import regex as re | |
| import os | |
| import numpy as np | |
| import pandas as pd | |
| import hashlib | |
| from datetime import datetime | |
| import string | |
| from tqdm import tqdm | |
| class LCSObject: | |
| """ Class object to store a log group with the same template | |
| """ | |
| def __init__(self, logTemplate='', logIDL=[]): | |
| self.logTemplate = logTemplate | |
| self.logIDL = logIDL | |
| class Node: | |
| """ A node in prefix tree data structure | |
| """ | |
| def __init__(self, token='', templateNo=0): | |
| self.logClust = None | |
| self.token = token | |
| self.templateNo = templateNo | |
| self.childD = dict() | |
| class LogParser: | |
| """ LogParser class | |
| Attributes | |
| ---------- | |
| path : the path of the input file | |
| logName : the file name of the input file | |
| savePath : the path of the output file | |
| tau : how much percentage of tokens matched to merge a log message | |
| """ | |
| def __init__(self, indir='./', outdir='./result/', log_format=None, tau=0.5, rex=[], keep_para=True): | |
| self.path = indir | |
| self.logName = None | |
| self.savePath = outdir | |
| self.tau = tau | |
| self.logformat = log_format | |
| self.df_log = None | |
| self.rex = rex | |
| self.keep_para = keep_para | |
| def LCS(self, seq1, seq2): | |
| lengths = [[0 for j in range(len(seq2) + 1)] for i in range(len(seq1) + 1)] | |
| # row 0 and column 0 are initialized to 0 already | |
| for i in range(len(seq1)): | |
| for j in range(len(seq2)): | |
| if seq1[i] == seq2[j]: | |
| lengths[i + 1][j + 1] = lengths[i][j] + 1 | |
| else: | |
| lengths[i + 1][j + 1] = max(lengths[i + 1][j], lengths[i][j + 1]) | |
| # read the substring out from the matrix | |
| result = [] | |
| lenOfSeq1, lenOfSeq2 = len(seq1), len(seq2) | |
| while lenOfSeq1 != 0 and lenOfSeq2 != 0: | |
| if lengths[lenOfSeq1][lenOfSeq2] == lengths[lenOfSeq1 - 1][lenOfSeq2]: | |
| lenOfSeq1 -= 1 | |
| elif lengths[lenOfSeq1][lenOfSeq2] == lengths[lenOfSeq1][lenOfSeq2 - 1]: | |
| lenOfSeq2 -= 1 | |
| else: | |
| assert seq1[lenOfSeq1 - 1] == seq2[lenOfSeq2 - 1] | |
| result.insert(0, seq1[lenOfSeq1 - 1]) | |
| lenOfSeq1 -= 1 | |
| lenOfSeq2 -= 1 | |
| return result | |
| #for each seq, find the corresponding log key(template) | |
| def SimpleLoopMatch(self, logClustL, seq): | |
| for logClust in logClustL: | |
| if float(len(logClust.logTemplate)) < 0.5 * len(seq): | |
| continue | |
| # Check the template is a subsequence of seq (we use set checking as a proxy here for speedup since | |
| # incorrect-ordering bad cases rarely occur in logs) | |
| token_set = set(seq) | |
| if all(token in token_set or token == '<*>' for token in logClust.logTemplate): | |
| return logClust | |
| return None | |
| def PrefixTreeMatch(self, parentn, seq, idx): | |
| retLogClust = None | |
| length = len(seq) | |
| for i in range(idx, length): | |
| if seq[i] in parentn.childD: | |
| childn = parentn.childD[seq[i]] | |
| if (childn.logClust is not None): | |
| constLM = [w for w in childn.logClust.logTemplate if w != '<*>'] | |
| if float(len(constLM)) >= self.tau * length: | |
| return childn.logClust | |
| else: | |
| return self.PrefixTreeMatch(childn, seq, i + 1) | |
| return retLogClust | |
| #for each seq, find the corresponding log template using LCS | |
| def LCSMatch(self, logClustL, seq): | |
| retLogClust = None | |
| maxLen = -1 | |
| maxlcs = [] | |
| maxClust = None | |
| set_seq = set(seq) | |
| size_seq = len(seq) | |
| for logClust in logClustL: | |
| set_template = set(logClust.logTemplate) | |
| if len(set_seq & set_template) < 0.5 * size_seq: | |
| continue | |
| lcs = self.LCS(seq, logClust.logTemplate) | |
| if len(lcs) > maxLen or (len(lcs) == maxLen and len(logClust.logTemplate) < len(maxClust.logTemplate)): | |
| maxLen = len(lcs) | |
| maxlcs = lcs | |
| maxClust = logClust | |
| # LCS should be large then tau * len(itself) | |
| if float(maxLen) >= self.tau * size_seq: | |
| retLogClust = maxClust | |
| return retLogClust | |
| def getTemplate(self, lcs, seq): | |
| retVal = [] | |
| if not lcs: | |
| return retVal | |
| lcs = lcs[::-1] | |
| i = 0 | |
| for token in seq: | |
| i += 1 | |
| if token == lcs[-1]: | |
| retVal.append(token) | |
| lcs.pop() | |
| else: | |
| retVal.append('<*>') | |
| if not lcs: | |
| break | |
| if i < len(seq): | |
| retVal.append('<*>') | |
| return retVal | |
| def addSeqToPrefixTree(self, rootn, newCluster): | |
| parentn = rootn | |
| seq = newCluster.logTemplate | |
| seq = [w for w in seq if w != '<*>'] | |
| for i in range(len(seq)): | |
| tokenInSeq = seq[i] | |
| # Match | |
| if tokenInSeq in parentn.childD: | |
| parentn.childD[tokenInSeq].templateNo += 1 | |
| # Do not Match | |
| else: | |
| parentn.childD[tokenInSeq] = Node(token=tokenInSeq, templateNo=1) | |
| parentn = parentn.childD[tokenInSeq] | |
| if parentn.logClust is None: | |
| parentn.logClust = newCluster | |
| def removeSeqFromPrefixTree(self, rootn, newCluster): | |
| parentn = rootn | |
| seq = newCluster.logTemplate | |
| seq = [w for w in seq if w != '<*>'] | |
| for tokenInSeq in seq: | |
| if tokenInSeq in parentn.childD: | |
| matchedNode = parentn.childD[tokenInSeq] | |
| if matchedNode.templateNo == 1: | |
| del parentn.childD[tokenInSeq] | |
| break | |
| else: | |
| matchedNode.templateNo -= 1 | |
| parentn = matchedNode | |
| def outputResult(self, logClustL): | |
| print("output result", self.savePath) | |
| templates = [0] * self.df_log.shape[0] | |
| ids = [0] * self.df_log.shape[0] | |
| df_event = [] | |
| for logclust in tqdm(logClustL): | |
| template_str = ' '.join(logclust.logTemplate) | |
| eid = hashlib.md5(template_str.encode('utf-8')).hexdigest()[0:8] | |
| for logid in logclust.logIDL: | |
| templates[logid - 1] = template_str | |
| ids[logid - 1] = eid | |
| df_event.append([eid, template_str, len(logclust.logIDL)]) | |
| df_event = pd.DataFrame(df_event, columns=['EventId', 'EventTemplate', 'Occurrences']) | |
| self.df_log['EventId'] = ids | |
| self.df_log['EventTemplate'] = templates | |
| if self.keep_para: | |
| self.df_log["ParameterList"] = self.df_log.apply(self.get_parameter_list, axis=1) | |
| self.df_log.to_csv(os.path.join(self.savePath, self.logname + '_structured.csv'), index=False) | |
| df_event.to_csv(os.path.join(self.savePath, self.logname + '_templates.csv'), index=False) | |
| def printTree(self, node, dep): | |
| pStr = '' | |
| for i in range(len(dep)): | |
| pStr += '\t' | |
| if node.token == '': | |
| pStr += 'Root' | |
| else: | |
| pStr += node.token | |
| if node.logClust is not None: | |
| pStr += '-->' + ' '.join(node.logClust.logTemplate) | |
| print(pStr + ' (' + str(node.templateNo) + ')') | |
| for child in node.childD: | |
| self.printTree(node.childD[child], dep + 1) | |
| def parse(self, logname): | |
| starttime = datetime.now() | |
| print('Parsing file: ' + os.path.join(self.path, logname)) | |
| self.logname = logname | |
| self.load_data() | |
| rootNode = Node() | |
| logCluL = [] | |
| punc = re.sub('[<*>]', '', string.punctuation) | |
| count = 0 | |
| for idx, line in self.df_log.iterrows(): | |
| logID = line['LineId'] | |
| logmessageL = list(filter(lambda x: x.strip() != '', re.split(f'[{punc}]', self.preprocess(line['Content'])))) | |
| constLogMessL = [w for w in logmessageL if w != '<*>'] | |
| #constLogMessL = [w for w in logmessageL] | |
| # Find an existing matched log cluster | |
| matchCluster = self.PrefixTreeMatch(rootNode, constLogMessL, 0) | |
| if matchCluster is None: | |
| matchCluster = self.SimpleLoopMatch(logCluL, constLogMessL) | |
| if matchCluster is None: | |
| matchCluster = self.LCSMatch(logCluL, logmessageL) | |
| # Match no existing log cluster | |
| if matchCluster is None: | |
| newCluster = LCSObject(logTemplate=logmessageL, logIDL=[logID]) | |
| logCluL.append(newCluster) | |
| self.addSeqToPrefixTree(rootNode, newCluster) | |
| # Add the new log message to the existing cluster | |
| else: | |
| newTemplate = self.getTemplate(self.LCS(logmessageL, matchCluster.logTemplate), | |
| matchCluster.logTemplate) | |
| if ' '.join(newTemplate) != ' '.join(matchCluster.logTemplate): | |
| self.removeSeqFromPrefixTree(rootNode, matchCluster) | |
| matchCluster.logTemplate = newTemplate | |
| self.addSeqToPrefixTree(rootNode, matchCluster) | |
| if matchCluster: | |
| matchCluster.logIDL.append(logID) | |
| count += 1 | |
| if count % 1000 == 0 or count == len(self.df_log): | |
| print('Processed {0:.1f}% of log lines.'.format(count * 100.0 / len(self.df_log)), end='\r') | |
| if not os.path.exists(self.savePath): | |
| os.makedirs(self.savePath) | |
| self.outputResult(logCluL) | |
| print('Parsing done. [Time taken: {!s}]'.format(datetime.now() - starttime)) | |
| def load_data(self): | |
| headers, regex = self.generate_logformat_regex(self.logformat) | |
| self.df_log = self.log_to_dataframe(os.path.join(self.path, self.logname), regex, headers, self.logformat) | |
| def preprocess(self, line): | |
| for currentRex in self.rex: | |
| line = re.sub(currentRex, '<*>', line) | |
| return line | |
| def log_to_dataframe(self, log_file, regex, headers, logformat): | |
| """ Function to transform log file to dataframe | |
| """ | |
| log_messages = [] | |
| linecount = 0 | |
| k = 0 | |
| with open(log_file, 'r') as fin: | |
| for line in fin.readlines(): | |
| #extract small size data | |
| k += 1 | |
| if k%10000 == 0: | |
| print("extracted {0} log lines from {1}".format(k, log_file)) | |
| line = re.sub(r'[^\x00-\x7F]+', '<NASCII>', line) #replace non ASCII (\x00-\x7F) character with <NASCII> | |
| try: | |
| match = regex.search(line.strip()) | |
| message = [match.group(header) for header in headers] | |
| log_messages.append(message) | |
| linecount += 1 | |
| except Exception as e: | |
| pass | |
| logdf = pd.DataFrame(log_messages, columns=headers) | |
| logdf.insert(0, 'LineId', None) | |
| logdf['LineId'] = [i + 1 for i in range(linecount)] | |
| return logdf | |
| def generate_logformat_regex(self, logformat): | |
| """ Function to generate regular expression to split log messages | |
| """ | |
| headers = [] | |
| splitters = re.split(r'(<[^<>]+>)', logformat) | |
| regex = '' | |
| for k in range(len(splitters)): | |
| if k % 2 == 0: | |
| splitter = re.sub(' +', '\\\s+', splitters[k]) #re.sub(' +', '\s+', splitters[k]) | |
| regex += splitter | |
| else: | |
| header = splitters[k].strip('<').strip('>') | |
| regex += '(?P<%s>.*?)' % header | |
| headers.append(header) | |
| regex = re.compile('^' + regex + '$') | |
| return headers, regex | |
| def get_parameter_list(self, row): | |
| template_regex = re.sub(r"\s<.{1,5}>\s", "<*>", row["EventTemplate"]) | |
| if "<*>" not in template_regex: return [] | |
| template_regex = re.sub(r'([^A-Za-z0-9])', r'\\\1', template_regex) | |
| template_regex = re.sub(r'\\ +', r'[^A-Za-z0-9]+', template_regex) | |
| template_regex = "^" + template_regex.replace("\<\*\>", "(.*?)") + "$" | |
| parameter_list = re.findall(template_regex, row["Content"]) | |
| parameter_list = parameter_list[0] if parameter_list else () | |
| parameter_list = list(parameter_list) if isinstance(parameter_list, tuple) else [parameter_list] | |
| parameter_list = [para.strip(string.punctuation).strip(' ') for para in parameter_list] | |
| return parameter_list | |
| if __name__ == "__main__": | |
| import os | |
| import pandas as pd | |
| print(os.getcwd()) | |
| os.chdir("../") | |
| print(os.getcwd()) | |
| lp = LogParser() | |
| # print(lp.LCS(seq1="abcbb", seq2="bc")) | |
| output_dir = 'demo/Spell_result/' # The output directory of parsing results | |
| log_file = 'HDFS.log' # The input log file name | |
| log_structured_file = output_dir + log_file + "_structured.csv" | |
| df = pd.read_csv(log_structured_file) | |
| for _, row in df.iterrows(): | |
| lp.get_parameter_list(row) | |