Spaces:

MukeshKapoor25
/

logbert_processor

Runtime error

App Files Files Community

logbert_processor / logparser /Drain.py

MukeshKapoor25

gitignore

3f90381 7 months ago

raw

history blame contribute delete

12.4 kB

	"""
	Description : This file implements the Drain algorithm for log parsing
	Author : LogPAI team
	License : MIT
	"""

	import re
	import os
	import numpy as np
	import pandas as pd
	import hashlib
	from datetime import datetime


	class Logcluster:
	def __init__(self, logTemplate='', logIDL=None):
	self.logTemplate = logTemplate
	if logIDL is None:
	logIDL = []
	self.logIDL = logIDL


	class Node:
	def __init__(self, childD=None, depth=0, digitOrtoken=None):
	if childD is None:
	childD = dict()
	self.childD = childD
	self.depth = depth
	self.digitOrtoken = digitOrtoken


	class LogParser:
	def __init__(self, log_format, indir='./', outdir='./result/', depth=4, st=0.4,
	maxChild=100, rex=[], keep_para=True):
	"""
	Attributes
	----------
	rex : regular expressions used in preprocessing (step1)
	path : the input path stores the input log file name
	depth : depth of all leaf nodes
	st : similarity threshold
	maxChild : max number of children of an internal node
	logName : the name of the input file containing raw log messages
	savePath : the output path stores the file containing structured logs
	"""
	self.path = indir
	self.depth = depth - 2
	self.st = st
	self.maxChild = maxChild
	self.logName = None
	self.savePath = outdir
	self.df_log = None
	self.log_format = log_format
	self.rex = rex
	self.keep_para = keep_para

	def hasNumbers(self, s):
	return any(char.isdigit() for char in s)

	def treeSearch(self, rn, seq):
	retLogClust = None

	seqLen = len(seq)
	if seqLen not in rn.childD:
	return retLogClust

	parentn = rn.childD[seqLen]

	currentDepth = 1
	for token in seq:
	if currentDepth >= self.depth or currentDepth > seqLen:
	break

	if token in parentn.childD:
	parentn = parentn.childD[token]
	elif '<*>' in parentn.childD:
	parentn = parentn.childD['<*>']
	else:
	return retLogClust
	currentDepth += 1

	logClustL = parentn.childD

	retLogClust = self.fastMatch(logClustL, seq)

	return retLogClust

	def addSeqToPrefixTree(self, rn, logClust):
	seqLen = len(logClust.logTemplate)
	if seqLen not in rn.childD:
	firtLayerNode = Node(depth=1, digitOrtoken=seqLen)
	rn.childD[seqLen] = firtLayerNode
	else:
	firtLayerNode = rn.childD[seqLen]

	parentn = firtLayerNode

	currentDepth = 1
	for token in logClust.logTemplate:

	# Add current log cluster to the leaf node
	if currentDepth >= self.depth or currentDepth > seqLen:
	if len(parentn.childD) == 0:
	parentn.childD = [logClust]
	else:
	parentn.childD.append(logClust)
	break

	# If token not matched in this layer of existing tree.
	if token not in parentn.childD:
	if not self.hasNumbers(token):
	if '<*>' in parentn.childD:
	if len(parentn.childD) < self.maxChild:
	newNode = Node(depth=currentDepth + 1, digitOrtoken=token)
	parentn.childD[token] = newNode
	parentn = newNode
	else:
	parentn = parentn.childD['<*>']
	else:
	if len(parentn.childD) + 1 < self.maxChild:
	newNode = Node(depth=currentDepth + 1, digitOrtoken=token)
	parentn.childD[token] = newNode
	parentn = newNode
	elif len(parentn.childD) + 1 == self.maxChild:
	newNode = Node(depth=currentDepth + 1, digitOrtoken='<*>')
	parentn.childD['<*>'] = newNode
	parentn = newNode
	else:
	parentn = parentn.childD['<*>']

	else:
	if '<*>' not in parentn.childD:
	newNode = Node(depth=currentDepth + 1, digitOrtoken='<*>')
	parentn.childD['<*>'] = newNode
	parentn = newNode
	else:
	parentn = parentn.childD['<*>']

	# If the token is matched
	else:
	parentn = parentn.childD[token]

	currentDepth += 1

	# seq1 is template
	def seqDist(self, seq1, seq2):
	assert len(seq1) == len(seq2)
	simTokens = 0
	numOfPar = 0

	for token1, token2 in zip(seq1, seq2):
	if token1 == '<*>':
	numOfPar += 1
	continue #comment@haixuanguo: <> == <> are similar pairs
	if token1 == token2:
	simTokens += 1

	retVal = float(simTokens) / len(seq1)

	return retVal, numOfPar

	def fastMatch(self, logClustL, seq):
	retLogClust = None

	maxSim = -1
	maxNumOfPara = -1
	maxClust = None

	for logClust in logClustL:
	curSim, curNumOfPara = self.seqDist(logClust.logTemplate, seq)
	if curSim > maxSim or (curSim == maxSim and curNumOfPara > maxNumOfPara):
	maxSim = curSim
	maxNumOfPara = curNumOfPara
	maxClust = logClust

	if maxSim >= self.st:
	retLogClust = maxClust

	return retLogClust

	def getTemplate(self, seq1, seq2):
	assert len(seq1) == len(seq2)
	retVal = []

	i = 0
	for word in seq1:
	if word == seq2[i]:
	retVal.append(word)
	else:
	retVal.append('<*>')

	i += 1

	return retVal

	def outputResult(self, logClustL):
	log_templates = [0] * self.df_log.shape[0]
	log_templateids = [0] * self.df_log.shape[0]
	df_events = []
	for logClust in logClustL:
	template_str = ' '.join(logClust.logTemplate)
	occurrence = len(logClust.logIDL)
	template_id = hashlib.md5(template_str.encode('utf-8')).hexdigest()[0:8]
	for logID in logClust.logIDL:
	logID -= 1
	log_templates[logID] = template_str
	log_templateids[logID] = template_id
	df_events.append([template_id, template_str, occurrence])

	df_event = pd.DataFrame(df_events, columns=['EventId', 'EventTemplate', 'Occurrences'])
	self.df_log['EventId'] = log_templateids
	self.df_log['EventTemplate'] = log_templates

	if self.keep_para:
	self.df_log["ParameterList"] = self.df_log.apply(self.get_parameter_list, axis=1)
	self.df_log.to_csv(os.path.join(self.savePath, self.logName + '_structured.csv'), index=False)

	occ_dict = dict(self.df_log['EventTemplate'].value_counts())
	df_event = pd.DataFrame()
	df_event['EventTemplate'] = self.df_log['EventTemplate'].unique()
	df_event['EventId'] = df_event['EventTemplate'].map(lambda x: hashlib.md5(str(x).encode('utf-8')).hexdigest()[0:8])
	df_event['Occurrences'] = df_event['EventTemplate'].map(occ_dict)
	df_event.to_csv(os.path.join(self.savePath, self.logName + '_templates.csv'), index=False,
	columns=["EventId", "EventTemplate", "Occurrences"])

	def printTree(self, node, dep):
	pStr = ''
	for i in range(dep):
	pStr += '\t'

	if node.depth == 0:
	pStr += 'Root'
	elif node.depth == 1:
	pStr += '<' + str(node.digitOrtoken) + '>'
	else:
	pStr += node.digitOrtoken

	print(pStr)

	if node.depth == self.depth:
	return 1
	for child in node.childD:
	self.printTree(node.childD[child], dep + 1)

	def parse(self, logName):
	print('Parsing file: ' + os.path.join(self.path, logName))
	start_time = datetime.now()
	self.logName = logName
	rootNode = Node()
	logCluL = []

	self.load_data()

	count = 0
	for idx, line in self.df_log.iterrows():

	logID = line['LineId']
	logmessageL = self.preprocess(line['Content']).strip().split()
	# logmessageL = filter(lambda x: x != '', re.split('[\s=:,]', self.preprocess(line['Content'])))
	matchCluster = self.treeSearch(rootNode, logmessageL)

	# Match no existing log cluster
	if matchCluster is None:
	newCluster = Logcluster(logTemplate=logmessageL, logIDL=[logID])
	logCluL.append(newCluster)
	self.addSeqToPrefixTree(rootNode, newCluster)

	# Add the new log message to the existing cluster
	else:
	newTemplate = self.getTemplate(logmessageL, matchCluster.logTemplate)
	matchCluster.logIDL.append(logID)
	if ' '.join(newTemplate) != ' '.join(matchCluster.logTemplate):
	matchCluster.logTemplate = newTemplate

	count += 1
	if count % 1000 == 0 or count == len(self.df_log):
	print('Processed {0:.1f}% of log lines.'.format(count * 100.0 / len(self.df_log)), end='\r')

	if not os.path.exists(self.savePath):
	os.makedirs(self.savePath)

	self.outputResult(logCluL)

	print('Parsing done. [Time taken: {!s}]'.format(datetime.now() - start_time))

	def load_data(self):
	headers, regex = self.generate_logformat_regex(self.log_format)
	self.df_log = self.log_to_dataframe(os.path.join(self.path, self.logName), regex, headers, self.log_format)

	def preprocess(self, line):
	for currentRex in self.rex:
	line = re.sub(currentRex, '<*>', line)
	return line

	def log_to_dataframe(self, log_file, regex, headers, logformat):
	""" Function to transform log file to dataframe
	"""
	log_messages = []
	linecount = 0
	cnt = 0
	with open(log_file, 'r') as fin:
	for line in fin.readlines():
	cnt += 1
	try:
	match = regex.search(line.strip())
	message = [match.group(header) for header in headers]
	log_messages.append(message)
	linecount += 1
	except Exception as e:
	# print("\n", line)
	# print(e)
	pass
	print("Total size after encoding is", linecount, cnt)
	logdf = pd.DataFrame(log_messages, columns=headers)
	logdf.insert(0, 'LineId', None)
	logdf['LineId'] = [i + 1 for i in range(linecount)]
	return logdf

	def generate_logformat_regex(self, logformat):
	""" Function to generate regular expression to split log messages
	"""
	headers = []
	splitters = re.split(r'(<[^<>]+>)', logformat)
	regex = ''
	for k in range(len(splitters)):
	if k % 2 == 0:
	splitter = re.sub(' +', '\\\s+', splitters[k])
	regex += splitter
	else:
	header = splitters[k].strip('<').strip('>')
	regex += '(?P<%s>.*?)' % header
	headers.append(header)
	regex = re.compile('^' + regex + '$')
	return headers, regex

	def get_parameter_list(self, row):
	template_regex = re.sub(r"<.{1,5}>", "<*>", str(row["EventTemplate"]))
	if "<*>" not in template_regex: return []
	template_regex = re.sub(r'([^A-Za-z0-9])', r'\\\1', template_regex)
	template_regex = re.sub(r' +', r'\\s+', template_regex)
	template_regex = "^" + template_regex.replace("\<\\>", "(.?)") + "$"
	parameter_list = re.findall(template_regex, row["Content"])
	parameter_list = parameter_list[0] if parameter_list else ()
	parameter_list = list(parameter_list) if isinstance(parameter_list, tuple) else [parameter_list]
	return parameter_list