Spaces:

MukeshKapoor25
/

logbert_processor

Runtime error

App Files Files Community

MukeshKapoor25 commited on Jul 23, 2025

Commit

3f90381

1 Parent(s): 9a2df03

gitignore

Browse files

Files changed (10) hide show

.gitignore +9 -0
bert_pytorch/model/attention/multi_head.py +37 -0
bert_pytorch/model/attention/single.py +25 -0
bert_pytorch/model/utils/feed_forward.py +16 -0
bert_pytorch/model/utils/gelu.py +12 -0
bert_pytorch/model/utils/layer_norm.py +17 -0
bert_pytorch/model/utils/sublayer.py +18 -0
logparser/Drain.py +345 -0
logparser/Spell.py +356 -0
model/bert/parameters.txt +40 -0

.gitignore CHANGED Viewed

@@ -1,3 +1,12 @@
 model/bert/best_center.pt
 model/bert/best_total_dist.pt
 model/bert/best_bert.pth

 model/bert/best_center.pt
 model/bert/best_total_dist.pt
 model/bert/best_bert.pth
+model/bert/train_log2.csv
+model/bert/train_valid_loss.png
+model/bert/valid_log2.csv
+bert_pytorch/trainer/optim_schedule.py
+bert_pytorch/trainer/pretrain.py
+model/bert/test_abnormal_errors.pkl
+model/bert/test_abnormal_results
+model/bert/test_normal_errors.pkl
+model/bert/test_normal_results

bert_pytorch/model/attention/multi_head.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import torch.nn as nn
+from .single import Attention
+class MultiHeadedAttention(nn.Module):
+    """
+    Take in model size and number of heads.
+    """
+    def __init__(self, h, d_model, dropout=0.1):
+        super().__init__()
+        assert d_model % h == 0
+        # We assume d_v always equals d_k
+        self.d_k = d_model // h
+        self.h = h
+        self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(3)])
+        self.output_linear = nn.Linear(d_model, d_model)
+        self.attention = Attention()
+        self.dropout = nn.Dropout(p=dropout)
+    def forward(self, query, key, value, mask=None):
+        batch_size = query.size(0)
+        # 1) Do all the linear projections in batch from d_model => h x d_k
+        query, key, value = [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
+                             for l, x in zip(self.linear_layers, (query, key, value))]
+        # 2) Apply attention on all the projected vectors in batch.
+        x, attn = self.attention(query, key, value, mask=mask, dropout=self.dropout)
+        # 3) "Concat" using a view and apply a final linear.
+        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)
+        return self.output_linear(x)

bert_pytorch/model/attention/single.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import torch.nn as nn
+import torch.nn.functional as F
+import torch
+import math
+class Attention(nn.Module):
+    """
+    Compute 'Scaled Dot Product Attention
+    """
+    def forward(self, query, key, value, mask=None, dropout=None):
+        scores = torch.matmul(query, key.transpose(-2, -1)) \
+                 / math.sqrt(query.size(-1))
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e9)
+        p_attn = F.softmax(scores, dim=-1)
+        if dropout is not None:
+            p_attn = dropout(p_attn)
+        return torch.matmul(p_attn, value), p_attn

bert_pytorch/model/utils/feed_forward.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch.nn as nn
+from .gelu import GELU
+class PositionwiseFeedForward(nn.Module):
+    "Implements FFN equation."
+    def __init__(self, d_model, d_ff, dropout=0.1):
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = nn.Linear(d_model, d_ff)
+        self.w_2 = nn.Linear(d_ff, d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = GELU()
+    def forward(self, x):
+        return self.w_2(self.dropout(self.activation(self.w_1(x))))

bert_pytorch/model/utils/gelu.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import torch.nn as nn
+import torch
+import math
+class GELU(nn.Module):
+    """
+    Paper Section 3.4, last paragraph notice that BERT used the GELU instead of RELU
+    """
+    def forward(self, x):
+        return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))

bert_pytorch/model/utils/layer_norm.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import torch.nn as nn
+import torch
+class LayerNorm(nn.Module):
+    "Construct a layernorm module (See citation for details)."
+    def __init__(self, features, eps=1e-6):
+        super(LayerNorm, self).__init__()
+        self.a_2 = nn.Parameter(torch.ones(features))
+        self.b_2 = nn.Parameter(torch.zeros(features))
+        self.eps = eps
+    def forward(self, x):
+        mean = x.mean(-1, keepdim=True)
+        std = x.std(-1, keepdim=True)
+        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

bert_pytorch/model/utils/sublayer.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import torch.nn as nn
+from .layer_norm import LayerNorm
+class SublayerConnection(nn.Module):
+    """
+    A residual connection followed by a layer norm.
+    Note for code simplicity the norm is first as opposed to last.
+    """
+    def __init__(self, size, dropout):
+        super(SublayerConnection, self).__init__()
+        self.norm = LayerNorm(size)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, sublayer):
+        "Apply residual connection to any sublayer with the same size."
+        return x + self.dropout(sublayer(self.norm(x)))

logparser/Drain.py ADDED Viewed

	@@ -0,0 +1,345 @@

+"""
+Description : This file implements the Drain algorithm for log parsing
+Author      : LogPAI team
+License     : MIT
+"""
+import re
+import os
+import numpy as np
+import pandas as pd
+import hashlib
+from datetime import datetime
+class Logcluster:
+    def __init__(self, logTemplate='', logIDL=None):
+        self.logTemplate = logTemplate
+        if logIDL is None:
+            logIDL = []
+        self.logIDL = logIDL
+class Node:
+    def __init__(self, childD=None, depth=0, digitOrtoken=None):
+        if childD is None:
+            childD = dict()
+        self.childD = childD
+        self.depth = depth
+        self.digitOrtoken = digitOrtoken
+class LogParser:
+    def __init__(self, log_format, indir='./', outdir='./result/', depth=4, st=0.4,
+                 maxChild=100, rex=[], keep_para=True):
+        """
+        Attributes
+        ----------
+            rex : regular expressions used in preprocessing (step1)
+            path : the input path stores the input log file name
+            depth : depth of all leaf nodes
+            st : similarity threshold
+            maxChild : max number of children of an internal node
+            logName : the name of the input file containing raw log messages
+            savePath : the output path stores the file containing structured logs
+        """
+        self.path = indir
+        self.depth = depth - 2
+        self.st = st
+        self.maxChild = maxChild
+        self.logName = None
+        self.savePath = outdir
+        self.df_log = None
+        self.log_format = log_format
+        self.rex = rex
+        self.keep_para = keep_para
+    def hasNumbers(self, s):
+        return any(char.isdigit() for char in s)
+    def treeSearch(self, rn, seq):
+        retLogClust = None
+        seqLen = len(seq)
+        if seqLen not in rn.childD:
+            return retLogClust
+        parentn = rn.childD[seqLen]
+        currentDepth = 1
+        for token in seq:
+            if currentDepth >= self.depth or currentDepth > seqLen:
+                break
+            if token in parentn.childD:
+                parentn = parentn.childD[token]
+            elif '<*>' in parentn.childD:
+                parentn = parentn.childD['<*>']
+            else:
+                return retLogClust
+            currentDepth += 1
+        logClustL = parentn.childD
+        retLogClust = self.fastMatch(logClustL, seq)
+        return retLogClust
+    def addSeqToPrefixTree(self, rn, logClust):
+        seqLen = len(logClust.logTemplate)
+        if seqLen not in rn.childD:
+            firtLayerNode = Node(depth=1, digitOrtoken=seqLen)
+            rn.childD[seqLen] = firtLayerNode
+        else:
+            firtLayerNode = rn.childD[seqLen]
+        parentn = firtLayerNode
+        currentDepth = 1
+        for token in logClust.logTemplate:
+            # Add current log cluster to the leaf node
+            if currentDepth >= self.depth or currentDepth > seqLen:
+                if len(parentn.childD) == 0:
+                    parentn.childD = [logClust]
+                else:
+                    parentn.childD.append(logClust)
+                break
+            # If token not matched in this layer of existing tree.
+            if token not in parentn.childD:
+                if not self.hasNumbers(token):
+                    if '<*>' in parentn.childD:
+                        if len(parentn.childD) < self.maxChild:
+                            newNode = Node(depth=currentDepth + 1, digitOrtoken=token)
+                            parentn.childD[token] = newNode
+                            parentn = newNode
+                        else:
+                            parentn = parentn.childD['<*>']
+                    else:
+                        if len(parentn.childD) + 1 < self.maxChild:
+                            newNode = Node(depth=currentDepth + 1, digitOrtoken=token)
+                            parentn.childD[token] = newNode
+                            parentn = newNode
+                        elif len(parentn.childD) + 1 == self.maxChild:
+                            newNode = Node(depth=currentDepth + 1, digitOrtoken='<*>')
+                            parentn.childD['<*>'] = newNode
+                            parentn = newNode
+                        else:
+                            parentn = parentn.childD['<*>']
+                else:
+                    if '<*>' not in parentn.childD:
+                        newNode = Node(depth=currentDepth + 1, digitOrtoken='<*>')
+                        parentn.childD['<*>'] = newNode
+                        parentn = newNode
+                    else:
+                        parentn = parentn.childD['<*>']
+            # If the token is matched
+            else:
+                parentn = parentn.childD[token]
+            currentDepth += 1
+    # seq1 is template
+    def seqDist(self, seq1, seq2):
+        assert len(seq1) == len(seq2)
+        simTokens = 0
+        numOfPar = 0
+        for token1, token2 in zip(seq1, seq2):
+            if token1 == '<*>':
+                numOfPar += 1
+                continue #comment@haixuanguo: <*> == <*> are similar pairs
+            if token1 == token2:
+                simTokens += 1
+        retVal = float(simTokens) / len(seq1)
+        return retVal, numOfPar
+    def fastMatch(self, logClustL, seq):
+        retLogClust = None
+        maxSim = -1
+        maxNumOfPara = -1
+        maxClust = None
+        for logClust in logClustL:
+            curSim, curNumOfPara = self.seqDist(logClust.logTemplate, seq)
+            if curSim > maxSim or (curSim == maxSim and curNumOfPara > maxNumOfPara):
+                maxSim = curSim
+                maxNumOfPara = curNumOfPara
+                maxClust = logClust
+        if maxSim >= self.st:
+            retLogClust = maxClust
+        return retLogClust
+    def getTemplate(self, seq1, seq2):
+        assert len(seq1) == len(seq2)
+        retVal = []
+        i = 0
+        for word in seq1:
+            if word == seq2[i]:
+                retVal.append(word)
+            else:
+                retVal.append('<*>')
+            i += 1
+        return retVal
+    def outputResult(self, logClustL):
+        log_templates = [0] * self.df_log.shape[0]
+        log_templateids = [0] * self.df_log.shape[0]
+        df_events = []
+        for logClust in logClustL:
+            template_str = ' '.join(logClust.logTemplate)
+            occurrence = len(logClust.logIDL)
+            template_id = hashlib.md5(template_str.encode('utf-8')).hexdigest()[0:8]
+            for logID in logClust.logIDL:
+                logID -= 1
+                log_templates[logID] = template_str
+                log_templateids[logID] = template_id
+            df_events.append([template_id, template_str, occurrence])
+        df_event = pd.DataFrame(df_events, columns=['EventId', 'EventTemplate', 'Occurrences'])
+        self.df_log['EventId'] = log_templateids
+        self.df_log['EventTemplate'] = log_templates
+        if self.keep_para:
+            self.df_log["ParameterList"] = self.df_log.apply(self.get_parameter_list, axis=1)
+        self.df_log.to_csv(os.path.join(self.savePath, self.logName + '_structured.csv'), index=False)
+        occ_dict = dict(self.df_log['EventTemplate'].value_counts())
+        df_event = pd.DataFrame()
+        df_event['EventTemplate'] = self.df_log['EventTemplate'].unique()
+        df_event['EventId'] = df_event['EventTemplate'].map(lambda x: hashlib.md5(str(x).encode('utf-8')).hexdigest()[0:8])
+        df_event['Occurrences'] = df_event['EventTemplate'].map(occ_dict)
+        df_event.to_csv(os.path.join(self.savePath, self.logName + '_templates.csv'), index=False,
+                        columns=["EventId", "EventTemplate", "Occurrences"])
+    def printTree(self, node, dep):
+        pStr = ''
+        for i in range(dep):
+            pStr += '\t'
+        if node.depth == 0:
+            pStr += 'Root'
+        elif node.depth == 1:
+            pStr += '<' + str(node.digitOrtoken) + '>'
+        else:
+            pStr += node.digitOrtoken
+        print(pStr)
+        if node.depth == self.depth:
+            return 1
+        for child in node.childD:
+            self.printTree(node.childD[child], dep + 1)
+    def parse(self, logName):
+        print('Parsing file: ' + os.path.join(self.path, logName))
+        start_time = datetime.now()
+        self.logName = logName
+        rootNode = Node()
+        logCluL = []
+        self.load_data()
+        count = 0
+        for idx, line in self.df_log.iterrows():
+            logID = line['LineId']
+            logmessageL = self.preprocess(line['Content']).strip().split()
+            # logmessageL = filter(lambda x: x != '', re.split('[\s=:,]', self.preprocess(line['Content'])))
+            matchCluster = self.treeSearch(rootNode, logmessageL)
+            # Match no existing log cluster
+            if matchCluster is None:
+                newCluster = Logcluster(logTemplate=logmessageL, logIDL=[logID])
+                logCluL.append(newCluster)
+                self.addSeqToPrefixTree(rootNode, newCluster)
+            # Add the new log message to the existing cluster
+            else:
+                newTemplate = self.getTemplate(logmessageL, matchCluster.logTemplate)
+                matchCluster.logIDL.append(logID)
+                if ' '.join(newTemplate) != ' '.join(matchCluster.logTemplate):
+                    matchCluster.logTemplate = newTemplate
+            count += 1
+            if count % 1000 == 0 or count == len(self.df_log):
+                print('Processed {0:.1f}% of log lines.'.format(count * 100.0 / len(self.df_log)), end='\r')
+        if not os.path.exists(self.savePath):
+            os.makedirs(self.savePath)
+        self.outputResult(logCluL)
+        print('Parsing done. [Time taken: {!s}]'.format(datetime.now() - start_time))
+    def load_data(self):
+        headers, regex = self.generate_logformat_regex(self.log_format)
+        self.df_log = self.log_to_dataframe(os.path.join(self.path, self.logName), regex, headers, self.log_format)
+    def preprocess(self, line):
+        for currentRex in self.rex:
+            line = re.sub(currentRex, '<*>', line)
+        return line
+    def log_to_dataframe(self, log_file, regex, headers, logformat):
+        """ Function to transform log file to dataframe
+        """
+        log_messages = []
+        linecount = 0
+        cnt = 0
+        with open(log_file, 'r') as fin:
+            for line in fin.readlines():
+                cnt += 1
+                try:
+                    match = regex.search(line.strip())
+                    message = [match.group(header) for header in headers]
+                    log_messages.append(message)
+                    linecount += 1
+                except Exception as e:
+                    # print("\n", line)
+                    # print(e)
+                    pass
+        print("Total size after encoding is", linecount, cnt)
+        logdf = pd.DataFrame(log_messages, columns=headers)
+        logdf.insert(0, 'LineId', None)
+        logdf['LineId'] = [i + 1 for i in range(linecount)]
+        return logdf
+    def generate_logformat_regex(self, logformat):
+        """ Function to generate regular expression to split log messages
+        """
+        headers = []
+        splitters = re.split(r'(<[^<>]+>)', logformat)
+        regex = ''
+        for k in range(len(splitters)):
+            if k % 2 == 0:
+                splitter = re.sub(' +', '\\\s+', splitters[k])
+                regex += splitter
+            else:
+                header = splitters[k].strip('<').strip('>')
+                regex += '(?P<%s>.*?)' % header
+                headers.append(header)
+        regex = re.compile('^' + regex + '$')
+        return headers, regex
+    def get_parameter_list(self, row):
+        template_regex = re.sub(r"<.{1,5}>", "<*>", str(row["EventTemplate"]))
+        if "<*>" not in template_regex: return []
+        template_regex = re.sub(r'([^A-Za-z0-9])', r'\\\1', template_regex)
+        template_regex = re.sub(r' +', r'\\s+', template_regex)
+        template_regex = "^" + template_regex.replace("\<\*\>", "(.*?)") + "$"
+        parameter_list = re.findall(template_regex, row["Content"])
+        parameter_list = parameter_list[0] if parameter_list else ()
+        parameter_list = list(parameter_list) if isinstance(parameter_list, tuple) else [parameter_list]
+        return parameter_list

logparser/Spell.py ADDED Viewed

	@@ -0,0 +1,356 @@

+"""
+Description : This file implements the Spell algorithm for log parsing
+Author      : LogPAI team
+License     : MIT
+"""
+import sys
+# import re
+import regex as re
+import os
+import numpy as np
+import pandas as pd
+import hashlib
+from datetime import datetime
+import string
+from tqdm import tqdm
+class LCSObject:
+    """ Class object to store a log group with the same template
+    """
+    def __init__(self, logTemplate='', logIDL=[]):
+        self.logTemplate = logTemplate
+        self.logIDL = logIDL
+class Node:
+    """ A node in prefix tree data structure
+    """
+    def __init__(self, token='', templateNo=0):
+        self.logClust = None
+        self.token = token
+        self.templateNo = templateNo
+        self.childD = dict()
+class LogParser:
+    """ LogParser class
+    Attributes
+    ----------
+        path : the path of the input file
+        logName : the file name of the input file
+        savePath : the path of the output file
+        tau : how much percentage of tokens matched to merge a log message
+    """
+    def __init__(self, indir='./', outdir='./result/', log_format=None, tau=0.5, rex=[], keep_para=True):
+        self.path = indir
+        self.logName = None
+        self.savePath = outdir
+        self.tau = tau
+        self.logformat = log_format
+        self.df_log = None
+        self.rex = rex
+        self.keep_para = keep_para
+    def LCS(self, seq1, seq2):
+        lengths = [[0 for j in range(len(seq2) + 1)] for i in range(len(seq1) + 1)]
+        # row 0 and column 0 are initialized to 0 already
+        for i in range(len(seq1)):
+            for j in range(len(seq2)):
+                if seq1[i] == seq2[j]:
+                    lengths[i + 1][j + 1] = lengths[i][j] + 1
+                else:
+                    lengths[i + 1][j + 1] = max(lengths[i + 1][j], lengths[i][j + 1])
+        # read the substring out from the matrix
+        result = []
+        lenOfSeq1, lenOfSeq2 = len(seq1), len(seq2)
+        while lenOfSeq1 != 0 and lenOfSeq2 != 0:
+            if lengths[lenOfSeq1][lenOfSeq2] == lengths[lenOfSeq1 - 1][lenOfSeq2]:
+                lenOfSeq1 -= 1
+            elif lengths[lenOfSeq1][lenOfSeq2] == lengths[lenOfSeq1][lenOfSeq2 - 1]:
+                lenOfSeq2 -= 1
+            else:
+                assert seq1[lenOfSeq1 - 1] == seq2[lenOfSeq2 - 1]
+                result.insert(0, seq1[lenOfSeq1 - 1])
+                lenOfSeq1 -= 1
+                lenOfSeq2 -= 1
+        return result
+    #for each seq, find the corresponding log key(template)
+    def SimpleLoopMatch(self, logClustL, seq):
+        for logClust in logClustL:
+            if float(len(logClust.logTemplate)) < 0.5 * len(seq):
+                continue
+            # Check the template is a subsequence of seq (we use set checking as a proxy here for speedup since
+            # incorrect-ordering bad cases rarely occur in logs)
+            token_set = set(seq)
+            if all(token in token_set or token == '<*>' for token in logClust.logTemplate):
+                return logClust
+        return None
+    def PrefixTreeMatch(self, parentn, seq, idx):
+        retLogClust = None
+        length = len(seq)
+        for i in range(idx, length):
+            if seq[i] in parentn.childD:
+                childn = parentn.childD[seq[i]]
+                if (childn.logClust is not None):
+                    constLM = [w for w in childn.logClust.logTemplate if w != '<*>']
+                    if float(len(constLM)) >= self.tau * length:
+                        return childn.logClust
+                else:
+                    return self.PrefixTreeMatch(childn, seq, i + 1)
+        return retLogClust
+    #for each seq, find the corresponding log template using LCS
+    def LCSMatch(self, logClustL, seq):
+        retLogClust = None
+        maxLen = -1
+        maxlcs = []
+        maxClust = None
+        set_seq = set(seq)
+        size_seq = len(seq)
+        for logClust in logClustL:
+            set_template = set(logClust.logTemplate)
+            if len(set_seq & set_template) < 0.5 * size_seq:
+                continue
+            lcs = self.LCS(seq, logClust.logTemplate)
+            if len(lcs) > maxLen or (len(lcs) == maxLen and len(logClust.logTemplate) < len(maxClust.logTemplate)):
+                maxLen = len(lcs)
+                maxlcs = lcs
+                maxClust = logClust
+        # LCS should be large then tau * len(itself)
+        if float(maxLen) >= self.tau * size_seq:
+            retLogClust = maxClust
+        return retLogClust
+    def getTemplate(self, lcs, seq):
+        retVal = []
+        if not lcs:
+            return retVal
+        lcs = lcs[::-1]
+        i = 0
+        for token in seq:
+            i += 1
+            if token == lcs[-1]:
+                retVal.append(token)
+                lcs.pop()
+            else:
+                retVal.append('<*>')
+            if not lcs:
+                break
+        if i < len(seq):
+            retVal.append('<*>')
+        return retVal
+    def addSeqToPrefixTree(self, rootn, newCluster):
+        parentn = rootn
+        seq = newCluster.logTemplate
+        seq = [w for w in seq if w != '<*>']
+        for i in range(len(seq)):
+            tokenInSeq = seq[i]
+            # Match
+            if tokenInSeq in parentn.childD:
+                parentn.childD[tokenInSeq].templateNo += 1
+                # Do not Match
+            else:
+                parentn.childD[tokenInSeq] = Node(token=tokenInSeq, templateNo=1)
+            parentn = parentn.childD[tokenInSeq]
+        if parentn.logClust is None:
+            parentn.logClust = newCluster
+    def removeSeqFromPrefixTree(self, rootn, newCluster):
+        parentn = rootn
+        seq = newCluster.logTemplate
+        seq = [w for w in seq if w != '<*>']
+        for tokenInSeq in seq:
+            if tokenInSeq in parentn.childD:
+                matchedNode = parentn.childD[tokenInSeq]
+                if matchedNode.templateNo == 1:
+                    del parentn.childD[tokenInSeq]
+                    break
+                else:
+                    matchedNode.templateNo -= 1
+                    parentn = matchedNode
+    def outputResult(self, logClustL):
+        print("output result", self.savePath)
+        templates = [0] * self.df_log.shape[0]
+        ids = [0] * self.df_log.shape[0]
+        df_event = []
+        for logclust in tqdm(logClustL):
+            template_str = ' '.join(logclust.logTemplate)
+            eid = hashlib.md5(template_str.encode('utf-8')).hexdigest()[0:8]
+            for logid in logclust.logIDL:
+                templates[logid - 1] = template_str
+                ids[logid - 1] = eid
+            df_event.append([eid, template_str, len(logclust.logIDL)])
+        df_event = pd.DataFrame(df_event, columns=['EventId', 'EventTemplate', 'Occurrences'])
+        self.df_log['EventId'] = ids
+        self.df_log['EventTemplate'] = templates
+        if self.keep_para:
+            self.df_log["ParameterList"] = self.df_log.apply(self.get_parameter_list, axis=1)
+        self.df_log.to_csv(os.path.join(self.savePath, self.logname + '_structured.csv'), index=False)
+        df_event.to_csv(os.path.join(self.savePath, self.logname + '_templates.csv'), index=False)
+    def printTree(self, node, dep):
+        pStr = ''
+        for i in range(len(dep)):
+            pStr += '\t'
+        if node.token == '':
+            pStr += 'Root'
+        else:
+            pStr += node.token
+            if node.logClust is not None:
+                pStr += '-->' + ' '.join(node.logClust.logTemplate)
+        print(pStr + ' (' + str(node.templateNo) + ')')
+        for child in node.childD:
+            self.printTree(node.childD[child], dep + 1)
+    def parse(self, logname):
+        starttime = datetime.now()
+        print('Parsing file: ' + os.path.join(self.path, logname))
+        self.logname = logname
+        self.load_data()
+        rootNode = Node()
+        logCluL = []
+        punc = re.sub('[<*>]', '', string.punctuation)
+        count = 0
+        for idx, line in self.df_log.iterrows():
+            logID = line['LineId']
+            logmessageL = list(filter(lambda x: x.strip() != '', re.split(f'[{punc}]', self.preprocess(line['Content']))))
+            constLogMessL = [w for w in logmessageL if w != '<*>']
+            #constLogMessL = [w for w in logmessageL]
+            # Find an existing matched log cluster
+            matchCluster = self.PrefixTreeMatch(rootNode, constLogMessL, 0)
+            if matchCluster is None:
+                matchCluster = self.SimpleLoopMatch(logCluL, constLogMessL)
+                if matchCluster is None:
+                    matchCluster = self.LCSMatch(logCluL, logmessageL)
+                    # Match no existing log cluster
+                    if matchCluster is None:
+                        newCluster = LCSObject(logTemplate=logmessageL, logIDL=[logID])
+                        logCluL.append(newCluster)
+                        self.addSeqToPrefixTree(rootNode, newCluster)
+                    # Add the new log message to the existing cluster
+                    else:
+                        newTemplate = self.getTemplate(self.LCS(logmessageL, matchCluster.logTemplate),
+                                                       matchCluster.logTemplate)
+                        if ' '.join(newTemplate) != ' '.join(matchCluster.logTemplate):
+                            self.removeSeqFromPrefixTree(rootNode, matchCluster)
+                            matchCluster.logTemplate = newTemplate
+                            self.addSeqToPrefixTree(rootNode, matchCluster)
+            if matchCluster:
+                matchCluster.logIDL.append(logID)
+            count += 1
+            if count % 1000 == 0 or count == len(self.df_log):
+                print('Processed {0:.1f}% of log lines.'.format(count * 100.0 / len(self.df_log)), end='\r')
+        if not os.path.exists(self.savePath):
+            os.makedirs(self.savePath)
+        self.outputResult(logCluL)
+        print('Parsing done. [Time taken: {!s}]'.format(datetime.now() - starttime))
+    def load_data(self):
+        headers, regex = self.generate_logformat_regex(self.logformat)
+        self.df_log = self.log_to_dataframe(os.path.join(self.path, self.logname), regex, headers, self.logformat)
+    def preprocess(self, line):
+        for currentRex in self.rex:
+            line = re.sub(currentRex, '<*>', line)
+        return line
+    def log_to_dataframe(self, log_file, regex, headers, logformat):
+        """ Function to transform log file to dataframe
+        """
+        log_messages = []
+        linecount = 0
+        k = 0
+        with open(log_file, 'r') as fin:
+            for line in fin.readlines():
+                #extract small size data
+                k += 1
+                if k%10000 == 0:
+                    print("extracted {0} log lines from {1}".format(k, log_file))
+                line = re.sub(r'[^\x00-\x7F]+', '<NASCII>', line) #replace non ASCII (\x00-\x7F) character with <NASCII>
+                try:
+                    match = regex.search(line.strip())
+                    message = [match.group(header) for header in headers]
+                    log_messages.append(message)
+                    linecount += 1
+                except Exception as e:
+                    pass
+        logdf = pd.DataFrame(log_messages, columns=headers)
+        logdf.insert(0, 'LineId', None)
+        logdf['LineId'] = [i + 1 for i in range(linecount)]
+        return logdf
+    def generate_logformat_regex(self, logformat):
+        """ Function to generate regular expression to split log messages
+        """
+        headers = []
+        splitters = re.split(r'(<[^<>]+>)', logformat)
+        regex = ''
+        for k in range(len(splitters)):
+            if k % 2 == 0:
+                splitter = re.sub(' +', '\\\s+', splitters[k]) #re.sub(' +', '\s+', splitters[k])
+                regex += splitter
+            else:
+                header = splitters[k].strip('<').strip('>')
+                regex += '(?P<%s>.*?)' % header
+                headers.append(header)
+        regex = re.compile('^' + regex + '$')
+        return headers, regex
+    def get_parameter_list(self, row):
+        template_regex = re.sub(r"\s<.{1,5}>\s", "<*>", row["EventTemplate"])
+        if "<*>" not in template_regex: return []
+        template_regex = re.sub(r'([^A-Za-z0-9])', r'\\\1', template_regex)
+        template_regex = re.sub(r'\\ +', r'[^A-Za-z0-9]+', template_regex)
+        template_regex = "^" + template_regex.replace("\<\*\>", "(.*?)") + "$"
+        parameter_list = re.findall(template_regex, row["Content"])
+        parameter_list = parameter_list[0] if parameter_list else ()
+        parameter_list = list(parameter_list) if isinstance(parameter_list, tuple) else [parameter_list]
+        parameter_list = [para.strip(string.punctuation).strip(' ') for para in parameter_list]
+        return parameter_list
+if __name__ == "__main__":
+    import os
+    import pandas as pd
+    print(os.getcwd())
+    os.chdir("../")
+    print(os.getcwd())
+    lp = LogParser()
+    # print(lp.LCS(seq1="abcbb", seq2="bc"))
+    output_dir = 'demo/Spell_result/'  # The output directory of parsing results
+    log_file = 'HDFS.log'  # The input log file name
+    log_structured_file = output_dir + log_file + "_structured.csv"
+    df = pd.read_csv(log_structured_file)
+    for _, row in df.iterrows():
+        lp.get_parameter_list(row)

model/bert/parameters.txt ADDED Viewed

	@@ -0,0 +1,40 @@

+device: cuda
+output_dir: AI_MODELS/trained_models/Hadoop_logbert/
+model_dir: AI_MODELS/trained_models/Hadoop_logbert/bert/
+model_path: AI_MODELS/trained_models/Hadoop_logbert/bert/best_bert.pth
+train_vocab: AI_MODELS/trained_models/Hadoop_logbert/train
+vocab_path: AI_MODELS/trained_models/Hadoop_logbert/vocab.pkl
+window_size: 64
+adaptive_window: True
+seq_len: 256
+max_len: 256
+min_len: 5
+mask_ratio: 0.7
+train_ratio: 1
+valid_ratio: 0.25
+test_ratio: 1
+is_logkey: True
+is_time: False
+hypersphere_loss: True
+hypersphere_loss_test: True
+scale: standard
+scale_path: AI_MODELS/trained_models/Hadoop_logbert/bert/scale.pkl
+hidden: 512
+layers: 6
+attn_heads: 8
+epochs: 2000
+n_epochs_stop: 50
+batch_size: 32
+corpus_lines: None
+on_memory: True
+num_workers: 5
+lr: 0.0003
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_weight_decay: 0.0
+with_cuda: True
+cuda_devices: None
+log_freq: None
+num_candidates: 6
+gaussian_mean: 0
+gaussian_std: 1