Spaces:

MukeshKapoor25
/

logbert_processor

Runtime error

App Files Files Community

MukeshKapoor25 commited on Jul 23, 2025

Commit

07237a3

1 Parent(s): fbf5fc4

Refactor imports and improve code formatting in logbert_rca_pipeline_api.py; update requirements.txt to include botocore

Browse files

Files changed (2) hide show

logbert_rca_pipeline_api.py +72 -56
requirements.txt +2 -1

logbert_rca_pipeline_api.py CHANGED Viewed

@@ -1,33 +1,30 @@
 import os
 import sys
-import re
-import ast
-import json
-import time
-import torch
-import pandas as pd
-import numpy as np
-from tqdm import tqdm
-from collections import defaultdict
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from torch.utils.data import DataLoader
-sys.path.append('../')
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-from logparser import Drain
-from bert_pytorch.dataset import LogDataset, WordVocab
-from bert_pytorch.model.bert import BERT
 from bert_pytorch.model.log_model import BERTLog
 # === Constants ===
 TOP_EVENTS = 5
 MAX_RCA_TOKENS = 200
-MISTRAL_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-# HF_CACHE = "/content/drive/MyDrive/hf_cache"
 # === Log Parsing ===
 def parse_log_with_drain(log_file, input_dir, output_dir):
     regex = [
         r"appattempt_\d+_\d+_\d+",
@@ -39,9 +36,11 @@ def parse_log_with_drain(log_file, input_dir, output_dir):
         r"[a-f0-9]{8,}"
     ]
     log_format = r'\[<AppId>] <Date> <Time> <Level> \[<Process>] <Component>: <Content>'
-    parser = Drain.LogParser(log_format, indir=input_dir, outdir=output_dir, depth=5, st=0.5, rex=regex, keep_para=True)
     parser.parse(log_file)
 def hadoop_sampling(structured_log_path, sequence_output_path):
     df = pd.read_csv(structured_log_path)
     data_dict = defaultdict(list)
@@ -50,39 +49,51 @@ def hadoop_sampling(structured_log_path, sequence_output_path):
         event_id = row.get("EventId")
         if pd.notnull(app_id) and pd.notnull(event_id):
             data_dict[app_id].append(str(event_id))
-    pd.DataFrame(list(data_dict.items()), columns=['AppId', 'EventSequence']).to_csv(sequence_output_path, index=False)
 # === Utility Functions ===
 def load_parameters(param_path):
     options = {}
     with open(param_path, 'r') as f:
         for line in f:
-            if ':' not in line: continue
             key, val = line.strip().split(':', 1)
             key, val = key.strip(), val.strip()
             if val.lower() in ['true', 'false', 'none']:
                 val = eval(val.capitalize())
             else:
-                try: val = int(val)
                 except ValueError:
-                    try: val = float(val)
-                    except ValueError: pass
             options[key] = val
     return options
 def load_logbert_model(options, vocab):
     try:
         return torch.load(options["model_path"], map_location=options["device"])
     except:
-        bert = BERT(len(vocab), options["hidden"], options["layers"], options["attn_heads"], options["max_len"])
         model = BERTLog(bert, vocab_size=len(vocab)).to(options["device"])
-        model.load_state_dict(torch.load(options["model_path"], map_location=options["device"]))
         return model
 def load_center(path, device):
     center = torch.load(path, map_location=device)
     return center["center"] if isinstance(center, dict) else center
 def extract_sequences(path, min_len):
     df = pd.read_csv(path)
     data, app_ids = [], []
@@ -96,20 +107,25 @@ def extract_sequences(path, min_len):
             continue
     return data, app_ids
 def prepare_dataloader(sequences, vocab, options):
     dummy_times = [[0] * len(seq) for seq in sequences]
-    dataset = LogDataset(sequences, dummy_times, vocab, seq_len=options["seq_len"], on_memory=True, mask_ratio=options["mask_ratio"])
     return DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=dataset.collate_fn)
 def calculate_mean_std(loader, model, center, device):
     scores = []
     with torch.no_grad():
         for batch in tqdm(loader, desc="📏 Computing train distances..."):
             batch = {k: v.to(device) for k, v in batch.items()}
-            cls_output = model(batch["bert_input"], batch["time_input"])["cls_output"]
             scores.append(torch.norm(cls_output - center, dim=1).item())
     return np.mean(scores), np.std(scores)
 def generate_prompt(event_templates):
     prompt = "The system encountered a failure. Below are the key log events preceding the anomaly:\n\n"
     for i, event in enumerate(event_templates, 1):
@@ -118,16 +134,8 @@ def generate_prompt(event_templates):
     prompt += "Explain the cause in one or two sentences, using technical reasoning if possible.\n"
     return prompt
-def call_mistral(prompt, tokenizer, model, device):
-    inputs = tokenizer(prompt, return_tensors="pt").to(device)
-    outputs = model.generate(
-        **inputs,
-        max_length=inputs['input_ids'].shape[1] + MAX_RCA_TOKENS,
-        do_sample=False,
-        top_k=50,
-        pad_token_id=tokenizer.eos_token_id
-    )
-    return tokenizer.decode(outputs[0], skip_special_tokens=True)[len(prompt):].strip()
 def compute_logkey_anomaly(masked_output, masked_label, top_k=5):
     num_undetected = 0
@@ -137,12 +145,16 @@ def compute_logkey_anomaly(masked_output, masked_label, top_k=5):
     return num_undetected, len(masked_label)
 # === API-Compatible RCA Pipeline ===
 def detect_anomalies_and_explain(input_log_path):
     log_file = os.path.basename(input_log_path)
     input_dir = os.path.dirname(input_log_path)
-    output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "model", "bert"))
-    log_structured_file = os.path.join(output_dir, log_file + "_structured.csv")
     log_templates_file = os.path.join(output_dir, log_file + "_templates.csv")
     log_sequence_file = os.path.join(output_dir, "rca_abnormal_sequence.csv")
     PARAMS_FILE = os.path.join(output_dir, "bert", "parameters.txt")
@@ -155,26 +167,28 @@ def detect_anomalies_and_explain(input_log_path):
     # Step 2: Load Models and Parameters
     options = load_parameters(PARAMS_FILE)
-    options["device"] = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    # tokenizer = AutoTokenizer.from_pretrained(MISTRAL_MODEL)
-    # model_mistral = AutoModelForCausalLM.from_pretrained(MISTRAL_MODEL, torch_dtype=torch.float32).to(options["device"])
-    # model_mistral.eval()
     vocab = WordVocab.load_vocab(options["vocab_path"])
     model = load_logbert_model(options, vocab).to(options["device"]).eval()
     center = load_center(CENTER_PATH, options["device"])
     # Step 3: Prepare Data
-    test_sequences, app_ids = extract_sequences(log_sequence_file, options["min_len"])
     test_loader = prepare_dataloader(test_sequences, vocab, options)
-    train_sequences = [line.strip().split() for line in open(TRAIN_FILE) if len(line.strip().split()) >= options["min_len"]]
     train_loader = prepare_dataloader(train_sequences, vocab, options)
-    mean, std = calculate_mean_std(train_loader, model, center, options["device"])
     templates_df = pd.read_csv(log_templates_file)
-    event_template_dict = dict(zip(templates_df["EventId"], templates_df["EventTemplate"]))
     # Step 4: Analyze & Explain Anomalies
     results = []
@@ -185,7 +199,8 @@ def detect_anomalies_and_explain(input_log_path):
         score = torch.norm(cls_output - center, dim=1).item()
         z_score = (score - mean) / std
-        num_undetected, masked_total = compute_logkey_anomaly(output["logkey_output"][0], batch["bert_label"][0])
         undetected_ratio = num_undetected / masked_total if masked_total else 0
         status = "Abnormal" if z_score > 2 or undetected_ratio > 0.5 else "Normal"
@@ -193,16 +208,17 @@ def detect_anomalies_and_explain(input_log_path):
             continue
         top_eids = test_sequences[i][:TOP_EVENTS]
-        event_templates = [event_template_dict.get(eid, f"[Missing Event {eid}]") for eid in top_eids]
-        #prompt = ''#generate_prompt(event_templates)
-        #explanation = ''#call_mistral(prompt, tokenizer, model_mistral, options["device"])
         results.append({
             "AppId": app_ids[i],
             "Score": score,
             "z_score": z_score,
             "UndetectedRatio": undetected_ratio,
-            "status":status,
             "Events": event_templates,
             "Explanation": None
         })

 import os
 import sys
+# Ensure local logbert_processor and logparser are first in sys.path for all imports
+sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), 'logparser')))
 from bert_pytorch.model.log_model import BERTLog
+from bert_pytorch.model.bert import BERT
+from bert_pytorch.dataset import LogDataset, WordVocab
+import Drain
+from torch.utils.data import DataLoader
+from collections import defaultdict
+from tqdm import tqdm
+import numpy as np
+import pandas as pd
+import torch
+import time
+import json
+import ast
+import re
 # === Constants ===
 TOP_EVENTS = 5
 MAX_RCA_TOKENS = 200
 # === Log Parsing ===
 def parse_log_with_drain(log_file, input_dir, output_dir):
     regex = [
         r"appattempt_\d+_\d+_\d+",
         r"[a-f0-9]{8,}"
     ]
     log_format = r'\[<AppId>] <Date> <Time> <Level> \[<Process>] <Component>: <Content>'
+    parser = Drain.LogParser(log_format, indir=input_dir,
+                             outdir=output_dir, depth=5, st=0.5, rex=regex, keep_para=True)
     parser.parse(log_file)
 def hadoop_sampling(structured_log_path, sequence_output_path):
     df = pd.read_csv(structured_log_path)
     data_dict = defaultdict(list)
         event_id = row.get("EventId")
         if pd.notnull(app_id) and pd.notnull(event_id):
             data_dict[app_id].append(str(event_id))
+    pd.DataFrame(list(data_dict.items()), columns=['AppId', 'EventSequence']).to_csv(
+        sequence_output_path, index=False)
 # === Utility Functions ===
 def load_parameters(param_path):
     options = {}
     with open(param_path, 'r') as f:
         for line in f:
+            if ':' not in line:
+                continue
             key, val = line.strip().split(':', 1)
             key, val = key.strip(), val.strip()
             if val.lower() in ['true', 'false', 'none']:
                 val = eval(val.capitalize())
             else:
+                try:
+                    val = int(val)
                 except ValueError:
+                    try:
+                        val = float(val)
+                    except ValueError:
+                        pass
             options[key] = val
     return options
 def load_logbert_model(options, vocab):
     try:
         return torch.load(options["model_path"], map_location=options["device"])
     except:
+        bert = BERT(len(vocab), options["hidden"], options["layers"],
+                    options["attn_heads"], options["max_len"])
         model = BERTLog(bert, vocab_size=len(vocab)).to(options["device"])
+        model.load_state_dict(torch.load(
+            options["model_path"], map_location=options["device"]))
         return model
 def load_center(path, device):
     center = torch.load(path, map_location=device)
     return center["center"] if isinstance(center, dict) else center
 def extract_sequences(path, min_len):
     df = pd.read_csv(path)
     data, app_ids = [], []
             continue
     return data, app_ids
 def prepare_dataloader(sequences, vocab, options):
     dummy_times = [[0] * len(seq) for seq in sequences]
+    dataset = LogDataset(sequences, dummy_times, vocab,
+                         seq_len=options["seq_len"], on_memory=True, mask_ratio=options["mask_ratio"])
     return DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=dataset.collate_fn)
 def calculate_mean_std(loader, model, center, device):
     scores = []
     with torch.no_grad():
         for batch in tqdm(loader, desc="📏 Computing train distances..."):
             batch = {k: v.to(device) for k, v in batch.items()}
+            cls_output = model(batch["bert_input"], batch["time_input"])[
+                "cls_output"]
             scores.append(torch.norm(cls_output - center, dim=1).item())
     return np.mean(scores), np.std(scores)
 def generate_prompt(event_templates):
     prompt = "The system encountered a failure. Below are the key log events preceding the anomaly:\n\n"
     for i, event in enumerate(event_templates, 1):
     prompt += "Explain the cause in one or two sentences, using technical reasoning if possible.\n"
     return prompt
 def compute_logkey_anomaly(masked_output, masked_label, top_k=5):
     num_undetected = 0
     return num_undetected, len(masked_label)
 # === API-Compatible RCA Pipeline ===
 def detect_anomalies_and_explain(input_log_path):
     log_file = os.path.basename(input_log_path)
     input_dir = os.path.dirname(input_log_path)
+    output_dir = os.path.abspath(os.path.join(
+        os.path.dirname(__file__), "model", "bert"))
+    log_structured_file = os.path.join(
+        output_dir, log_file + "_structured.csv")
     log_templates_file = os.path.join(output_dir, log_file + "_templates.csv")
     log_sequence_file = os.path.join(output_dir, "rca_abnormal_sequence.csv")
     PARAMS_FILE = os.path.join(output_dir, "bert", "parameters.txt")
     # Step 2: Load Models and Parameters
     options = load_parameters(PARAMS_FILE)
+    options["device"] = torch.device(
+        "cuda" if torch.cuda.is_available() else "cpu")
     vocab = WordVocab.load_vocab(options["vocab_path"])
     model = load_logbert_model(options, vocab).to(options["device"]).eval()
     center = load_center(CENTER_PATH, options["device"])
     # Step 3: Prepare Data
+    test_sequences, app_ids = extract_sequences(
+        log_sequence_file, options["min_len"])
     test_loader = prepare_dataloader(test_sequences, vocab, options)
+    train_sequences = [line.strip().split() for line in open(
+        TRAIN_FILE) if len(line.strip().split()) >= options["min_len"]]
     train_loader = prepare_dataloader(train_sequences, vocab, options)
+    mean, std = calculate_mean_std(
+        train_loader, model, center, options["device"])
     templates_df = pd.read_csv(log_templates_file)
+    event_template_dict = dict(
+        zip(templates_df["EventId"], templates_df["EventTemplate"]))
     # Step 4: Analyze & Explain Anomalies
     results = []
         score = torch.norm(cls_output - center, dim=1).item()
         z_score = (score - mean) / std
+        num_undetected, masked_total = compute_logkey_anomaly(
+            output["logkey_output"][0], batch["bert_label"][0])
         undetected_ratio = num_undetected / masked_total if masked_total else 0
         status = "Abnormal" if z_score > 2 or undetected_ratio > 0.5 else "Normal"
             continue
         top_eids = test_sequences[i][:TOP_EVENTS]
+        event_templates = [event_template_dict.get(
+            eid, f"[Missing Event {eid}]") for eid in top_eids]
+        # Inject results to DB
         results.append({
             "AppId": app_ids[i],
             "Score": score,
             "z_score": z_score,
             "UndetectedRatio": undetected_ratio,
+            "status": status,
             "Events": event_templates,
             "Explanation": None
         })

requirements.txt CHANGED Viewed

@@ -16,4 +16,5 @@ sqlalchemy
 asyncpg
 logparser
 bert_pytorch
-seaborn

 asyncpg
 logparser
 bert_pytorch
+seaborn
+botocore