Upload 9 files
Browse files- README.md +53 -5
- app.py +187 -0
- classify.py +85 -0
- processor_bert.py +64 -0
- processor_llm.py +89 -0
- processor_regex.py +47 -0
- requirements.txt +7 -0
- synthetic_logs.csv +0 -0
- test.csv +11 -0
README.md
CHANGED
|
@@ -1,13 +1,61 @@
|
|
| 1 |
---
|
| 2 |
title: Log Classification System
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: mit
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: Log Classification System
|
| 3 |
+
emoji: 🔍
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.44.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: mit
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# 🔍 Log Classification System
|
| 14 |
+
|
| 15 |
+
A **production-inspired hybrid log classification pipeline** that routes enterprise logs through 3 tiers — Regex → BERT + Logistic Regression → LLM — based on pattern confidence and source system.
|
| 16 |
+
|
| 17 |
+
## Architecture
|
| 18 |
+
|
| 19 |
+
```
|
| 20 |
+
Input Log
|
| 21 |
+
│
|
| 22 |
+
├─► [Tier 1] Regex Classifier → Fixed patterns (sub-ms latency)
|
| 23 |
+
│ │ No match?
|
| 24 |
+
│ ▼
|
| 25 |
+
├─► [Tier 2] BERT + LogReg → High-confidence ML (conf > 0.5)
|
| 26 |
+
│ │ Low confidence?
|
| 27 |
+
│ ▼
|
| 28 |
+
└─► [Tier 3] LLM (HF Inference) → LegacyCRM / rare patterns
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
## Categories
|
| 32 |
+
|
| 33 |
+
| Category | Tier Used |
|
| 34 |
+
|---|---|
|
| 35 |
+
| User Action | Regex |
|
| 36 |
+
| System Notification | Regex |
|
| 37 |
+
| HTTP Status | BERT |
|
| 38 |
+
| Security Alert | BERT |
|
| 39 |
+
| Critical Error | BERT |
|
| 40 |
+
| Error | BERT |
|
| 41 |
+
| Resource Usage | BERT |
|
| 42 |
+
| Workflow Error | LLM |
|
| 43 |
+
| Deprecation Warning | LLM |
|
| 44 |
+
|
| 45 |
+
## Setup
|
| 46 |
+
|
| 47 |
+
### HuggingFace Spaces Secrets Required
|
| 48 |
+
- `HF_TOKEN` — your HuggingFace token (for LLM inference on LegacyCRM logs)
|
| 49 |
+
|
| 50 |
+
### Local Setup
|
| 51 |
+
```bash
|
| 52 |
+
pip install -r requirements.txt
|
| 53 |
+
python app.py
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
## Source Systems
|
| 57 |
+
- `ModernCRM`, `ModernHR`, `BillingSystem`, `AnalyticsEngine`, `ThirdPartyAPI` → Regex → BERT
|
| 58 |
+
- `LegacyCRM` → LLM directly (too few training samples for ML)
|
| 59 |
+
|
| 60 |
+
## Tech Stack
|
| 61 |
+
`sentence-transformers` · `scikit-learn` · `huggingface-hub` · `gradio` · `fastapi` · `pandas`
|
app.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Log Classification System — HuggingFace Spaces
|
| 3 |
+
Gradio UI for the 3-tier hybrid log classification pipeline.
|
| 4 |
+
"""
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
import io
|
| 7 |
+
import time
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import gradio as gr
|
| 10 |
+
from classify import classify_log, classify_csv
|
| 11 |
+
|
| 12 |
+
# ── Source options ──────────────────────────────────────────────────────────
|
| 13 |
+
SOURCES = [
|
| 14 |
+
"ModernCRM",
|
| 15 |
+
"ModernHR",
|
| 16 |
+
"BillingSystem",
|
| 17 |
+
"AnalyticsEngine",
|
| 18 |
+
"ThirdPartyAPI",
|
| 19 |
+
"LegacyCRM",
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
TIER_COLORS = {
|
| 23 |
+
"Regex": "🟢",
|
| 24 |
+
"BERT": "🔵",
|
| 25 |
+
"LLM": "🟡",
|
| 26 |
+
"LLM (fallback)": "🟠",
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
EXAMPLE_LOGS = [
|
| 30 |
+
["ModernCRM", "User User12345 logged in."],
|
| 31 |
+
["ModernHR", "Multiple login failures occurred on user 6454 account"],
|
| 32 |
+
["BillingSystem", "GET /v2/servers/detail HTTP/1.1 status: 200 len: 1583 time: 0.19"],
|
| 33 |
+
["AnalyticsEngine", "System crashed due to disk I/O failure on node-3"],
|
| 34 |
+
["LegacyCRM", "Case escalation for ticket ID 7324 failed — support agent is no longer active."],
|
| 35 |
+
["LegacyCRM", "The 'BulkEmailSender' feature will be deprecated in v5.0. Use 'EmailCampaignManager'."],
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# ── Single log tab ──────────────────────────────────────────────────────────
|
| 40 |
+
def classify_single(source: str, log_message: str):
|
| 41 |
+
if not log_message.strip():
|
| 42 |
+
return "—", "—", "—", "—"
|
| 43 |
+
|
| 44 |
+
t0 = time.perf_counter()
|
| 45 |
+
result = classify_log(source, log_message)
|
| 46 |
+
latency_ms = (time.perf_counter() - t0) * 1000
|
| 47 |
+
|
| 48 |
+
label = result["label"]
|
| 49 |
+
tier = result["tier"]
|
| 50 |
+
confidence = f"{result['confidence']:.1%}" if result["confidence"] is not None else "N/A"
|
| 51 |
+
icon = TIER_COLORS.get(tier, "⚪")
|
| 52 |
+
|
| 53 |
+
return (
|
| 54 |
+
label,
|
| 55 |
+
f"{icon} {tier}",
|
| 56 |
+
confidence,
|
| 57 |
+
f"{latency_ms:.1f} ms",
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# ── Batch CSV tab ───────────────────────────────────────────────────────────
|
| 62 |
+
def classify_batch(file):
|
| 63 |
+
if file is None:
|
| 64 |
+
return None, "⚠️ Please upload a CSV file."
|
| 65 |
+
|
| 66 |
+
try:
|
| 67 |
+
output_path, df = classify_csv(file.name, "/tmp/classified_output.csv")
|
| 68 |
+
except ValueError as e:
|
| 69 |
+
return None, f"⚠️ {e}"
|
| 70 |
+
except Exception as e:
|
| 71 |
+
return None, f"❌ Error: {e}"
|
| 72 |
+
|
| 73 |
+
total = len(df)
|
| 74 |
+
tier_counts = df["tier_used"].value_counts().to_dict()
|
| 75 |
+
label_counts = df["predicted_label"].value_counts().to_dict()
|
| 76 |
+
|
| 77 |
+
tier_lines = "\n".join(f" {TIER_COLORS.get(k,'⚪')} {k}: {v} ({v/total:.0%})" for k, v in tier_counts.items())
|
| 78 |
+
label_lines = "\n".join(f" • {k}: {v}" for k, v in label_counts.items())
|
| 79 |
+
|
| 80 |
+
stats = (
|
| 81 |
+
f"✅ Classified {total} logs\n\n"
|
| 82 |
+
f"📊 Tier breakdown:\n{tier_lines}\n\n"
|
| 83 |
+
f"🏷️ Label distribution:\n{label_lines}"
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
return output_path, stats
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# ── UI ──────────────────────────────────────────────────────────────────────
|
| 90 |
+
with gr.Blocks(title="Log Classification System", theme=gr.themes.Soft()) as demo:
|
| 91 |
+
|
| 92 |
+
gr.Markdown("""
|
| 93 |
+
# 🔍 Log Classification System
|
| 94 |
+
**3-tier hybrid pipeline** → 🟢 Regex · 🔵 BERT + LogReg · 🟡 LLM
|
| 95 |
+
Built to mimic production enterprise log monitoring architecture.
|
| 96 |
+
""")
|
| 97 |
+
|
| 98 |
+
with gr.Tabs():
|
| 99 |
+
|
| 100 |
+
# ── Tab 1: Single Log ────────────────────────────────────────────
|
| 101 |
+
with gr.Tab("Single Log"):
|
| 102 |
+
with gr.Row():
|
| 103 |
+
source_input = gr.Dropdown(
|
| 104 |
+
choices=SOURCES,
|
| 105 |
+
value="ModernCRM",
|
| 106 |
+
label="Source System",
|
| 107 |
+
)
|
| 108 |
+
log_input = gr.Textbox(
|
| 109 |
+
label="Log Message",
|
| 110 |
+
placeholder="Paste a log message here...",
|
| 111 |
+
lines=3,
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
classify_btn = gr.Button("Classify", variant="primary")
|
| 115 |
+
|
| 116 |
+
with gr.Row():
|
| 117 |
+
label_out = gr.Textbox(label="🏷️ Predicted Label", interactive=False)
|
| 118 |
+
tier_out = gr.Textbox(label="⚙️ Tier Used", interactive=False)
|
| 119 |
+
confidence_out = gr.Textbox(label="📈 Confidence", interactive=False)
|
| 120 |
+
latency_out = gr.Textbox(label="⏱️ Latency", interactive=False)
|
| 121 |
+
|
| 122 |
+
classify_btn.click(
|
| 123 |
+
fn=classify_single,
|
| 124 |
+
inputs=[source_input, log_input],
|
| 125 |
+
outputs=[label_out, tier_out, confidence_out, latency_out],
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
gr.Examples(
|
| 129 |
+
examples=EXAMPLE_LOGS,
|
| 130 |
+
inputs=[source_input, log_input],
|
| 131 |
+
label="📋 Example Logs (click to try)",
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
# ── Tab 2: Batch CSV ─────────────────────────────────────────────
|
| 135 |
+
with gr.Tab("Batch CSV Upload"):
|
| 136 |
+
gr.Markdown("""
|
| 137 |
+
Upload a CSV with columns: **`source`**, **`log_message`**
|
| 138 |
+
Download the classified CSV with added columns: `predicted_label`, `tier_used`, `confidence`.
|
| 139 |
+
""")
|
| 140 |
+
with gr.Row():
|
| 141 |
+
with gr.Column():
|
| 142 |
+
csv_input = gr.File(label="📂 Upload CSV", file_types=[".csv"])
|
| 143 |
+
batch_btn = gr.Button("Classify All", variant="primary")
|
| 144 |
+
with gr.Column():
|
| 145 |
+
csv_output = gr.File(label="📥 Download Classified CSV")
|
| 146 |
+
stats_out = gr.Textbox(label="📊 Stats", lines=12, interactive=False)
|
| 147 |
+
|
| 148 |
+
batch_btn.click(
|
| 149 |
+
fn=classify_batch,
|
| 150 |
+
inputs=[csv_input],
|
| 151 |
+
outputs=[csv_output, stats_out],
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
gr.Markdown("""
|
| 155 |
+
**Sample CSV format:**
|
| 156 |
+
```
|
| 157 |
+
source,log_message
|
| 158 |
+
ModernCRM,User User123 logged in.
|
| 159 |
+
LegacyCRM,Case escalation for ticket ID 7324 failed.
|
| 160 |
+
BillingSystem,GET /api/v2/invoice HTTP/1.1 status: 500
|
| 161 |
+
```
|
| 162 |
+
""")
|
| 163 |
+
|
| 164 |
+
# ── Tab 3: Architecture ──────────────────────────────────────────
|
| 165 |
+
with gr.Tab("Architecture"):
|
| 166 |
+
gr.Markdown("""
|
| 167 |
+
## 🏗️ 3-Tier Hybrid Pipeline
|
| 168 |
+
|
| 169 |
+
| Tier | Method | Coverage | Latency | When Used |
|
| 170 |
+
|------|--------|----------|---------|-----------|
|
| 171 |
+
| 🟢 Regex | Python `re` patterns | ~21% | < 1ms | Fixed patterns (login, backup, etc.) |
|
| 172 |
+
| 🔵 BERT | `all-MiniLM-L6-v2` + LogReg | ~79% | 20–80ms | High-volume categories with 150+ samples |
|
| 173 |
+
| 🟡 LLM | HuggingFace Inference API | ~0.3% | 500–2000ms | LegacyCRM logs, rare patterns |
|
| 174 |
+
|
| 175 |
+
## 📊 Model Performance (from training)
|
| 176 |
+
- **BERT + LogReg** trained on 2,410 synthetic enterprise logs
|
| 177 |
+
- **Confidence threshold**: 0.5 (below → escalate to LLM)
|
| 178 |
+
- **Source-aware routing**: `LegacyCRM` bypasses ML entirely (only 7 training samples)
|
| 179 |
+
|
| 180 |
+
## 🔑 Environment Variables
|
| 181 |
+
| Secret | Required For |
|
| 182 |
+
|--------|-------------|
|
| 183 |
+
| `HF_TOKEN` | LLM inference (LegacyCRM logs) |
|
| 184 |
+
""")
|
| 185 |
+
|
| 186 |
+
if __name__ == "__main__":
|
| 187 |
+
demo.launch()
|
classify.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from processor_regex import classify_with_regex
|
| 4 |
+
from processor_bert import classify_with_bert
|
| 5 |
+
from processor_llm import classify_with_llm
|
| 6 |
+
|
| 7 |
+
LEGACY_SOURCE = "LegacyCRM"
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def classify_log(source: str, log_msg: str) -> dict:
|
| 11 |
+
"""
|
| 12 |
+
Route a single log through the 3-tier hybrid pipeline.
|
| 13 |
+
|
| 14 |
+
Routing logic:
|
| 15 |
+
- LegacyCRM → Tier 3 (LLM) directly [too few training samples for ML]
|
| 16 |
+
- Others → Tier 1 (Regex) first
|
| 17 |
+
→ Tier 2 (BERT) if regex misses
|
| 18 |
+
→ Tier 3 (LLM) if BERT confidence < 0.5
|
| 19 |
+
|
| 20 |
+
Returns dict with keys: label, tier, confidence
|
| 21 |
+
"""
|
| 22 |
+
if source == LEGACY_SOURCE:
|
| 23 |
+
label = classify_with_llm(log_msg)
|
| 24 |
+
return {"label": label, "tier": "LLM", "confidence": None}
|
| 25 |
+
|
| 26 |
+
# Tier 1 — Regex
|
| 27 |
+
label = classify_with_regex(log_msg)
|
| 28 |
+
if label:
|
| 29 |
+
return {"label": label, "tier": "Regex", "confidence": 1.0}
|
| 30 |
+
|
| 31 |
+
# Tier 2 — BERT + LogReg
|
| 32 |
+
label, confidence = classify_with_bert(log_msg)
|
| 33 |
+
if label != "Unclassified":
|
| 34 |
+
return {"label": label, "tier": "BERT", "confidence": confidence}
|
| 35 |
+
|
| 36 |
+
# Tier 3 — LLM fallback (low-confidence BERT)
|
| 37 |
+
label = classify_with_llm(log_msg)
|
| 38 |
+
return {"label": label, "tier": "LLM (fallback)", "confidence": None}
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def classify(logs: list[tuple[str, str]]) -> list[dict]:
|
| 42 |
+
"""Classify a list of (source, log_message) tuples."""
|
| 43 |
+
return [classify_log(source, msg) for source, msg in logs]
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str, pd.DataFrame]:
|
| 47 |
+
"""
|
| 48 |
+
Read a CSV with 'source' and 'log_message' columns,
|
| 49 |
+
classify each row, write results to output_path.
|
| 50 |
+
Returns (output_path, result_dataframe).
|
| 51 |
+
"""
|
| 52 |
+
df = pd.read_csv(input_path)
|
| 53 |
+
|
| 54 |
+
required = {"source", "log_message"}
|
| 55 |
+
if not required.issubset(df.columns):
|
| 56 |
+
raise ValueError(f"CSV must contain columns: {required}. Got: {set(df.columns)}")
|
| 57 |
+
|
| 58 |
+
results = classify(list(zip(df["source"], df["log_message"])))
|
| 59 |
+
df["predicted_label"] = [r["label"] for r in results]
|
| 60 |
+
df["tier_used"] = [r["tier"] for r in results]
|
| 61 |
+
df["confidence"] = [
|
| 62 |
+
f"{r['confidence']:.1%}" if r["confidence"] is not None else "N/A"
|
| 63 |
+
for r in results
|
| 64 |
+
]
|
| 65 |
+
|
| 66 |
+
df.to_csv(output_path, index=False)
|
| 67 |
+
return output_path, df
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
if __name__ == "__main__":
|
| 71 |
+
sample_logs = [
|
| 72 |
+
("ModernCRM", "IP 192.168.133.114 blocked due to potential attack"),
|
| 73 |
+
("BillingSystem", "User User12345 logged in."),
|
| 74 |
+
("AnalyticsEngine", "File data_6957.csv uploaded successfully by user User265."),
|
| 75 |
+
("ModernHR", "GET /v2/servers/detail HTTP/1.1 status: 200 len: 1583 time: 0.19"),
|
| 76 |
+
("ModernHR", "Admin access escalation detected for user 9429"),
|
| 77 |
+
("LegacyCRM", "Case escalation for ticket ID 7324 failed because the assigned agent is no longer active."),
|
| 78 |
+
("LegacyCRM", "The 'ReportGenerator' module will be retired in v4.0. Migrate to 'AdvancedAnalyticsSuite'."),
|
| 79 |
+
]
|
| 80 |
+
|
| 81 |
+
print(f"{'Source':<20} {'Tier':<15} {'Conf':>6} {'Label':<25} Log")
|
| 82 |
+
print("─" * 110)
|
| 83 |
+
for (source, log), result in zip(sample_logs, classify(sample_logs)):
|
| 84 |
+
conf = f"{result['confidence']:.0%}" if result['confidence'] else " N/A"
|
| 85 |
+
print(f"{source:<20} {result['tier']:<15} {conf:>6} {result['label']:<25} {log[:45]}")
|
processor_bert.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import joblib
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
| 5 |
+
|
| 6 |
+
# ── Lazy-load models on first use (faster Spaces startup) ──────────────────
|
| 7 |
+
_embedding_model = None
|
| 8 |
+
_classifier = None
|
| 9 |
+
|
| 10 |
+
MODEL_PATH = os.path.join(os.path.dirname(__file__), "models", "log_classifier.joblib")
|
| 11 |
+
CONFIDENCE_THRESHOLD = 0.5
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _load_models():
|
| 15 |
+
global _embedding_model, _classifier
|
| 16 |
+
if _embedding_model is None:
|
| 17 |
+
_embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 18 |
+
if _classifier is None:
|
| 19 |
+
if not os.path.exists(MODEL_PATH):
|
| 20 |
+
raise FileNotFoundError(
|
| 21 |
+
f"Model not found at {MODEL_PATH}. "
|
| 22 |
+
"Run the Colab training notebook first and upload log_classifier.joblib."
|
| 23 |
+
)
|
| 24 |
+
_classifier = joblib.load(MODEL_PATH)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def classify_with_bert(log_message: str) -> tuple[str, float]:
|
| 28 |
+
"""
|
| 29 |
+
Tier 2: BERT embedding + Logistic Regression classifier.
|
| 30 |
+
Returns (label, confidence). Returns ('Unclassified', max_prob) if
|
| 31 |
+
no class exceeds CONFIDENCE_THRESHOLD.
|
| 32 |
+
Latency: ~20-80ms on CPU.
|
| 33 |
+
"""
|
| 34 |
+
_load_models()
|
| 35 |
+
|
| 36 |
+
embedding = _embedding_model.encode([log_message])
|
| 37 |
+
probabilities = _classifier.predict_proba(embedding)[0]
|
| 38 |
+
max_prob = float(np.max(probabilities))
|
| 39 |
+
|
| 40 |
+
if max_prob < CONFIDENCE_THRESHOLD:
|
| 41 |
+
return "Unclassified", max_prob
|
| 42 |
+
|
| 43 |
+
predicted_label = _classifier.predict(embedding)[0]
|
| 44 |
+
return predicted_label, max_prob
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def get_classes() -> list[str]:
|
| 48 |
+
"""Return list of classes the BERT classifier knows."""
|
| 49 |
+
_load_models()
|
| 50 |
+
return list(_classifier.classes_)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
if __name__ == "__main__":
|
| 54 |
+
test_logs = [
|
| 55 |
+
"GET /v2/servers/detail HTTP/1.1 status: 404 len: 1583 time: 0.19",
|
| 56 |
+
"System crashed due to driver errors when restarting the server",
|
| 57 |
+
"Multiple login failures occurred on user 6454 account",
|
| 58 |
+
"Admin access escalation detected for user 9429",
|
| 59 |
+
"CPU usage at 98% for the last 10 minutes on node-7",
|
| 60 |
+
"Hey bro chill ya!", # should be Unclassified
|
| 61 |
+
]
|
| 62 |
+
for log in test_logs:
|
| 63 |
+
label, conf = classify_with_bert(log)
|
| 64 |
+
print(f"[{conf:.0%}] {label:25s} | {log[:70]}")
|
processor_llm.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
from huggingface_hub import InferenceClient
|
| 4 |
+
|
| 5 |
+
# ── Config ─────────────────────────────────────────────────────────────────
|
| 6 |
+
HF_TOKEN = os.getenv("HF_TOKEN") # Set as HuggingFace Space secret
|
| 7 |
+
LLM_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
|
| 8 |
+
|
| 9 |
+
VALID_CATEGORIES = ["Workflow Error", "Deprecation Warning"]
|
| 10 |
+
|
| 11 |
+
SYSTEM_PROMPT = (
|
| 12 |
+
"You are an enterprise log classifier. "
|
| 13 |
+
"Classify log messages into exactly one category. "
|
| 14 |
+
"Return ONLY the category name — no explanation, no punctuation."
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
FEW_SHOT_EXAMPLES = [
|
| 18 |
+
{
|
| 19 |
+
"log": "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active.",
|
| 20 |
+
"label": "Workflow Error",
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"log": "The 'BulkEmailSender' feature is no longer supported. Use 'EmailCampaignManager' instead.",
|
| 24 |
+
"label": "Deprecation Warning",
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"log": "Invoice generation aborted for order ID 8910 due to invalid tax calculation module.",
|
| 28 |
+
"label": "Workflow Error",
|
| 29 |
+
},
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _build_messages(log_msg: str) -> list[dict]:
|
| 34 |
+
categories_str = ", ".join(f'"{c}"' for c in VALID_CATEGORIES)
|
| 35 |
+
|
| 36 |
+
user_content = (
|
| 37 |
+
f"Classify the following log into one of these categories: {categories_str}.\n"
|
| 38 |
+
"If none fits, return \"Unclassified\".\n\n"
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
# Add few-shot examples
|
| 42 |
+
for ex in FEW_SHOT_EXAMPLES:
|
| 43 |
+
user_content += f'Log: {ex["log"]}\nCategory: {ex["label"]}\n\n'
|
| 44 |
+
|
| 45 |
+
user_content += f"Log: {log_msg}\nCategory:"
|
| 46 |
+
|
| 47 |
+
return [
|
| 48 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 49 |
+
{"role": "user", "content": user_content},
|
| 50 |
+
]
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def classify_with_llm(log_msg: str) -> str:
|
| 54 |
+
"""
|
| 55 |
+
Tier 3: LLM-based classifier using HuggingFace Inference API.
|
| 56 |
+
Used for LegacyCRM logs where training data is insufficient for ML.
|
| 57 |
+
Latency: 500–2000ms depending on model load.
|
| 58 |
+
"""
|
| 59 |
+
try:
|
| 60 |
+
client = InferenceClient(token=HF_TOKEN)
|
| 61 |
+
response = client.chat.completions.create(
|
| 62 |
+
model=LLM_MODEL,
|
| 63 |
+
messages=_build_messages(log_msg),
|
| 64 |
+
max_tokens=15,
|
| 65 |
+
temperature=0.1,
|
| 66 |
+
)
|
| 67 |
+
raw = response.choices[0].message.content.strip()
|
| 68 |
+
except Exception as e:
|
| 69 |
+
print(f"[LLM] Inference error: {e}")
|
| 70 |
+
return "Unclassified"
|
| 71 |
+
|
| 72 |
+
# Normalize response to valid category
|
| 73 |
+
for cat in VALID_CATEGORIES:
|
| 74 |
+
if cat.lower() in raw.lower():
|
| 75 |
+
return cat
|
| 76 |
+
|
| 77 |
+
return "Unclassified"
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
if __name__ == "__main__":
|
| 81 |
+
# Requires HF_TOKEN in environment
|
| 82 |
+
test_logs = [
|
| 83 |
+
"Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active.",
|
| 84 |
+
"The 'ReportGenerator' module will be retired in version 4.0. Migrate to 'AdvancedAnalyticsSuite'.",
|
| 85 |
+
"System reboot initiated by user 12345.", # should be Unclassified
|
| 86 |
+
]
|
| 87 |
+
for log in test_logs:
|
| 88 |
+
result = classify_with_llm(log)
|
| 89 |
+
print(f"{result:25s} | {log[:80]}")
|
processor_regex.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
REGEX_PATTERNS = {
|
| 4 |
+
r"User\s+\w+\d+\s+logged\s+(in|out)": "User Action",
|
| 5 |
+
r"Account\s+(?:with\s+)?ID\s+\S+\s+created\s+by": "User Action",
|
| 6 |
+
r"Backup\s+(started|ended|completed\s+successfully)": "System Notification",
|
| 7 |
+
r"System\s+updated\s+to\s+version": "System Notification",
|
| 8 |
+
r"File\s+\S+\s+uploaded\s+successfully\s+by\s+user": "System Notification",
|
| 9 |
+
r"Disk\s+cleanup\s+completed\s+successfully": "System Notification",
|
| 10 |
+
r"System\s+reboot\s+initiated\s+by\s+user": "System Notification",
|
| 11 |
+
r"Scheduled\s+maintenance\s+(started|completed)": "System Notification",
|
| 12 |
+
r"Service\s+\w+\s+restarted\s+successfully": "System Notification",
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
def classify_with_regex(log_message: str) -> str | None:
|
| 16 |
+
"""
|
| 17 |
+
Tier 1: Rule-based classifier using regex patterns.
|
| 18 |
+
Returns category label or None if no pattern matches.
|
| 19 |
+
Latency: sub-millisecond.
|
| 20 |
+
"""
|
| 21 |
+
for pattern, label in REGEX_PATTERNS.items():
|
| 22 |
+
if re.search(pattern, log_message, re.IGNORECASE):
|
| 23 |
+
return label
|
| 24 |
+
return None
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def get_regex_coverage(log_messages: list[str]) -> dict:
|
| 28 |
+
"""Measure regex tier coverage on a list of log messages."""
|
| 29 |
+
matched = sum(1 for msg in log_messages if classify_with_regex(msg) is not None)
|
| 30 |
+
return {
|
| 31 |
+
"total": len(log_messages),
|
| 32 |
+
"matched": matched,
|
| 33 |
+
"coverage_pct": round(matched / len(log_messages) * 100, 2),
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
if __name__ == "__main__":
|
| 38 |
+
test_logs = [
|
| 39 |
+
"User User123 logged in.",
|
| 40 |
+
"Backup completed successfully.",
|
| 41 |
+
"Account with ID 456 created by Admin.",
|
| 42 |
+
"GET /api/v2/resource HTTP/1.1 status: 200", # should be None
|
| 43 |
+
"Hey bro chill ya!", # should be None
|
| 44 |
+
]
|
| 45 |
+
for log in test_logs:
|
| 46 |
+
result = classify_with_regex(log)
|
| 47 |
+
print(f"[{'✓' if result else '✗'}] {result or 'None':25s} | {log[:60]}")
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==4.44.0
|
| 2 |
+
sentence-transformers==3.0.1
|
| 3 |
+
scikit-learn==1.5.1
|
| 4 |
+
huggingface-hub==0.24.6
|
| 5 |
+
joblib==1.4.2
|
| 6 |
+
pandas==2.2.2
|
| 7 |
+
numpy==1.26.4
|
synthetic_logs.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
test.csv
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
source,log_message
|
| 2 |
+
ModernCRM, "IP 192.168.133.114 blocked due to potential attack"
|
| 3 |
+
BillingSystem, "User 12345 logged in."
|
| 4 |
+
AnalyticsEngine, "File data_6957.csv uploaded successfully by user User265."
|
| 5 |
+
AnalyticsEngine, "Backup completed successfully."
|
| 6 |
+
ModernHR, "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1 RCODE 200 len: 1583 time: 0.1878400"
|
| 7 |
+
ModernHR, "Admin access escalation detected for user 9429"
|
| 8 |
+
LegacyCRM, "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active."
|
| 9 |
+
LegacyCRM, "Invoice generation process aborted for order ID 8910 due to invalid tax calculation module."
|
| 10 |
+
LegacyCRM, "The 'BulkEmailSender' feature is no longer supported. Use 'EmailCampaignManager' for improved functionality."
|
| 11 |
+
LegacyCRM, "The 'ReportGenerator' module will be retired in version 4.0. Please migrate to the 'AdvancedAnalyticsSuite' by Dec 2025"
|