NOT-OMEGA commited on
Commit
abc86a6
·
verified ·
1 Parent(s): 435dc35

Upload 9 files

Browse files
Files changed (9) hide show
  1. README.md +53 -5
  2. app.py +187 -0
  3. classify.py +85 -0
  4. processor_bert.py +64 -0
  5. processor_llm.py +89 -0
  6. processor_regex.py +47 -0
  7. requirements.txt +7 -0
  8. synthetic_logs.csv +0 -0
  9. test.csv +11 -0
README.md CHANGED
@@ -1,13 +1,61 @@
1
  ---
2
  title: Log Classification System
3
- emoji: 📉
4
- colorFrom: green
5
- colorTo: pink
6
  sdk: gradio
7
- sdk_version: 6.10.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: Log Classification System
3
+ emoji: 🔍
4
+ colorFrom: blue
5
+ colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 4.44.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
12
 
13
+ # 🔍 Log Classification System
14
+
15
+ A **production-inspired hybrid log classification pipeline** that routes enterprise logs through 3 tiers — Regex → BERT + Logistic Regression → LLM — based on pattern confidence and source system.
16
+
17
+ ## Architecture
18
+
19
+ ```
20
+ Input Log
21
+
22
+ ├─► [Tier 1] Regex Classifier → Fixed patterns (sub-ms latency)
23
+ │ │ No match?
24
+ │ ▼
25
+ ├─► [Tier 2] BERT + LogReg → High-confidence ML (conf > 0.5)
26
+ │ │ Low confidence?
27
+ │ ▼
28
+ └─► [Tier 3] LLM (HF Inference) → LegacyCRM / rare patterns
29
+ ```
30
+
31
+ ## Categories
32
+
33
+ | Category | Tier Used |
34
+ |---|---|
35
+ | User Action | Regex |
36
+ | System Notification | Regex |
37
+ | HTTP Status | BERT |
38
+ | Security Alert | BERT |
39
+ | Critical Error | BERT |
40
+ | Error | BERT |
41
+ | Resource Usage | BERT |
42
+ | Workflow Error | LLM |
43
+ | Deprecation Warning | LLM |
44
+
45
+ ## Setup
46
+
47
+ ### HuggingFace Spaces Secrets Required
48
+ - `HF_TOKEN` — your HuggingFace token (for LLM inference on LegacyCRM logs)
49
+
50
+ ### Local Setup
51
+ ```bash
52
+ pip install -r requirements.txt
53
+ python app.py
54
+ ```
55
+
56
+ ## Source Systems
57
+ - `ModernCRM`, `ModernHR`, `BillingSystem`, `AnalyticsEngine`, `ThirdPartyAPI` → Regex → BERT
58
+ - `LegacyCRM` → LLM directly (too few training samples for ML)
59
+
60
+ ## Tech Stack
61
+ `sentence-transformers` · `scikit-learn` · `huggingface-hub` · `gradio` · `fastapi` · `pandas`
app.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Log Classification System — HuggingFace Spaces
3
+ Gradio UI for the 3-tier hybrid log classification pipeline.
4
+ """
5
+ from __future__ import annotations
6
+ import io
7
+ import time
8
+ import pandas as pd
9
+ import gradio as gr
10
+ from classify import classify_log, classify_csv
11
+
12
+ # ── Source options ──────────────────────────────────────────────────────────
13
+ SOURCES = [
14
+ "ModernCRM",
15
+ "ModernHR",
16
+ "BillingSystem",
17
+ "AnalyticsEngine",
18
+ "ThirdPartyAPI",
19
+ "LegacyCRM",
20
+ ]
21
+
22
+ TIER_COLORS = {
23
+ "Regex": "🟢",
24
+ "BERT": "🔵",
25
+ "LLM": "🟡",
26
+ "LLM (fallback)": "🟠",
27
+ }
28
+
29
+ EXAMPLE_LOGS = [
30
+ ["ModernCRM", "User User12345 logged in."],
31
+ ["ModernHR", "Multiple login failures occurred on user 6454 account"],
32
+ ["BillingSystem", "GET /v2/servers/detail HTTP/1.1 status: 200 len: 1583 time: 0.19"],
33
+ ["AnalyticsEngine", "System crashed due to disk I/O failure on node-3"],
34
+ ["LegacyCRM", "Case escalation for ticket ID 7324 failed — support agent is no longer active."],
35
+ ["LegacyCRM", "The 'BulkEmailSender' feature will be deprecated in v5.0. Use 'EmailCampaignManager'."],
36
+ ]
37
+
38
+
39
+ # ── Single log tab ──────────────────────────────────────────────────────────
40
+ def classify_single(source: str, log_message: str):
41
+ if not log_message.strip():
42
+ return "—", "—", "—", "—"
43
+
44
+ t0 = time.perf_counter()
45
+ result = classify_log(source, log_message)
46
+ latency_ms = (time.perf_counter() - t0) * 1000
47
+
48
+ label = result["label"]
49
+ tier = result["tier"]
50
+ confidence = f"{result['confidence']:.1%}" if result["confidence"] is not None else "N/A"
51
+ icon = TIER_COLORS.get(tier, "⚪")
52
+
53
+ return (
54
+ label,
55
+ f"{icon} {tier}",
56
+ confidence,
57
+ f"{latency_ms:.1f} ms",
58
+ )
59
+
60
+
61
+ # ── Batch CSV tab ───────────────────────────────────────────────────────────
62
+ def classify_batch(file):
63
+ if file is None:
64
+ return None, "⚠️ Please upload a CSV file."
65
+
66
+ try:
67
+ output_path, df = classify_csv(file.name, "/tmp/classified_output.csv")
68
+ except ValueError as e:
69
+ return None, f"⚠️ {e}"
70
+ except Exception as e:
71
+ return None, f"❌ Error: {e}"
72
+
73
+ total = len(df)
74
+ tier_counts = df["tier_used"].value_counts().to_dict()
75
+ label_counts = df["predicted_label"].value_counts().to_dict()
76
+
77
+ tier_lines = "\n".join(f" {TIER_COLORS.get(k,'⚪')} {k}: {v} ({v/total:.0%})" for k, v in tier_counts.items())
78
+ label_lines = "\n".join(f" • {k}: {v}" for k, v in label_counts.items())
79
+
80
+ stats = (
81
+ f"✅ Classified {total} logs\n\n"
82
+ f"📊 Tier breakdown:\n{tier_lines}\n\n"
83
+ f"🏷️ Label distribution:\n{label_lines}"
84
+ )
85
+
86
+ return output_path, stats
87
+
88
+
89
+ # ── UI ──────────────────────────────────────────────────────────────────────
90
+ with gr.Blocks(title="Log Classification System", theme=gr.themes.Soft()) as demo:
91
+
92
+ gr.Markdown("""
93
+ # 🔍 Log Classification System
94
+ **3-tier hybrid pipeline** → 🟢 Regex · 🔵 BERT + LogReg · 🟡 LLM
95
+ Built to mimic production enterprise log monitoring architecture.
96
+ """)
97
+
98
+ with gr.Tabs():
99
+
100
+ # ── Tab 1: Single Log ────────────────────────────────────────────
101
+ with gr.Tab("Single Log"):
102
+ with gr.Row():
103
+ source_input = gr.Dropdown(
104
+ choices=SOURCES,
105
+ value="ModernCRM",
106
+ label="Source System",
107
+ )
108
+ log_input = gr.Textbox(
109
+ label="Log Message",
110
+ placeholder="Paste a log message here...",
111
+ lines=3,
112
+ )
113
+
114
+ classify_btn = gr.Button("Classify", variant="primary")
115
+
116
+ with gr.Row():
117
+ label_out = gr.Textbox(label="🏷️ Predicted Label", interactive=False)
118
+ tier_out = gr.Textbox(label="⚙️ Tier Used", interactive=False)
119
+ confidence_out = gr.Textbox(label="📈 Confidence", interactive=False)
120
+ latency_out = gr.Textbox(label="⏱️ Latency", interactive=False)
121
+
122
+ classify_btn.click(
123
+ fn=classify_single,
124
+ inputs=[source_input, log_input],
125
+ outputs=[label_out, tier_out, confidence_out, latency_out],
126
+ )
127
+
128
+ gr.Examples(
129
+ examples=EXAMPLE_LOGS,
130
+ inputs=[source_input, log_input],
131
+ label="📋 Example Logs (click to try)",
132
+ )
133
+
134
+ # ── Tab 2: Batch CSV ─────────────────────────────────────────────
135
+ with gr.Tab("Batch CSV Upload"):
136
+ gr.Markdown("""
137
+ Upload a CSV with columns: **`source`**, **`log_message`**
138
+ Download the classified CSV with added columns: `predicted_label`, `tier_used`, `confidence`.
139
+ """)
140
+ with gr.Row():
141
+ with gr.Column():
142
+ csv_input = gr.File(label="📂 Upload CSV", file_types=[".csv"])
143
+ batch_btn = gr.Button("Classify All", variant="primary")
144
+ with gr.Column():
145
+ csv_output = gr.File(label="📥 Download Classified CSV")
146
+ stats_out = gr.Textbox(label="📊 Stats", lines=12, interactive=False)
147
+
148
+ batch_btn.click(
149
+ fn=classify_batch,
150
+ inputs=[csv_input],
151
+ outputs=[csv_output, stats_out],
152
+ )
153
+
154
+ gr.Markdown("""
155
+ **Sample CSV format:**
156
+ ```
157
+ source,log_message
158
+ ModernCRM,User User123 logged in.
159
+ LegacyCRM,Case escalation for ticket ID 7324 failed.
160
+ BillingSystem,GET /api/v2/invoice HTTP/1.1 status: 500
161
+ ```
162
+ """)
163
+
164
+ # ── Tab 3: Architecture ──────────────────────────────────────────
165
+ with gr.Tab("Architecture"):
166
+ gr.Markdown("""
167
+ ## 🏗️ 3-Tier Hybrid Pipeline
168
+
169
+ | Tier | Method | Coverage | Latency | When Used |
170
+ |------|--------|----------|---------|-----------|
171
+ | 🟢 Regex | Python `re` patterns | ~21% | < 1ms | Fixed patterns (login, backup, etc.) |
172
+ | 🔵 BERT | `all-MiniLM-L6-v2` + LogReg | ~79% | 20–80ms | High-volume categories with 150+ samples |
173
+ | 🟡 LLM | HuggingFace Inference API | ~0.3% | 500–2000ms | LegacyCRM logs, rare patterns |
174
+
175
+ ## 📊 Model Performance (from training)
176
+ - **BERT + LogReg** trained on 2,410 synthetic enterprise logs
177
+ - **Confidence threshold**: 0.5 (below → escalate to LLM)
178
+ - **Source-aware routing**: `LegacyCRM` bypasses ML entirely (only 7 training samples)
179
+
180
+ ## 🔑 Environment Variables
181
+ | Secret | Required For |
182
+ |--------|-------------|
183
+ | `HF_TOKEN` | LLM inference (LegacyCRM logs) |
184
+ """)
185
+
186
+ if __name__ == "__main__":
187
+ demo.launch()
classify.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import pandas as pd
3
+ from processor_regex import classify_with_regex
4
+ from processor_bert import classify_with_bert
5
+ from processor_llm import classify_with_llm
6
+
7
+ LEGACY_SOURCE = "LegacyCRM"
8
+
9
+
10
+ def classify_log(source: str, log_msg: str) -> dict:
11
+ """
12
+ Route a single log through the 3-tier hybrid pipeline.
13
+
14
+ Routing logic:
15
+ - LegacyCRM → Tier 3 (LLM) directly [too few training samples for ML]
16
+ - Others → Tier 1 (Regex) first
17
+ → Tier 2 (BERT) if regex misses
18
+ → Tier 3 (LLM) if BERT confidence < 0.5
19
+
20
+ Returns dict with keys: label, tier, confidence
21
+ """
22
+ if source == LEGACY_SOURCE:
23
+ label = classify_with_llm(log_msg)
24
+ return {"label": label, "tier": "LLM", "confidence": None}
25
+
26
+ # Tier 1 — Regex
27
+ label = classify_with_regex(log_msg)
28
+ if label:
29
+ return {"label": label, "tier": "Regex", "confidence": 1.0}
30
+
31
+ # Tier 2 — BERT + LogReg
32
+ label, confidence = classify_with_bert(log_msg)
33
+ if label != "Unclassified":
34
+ return {"label": label, "tier": "BERT", "confidence": confidence}
35
+
36
+ # Tier 3 — LLM fallback (low-confidence BERT)
37
+ label = classify_with_llm(log_msg)
38
+ return {"label": label, "tier": "LLM (fallback)", "confidence": None}
39
+
40
+
41
+ def classify(logs: list[tuple[str, str]]) -> list[dict]:
42
+ """Classify a list of (source, log_message) tuples."""
43
+ return [classify_log(source, msg) for source, msg in logs]
44
+
45
+
46
+ def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str, pd.DataFrame]:
47
+ """
48
+ Read a CSV with 'source' and 'log_message' columns,
49
+ classify each row, write results to output_path.
50
+ Returns (output_path, result_dataframe).
51
+ """
52
+ df = pd.read_csv(input_path)
53
+
54
+ required = {"source", "log_message"}
55
+ if not required.issubset(df.columns):
56
+ raise ValueError(f"CSV must contain columns: {required}. Got: {set(df.columns)}")
57
+
58
+ results = classify(list(zip(df["source"], df["log_message"])))
59
+ df["predicted_label"] = [r["label"] for r in results]
60
+ df["tier_used"] = [r["tier"] for r in results]
61
+ df["confidence"] = [
62
+ f"{r['confidence']:.1%}" if r["confidence"] is not None else "N/A"
63
+ for r in results
64
+ ]
65
+
66
+ df.to_csv(output_path, index=False)
67
+ return output_path, df
68
+
69
+
70
+ if __name__ == "__main__":
71
+ sample_logs = [
72
+ ("ModernCRM", "IP 192.168.133.114 blocked due to potential attack"),
73
+ ("BillingSystem", "User User12345 logged in."),
74
+ ("AnalyticsEngine", "File data_6957.csv uploaded successfully by user User265."),
75
+ ("ModernHR", "GET /v2/servers/detail HTTP/1.1 status: 200 len: 1583 time: 0.19"),
76
+ ("ModernHR", "Admin access escalation detected for user 9429"),
77
+ ("LegacyCRM", "Case escalation for ticket ID 7324 failed because the assigned agent is no longer active."),
78
+ ("LegacyCRM", "The 'ReportGenerator' module will be retired in v4.0. Migrate to 'AdvancedAnalyticsSuite'."),
79
+ ]
80
+
81
+ print(f"{'Source':<20} {'Tier':<15} {'Conf':>6} {'Label':<25} Log")
82
+ print("─" * 110)
83
+ for (source, log), result in zip(sample_logs, classify(sample_logs)):
84
+ conf = f"{result['confidence']:.0%}" if result['confidence'] else " N/A"
85
+ print(f"{source:<20} {result['tier']:<15} {conf:>6} {result['label']:<25} {log[:45]}")
processor_bert.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import joblib
3
+ import numpy as np
4
+ from sentence_transformers import SentenceTransformer
5
+
6
+ # ── Lazy-load models on first use (faster Spaces startup) ──────────────────
7
+ _embedding_model = None
8
+ _classifier = None
9
+
10
+ MODEL_PATH = os.path.join(os.path.dirname(__file__), "models", "log_classifier.joblib")
11
+ CONFIDENCE_THRESHOLD = 0.5
12
+
13
+
14
+ def _load_models():
15
+ global _embedding_model, _classifier
16
+ if _embedding_model is None:
17
+ _embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
18
+ if _classifier is None:
19
+ if not os.path.exists(MODEL_PATH):
20
+ raise FileNotFoundError(
21
+ f"Model not found at {MODEL_PATH}. "
22
+ "Run the Colab training notebook first and upload log_classifier.joblib."
23
+ )
24
+ _classifier = joblib.load(MODEL_PATH)
25
+
26
+
27
+ def classify_with_bert(log_message: str) -> tuple[str, float]:
28
+ """
29
+ Tier 2: BERT embedding + Logistic Regression classifier.
30
+ Returns (label, confidence). Returns ('Unclassified', max_prob) if
31
+ no class exceeds CONFIDENCE_THRESHOLD.
32
+ Latency: ~20-80ms on CPU.
33
+ """
34
+ _load_models()
35
+
36
+ embedding = _embedding_model.encode([log_message])
37
+ probabilities = _classifier.predict_proba(embedding)[0]
38
+ max_prob = float(np.max(probabilities))
39
+
40
+ if max_prob < CONFIDENCE_THRESHOLD:
41
+ return "Unclassified", max_prob
42
+
43
+ predicted_label = _classifier.predict(embedding)[0]
44
+ return predicted_label, max_prob
45
+
46
+
47
+ def get_classes() -> list[str]:
48
+ """Return list of classes the BERT classifier knows."""
49
+ _load_models()
50
+ return list(_classifier.classes_)
51
+
52
+
53
+ if __name__ == "__main__":
54
+ test_logs = [
55
+ "GET /v2/servers/detail HTTP/1.1 status: 404 len: 1583 time: 0.19",
56
+ "System crashed due to driver errors when restarting the server",
57
+ "Multiple login failures occurred on user 6454 account",
58
+ "Admin access escalation detected for user 9429",
59
+ "CPU usage at 98% for the last 10 minutes on node-7",
60
+ "Hey bro chill ya!", # should be Unclassified
61
+ ]
62
+ for log in test_logs:
63
+ label, conf = classify_with_bert(log)
64
+ print(f"[{conf:.0%}] {label:25s} | {log[:70]}")
processor_llm.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from huggingface_hub import InferenceClient
4
+
5
+ # ── Config ─────────────────────────────────────────────────────────────────
6
+ HF_TOKEN = os.getenv("HF_TOKEN") # Set as HuggingFace Space secret
7
+ LLM_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
8
+
9
+ VALID_CATEGORIES = ["Workflow Error", "Deprecation Warning"]
10
+
11
+ SYSTEM_PROMPT = (
12
+ "You are an enterprise log classifier. "
13
+ "Classify log messages into exactly one category. "
14
+ "Return ONLY the category name — no explanation, no punctuation."
15
+ )
16
+
17
+ FEW_SHOT_EXAMPLES = [
18
+ {
19
+ "log": "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active.",
20
+ "label": "Workflow Error",
21
+ },
22
+ {
23
+ "log": "The 'BulkEmailSender' feature is no longer supported. Use 'EmailCampaignManager' instead.",
24
+ "label": "Deprecation Warning",
25
+ },
26
+ {
27
+ "log": "Invoice generation aborted for order ID 8910 due to invalid tax calculation module.",
28
+ "label": "Workflow Error",
29
+ },
30
+ ]
31
+
32
+
33
+ def _build_messages(log_msg: str) -> list[dict]:
34
+ categories_str = ", ".join(f'"{c}"' for c in VALID_CATEGORIES)
35
+
36
+ user_content = (
37
+ f"Classify the following log into one of these categories: {categories_str}.\n"
38
+ "If none fits, return \"Unclassified\".\n\n"
39
+ )
40
+
41
+ # Add few-shot examples
42
+ for ex in FEW_SHOT_EXAMPLES:
43
+ user_content += f'Log: {ex["log"]}\nCategory: {ex["label"]}\n\n'
44
+
45
+ user_content += f"Log: {log_msg}\nCategory:"
46
+
47
+ return [
48
+ {"role": "system", "content": SYSTEM_PROMPT},
49
+ {"role": "user", "content": user_content},
50
+ ]
51
+
52
+
53
+ def classify_with_llm(log_msg: str) -> str:
54
+ """
55
+ Tier 3: LLM-based classifier using HuggingFace Inference API.
56
+ Used for LegacyCRM logs where training data is insufficient for ML.
57
+ Latency: 500–2000ms depending on model load.
58
+ """
59
+ try:
60
+ client = InferenceClient(token=HF_TOKEN)
61
+ response = client.chat.completions.create(
62
+ model=LLM_MODEL,
63
+ messages=_build_messages(log_msg),
64
+ max_tokens=15,
65
+ temperature=0.1,
66
+ )
67
+ raw = response.choices[0].message.content.strip()
68
+ except Exception as e:
69
+ print(f"[LLM] Inference error: {e}")
70
+ return "Unclassified"
71
+
72
+ # Normalize response to valid category
73
+ for cat in VALID_CATEGORIES:
74
+ if cat.lower() in raw.lower():
75
+ return cat
76
+
77
+ return "Unclassified"
78
+
79
+
80
+ if __name__ == "__main__":
81
+ # Requires HF_TOKEN in environment
82
+ test_logs = [
83
+ "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active.",
84
+ "The 'ReportGenerator' module will be retired in version 4.0. Migrate to 'AdvancedAnalyticsSuite'.",
85
+ "System reboot initiated by user 12345.", # should be Unclassified
86
+ ]
87
+ for log in test_logs:
88
+ result = classify_with_llm(log)
89
+ print(f"{result:25s} | {log[:80]}")
processor_regex.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ REGEX_PATTERNS = {
4
+ r"User\s+\w+\d+\s+logged\s+(in|out)": "User Action",
5
+ r"Account\s+(?:with\s+)?ID\s+\S+\s+created\s+by": "User Action",
6
+ r"Backup\s+(started|ended|completed\s+successfully)": "System Notification",
7
+ r"System\s+updated\s+to\s+version": "System Notification",
8
+ r"File\s+\S+\s+uploaded\s+successfully\s+by\s+user": "System Notification",
9
+ r"Disk\s+cleanup\s+completed\s+successfully": "System Notification",
10
+ r"System\s+reboot\s+initiated\s+by\s+user": "System Notification",
11
+ r"Scheduled\s+maintenance\s+(started|completed)": "System Notification",
12
+ r"Service\s+\w+\s+restarted\s+successfully": "System Notification",
13
+ }
14
+
15
+ def classify_with_regex(log_message: str) -> str | None:
16
+ """
17
+ Tier 1: Rule-based classifier using regex patterns.
18
+ Returns category label or None if no pattern matches.
19
+ Latency: sub-millisecond.
20
+ """
21
+ for pattern, label in REGEX_PATTERNS.items():
22
+ if re.search(pattern, log_message, re.IGNORECASE):
23
+ return label
24
+ return None
25
+
26
+
27
+ def get_regex_coverage(log_messages: list[str]) -> dict:
28
+ """Measure regex tier coverage on a list of log messages."""
29
+ matched = sum(1 for msg in log_messages if classify_with_regex(msg) is not None)
30
+ return {
31
+ "total": len(log_messages),
32
+ "matched": matched,
33
+ "coverage_pct": round(matched / len(log_messages) * 100, 2),
34
+ }
35
+
36
+
37
+ if __name__ == "__main__":
38
+ test_logs = [
39
+ "User User123 logged in.",
40
+ "Backup completed successfully.",
41
+ "Account with ID 456 created by Admin.",
42
+ "GET /api/v2/resource HTTP/1.1 status: 200", # should be None
43
+ "Hey bro chill ya!", # should be None
44
+ ]
45
+ for log in test_logs:
46
+ result = classify_with_regex(log)
47
+ print(f"[{'✓' if result else '✗'}] {result or 'None':25s} | {log[:60]}")
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio==4.44.0
2
+ sentence-transformers==3.0.1
3
+ scikit-learn==1.5.1
4
+ huggingface-hub==0.24.6
5
+ joblib==1.4.2
6
+ pandas==2.2.2
7
+ numpy==1.26.4
synthetic_logs.csv ADDED
The diff for this file is too large to render. See raw diff
 
test.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ source,log_message
2
+ ModernCRM, "IP 192.168.133.114 blocked due to potential attack"
3
+ BillingSystem, "User 12345 logged in."
4
+ AnalyticsEngine, "File data_6957.csv uploaded successfully by user User265."
5
+ AnalyticsEngine, "Backup completed successfully."
6
+ ModernHR, "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1 RCODE 200 len: 1583 time: 0.1878400"
7
+ ModernHR, "Admin access escalation detected for user 9429"
8
+ LegacyCRM, "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active."
9
+ LegacyCRM, "Invoice generation process aborted for order ID 8910 due to invalid tax calculation module."
10
+ LegacyCRM, "The 'BulkEmailSender' feature is no longer supported. Use 'EmailCampaignManager' for improved functionality."
11
+ LegacyCRM, "The 'ReportGenerator' module will be retired in version 4.0. Please migrate to the 'AdvancedAnalyticsSuite' by Dec 2025"