Spaces:

NOT-OMEGA
/

log-classification-system

Sleeping

App Files Files Community

NOT-OMEGA commited on Mar 31

Commit

abc86a6

verified ·

1 Parent(s): 435dc35

Upload 9 files

Browse files

Files changed (9) hide show

README.md +53 -5
app.py +187 -0
classify.py +85 -0
processor_bert.py +64 -0
processor_llm.py +89 -0
processor_regex.py +47 -0
requirements.txt +7 -0
synthetic_logs.csv +0 -0
test.csv +11 -0

README.md CHANGED Viewed

@@ -1,13 +1,61 @@
 ---
 title: Log Classification System
-emoji: 📉
-colorFrom: green
-colorTo: pink
 sdk: gradio
-sdk_version: 6.10.0
 app_file: app.py
 pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Log Classification System
+emoji: 🔍
+colorFrom: blue
+colorTo: indigo
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
 pinned: false
 license: mit
 ---
+# 🔍 Log Classification System
+A **production-inspired hybrid log classification pipeline** that routes enterprise logs through 3 tiers — Regex → BERT + Logistic Regression → LLM — based on pattern confidence and source system.
+## Architecture
+```
+Input Log
+    │
+    ├─► [Tier 1] Regex Classifier      → Fixed patterns (sub-ms latency)
+    │         │ No match?
+    │         ▼
+    ├─► [Tier 2] BERT + LogReg         → High-confidence ML (conf > 0.5)
+    │         │ Low confidence?
+    │         ▼
+    └─► [Tier 3] LLM (HF Inference)   → LegacyCRM / rare patterns
+```
+## Categories
+| Category | Tier Used |
+|---|---|
+| User Action | Regex |
+| System Notification | Regex |
+| HTTP Status | BERT |
+| Security Alert | BERT |
+| Critical Error | BERT |
+| Error | BERT |
+| Resource Usage | BERT |
+| Workflow Error | LLM |
+| Deprecation Warning | LLM |
+## Setup
+### HuggingFace Spaces Secrets Required
+- `HF_TOKEN` — your HuggingFace token (for LLM inference on LegacyCRM logs)
+### Local Setup
+```bash
+pip install -r requirements.txt
+python app.py
+```
+## Source Systems
+- `ModernCRM`, `ModernHR`, `BillingSystem`, `AnalyticsEngine`, `ThirdPartyAPI` → Regex → BERT
+- `LegacyCRM` → LLM directly (too few training samples for ML)
+## Tech Stack
+`sentence-transformers` · `scikit-learn` · `huggingface-hub` · `gradio` · `fastapi` · `pandas`

app.py ADDED Viewed

	@@ -0,0 +1,187 @@

+"""
+Log Classification System — HuggingFace Spaces
+Gradio UI for the 3-tier hybrid log classification pipeline.
+"""
+from __future__ import annotations
+import io
+import time
+import pandas as pd
+import gradio as gr
+from classify import classify_log, classify_csv
+# ── Source options ──────────────────────────────────────────────────────────
+SOURCES = [
+    "ModernCRM",
+    "ModernHR",
+    "BillingSystem",
+    "AnalyticsEngine",
+    "ThirdPartyAPI",
+    "LegacyCRM",
+]
+TIER_COLORS = {
+    "Regex":        "🟢",
+    "BERT":         "🔵",
+    "LLM":          "🟡",
+    "LLM (fallback)": "🟠",
+}
+EXAMPLE_LOGS = [
+    ["ModernCRM",       "User User12345 logged in."],
+    ["ModernHR",        "Multiple login failures occurred on user 6454 account"],
+    ["BillingSystem",   "GET /v2/servers/detail HTTP/1.1 status: 200 len: 1583 time: 0.19"],
+    ["AnalyticsEngine", "System crashed due to disk I/O failure on node-3"],
+    ["LegacyCRM",       "Case escalation for ticket ID 7324 failed — support agent is no longer active."],
+    ["LegacyCRM",       "The 'BulkEmailSender' feature will be deprecated in v5.0. Use 'EmailCampaignManager'."],
+]
+# ── Single log tab ──────────────────────────────────────────────────────────
+def classify_single(source: str, log_message: str):
+    if not log_message.strip():
+        return "—", "—", "—", "—"
+    t0 = time.perf_counter()
+    result = classify_log(source, log_message)
+    latency_ms = (time.perf_counter() - t0) * 1000
+    label      = result["label"]
+    tier       = result["tier"]
+    confidence = f"{result['confidence']:.1%}" if result["confidence"] is not None else "N/A"
+    icon       = TIER_COLORS.get(tier, "⚪")
+    return (
+        label,
+        f"{icon} {tier}",
+        confidence,
+        f"{latency_ms:.1f} ms",
+    )
+# ── Batch CSV tab ───────────────────────────────────────────────────────────
+def classify_batch(file):
+    if file is None:
+        return None, "⚠️ Please upload a CSV file."
+    try:
+        output_path, df = classify_csv(file.name, "/tmp/classified_output.csv")
+    except ValueError as e:
+        return None, f"⚠️ {e}"
+    except Exception as e:
+        return None, f"❌ Error: {e}"
+    total = len(df)
+    tier_counts  = df["tier_used"].value_counts().to_dict()
+    label_counts = df["predicted_label"].value_counts().to_dict()
+    tier_lines  = "\n".join(f"  {TIER_COLORS.get(k,'⚪')} {k}: {v} ({v/total:.0%})" for k, v in tier_counts.items())
+    label_lines = "\n".join(f"  • {k}: {v}" for k, v in label_counts.items())
+    stats = (
+        f"✅ Classified {total} logs\n\n"
+        f"📊 Tier breakdown:\n{tier_lines}\n\n"
+        f"🏷️ Label distribution:\n{label_lines}"
+    )
+    return output_path, stats
+# ── UI ──────────────────────────────────────────────────────────────────────
+with gr.Blocks(title="Log Classification System", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+# 🔍 Log Classification System
+**3-tier hybrid pipeline** → 🟢 Regex · 🔵 BERT + LogReg · 🟡 LLM
+Built to mimic production enterprise log monitoring architecture.
+""")
+    with gr.Tabs():
+        # ── Tab 1: Single Log ────────────────────────────────────────────
+        with gr.Tab("Single Log"):
+            with gr.Row():
+                source_input = gr.Dropdown(
+                    choices=SOURCES,
+                    value="ModernCRM",
+                    label="Source System",
+                )
+                log_input = gr.Textbox(
+                    label="Log Message",
+                    placeholder="Paste a log message here...",
+                    lines=3,
+                )
+            classify_btn = gr.Button("Classify", variant="primary")
+            with gr.Row():
+                label_out      = gr.Textbox(label="🏷️ Predicted Label",     interactive=False)
+                tier_out       = gr.Textbox(label="⚙️ Tier Used",           interactive=False)
+                confidence_out = gr.Textbox(label="📈 Confidence",          interactive=False)
+                latency_out    = gr.Textbox(label="⏱️ Latency",             interactive=False)
+            classify_btn.click(
+                fn=classify_single,
+                inputs=[source_input, log_input],
+                outputs=[label_out, tier_out, confidence_out, latency_out],
+            )
+            gr.Examples(
+                examples=EXAMPLE_LOGS,
+                inputs=[source_input, log_input],
+                label="📋 Example Logs (click to try)",
+            )
+        # ── Tab 2: Batch CSV ─────────────────────────────────────────────
+        with gr.Tab("Batch CSV Upload"):
+            gr.Markdown("""
+Upload a CSV with columns: **`source`**, **`log_message`**
+Download the classified CSV with added columns: `predicted_label`, `tier_used`, `confidence`.
+""")
+            with gr.Row():
+                with gr.Column():
+                    csv_input  = gr.File(label="📂 Upload CSV", file_types=[".csv"])
+                    batch_btn  = gr.Button("Classify All", variant="primary")
+                with gr.Column():
+                    csv_output = gr.File(label="📥 Download Classified CSV")
+                    stats_out  = gr.Textbox(label="📊 Stats", lines=12, interactive=False)
+            batch_btn.click(
+                fn=classify_batch,
+                inputs=[csv_input],
+                outputs=[csv_output, stats_out],
+            )
+            gr.Markdown("""
+**Sample CSV format:**
+```
+source,log_message
+ModernCRM,User User123 logged in.
+LegacyCRM,Case escalation for ticket ID 7324 failed.
+BillingSystem,GET /api/v2/invoice HTTP/1.1 status: 500
+```
+""")
+        # ── Tab 3: Architecture ──────────────────────────────────────────
+        with gr.Tab("Architecture"):
+            gr.Markdown("""
+## 🏗️ 3-Tier Hybrid Pipeline
+| Tier | Method | Coverage | Latency | When Used |
+|------|--------|----------|---------|-----------|
+| 🟢 Regex | Python `re` patterns | ~21% | < 1ms | Fixed patterns (login, backup, etc.) |
+| 🔵 BERT | `all-MiniLM-L6-v2` + LogReg | ~79% | 20–80ms | High-volume categories with 150+ samples |
+| 🟡 LLM | HuggingFace Inference API | ~0.3% | 500–2000ms | LegacyCRM logs, rare patterns |
+## 📊 Model Performance (from training)
+- **BERT + LogReg** trained on 2,410 synthetic enterprise logs
+- **Confidence threshold**: 0.5 (below → escalate to LLM)
+- **Source-aware routing**: `LegacyCRM` bypasses ML entirely (only 7 training samples)
+## 🔑 Environment Variables
+| Secret | Required For |
+|--------|-------------|
+| `HF_TOKEN` | LLM inference (LegacyCRM logs) |
+""")
+if __name__ == "__main__":
+    demo.launch()

classify.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from __future__ import annotations
+import pandas as pd
+from processor_regex import classify_with_regex
+from processor_bert import classify_with_bert
+from processor_llm import classify_with_llm
+LEGACY_SOURCE = "LegacyCRM"
+def classify_log(source: str, log_msg: str) -> dict:
+    """
+    Route a single log through the 3-tier hybrid pipeline.
+    Routing logic:
+      - LegacyCRM  → Tier 3 (LLM) directly  [too few training samples for ML]
+      - Others     → Tier 1 (Regex) first
+                  → Tier 2 (BERT) if regex misses
+                  → Tier 3 (LLM) if BERT confidence < 0.5
+    Returns dict with keys: label, tier, confidence
+    """
+    if source == LEGACY_SOURCE:
+        label = classify_with_llm(log_msg)
+        return {"label": label, "tier": "LLM", "confidence": None}
+    # Tier 1 — Regex
+    label = classify_with_regex(log_msg)
+    if label:
+        return {"label": label, "tier": "Regex", "confidence": 1.0}
+    # Tier 2 — BERT + LogReg
+    label, confidence = classify_with_bert(log_msg)
+    if label != "Unclassified":
+        return {"label": label, "tier": "BERT", "confidence": confidence}
+    # Tier 3 — LLM fallback (low-confidence BERT)
+    label = classify_with_llm(log_msg)
+    return {"label": label, "tier": "LLM (fallback)", "confidence": None}
+def classify(logs: list[tuple[str, str]]) -> list[dict]:
+    """Classify a list of (source, log_message) tuples."""
+    return [classify_log(source, msg) for source, msg in logs]
+def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str, pd.DataFrame]:
+    """
+    Read a CSV with 'source' and 'log_message' columns,
+    classify each row, write results to output_path.
+    Returns (output_path, result_dataframe).
+    """
+    df = pd.read_csv(input_path)
+    required = {"source", "log_message"}
+    if not required.issubset(df.columns):
+        raise ValueError(f"CSV must contain columns: {required}. Got: {set(df.columns)}")
+    results = classify(list(zip(df["source"], df["log_message"])))
+    df["predicted_label"] = [r["label"] for r in results]
+    df["tier_used"]        = [r["tier"]  for r in results]
+    df["confidence"]       = [
+        f"{r['confidence']:.1%}" if r["confidence"] is not None else "N/A"
+        for r in results
+    ]
+    df.to_csv(output_path, index=False)
+    return output_path, df
+if __name__ == "__main__":
+    sample_logs = [
+        ("ModernCRM",       "IP 192.168.133.114 blocked due to potential attack"),
+        ("BillingSystem",   "User User12345 logged in."),
+        ("AnalyticsEngine", "File data_6957.csv uploaded successfully by user User265."),
+        ("ModernHR",        "GET /v2/servers/detail HTTP/1.1 status: 200 len: 1583 time: 0.19"),
+        ("ModernHR",        "Admin access escalation detected for user 9429"),
+        ("LegacyCRM",       "Case escalation for ticket ID 7324 failed because the assigned agent is no longer active."),
+        ("LegacyCRM",       "The 'ReportGenerator' module will be retired in v4.0. Migrate to 'AdvancedAnalyticsSuite'."),
+    ]
+    print(f"{'Source':<20} {'Tier':<15} {'Conf':>6}  {'Label':<25} Log")
+    print("─" * 110)
+    for (source, log), result in zip(sample_logs, classify(sample_logs)):
+        conf = f"{result['confidence']:.0%}" if result['confidence'] else " N/A"
+        print(f"{source:<20} {result['tier']:<15} {conf:>6}  {result['label']:<25} {log[:45]}")

processor_bert.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import os
+import joblib
+import numpy as np
+from sentence_transformers import SentenceTransformer
+# ── Lazy-load models on first use (faster Spaces startup) ──────────────────
+_embedding_model = None
+_classifier = None
+MODEL_PATH = os.path.join(os.path.dirname(__file__), "models", "log_classifier.joblib")
+CONFIDENCE_THRESHOLD = 0.5
+def _load_models():
+    global _embedding_model, _classifier
+    if _embedding_model is None:
+        _embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+    if _classifier is None:
+        if not os.path.exists(MODEL_PATH):
+            raise FileNotFoundError(
+                f"Model not found at {MODEL_PATH}. "
+                "Run the Colab training notebook first and upload log_classifier.joblib."
+            )
+        _classifier = joblib.load(MODEL_PATH)
+def classify_with_bert(log_message: str) -> tuple[str, float]:
+    """
+    Tier 2: BERT embedding + Logistic Regression classifier.
+    Returns (label, confidence). Returns ('Unclassified', max_prob) if
+    no class exceeds CONFIDENCE_THRESHOLD.
+    Latency: ~20-80ms on CPU.
+    """
+    _load_models()
+    embedding = _embedding_model.encode([log_message])
+    probabilities = _classifier.predict_proba(embedding)[0]
+    max_prob = float(np.max(probabilities))
+    if max_prob < CONFIDENCE_THRESHOLD:
+        return "Unclassified", max_prob
+    predicted_label = _classifier.predict(embedding)[0]
+    return predicted_label, max_prob
+def get_classes() -> list[str]:
+    """Return list of classes the BERT classifier knows."""
+    _load_models()
+    return list(_classifier.classes_)
+if __name__ == "__main__":
+    test_logs = [
+        "GET /v2/servers/detail HTTP/1.1 status: 404 len: 1583 time: 0.19",
+        "System crashed due to driver errors when restarting the server",
+        "Multiple login failures occurred on user 6454 account",
+        "Admin access escalation detected for user 9429",
+        "CPU usage at 98% for the last 10 minutes on node-7",
+        "Hey bro chill ya!",   # should be Unclassified
+    ]
+    for log in test_logs:
+        label, conf = classify_with_bert(log)
+        print(f"[{conf:.0%}] {label:25s} | {log[:70]}")

processor_llm.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+import re
+from huggingface_hub import InferenceClient
+# ── Config ─────────────────────────────────────────────────────────────────
+HF_TOKEN = os.getenv("HF_TOKEN")          # Set as HuggingFace Space secret
+LLM_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
+VALID_CATEGORIES = ["Workflow Error", "Deprecation Warning"]
+SYSTEM_PROMPT = (
+    "You are an enterprise log classifier. "
+    "Classify log messages into exactly one category. "
+    "Return ONLY the category name — no explanation, no punctuation."
+)
+FEW_SHOT_EXAMPLES = [
+    {
+        "log": "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active.",
+        "label": "Workflow Error",
+    },
+    {
+        "log": "The 'BulkEmailSender' feature is no longer supported. Use 'EmailCampaignManager' instead.",
+        "label": "Deprecation Warning",
+    },
+    {
+        "log": "Invoice generation aborted for order ID 8910 due to invalid tax calculation module.",
+        "label": "Workflow Error",
+    },
+]
+def _build_messages(log_msg: str) -> list[dict]:
+    categories_str = ", ".join(f'"{c}"' for c in VALID_CATEGORIES)
+    user_content = (
+        f"Classify the following log into one of these categories: {categories_str}.\n"
+        "If none fits, return \"Unclassified\".\n\n"
+    )
+    # Add few-shot examples
+    for ex in FEW_SHOT_EXAMPLES:
+        user_content += f'Log: {ex["log"]}\nCategory: {ex["label"]}\n\n'
+    user_content += f"Log: {log_msg}\nCategory:"
+    return [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": user_content},
+    ]
+def classify_with_llm(log_msg: str) -> str:
+    """
+    Tier 3: LLM-based classifier using HuggingFace Inference API.
+    Used for LegacyCRM logs where training data is insufficient for ML.
+    Latency: 500–2000ms depending on model load.
+    """
+    try:
+        client = InferenceClient(token=HF_TOKEN)
+        response = client.chat.completions.create(
+            model=LLM_MODEL,
+            messages=_build_messages(log_msg),
+            max_tokens=15,
+            temperature=0.1,
+        )
+        raw = response.choices[0].message.content.strip()
+    except Exception as e:
+        print(f"[LLM] Inference error: {e}")
+        return "Unclassified"
+    # Normalize response to valid category
+    for cat in VALID_CATEGORIES:
+        if cat.lower() in raw.lower():
+            return cat
+    return "Unclassified"
+if __name__ == "__main__":
+    # Requires HF_TOKEN in environment
+    test_logs = [
+        "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active.",
+        "The 'ReportGenerator' module will be retired in version 4.0. Migrate to 'AdvancedAnalyticsSuite'.",
+        "System reboot initiated by user 12345.",   # should be Unclassified
+    ]
+    for log in test_logs:
+        result = classify_with_llm(log)
+        print(f"{result:25s} | {log[:80]}")

processor_regex.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import re
+REGEX_PATTERNS = {
+    r"User\s+\w+\d+\s+logged\s+(in|out)": "User Action",
+    r"Account\s+(?:with\s+)?ID\s+\S+\s+created\s+by": "User Action",
+    r"Backup\s+(started|ended|completed\s+successfully)": "System Notification",
+    r"System\s+updated\s+to\s+version": "System Notification",
+    r"File\s+\S+\s+uploaded\s+successfully\s+by\s+user": "System Notification",
+    r"Disk\s+cleanup\s+completed\s+successfully": "System Notification",
+    r"System\s+reboot\s+initiated\s+by\s+user": "System Notification",
+    r"Scheduled\s+maintenance\s+(started|completed)": "System Notification",
+    r"Service\s+\w+\s+restarted\s+successfully": "System Notification",
+}
+def classify_with_regex(log_message: str) -> str | None:
+    """
+    Tier 1: Rule-based classifier using regex patterns.
+    Returns category label or None if no pattern matches.
+    Latency: sub-millisecond.
+    """
+    for pattern, label in REGEX_PATTERNS.items():
+        if re.search(pattern, log_message, re.IGNORECASE):
+            return label
+    return None
+def get_regex_coverage(log_messages: list[str]) -> dict:
+    """Measure regex tier coverage on a list of log messages."""
+    matched = sum(1 for msg in log_messages if classify_with_regex(msg) is not None)
+    return {
+        "total": len(log_messages),
+        "matched": matched,
+        "coverage_pct": round(matched / len(log_messages) * 100, 2),
+    }
+if __name__ == "__main__":
+    test_logs = [
+        "User User123 logged in.",
+        "Backup completed successfully.",
+        "Account with ID 456 created by Admin.",
+        "GET /api/v2/resource HTTP/1.1 status: 200",   # should be None
+        "Hey bro chill ya!",                            # should be None
+    ]
+    for log in test_logs:
+        result = classify_with_regex(log)
+        print(f"[{'✓' if result else '✗'}] {result or 'None':25s} | {log[:60]}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio==4.44.0
+sentence-transformers==3.0.1
+scikit-learn==1.5.1
+huggingface-hub==0.24.6
+joblib==1.4.2
+pandas==2.2.2
+numpy==1.26.4

synthetic_logs.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

test.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+source,log_message
+ModernCRM, "IP 192.168.133.114 blocked due to potential attack"
+BillingSystem, "User 12345 logged in."
+AnalyticsEngine, "File data_6957.csv uploaded successfully by user User265."
+AnalyticsEngine, "Backup completed successfully."
+ModernHR, "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1 RCODE  200 len: 1583 time: 0.1878400"
+ModernHR, "Admin access escalation detected for user 9429"
+LegacyCRM, "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active."
+LegacyCRM, "Invoice generation process aborted for order ID 8910 due to invalid tax calculation module."
+LegacyCRM, "The 'BulkEmailSender' feature is no longer supported. Use 'EmailCampaignManager' for improved functionality."
+LegacyCRM, "The 'ReportGenerator' module will be retired in version 4.0. Please migrate to the 'AdvancedAnalyticsSuite' by Dec 2025"