NOT-OMEGA's picture
Upload 9 files
abc86a6 verified
"""
Log Classification System β€” HuggingFace Spaces
Gradio UI for the 3-tier hybrid log classification pipeline.
"""
from __future__ import annotations
import io
import time
import pandas as pd
import gradio as gr
from classify import classify_log, classify_csv
# ── Source options ──────────────────────────────────────────────────────────
SOURCES = [
"ModernCRM",
"ModernHR",
"BillingSystem",
"AnalyticsEngine",
"ThirdPartyAPI",
"LegacyCRM",
]
TIER_COLORS = {
"Regex": "🟒",
"BERT": "πŸ”΅",
"LLM": "🟑",
"LLM (fallback)": "🟠",
}
EXAMPLE_LOGS = [
["ModernCRM", "User User12345 logged in."],
["ModernHR", "Multiple login failures occurred on user 6454 account"],
["BillingSystem", "GET /v2/servers/detail HTTP/1.1 status: 200 len: 1583 time: 0.19"],
["AnalyticsEngine", "System crashed due to disk I/O failure on node-3"],
["LegacyCRM", "Case escalation for ticket ID 7324 failed β€” support agent is no longer active."],
["LegacyCRM", "The 'BulkEmailSender' feature will be deprecated in v5.0. Use 'EmailCampaignManager'."],
]
# ── Single log tab ──────────────────────────────────────────────────────────
def classify_single(source: str, log_message: str):
if not log_message.strip():
return "β€”", "β€”", "β€”", "β€”"
t0 = time.perf_counter()
result = classify_log(source, log_message)
latency_ms = (time.perf_counter() - t0) * 1000
label = result["label"]
tier = result["tier"]
confidence = f"{result['confidence']:.1%}" if result["confidence"] is not None else "N/A"
icon = TIER_COLORS.get(tier, "βšͺ")
return (
label,
f"{icon} {tier}",
confidence,
f"{latency_ms:.1f} ms",
)
# ── Batch CSV tab ───────────────────────────────────────────────────────────
def classify_batch(file):
if file is None:
return None, "⚠️ Please upload a CSV file."
try:
output_path, df = classify_csv(file.name, "/tmp/classified_output.csv")
except ValueError as e:
return None, f"⚠️ {e}"
except Exception as e:
return None, f"❌ Error: {e}"
total = len(df)
tier_counts = df["tier_used"].value_counts().to_dict()
label_counts = df["predicted_label"].value_counts().to_dict()
tier_lines = "\n".join(f" {TIER_COLORS.get(k,'βšͺ')} {k}: {v} ({v/total:.0%})" for k, v in tier_counts.items())
label_lines = "\n".join(f" β€’ {k}: {v}" for k, v in label_counts.items())
stats = (
f"βœ… Classified {total} logs\n\n"
f"πŸ“Š Tier breakdown:\n{tier_lines}\n\n"
f"🏷️ Label distribution:\n{label_lines}"
)
return output_path, stats
# ── UI ──────────────────────────────────────────────────────────────────────
with gr.Blocks(title="Log Classification System", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# πŸ” Log Classification System
**3-tier hybrid pipeline** β†’ 🟒 Regex Β· πŸ”΅ BERT + LogReg Β· 🟑 LLM
Built to mimic production enterprise log monitoring architecture.
""")
with gr.Tabs():
# ── Tab 1: Single Log ────────────────────────────────────────────
with gr.Tab("Single Log"):
with gr.Row():
source_input = gr.Dropdown(
choices=SOURCES,
value="ModernCRM",
label="Source System",
)
log_input = gr.Textbox(
label="Log Message",
placeholder="Paste a log message here...",
lines=3,
)
classify_btn = gr.Button("Classify", variant="primary")
with gr.Row():
label_out = gr.Textbox(label="🏷️ Predicted Label", interactive=False)
tier_out = gr.Textbox(label="βš™οΈ Tier Used", interactive=False)
confidence_out = gr.Textbox(label="πŸ“ˆ Confidence", interactive=False)
latency_out = gr.Textbox(label="⏱️ Latency", interactive=False)
classify_btn.click(
fn=classify_single,
inputs=[source_input, log_input],
outputs=[label_out, tier_out, confidence_out, latency_out],
)
gr.Examples(
examples=EXAMPLE_LOGS,
inputs=[source_input, log_input],
label="πŸ“‹ Example Logs (click to try)",
)
# ── Tab 2: Batch CSV ─────────────────────────────────────────────
with gr.Tab("Batch CSV Upload"):
gr.Markdown("""
Upload a CSV with columns: **`source`**, **`log_message`**
Download the classified CSV with added columns: `predicted_label`, `tier_used`, `confidence`.
""")
with gr.Row():
with gr.Column():
csv_input = gr.File(label="πŸ“‚ Upload CSV", file_types=[".csv"])
batch_btn = gr.Button("Classify All", variant="primary")
with gr.Column():
csv_output = gr.File(label="πŸ“₯ Download Classified CSV")
stats_out = gr.Textbox(label="πŸ“Š Stats", lines=12, interactive=False)
batch_btn.click(
fn=classify_batch,
inputs=[csv_input],
outputs=[csv_output, stats_out],
)
gr.Markdown("""
**Sample CSV format:**
```
source,log_message
ModernCRM,User User123 logged in.
LegacyCRM,Case escalation for ticket ID 7324 failed.
BillingSystem,GET /api/v2/invoice HTTP/1.1 status: 500
```
""")
# ── Tab 3: Architecture ──────────────────────────────────────────
with gr.Tab("Architecture"):
gr.Markdown("""
## πŸ—οΈ 3-Tier Hybrid Pipeline
| Tier | Method | Coverage | Latency | When Used |
|------|--------|----------|---------|-----------|
| 🟒 Regex | Python `re` patterns | ~21% | < 1ms | Fixed patterns (login, backup, etc.) |
| πŸ”΅ BERT | `all-MiniLM-L6-v2` + LogReg | ~79% | 20–80ms | High-volume categories with 150+ samples |
| 🟑 LLM | HuggingFace Inference API | ~0.3% | 500–2000ms | LegacyCRM logs, rare patterns |
## πŸ“Š Model Performance (from training)
- **BERT + LogReg** trained on 2,410 synthetic enterprise logs
- **Confidence threshold**: 0.5 (below β†’ escalate to LLM)
- **Source-aware routing**: `LegacyCRM` bypasses ML entirely (only 7 training samples)
## πŸ”‘ Environment Variables
| Secret | Required For |
|--------|-------------|
| `HF_TOKEN` | LLM inference (LegacyCRM logs) |
""")
if __name__ == "__main__":
demo.launch()