File size: 7,456 Bytes
abc86a6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 | """
Log Classification System β HuggingFace Spaces
Gradio UI for the 3-tier hybrid log classification pipeline.
"""
from __future__ import annotations
import io
import time
import pandas as pd
import gradio as gr
from classify import classify_log, classify_csv
# ββ Source options ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
SOURCES = [
"ModernCRM",
"ModernHR",
"BillingSystem",
"AnalyticsEngine",
"ThirdPartyAPI",
"LegacyCRM",
]
TIER_COLORS = {
"Regex": "π’",
"BERT": "π΅",
"LLM": "π‘",
"LLM (fallback)": "π ",
}
EXAMPLE_LOGS = [
["ModernCRM", "User User12345 logged in."],
["ModernHR", "Multiple login failures occurred on user 6454 account"],
["BillingSystem", "GET /v2/servers/detail HTTP/1.1 status: 200 len: 1583 time: 0.19"],
["AnalyticsEngine", "System crashed due to disk I/O failure on node-3"],
["LegacyCRM", "Case escalation for ticket ID 7324 failed β support agent is no longer active."],
["LegacyCRM", "The 'BulkEmailSender' feature will be deprecated in v5.0. Use 'EmailCampaignManager'."],
]
# ββ Single log tab ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def classify_single(source: str, log_message: str):
if not log_message.strip():
return "β", "β", "β", "β"
t0 = time.perf_counter()
result = classify_log(source, log_message)
latency_ms = (time.perf_counter() - t0) * 1000
label = result["label"]
tier = result["tier"]
confidence = f"{result['confidence']:.1%}" if result["confidence"] is not None else "N/A"
icon = TIER_COLORS.get(tier, "βͺ")
return (
label,
f"{icon} {tier}",
confidence,
f"{latency_ms:.1f} ms",
)
# ββ Batch CSV tab βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def classify_batch(file):
if file is None:
return None, "β οΈ Please upload a CSV file."
try:
output_path, df = classify_csv(file.name, "/tmp/classified_output.csv")
except ValueError as e:
return None, f"β οΈ {e}"
except Exception as e:
return None, f"β Error: {e}"
total = len(df)
tier_counts = df["tier_used"].value_counts().to_dict()
label_counts = df["predicted_label"].value_counts().to_dict()
tier_lines = "\n".join(f" {TIER_COLORS.get(k,'βͺ')} {k}: {v} ({v/total:.0%})" for k, v in tier_counts.items())
label_lines = "\n".join(f" β’ {k}: {v}" for k, v in label_counts.items())
stats = (
f"β
Classified {total} logs\n\n"
f"π Tier breakdown:\n{tier_lines}\n\n"
f"π·οΈ Label distribution:\n{label_lines}"
)
return output_path, stats
# ββ UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
with gr.Blocks(title="Log Classification System", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# π Log Classification System
**3-tier hybrid pipeline** β π’ Regex Β· π΅ BERT + LogReg Β· π‘ LLM
Built to mimic production enterprise log monitoring architecture.
""")
with gr.Tabs():
# ββ Tab 1: Single Log ββββββββββββββββββββββββββββββββββββββββββββ
with gr.Tab("Single Log"):
with gr.Row():
source_input = gr.Dropdown(
choices=SOURCES,
value="ModernCRM",
label="Source System",
)
log_input = gr.Textbox(
label="Log Message",
placeholder="Paste a log message here...",
lines=3,
)
classify_btn = gr.Button("Classify", variant="primary")
with gr.Row():
label_out = gr.Textbox(label="π·οΈ Predicted Label", interactive=False)
tier_out = gr.Textbox(label="βοΈ Tier Used", interactive=False)
confidence_out = gr.Textbox(label="π Confidence", interactive=False)
latency_out = gr.Textbox(label="β±οΈ Latency", interactive=False)
classify_btn.click(
fn=classify_single,
inputs=[source_input, log_input],
outputs=[label_out, tier_out, confidence_out, latency_out],
)
gr.Examples(
examples=EXAMPLE_LOGS,
inputs=[source_input, log_input],
label="π Example Logs (click to try)",
)
# ββ Tab 2: Batch CSV βββββββββββββββββββββββββββββββββββββββββββββ
with gr.Tab("Batch CSV Upload"):
gr.Markdown("""
Upload a CSV with columns: **`source`**, **`log_message`**
Download the classified CSV with added columns: `predicted_label`, `tier_used`, `confidence`.
""")
with gr.Row():
with gr.Column():
csv_input = gr.File(label="π Upload CSV", file_types=[".csv"])
batch_btn = gr.Button("Classify All", variant="primary")
with gr.Column():
csv_output = gr.File(label="π₯ Download Classified CSV")
stats_out = gr.Textbox(label="π Stats", lines=12, interactive=False)
batch_btn.click(
fn=classify_batch,
inputs=[csv_input],
outputs=[csv_output, stats_out],
)
gr.Markdown("""
**Sample CSV format:**
```
source,log_message
ModernCRM,User User123 logged in.
LegacyCRM,Case escalation for ticket ID 7324 failed.
BillingSystem,GET /api/v2/invoice HTTP/1.1 status: 500
```
""")
# ββ Tab 3: Architecture ββββββββββββββββββββββββββββββββββββββββββ
with gr.Tab("Architecture"):
gr.Markdown("""
## ποΈ 3-Tier Hybrid Pipeline
| Tier | Method | Coverage | Latency | When Used |
|------|--------|----------|---------|-----------|
| π’ Regex | Python `re` patterns | ~21% | < 1ms | Fixed patterns (login, backup, etc.) |
| π΅ BERT | `all-MiniLM-L6-v2` + LogReg | ~79% | 20β80ms | High-volume categories with 150+ samples |
| π‘ LLM | HuggingFace Inference API | ~0.3% | 500β2000ms | LegacyCRM logs, rare patterns |
## π Model Performance (from training)
- **BERT + LogReg** trained on 2,410 synthetic enterprise logs
- **Confidence threshold**: 0.5 (below β escalate to LLM)
- **Source-aware routing**: `LegacyCRM` bypasses ML entirely (only 7 training samples)
## π Environment Variables
| Secret | Required For |
|--------|-------------|
| `HF_TOKEN` | LLM inference (LegacyCRM logs) |
""")
if __name__ == "__main__":
demo.launch()
|