File size: 7,456 Bytes
abc86a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
"""
Log Classification System β€” HuggingFace Spaces
Gradio UI for the 3-tier hybrid log classification pipeline.
"""
from __future__ import annotations
import io
import time
import pandas as pd
import gradio as gr
from classify import classify_log, classify_csv

# ── Source options ──────────────────────────────────────────────────────────
SOURCES = [
    "ModernCRM",
    "ModernHR",
    "BillingSystem",
    "AnalyticsEngine",
    "ThirdPartyAPI",
    "LegacyCRM",
]

TIER_COLORS = {
    "Regex":        "🟒",
    "BERT":         "πŸ”΅",
    "LLM":          "🟑",
    "LLM (fallback)": "🟠",
}

EXAMPLE_LOGS = [
    ["ModernCRM",       "User User12345 logged in."],
    ["ModernHR",        "Multiple login failures occurred on user 6454 account"],
    ["BillingSystem",   "GET /v2/servers/detail HTTP/1.1 status: 200 len: 1583 time: 0.19"],
    ["AnalyticsEngine", "System crashed due to disk I/O failure on node-3"],
    ["LegacyCRM",       "Case escalation for ticket ID 7324 failed β€” support agent is no longer active."],
    ["LegacyCRM",       "The 'BulkEmailSender' feature will be deprecated in v5.0. Use 'EmailCampaignManager'."],
]


# ── Single log tab ──────────────────────────────────────────────────────────
def classify_single(source: str, log_message: str):
    if not log_message.strip():
        return "β€”", "β€”", "β€”", "β€”"

    t0 = time.perf_counter()
    result = classify_log(source, log_message)
    latency_ms = (time.perf_counter() - t0) * 1000

    label      = result["label"]
    tier       = result["tier"]
    confidence = f"{result['confidence']:.1%}" if result["confidence"] is not None else "N/A"
    icon       = TIER_COLORS.get(tier, "βšͺ")

    return (
        label,
        f"{icon} {tier}",
        confidence,
        f"{latency_ms:.1f} ms",
    )


# ── Batch CSV tab ───────────────────────────────────────────────────────────
def classify_batch(file):
    if file is None:
        return None, "⚠️ Please upload a CSV file."

    try:
        output_path, df = classify_csv(file.name, "/tmp/classified_output.csv")
    except ValueError as e:
        return None, f"⚠️ {e}"
    except Exception as e:
        return None, f"❌ Error: {e}"

    total = len(df)
    tier_counts  = df["tier_used"].value_counts().to_dict()
    label_counts = df["predicted_label"].value_counts().to_dict()

    tier_lines  = "\n".join(f"  {TIER_COLORS.get(k,'βšͺ')} {k}: {v} ({v/total:.0%})" for k, v in tier_counts.items())
    label_lines = "\n".join(f"  β€’ {k}: {v}" for k, v in label_counts.items())

    stats = (
        f"βœ… Classified {total} logs\n\n"
        f"πŸ“Š Tier breakdown:\n{tier_lines}\n\n"
        f"🏷️ Label distribution:\n{label_lines}"
    )

    return output_path, stats


# ── UI ──────────────────────────────────────────────────────────────────────
with gr.Blocks(title="Log Classification System", theme=gr.themes.Soft()) as demo:

    gr.Markdown("""
# πŸ” Log Classification System
**3-tier hybrid pipeline** β†’ 🟒 Regex Β· πŸ”΅ BERT + LogReg Β· 🟑 LLM
Built to mimic production enterprise log monitoring architecture.
""")

    with gr.Tabs():

        # ── Tab 1: Single Log ────────────────────────────────────────────
        with gr.Tab("Single Log"):
            with gr.Row():
                source_input = gr.Dropdown(
                    choices=SOURCES,
                    value="ModernCRM",
                    label="Source System",
                )
                log_input = gr.Textbox(
                    label="Log Message",
                    placeholder="Paste a log message here...",
                    lines=3,
                )

            classify_btn = gr.Button("Classify", variant="primary")

            with gr.Row():
                label_out      = gr.Textbox(label="🏷️ Predicted Label",     interactive=False)
                tier_out       = gr.Textbox(label="βš™οΈ Tier Used",           interactive=False)
                confidence_out = gr.Textbox(label="πŸ“ˆ Confidence",          interactive=False)
                latency_out    = gr.Textbox(label="⏱️ Latency",             interactive=False)

            classify_btn.click(
                fn=classify_single,
                inputs=[source_input, log_input],
                outputs=[label_out, tier_out, confidence_out, latency_out],
            )

            gr.Examples(
                examples=EXAMPLE_LOGS,
                inputs=[source_input, log_input],
                label="πŸ“‹ Example Logs (click to try)",
            )

        # ── Tab 2: Batch CSV ─────────────────────────────────────────────
        with gr.Tab("Batch CSV Upload"):
            gr.Markdown("""
Upload a CSV with columns: **`source`**, **`log_message`**  
Download the classified CSV with added columns: `predicted_label`, `tier_used`, `confidence`.
""")
            with gr.Row():
                with gr.Column():
                    csv_input  = gr.File(label="πŸ“‚ Upload CSV", file_types=[".csv"])
                    batch_btn  = gr.Button("Classify All", variant="primary")
                with gr.Column():
                    csv_output = gr.File(label="πŸ“₯ Download Classified CSV")
                    stats_out  = gr.Textbox(label="πŸ“Š Stats", lines=12, interactive=False)

            batch_btn.click(
                fn=classify_batch,
                inputs=[csv_input],
                outputs=[csv_output, stats_out],
            )

            gr.Markdown("""
**Sample CSV format:**
```
source,log_message
ModernCRM,User User123 logged in.
LegacyCRM,Case escalation for ticket ID 7324 failed.
BillingSystem,GET /api/v2/invoice HTTP/1.1 status: 500
```
""")

        # ── Tab 3: Architecture ──────────────────────────────────────────
        with gr.Tab("Architecture"):
            gr.Markdown("""
## πŸ—οΈ 3-Tier Hybrid Pipeline

| Tier | Method | Coverage | Latency | When Used |
|------|--------|----------|---------|-----------|
| 🟒 Regex | Python `re` patterns | ~21% | < 1ms | Fixed patterns (login, backup, etc.) |
| πŸ”΅ BERT | `all-MiniLM-L6-v2` + LogReg | ~79% | 20–80ms | High-volume categories with 150+ samples |
| 🟑 LLM | HuggingFace Inference API | ~0.3% | 500–2000ms | LegacyCRM logs, rare patterns |

## πŸ“Š Model Performance (from training)
- **BERT + LogReg** trained on 2,410 synthetic enterprise logs
- **Confidence threshold**: 0.5 (below β†’ escalate to LLM)
- **Source-aware routing**: `LegacyCRM` bypasses ML entirely (only 7 training samples)

## πŸ”‘ Environment Variables
| Secret | Required For |
|--------|-------------|
| `HF_TOKEN` | LLM inference (LegacyCRM logs) |
""")

if __name__ == "__main__":
    demo.launch()