| """ |
| Log Classification System β HuggingFace Spaces |
| Gradio UI for the 3-tier hybrid log classification pipeline. |
| """ |
| from __future__ import annotations |
| import io |
| import time |
| import pandas as pd |
| import gradio as gr |
| from classify import classify_log, classify_csv |
|
|
| |
| SOURCES = [ |
| "ModernCRM", |
| "ModernHR", |
| "BillingSystem", |
| "AnalyticsEngine", |
| "ThirdPartyAPI", |
| "LegacyCRM", |
| ] |
|
|
| TIER_COLORS = { |
| "Regex": "π’", |
| "BERT": "π΅", |
| "LLM": "π‘", |
| "LLM (fallback)": "π ", |
| } |
|
|
| EXAMPLE_LOGS = [ |
| ["ModernCRM", "User User12345 logged in."], |
| ["ModernHR", "Multiple login failures occurred on user 6454 account"], |
| ["BillingSystem", "GET /v2/servers/detail HTTP/1.1 status: 200 len: 1583 time: 0.19"], |
| ["AnalyticsEngine", "System crashed due to disk I/O failure on node-3"], |
| ["LegacyCRM", "Case escalation for ticket ID 7324 failed β support agent is no longer active."], |
| ["LegacyCRM", "The 'BulkEmailSender' feature will be deprecated in v5.0. Use 'EmailCampaignManager'."], |
| ] |
|
|
|
|
| |
| def classify_single(source: str, log_message: str): |
| if not log_message.strip(): |
| return "β", "β", "β", "β" |
|
|
| t0 = time.perf_counter() |
| result = classify_log(source, log_message) |
| latency_ms = (time.perf_counter() - t0) * 1000 |
|
|
| label = result["label"] |
| tier = result["tier"] |
| confidence = f"{result['confidence']:.1%}" if result["confidence"] is not None else "N/A" |
| icon = TIER_COLORS.get(tier, "βͺ") |
|
|
| return ( |
| label, |
| f"{icon} {tier}", |
| confidence, |
| f"{latency_ms:.1f} ms", |
| ) |
|
|
|
|
| |
| def classify_batch(file): |
| if file is None: |
| return None, "β οΈ Please upload a CSV file." |
|
|
| try: |
| output_path, df = classify_csv(file.name, "/tmp/classified_output.csv") |
| except ValueError as e: |
| return None, f"β οΈ {e}" |
| except Exception as e: |
| return None, f"β Error: {e}" |
|
|
| total = len(df) |
| tier_counts = df["tier_used"].value_counts().to_dict() |
| label_counts = df["predicted_label"].value_counts().to_dict() |
|
|
| tier_lines = "\n".join(f" {TIER_COLORS.get(k,'βͺ')} {k}: {v} ({v/total:.0%})" for k, v in tier_counts.items()) |
| label_lines = "\n".join(f" β’ {k}: {v}" for k, v in label_counts.items()) |
|
|
| stats = ( |
| f"β
Classified {total} logs\n\n" |
| f"π Tier breakdown:\n{tier_lines}\n\n" |
| f"π·οΈ Label distribution:\n{label_lines}" |
| ) |
|
|
| return output_path, stats |
|
|
|
|
| |
| with gr.Blocks(title="Log Classification System", theme=gr.themes.Soft()) as demo: |
|
|
| gr.Markdown(""" |
| # π Log Classification System |
| **3-tier hybrid pipeline** β π’ Regex Β· π΅ BERT + LogReg Β· π‘ LLM |
| Built to mimic production enterprise log monitoring architecture. |
| """) |
|
|
| with gr.Tabs(): |
|
|
| |
| with gr.Tab("Single Log"): |
| with gr.Row(): |
| source_input = gr.Dropdown( |
| choices=SOURCES, |
| value="ModernCRM", |
| label="Source System", |
| ) |
| log_input = gr.Textbox( |
| label="Log Message", |
| placeholder="Paste a log message here...", |
| lines=3, |
| ) |
|
|
| classify_btn = gr.Button("Classify", variant="primary") |
|
|
| with gr.Row(): |
| label_out = gr.Textbox(label="π·οΈ Predicted Label", interactive=False) |
| tier_out = gr.Textbox(label="βοΈ Tier Used", interactive=False) |
| confidence_out = gr.Textbox(label="π Confidence", interactive=False) |
| latency_out = gr.Textbox(label="β±οΈ Latency", interactive=False) |
|
|
| classify_btn.click( |
| fn=classify_single, |
| inputs=[source_input, log_input], |
| outputs=[label_out, tier_out, confidence_out, latency_out], |
| ) |
|
|
| gr.Examples( |
| examples=EXAMPLE_LOGS, |
| inputs=[source_input, log_input], |
| label="π Example Logs (click to try)", |
| ) |
|
|
| |
| with gr.Tab("Batch CSV Upload"): |
| gr.Markdown(""" |
| Upload a CSV with columns: **`source`**, **`log_message`** |
| Download the classified CSV with added columns: `predicted_label`, `tier_used`, `confidence`. |
| """) |
| with gr.Row(): |
| with gr.Column(): |
| csv_input = gr.File(label="π Upload CSV", file_types=[".csv"]) |
| batch_btn = gr.Button("Classify All", variant="primary") |
| with gr.Column(): |
| csv_output = gr.File(label="π₯ Download Classified CSV") |
| stats_out = gr.Textbox(label="π Stats", lines=12, interactive=False) |
|
|
| batch_btn.click( |
| fn=classify_batch, |
| inputs=[csv_input], |
| outputs=[csv_output, stats_out], |
| ) |
|
|
| gr.Markdown(""" |
| **Sample CSV format:** |
| ``` |
| source,log_message |
| ModernCRM,User User123 logged in. |
| LegacyCRM,Case escalation for ticket ID 7324 failed. |
| BillingSystem,GET /api/v2/invoice HTTP/1.1 status: 500 |
| ``` |
| """) |
|
|
| |
| with gr.Tab("Architecture"): |
| gr.Markdown(""" |
| ## ποΈ 3-Tier Hybrid Pipeline |
| |
| | Tier | Method | Coverage | Latency | When Used | |
| |------|--------|----------|---------|-----------| |
| | π’ Regex | Python `re` patterns | ~21% | < 1ms | Fixed patterns (login, backup, etc.) | |
| | π΅ BERT | `all-MiniLM-L6-v2` + LogReg | ~79% | 20β80ms | High-volume categories with 150+ samples | |
| | π‘ LLM | HuggingFace Inference API | ~0.3% | 500β2000ms | LegacyCRM logs, rare patterns | |
| |
| ## π Model Performance (from training) |
| - **BERT + LogReg** trained on 2,410 synthetic enterprise logs |
| - **Confidence threshold**: 0.5 (below β escalate to LLM) |
| - **Source-aware routing**: `LegacyCRM` bypasses ML entirely (only 7 training samples) |
| |
| ## π Environment Variables |
| | Secret | Required For | |
| |--------|-------------| |
| | `HF_TOKEN` | LLM inference (LegacyCRM logs) | |
| """) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|