Spaces:
Sleeping
Sleeping
Upload 13 files
Browse files- hf_space/Dockerfile +20 -0
- hf_space/app_gradio.py +542 -0
- hf_space/classify.py +198 -0
- hf_space/models/log_classifier.joblib +3 -0
- hf_space/onnx_model/config.json +24 -0
- hf_space/onnx_model/special_tokens_map.json +37 -0
- hf_space/onnx_model/tokenizer.json +0 -0
- hf_space/onnx_model/tokenizer_config.json +65 -0
- hf_space/onnx_model/vocab.txt +0 -0
- hf_space/processor_bert.py +216 -0
- hf_space/processor_llm.py +192 -0
- hf_space/processor_regex.py +220 -0
- hf_space/requirements.txt +25 -0
hf_space/Dockerfile
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 4 |
+
curl \
|
| 5 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
COPY requirements.txt .
|
| 10 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 11 |
+
|
| 12 |
+
COPY . .
|
| 13 |
+
|
| 14 |
+
RUN useradd -m -u 1000 appuser \
|
| 15 |
+
&& chown -R appuser:appuser /app
|
| 16 |
+
USER appuser
|
| 17 |
+
|
| 18 |
+
EXPOSE 7860
|
| 19 |
+
|
| 20 |
+
CMD ["python", "app_gradio.py"]
|
hf_space/app_gradio.py
ADDED
|
@@ -0,0 +1,542 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Log Classification System β HuggingFace Spaces
|
| 3 |
+
Ultra-modern 3D UI with custom CSS
|
| 4 |
+
"""
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
import io
|
| 7 |
+
import time
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import gradio as gr
|
| 10 |
+
from classify import classify_log, classify_csv
|
| 11 |
+
|
| 12 |
+
SOURCES = [
|
| 13 |
+
"ModernCRM", "ModernHR", "BillingSystem",
|
| 14 |
+
"AnalyticsEngine", "ThirdPartyAPI", "LegacyCRM",
|
| 15 |
+
]
|
| 16 |
+
|
| 17 |
+
TIER_COLORS = {
|
| 18 |
+
"Regex": "π’",
|
| 19 |
+
"BERT": "π΅",
|
| 20 |
+
"LLM": "π‘",
|
| 21 |
+
"LLM (fallback)": "π ",
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
EXAMPLE_LOGS = [
|
| 25 |
+
["ModernCRM", "User User12345 logged in."],
|
| 26 |
+
["ModernHR", "Multiple login failures occurred on user 6454 account"],
|
| 27 |
+
["BillingSystem", "GET /v2/servers/detail HTTP/1.1 status: 200 len: 1583 time: 0.19"],
|
| 28 |
+
["AnalyticsEngine", "System crashed due to disk I/O failure on node-3"],
|
| 29 |
+
["LegacyCRM", "Case escalation for ticket ID 7324 failed β support agent is no longer active."],
|
| 30 |
+
["LegacyCRM", "The 'BulkEmailSender' feature will be deprecated in v5.0. Use 'EmailCampaignManager'."],
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
# ββ Custom CSS β 3D Modern Dark Theme ββββββββββββββββββββββββββββββββββββββ
|
| 34 |
+
CUSTOM_CSS = """
|
| 35 |
+
@import url('https://fonts.googleapis.com/css2?family=Rajdhani:wght@400;500;600;700&family=Share+Tech+Mono&family=Exo+2:wght@300;400;600;700&display=swap');
|
| 36 |
+
|
| 37 |
+
:root {
|
| 38 |
+
--bg-primary: #050810;
|
| 39 |
+
--bg-secondary: #0a0f1e;
|
| 40 |
+
--bg-card: #0d1425;
|
| 41 |
+
--bg-card-hover: #111a30;
|
| 42 |
+
--accent-cyan: #00d4ff;
|
| 43 |
+
--accent-blue: #0066ff;
|
| 44 |
+
--accent-purple: #7c3aed;
|
| 45 |
+
--accent-green: #00ff88;
|
| 46 |
+
--accent-orange: #ff6b00;
|
| 47 |
+
--text-primary: #e2e8f0;
|
| 48 |
+
--text-secondary: #94a3b8;
|
| 49 |
+
--text-muted: #475569;
|
| 50 |
+
--border-glow: rgba(0, 212, 255, 0.3);
|
| 51 |
+
--shadow-3d: 0 20px 60px rgba(0, 0, 0, 0.8), 0 0 40px rgba(0, 102, 255, 0.15);
|
| 52 |
+
--glow-cyan: 0 0 20px rgba(0, 212, 255, 0.4), 0 0 40px rgba(0, 212, 255, 0.2);
|
| 53 |
+
--glow-blue: 0 0 20px rgba(0, 102, 255, 0.4);
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
/* ββ Base ββ */
|
| 57 |
+
body, .gradio-container {
|
| 58 |
+
background: var(--bg-primary) !important;
|
| 59 |
+
font-family: 'Exo 2', sans-serif !important;
|
| 60 |
+
color: var(--text-primary) !important;
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
.gradio-container {
|
| 64 |
+
background:
|
| 65 |
+
radial-gradient(ellipse at 20% 20%, rgba(0, 102, 255, 0.08) 0%, transparent 50%),
|
| 66 |
+
radial-gradient(ellipse at 80% 80%, rgba(124, 58, 237, 0.08) 0%, transparent 50%),
|
| 67 |
+
radial-gradient(ellipse at 50% 50%, rgba(0, 212, 255, 0.03) 0%, transparent 70%),
|
| 68 |
+
var(--bg-primary) !important;
|
| 69 |
+
min-height: 100vh;
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
/* ββ Header ββ */
|
| 73 |
+
.main-header {
|
| 74 |
+
text-align: center;
|
| 75 |
+
padding: 48px 24px 32px;
|
| 76 |
+
position: relative;
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
.main-header::before {
|
| 80 |
+
content: '';
|
| 81 |
+
position: absolute;
|
| 82 |
+
top: 0; left: 50%;
|
| 83 |
+
transform: translateX(-50%);
|
| 84 |
+
width: 600px; height: 2px;
|
| 85 |
+
background: linear-gradient(90deg, transparent, var(--accent-cyan), var(--accent-blue), transparent);
|
| 86 |
+
box-shadow: var(--glow-cyan);
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
/* ββ Tab Navigation ββ */
|
| 90 |
+
.tab-nav {
|
| 91 |
+
background: rgba(13, 20, 37, 0.8) !important;
|
| 92 |
+
border: 1px solid rgba(0, 212, 255, 0.15) !important;
|
| 93 |
+
border-radius: 16px !important;
|
| 94 |
+
padding: 6px !important;
|
| 95 |
+
backdrop-filter: blur(20px) !important;
|
| 96 |
+
box-shadow: var(--shadow-3d) !important;
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
.tab-nav button {
|
| 100 |
+
font-family: 'Rajdhani', sans-serif !important;
|
| 101 |
+
font-weight: 600 !important;
|
| 102 |
+
font-size: 14px !important;
|
| 103 |
+
letter-spacing: 1.5px !important;
|
| 104 |
+
text-transform: uppercase !important;
|
| 105 |
+
color: var(--text-secondary) !important;
|
| 106 |
+
background: transparent !important;
|
| 107 |
+
border: none !important;
|
| 108 |
+
border-radius: 10px !important;
|
| 109 |
+
padding: 12px 24px !important;
|
| 110 |
+
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
.tab-nav button.selected {
|
| 114 |
+
color: var(--accent-cyan) !important;
|
| 115 |
+
background: linear-gradient(135deg, rgba(0, 212, 255, 0.1), rgba(0, 102, 255, 0.1)) !important;
|
| 116 |
+
box-shadow: 0 0 20px rgba(0, 212, 255, 0.2), inset 0 1px 0 rgba(0, 212, 255, 0.3) !important;
|
| 117 |
+
border: 1px solid rgba(0, 212, 255, 0.3) !important;
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
/* ββ Cards / Blocks ββ */
|
| 121 |
+
.gradio-group, .gr-group {
|
| 122 |
+
background: var(--bg-card) !important;
|
| 123 |
+
border: 1px solid rgba(0, 212, 255, 0.1) !important;
|
| 124 |
+
border-radius: 20px !important;
|
| 125 |
+
box-shadow: var(--shadow-3d), inset 0 1px 0 rgba(255,255,255,0.03) !important;
|
| 126 |
+
transition: all 0.4s ease !important;
|
| 127 |
+
transform: perspective(1000px) rotateX(0deg);
|
| 128 |
+
position: relative;
|
| 129 |
+
overflow: hidden;
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
.gradio-group::before {
|
| 133 |
+
content: '';
|
| 134 |
+
position: absolute;
|
| 135 |
+
top: 0; left: 0; right: 0;
|
| 136 |
+
height: 1px;
|
| 137 |
+
background: linear-gradient(90deg, transparent, rgba(0, 212, 255, 0.5), transparent);
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
.gradio-group:hover {
|
| 141 |
+
border-color: rgba(0, 212, 255, 0.25) !important;
|
| 142 |
+
box-shadow: var(--shadow-3d), var(--glow-cyan) !important;
|
| 143 |
+
transform: perspective(1000px) translateY(-4px) !important;
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
/* ββ Labels ββ */
|
| 147 |
+
label span, .gr-label {
|
| 148 |
+
font-family: 'Rajdhani', sans-serif !important;
|
| 149 |
+
font-weight: 600 !important;
|
| 150 |
+
letter-spacing: 1.5px !important;
|
| 151 |
+
text-transform: uppercase !important;
|
| 152 |
+
font-size: 11px !important;
|
| 153 |
+
color: var(--accent-cyan) !important;
|
| 154 |
+
opacity: 0.85;
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
/* ββ Inputs ββ */
|
| 158 |
+
input, textarea, select, .gr-input {
|
| 159 |
+
background: rgba(5, 8, 16, 0.8) !important;
|
| 160 |
+
border: 1px solid rgba(0, 212, 255, 0.15) !important;
|
| 161 |
+
border-radius: 12px !important;
|
| 162 |
+
color: var(--text-primary) !important;
|
| 163 |
+
font-family: 'Share Tech Mono', monospace !important;
|
| 164 |
+
font-size: 13px !important;
|
| 165 |
+
transition: all 0.3s ease !important;
|
| 166 |
+
padding: 12px 16px !important;
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
input:focus, textarea:focus {
|
| 170 |
+
border-color: var(--accent-cyan) !important;
|
| 171 |
+
box-shadow: 0 0 0 3px rgba(0, 212, 255, 0.1), var(--glow-cyan) !important;
|
| 172 |
+
outline: none !important;
|
| 173 |
+
background: rgba(0, 212, 255, 0.03) !important;
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
/* ββ Dropdown ββ */
|
| 177 |
+
.gr-dropdown select, .gradio-dropdown {
|
| 178 |
+
background: rgba(5, 8, 16, 0.9) !important;
|
| 179 |
+
border: 1px solid rgba(0, 212, 255, 0.2) !important;
|
| 180 |
+
border-radius: 12px !important;
|
| 181 |
+
color: var(--accent-cyan) !important;
|
| 182 |
+
font-family: 'Rajdhani', sans-serif !important;
|
| 183 |
+
font-weight: 600 !important;
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
/* ββ Primary Button ββ */
|
| 187 |
+
button.primary, .gr-button-primary, button[variant="primary"] {
|
| 188 |
+
font-family: 'Rajdhani', sans-serif !important;
|
| 189 |
+
font-weight: 700 !important;
|
| 190 |
+
font-size: 15px !important;
|
| 191 |
+
letter-spacing: 2px !important;
|
| 192 |
+
text-transform: uppercase !important;
|
| 193 |
+
background: linear-gradient(135deg, #0066ff 0%, #00d4ff 50%, #0066ff 100%) !important;
|
| 194 |
+
background-size: 200% 200% !important;
|
| 195 |
+
border: none !important;
|
| 196 |
+
border-radius: 12px !important;
|
| 197 |
+
padding: 14px 32px !important;
|
| 198 |
+
color: #fff !important;
|
| 199 |
+
box-shadow:
|
| 200 |
+
0 8px 32px rgba(0, 102, 255, 0.4),
|
| 201 |
+
0 2px 8px rgba(0, 0, 0, 0.5),
|
| 202 |
+
inset 0 1px 0 rgba(255,255,255,0.2) !important;
|
| 203 |
+
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
|
| 204 |
+
animation: gradientShift 3s ease infinite !important;
|
| 205 |
+
position: relative !important;
|
| 206 |
+
overflow: hidden !important;
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
button.primary::before {
|
| 210 |
+
content: '';
|
| 211 |
+
position: absolute;
|
| 212 |
+
top: -50%; left: -60%;
|
| 213 |
+
width: 40%; height: 200%;
|
| 214 |
+
background: rgba(255,255,255,0.1);
|
| 215 |
+
transform: skewX(-20deg);
|
| 216 |
+
transition: left 0.6s ease;
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
button.primary:hover::before {
|
| 220 |
+
left: 120%;
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
button.primary:hover {
|
| 224 |
+
transform: translateY(-3px) scale(1.02) !important;
|
| 225 |
+
box-shadow:
|
| 226 |
+
0 16px 48px rgba(0, 102, 255, 0.5),
|
| 227 |
+
0 0 30px rgba(0, 212, 255, 0.3),
|
| 228 |
+
inset 0 1px 0 rgba(255,255,255,0.3) !important;
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
button.primary:active {
|
| 232 |
+
transform: translateY(0px) scale(0.98) !important;
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
@keyframes gradientShift {
|
| 236 |
+
0%, 100% { background-position: 0% 50%; }
|
| 237 |
+
50% { background-position: 100% 50%; }
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
/* ββ Output Textboxes β 3D Result Cards ββ */
|
| 241 |
+
.output-card input, .output-card textarea {
|
| 242 |
+
background: linear-gradient(135deg, rgba(0, 212, 255, 0.05), rgba(0, 102, 255, 0.05)) !important;
|
| 243 |
+
border: 1px solid rgba(0, 212, 255, 0.2) !important;
|
| 244 |
+
border-radius: 14px !important;
|
| 245 |
+
font-family: 'Share Tech Mono', monospace !important;
|
| 246 |
+
font-size: 16px !important;
|
| 247 |
+
font-weight: bold !important;
|
| 248 |
+
color: var(--accent-cyan) !important;
|
| 249 |
+
text-align: center !important;
|
| 250 |
+
box-shadow: inset 0 2px 8px rgba(0,0,0,0.3), 0 0 20px rgba(0, 212, 255, 0.1) !important;
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
/* ββ Table / DataFrame ββ */
|
| 254 |
+
table {
|
| 255 |
+
border-collapse: separate !important;
|
| 256 |
+
border-spacing: 0 4px !important;
|
| 257 |
+
font-family: 'Share Tech Mono', monospace !important;
|
| 258 |
+
font-size: 12px !important;
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
th {
|
| 262 |
+
background: rgba(0, 102, 255, 0.2) !important;
|
| 263 |
+
color: var(--accent-cyan) !important;
|
| 264 |
+
font-family: 'Rajdhani', sans-serif !important;
|
| 265 |
+
letter-spacing: 1.5px !important;
|
| 266 |
+
text-transform: uppercase !important;
|
| 267 |
+
font-size: 11px !important;
|
| 268 |
+
padding: 10px 16px !important;
|
| 269 |
+
border: none !important;
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
td {
|
| 273 |
+
background: rgba(13, 20, 37, 0.6) !important;
|
| 274 |
+
color: var(--text-secondary) !important;
|
| 275 |
+
padding: 8px 16px !important;
|
| 276 |
+
border: none !important;
|
| 277 |
+
border-top: 1px solid rgba(0, 212, 255, 0.05) !important;
|
| 278 |
+
transition: background 0.2s ease !important;
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
tr:hover td {
|
| 282 |
+
background: rgba(0, 212, 255, 0.05) !important;
|
| 283 |
+
color: var(--text-primary) !important;
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
/* ββ Markdown ββ */
|
| 287 |
+
.prose, .markdown {
|
| 288 |
+
color: var(--text-secondary) !important;
|
| 289 |
+
font-family: 'Exo 2', sans-serif !important;
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
.prose h1, .markdown h1 {
|
| 293 |
+
font-family: 'Rajdhani', sans-serif !important;
|
| 294 |
+
font-size: 3rem !important;
|
| 295 |
+
font-weight: 700 !important;
|
| 296 |
+
letter-spacing: 3px !important;
|
| 297 |
+
text-transform: uppercase !important;
|
| 298 |
+
background: linear-gradient(135deg, #ffffff 0%, var(--accent-cyan) 40%, var(--accent-blue) 100%) !important;
|
| 299 |
+
-webkit-background-clip: text !important;
|
| 300 |
+
-webkit-text-fill-color: transparent !important;
|
| 301 |
+
background-clip: text !important;
|
| 302 |
+
filter: drop-shadow(0 0 30px rgba(0, 212, 255, 0.3)) !important;
|
| 303 |
+
margin-bottom: 8px !important;
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
.prose h2, .markdown h2 {
|
| 307 |
+
font-family: 'Rajdhani', sans-serif !important;
|
| 308 |
+
font-size: 1.4rem !important;
|
| 309 |
+
font-weight: 600 !important;
|
| 310 |
+
letter-spacing: 2px !important;
|
| 311 |
+
color: var(--accent-cyan) !important;
|
| 312 |
+
text-transform: uppercase !important;
|
| 313 |
+
border-bottom: 1px solid rgba(0, 212, 255, 0.2) !important;
|
| 314 |
+
padding-bottom: 8px !important;
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
.prose p, .markdown p {
|
| 318 |
+
color: var(--text-secondary) !important;
|
| 319 |
+
line-height: 1.7 !important;
|
| 320 |
+
font-size: 14px !important;
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
.prose strong, .markdown strong {
|
| 324 |
+
color: var(--accent-cyan) !important;
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
/* ββ Code blocks ββ */
|
| 328 |
+
code, pre {
|
| 329 |
+
font-family: 'Share Tech Mono', monospace !important;
|
| 330 |
+
background: rgba(0, 212, 255, 0.05) !important;
|
| 331 |
+
border: 1px solid rgba(0, 212, 255, 0.15) !important;
|
| 332 |
+
border-radius: 8px !important;
|
| 333 |
+
color: var(--accent-cyan) !important;
|
| 334 |
+
font-size: 12px !important;
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
/* ββ Examples Table ββ */
|
| 338 |
+
.examples {
|
| 339 |
+
background: var(--bg-card) !important;
|
| 340 |
+
border: 1px solid rgba(0, 212, 255, 0.1) !important;
|
| 341 |
+
border-radius: 14px !important;
|
| 342 |
+
overflow: hidden !important;
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
+
.examples table th {
|
| 346 |
+
background: rgba(0, 102, 255, 0.15) !important;
|
| 347 |
+
}
|
| 348 |
+
|
| 349 |
+
/* ββ File Upload ββ */
|
| 350 |
+
.gr-file {
|
| 351 |
+
background: rgba(5, 8, 16, 0.8) !important;
|
| 352 |
+
border: 2px dashed rgba(0, 212, 255, 0.25) !important;
|
| 353 |
+
border-radius: 16px !important;
|
| 354 |
+
transition: all 0.3s ease !important;
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
.gr-file:hover {
|
| 358 |
+
border-color: var(--accent-cyan) !important;
|
| 359 |
+
background: rgba(0, 212, 255, 0.03) !important;
|
| 360 |
+
box-shadow: var(--glow-cyan) !important;
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
/* ββ Scrollbar ββ */
|
| 364 |
+
::-webkit-scrollbar { width: 6px; height: 6px; }
|
| 365 |
+
::-webkit-scrollbar-track { background: var(--bg-secondary); }
|
| 366 |
+
::-webkit-scrollbar-thumb {
|
| 367 |
+
background: linear-gradient(var(--accent-blue), var(--accent-cyan));
|
| 368 |
+
border-radius: 3px;
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
/* ββ Pulsing accent line ββ */
|
| 372 |
+
@keyframes pulse-glow {
|
| 373 |
+
0%, 100% { opacity: 0.4; box-shadow: 0 0 10px rgba(0,212,255,0.3); }
|
| 374 |
+
50% { opacity: 1; box-shadow: 0 0 30px rgba(0,212,255,0.8); }
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
/* ββ Tier badge colors ββ */
|
| 378 |
+
.tier-regex { color: #00ff88 !important; }
|
| 379 |
+
.tier-bert { color: #00d4ff !important; }
|
| 380 |
+
.tier-llm { color: #ffd700 !important; }
|
| 381 |
+
"""
|
| 382 |
+
|
| 383 |
+
# ββ Functions βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 384 |
+
def classify_single(source: str, log_message: str):
|
| 385 |
+
if not log_message.strip():
|
| 386 |
+
return "β", "β", "β", "β"
|
| 387 |
+
t0 = time.perf_counter()
|
| 388 |
+
result = classify_log(source, log_message)
|
| 389 |
+
latency_ms = (time.perf_counter() - t0) * 1000
|
| 390 |
+
label = result["label"]
|
| 391 |
+
tier = result["tier"]
|
| 392 |
+
confidence = f"{result['confidence']:.1%}" if result["confidence"] is not None else "N/A"
|
| 393 |
+
icon = TIER_COLORS.get(tier, "βͺ")
|
| 394 |
+
return label, f"{icon} {tier}", confidence, f"{latency_ms:.1f} ms"
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
def classify_batch(file):
|
| 398 |
+
if file is None:
|
| 399 |
+
return None, "β οΈ Please upload a CSV file."
|
| 400 |
+
try:
|
| 401 |
+
output_path, df = classify_csv(file.name, "/tmp/classified_output.csv")
|
| 402 |
+
except ValueError as e:
|
| 403 |
+
return None, f"β οΈ {e}"
|
| 404 |
+
except Exception as e:
|
| 405 |
+
return None, f"β Error: {e}"
|
| 406 |
+
total = len(df)
|
| 407 |
+
tier_counts = df["tier_used"].value_counts().to_dict()
|
| 408 |
+
label_counts = df["predicted_label"].value_counts().to_dict()
|
| 409 |
+
tier_lines = "\n".join(f" {TIER_COLORS.get(k,'βͺ')} {k}: {v} ({v/total:.0%})" for k, v in tier_counts.items())
|
| 410 |
+
label_lines = "\n".join(f" β’ {k}: {v}" for k, v in label_counts.items())
|
| 411 |
+
stats = (
|
| 412 |
+
f"β
Classified {total} logs\n\n"
|
| 413 |
+
f"π Tier breakdown:\n{tier_lines}\n\n"
|
| 414 |
+
f"π·οΈ Label distribution:\n{label_lines}"
|
| 415 |
+
)
|
| 416 |
+
return output_path, stats
|
| 417 |
+
|
| 418 |
+
|
| 419 |
+
# ββ UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 420 |
+
with gr.Blocks(
|
| 421 |
+
title="LOG CLASSIFICATION SYSTEM",
|
| 422 |
+
theme=gr.themes.Base(
|
| 423 |
+
primary_hue="blue",
|
| 424 |
+
secondary_hue="cyan",
|
| 425 |
+
neutral_hue="slate",
|
| 426 |
+
font=[gr.themes.GoogleFont("Exo 2"), "sans-serif"],
|
| 427 |
+
font_mono=[gr.themes.GoogleFont("Share Tech Mono"), "monospace"],
|
| 428 |
+
).set(
|
| 429 |
+
body_background_fill="#050810",
|
| 430 |
+
body_text_color="#e2e8f0",
|
| 431 |
+
block_background_fill="#0d1425",
|
| 432 |
+
block_border_color="rgba(0,212,255,0.15)",
|
| 433 |
+
block_label_text_color="#00d4ff",
|
| 434 |
+
input_background_fill="#050810",
|
| 435 |
+
input_border_color="rgba(0,212,255,0.2)",
|
| 436 |
+
button_primary_background_fill="linear-gradient(135deg, #0066ff, #00d4ff)",
|
| 437 |
+
button_primary_text_color="#ffffff",
|
| 438 |
+
border_color_accent="#00d4ff",
|
| 439 |
+
color_accent_soft="rgba(0,212,255,0.1)",
|
| 440 |
+
),
|
| 441 |
+
css=CUSTOM_CSS
|
| 442 |
+
) as demo:
|
| 443 |
+
|
| 444 |
+
gr.Markdown("""
|
| 445 |
+
# π LOG CLASSIFICATION SYSTEM
|
| 446 |
+
**3-tier hybrid pipeline** β π’ Regex Β· π΅ BERT + ML Β· π‘ LLM
|
| 447 |
+
*Enterprise-grade log monitoring at production scale*
|
| 448 |
+
""")
|
| 449 |
+
|
| 450 |
+
with gr.Tabs():
|
| 451 |
+
|
| 452 |
+
# ββ Tab 1: Single Log βββββββββββββββββββββββββββββββββββββββββββββ
|
| 453 |
+
with gr.Tab("β‘ SINGLE LOG"):
|
| 454 |
+
with gr.Row():
|
| 455 |
+
with gr.Column(scale=1):
|
| 456 |
+
source_input = gr.Dropdown(
|
| 457 |
+
choices=SOURCES,
|
| 458 |
+
value="ModernCRM",
|
| 459 |
+
label="SOURCE SYSTEM",
|
| 460 |
+
)
|
| 461 |
+
with gr.Column(scale=3):
|
| 462 |
+
log_input = gr.Textbox(
|
| 463 |
+
label="LOG MESSAGE",
|
| 464 |
+
placeholder="Paste a log message here...",
|
| 465 |
+
lines=3,
|
| 466 |
+
)
|
| 467 |
+
|
| 468 |
+
classify_btn = gr.Button("βΆ CLASSIFY LOG", variant="primary", size="lg")
|
| 469 |
+
|
| 470 |
+
with gr.Row():
|
| 471 |
+
label_out = gr.Textbox(label="π·οΈ PREDICTED LABEL", interactive=False)
|
| 472 |
+
tier_out = gr.Textbox(label="βοΈ TIER USED", interactive=False)
|
| 473 |
+
confidence_out = gr.Textbox(label="π CONFIDENCE", interactive=False)
|
| 474 |
+
latency_out = gr.Textbox(label="β±οΈ LATENCY", interactive=False)
|
| 475 |
+
|
| 476 |
+
classify_btn.click(
|
| 477 |
+
fn=classify_single,
|
| 478 |
+
inputs=[source_input, log_input],
|
| 479 |
+
outputs=[label_out, tier_out, confidence_out, latency_out],
|
| 480 |
+
)
|
| 481 |
+
|
| 482 |
+
gr.Examples(
|
| 483 |
+
examples=EXAMPLE_LOGS,
|
| 484 |
+
inputs=[source_input, log_input],
|
| 485 |
+
label="π EXAMPLE LOGS β click to try",
|
| 486 |
+
)
|
| 487 |
+
|
| 488 |
+
# ββ Tab 2: Batch CSV ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 489 |
+
with gr.Tab("π¦ BATCH CSV"):
|
| 490 |
+
gr.Markdown("""
|
| 491 |
+
### Bulk Classification
|
| 492 |
+
Upload a CSV with columns: **`source`**, **`log_message`**
|
| 493 |
+
Output includes: `predicted_label`, `tier_used`, `confidence`, `latency_ms`
|
| 494 |
+
""")
|
| 495 |
+
with gr.Row():
|
| 496 |
+
with gr.Column():
|
| 497 |
+
csv_input = gr.File(label="π UPLOAD CSV", file_types=[".csv"])
|
| 498 |
+
batch_btn = gr.Button("βΆ CLASSIFY ALL", variant="primary")
|
| 499 |
+
with gr.Column():
|
| 500 |
+
csv_output = gr.File(label="π₯ DOWNLOAD RESULTS")
|
| 501 |
+
stats_out = gr.Textbox(label="π STATISTICS", lines=12, interactive=False)
|
| 502 |
+
|
| 503 |
+
batch_btn.click(
|
| 504 |
+
fn=classify_batch,
|
| 505 |
+
inputs=[csv_input],
|
| 506 |
+
outputs=[csv_output, stats_out],
|
| 507 |
+
)
|
| 508 |
+
|
| 509 |
+
gr.Markdown("""
|
| 510 |
+
**Sample CSV format:**
|
| 511 |
+
```
|
| 512 |
+
source,log_message
|
| 513 |
+
ModernCRM,User User123 logged in.
|
| 514 |
+
LegacyCRM,Case escalation for ticket ID 7324 failed.
|
| 515 |
+
BillingSystem,GET /api/v2/invoice HTTP/1.1 status: 500
|
| 516 |
+
```
|
| 517 |
+
""")
|
| 518 |
+
|
| 519 |
+
# ββ Tab 3: Architecture βββββββββββββββββββββββββββββββββββββββββββ
|
| 520 |
+
with gr.Tab("ποΈ ARCHITECTURE"):
|
| 521 |
+
gr.Markdown("""
|
| 522 |
+
## 3-Tier Hybrid Pipeline
|
| 523 |
+
|
| 524 |
+
| Tier | Method | Coverage | Latency | Trigger |
|
| 525 |
+
|------|--------|----------|---------|---------|
|
| 526 |
+
| π’ **Regex** | Python `re` patterns | ~21% | < 1ms | Fixed patterns |
|
| 527 |
+
| π΅ **BERT** | `all-MiniLM-L6-v2` + LogReg | ~79% | 20β80ms | High-volume categories |
|
| 528 |
+
| π‘ **LLM** | HuggingFace Inference API | ~0.3% | 500β2000ms | LegacyCRM + rare patterns |
|
| 529 |
+
|
| 530 |
+
## Model Performance
|
| 531 |
+
- **Training data**: 2,410 synthetic enterprise logs
|
| 532 |
+
- **Confidence threshold**: 0.5 (below β escalate to LLM)
|
| 533 |
+
- **Source-aware routing**: `LegacyCRM` β LLM directly
|
| 534 |
+
|
| 535 |
+
## Environment Variables
|
| 536 |
+
| Secret | Purpose |
|
| 537 |
+
|--------|---------|
|
| 538 |
+
| `HF_TOKEN` | LLM inference for LegacyCRM logs |
|
| 539 |
+
""")
|
| 540 |
+
|
| 541 |
+
if __name__ == "__main__":
|
| 542 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|
hf_space/classify.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
classify.py β 3-Tier Hybrid Pipeline (V3 β Latency-Tracked)
|
| 3 |
+
|
| 4 |
+
Architecture:
|
| 5 |
+
LegacyCRM β LLM directly
|
| 6 |
+
Others β Regex β BERT (batch) β LLM fallback
|
| 7 |
+
|
| 8 |
+
Changes in V3:
|
| 9 |
+
- Tier-wise latency tracking (regex_ms, bert_ms, llm_ms)
|
| 10 |
+
- Pipeline summary with p50/p95 per tier
|
| 11 |
+
- Defensive: LLM timeout + retry baked in via processor_llm
|
| 12 |
+
- classify_logs returns richer result dict
|
| 13 |
+
"""
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
import time
|
| 16 |
+
import statistics
|
| 17 |
+
import pandas as pd
|
| 18 |
+
from processor_regex import classify_with_regex
|
| 19 |
+
from processor_bert import classify_batch as bert_batch
|
| 20 |
+
from processor_llm import classify_with_llm
|
| 21 |
+
|
| 22 |
+
LEGACY_SOURCE = "LegacyCRM"
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# ββ Result type βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 26 |
+
def _make_result(label: str, tier: str, confidence, latency_ms: float) -> dict:
|
| 27 |
+
return {
|
| 28 |
+
"label": label,
|
| 29 |
+
"tier": tier,
|
| 30 |
+
"confidence": confidence,
|
| 31 |
+
"latency_ms": round(latency_ms, 3),
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# ββ Single log (backward-compatible) ββββββββββββββββββββββββββββββββββββββββ
|
| 36 |
+
def classify_log(source: str, log_msg: str) -> dict:
|
| 37 |
+
"""Single log classify karo. Returns label, tier, confidence, latency_ms."""
|
| 38 |
+
results = classify_logs([(source, log_msg)])
|
| 39 |
+
return results[0]
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# ββ Batch pipeline (main entry point) βββββββββββββββββββββββββββββββββββββββ
|
| 43 |
+
def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
|
| 44 |
+
"""
|
| 45 |
+
Batch classify with 3-tier routing + per-result latency.
|
| 46 |
+
|
| 47 |
+
Returns list of dicts:
|
| 48 |
+
{ label, tier, confidence, latency_ms }
|
| 49 |
+
|
| 50 |
+
Tier routing:
|
| 51 |
+
LegacyCRM source β LLM directly
|
| 52 |
+
Regex match β done (sub-ms)
|
| 53 |
+
Remainder β BERT batch β LLM if low confidence
|
| 54 |
+
"""
|
| 55 |
+
n = len(logs)
|
| 56 |
+
results = [None] * n
|
| 57 |
+
|
| 58 |
+
# ββ Step 1: Route to groups βββββββββββββββββββββββββββββββββββββββββββββ
|
| 59 |
+
llm_indices = []
|
| 60 |
+
bert_indices = []
|
| 61 |
+
entry_times = [time.perf_counter()] * n # approximate per-log start
|
| 62 |
+
|
| 63 |
+
t_route_start = time.perf_counter()
|
| 64 |
+
for i, (source, log_msg) in enumerate(logs):
|
| 65 |
+
entry_times[i] = time.perf_counter()
|
| 66 |
+
if source == LEGACY_SOURCE:
|
| 67 |
+
llm_indices.append(i)
|
| 68 |
+
else:
|
| 69 |
+
t0 = time.perf_counter()
|
| 70 |
+
label = classify_with_regex(log_msg)
|
| 71 |
+
t1 = time.perf_counter()
|
| 72 |
+
if label:
|
| 73 |
+
results[i] = _make_result(label, "Regex", 1.0, (t1 - t0) * 1000)
|
| 74 |
+
else:
|
| 75 |
+
bert_indices.append(i)
|
| 76 |
+
|
| 77 |
+
# ββ Step 2: BERT batch ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 78 |
+
if bert_indices:
|
| 79 |
+
bert_msgs = [logs[i][1] for i in bert_indices]
|
| 80 |
+
|
| 81 |
+
t_bert_start = time.perf_counter()
|
| 82 |
+
bert_results = bert_batch(bert_msgs)
|
| 83 |
+
t_bert_end = time.perf_counter()
|
| 84 |
+
|
| 85 |
+
bert_ms_per_log = (t_bert_end - t_bert_start) * 1000 / len(bert_msgs)
|
| 86 |
+
|
| 87 |
+
for idx, (label, conf) in zip(bert_indices, bert_results):
|
| 88 |
+
if label != "Unclassified":
|
| 89 |
+
results[idx] = _make_result(label, "BERT", conf, bert_ms_per_log)
|
| 90 |
+
else:
|
| 91 |
+
llm_indices.append(idx)
|
| 92 |
+
|
| 93 |
+
# ββ Step 3: LLM (LegacyCRM + BERT fallback) ββββββββββββββββββββββββββββ
|
| 94 |
+
for i in llm_indices:
|
| 95 |
+
_, log_msg = logs[i]
|
| 96 |
+
t0 = time.perf_counter()
|
| 97 |
+
label = classify_with_llm(log_msg)
|
| 98 |
+
t1 = time.perf_counter()
|
| 99 |
+
tier = "LLM" if logs[i][0] == LEGACY_SOURCE else "LLM (fallback)"
|
| 100 |
+
results[i] = _make_result(label, tier, None, (t1 - t0) * 1000)
|
| 101 |
+
|
| 102 |
+
return results
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
# ββ Pipeline summary βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 106 |
+
def pipeline_summary(results: list[dict]) -> dict:
|
| 107 |
+
"""
|
| 108 |
+
Aggregate stats from classify_logs output.
|
| 109 |
+
Useful for dashboard and benchmark reporting.
|
| 110 |
+
"""
|
| 111 |
+
tier_groups: dict[str, list[float]] = {}
|
| 112 |
+
label_counts: dict[str, int] = {}
|
| 113 |
+
|
| 114 |
+
for r in results:
|
| 115 |
+
tier = r["tier"]
|
| 116 |
+
tier_groups.setdefault(tier, []).append(r["latency_ms"])
|
| 117 |
+
label_counts[r["label"]] = label_counts.get(r["label"], 0) + 1
|
| 118 |
+
|
| 119 |
+
total = len(results)
|
| 120 |
+
tier_stats = {}
|
| 121 |
+
for tier, latencies in tier_groups.items():
|
| 122 |
+
latencies_sorted = sorted(latencies)
|
| 123 |
+
n = len(latencies_sorted)
|
| 124 |
+
tier_stats[tier] = {
|
| 125 |
+
"count": n,
|
| 126 |
+
"pct": round(n / total * 100, 1),
|
| 127 |
+
"p50_ms": round(statistics.median(latencies_sorted), 2),
|
| 128 |
+
"p95_ms": round(latencies_sorted[min(int(n * 0.95), n - 1)], 2),
|
| 129 |
+
"p99_ms": round(latencies_sorted[min(int(n * 0.99), n - 1)], 2),
|
| 130 |
+
"mean_ms": round(statistics.mean(latencies_sorted), 2),
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
return {
|
| 134 |
+
"total": total,
|
| 135 |
+
"tier_stats": tier_stats,
|
| 136 |
+
"label_counts": label_counts,
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
# ββ CSV batch classify βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 141 |
+
def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str, pd.DataFrame]:
|
| 142 |
+
"""
|
| 143 |
+
CSV file classify karo.
|
| 144 |
+
Required columns: 'source', 'log_message'
|
| 145 |
+
Output: adds 'predicted_label', 'tier_used', 'confidence', 'latency_ms'
|
| 146 |
+
"""
|
| 147 |
+
df = pd.read_csv(input_path)
|
| 148 |
+
required = {"source", "log_message"}
|
| 149 |
+
if not required.issubset(df.columns):
|
| 150 |
+
raise ValueError(f"CSV mein ye columns chahiye: {required}. Mila: {set(df.columns)}")
|
| 151 |
+
|
| 152 |
+
log_pairs = list(zip(df["source"], df["log_message"]))
|
| 153 |
+
results = classify_logs(log_pairs)
|
| 154 |
+
|
| 155 |
+
df["predicted_label"] = [r["label"] for r in results]
|
| 156 |
+
df["tier_used"] = [r["tier"] for r in results]
|
| 157 |
+
df["latency_ms"] = [r["latency_ms"] for r in results]
|
| 158 |
+
df["confidence"] = [
|
| 159 |
+
f"{r['confidence']:.1%}" if r["confidence"] is not None else "N/A"
|
| 160 |
+
for r in results
|
| 161 |
+
]
|
| 162 |
+
|
| 163 |
+
df.to_csv(output_path, index=False)
|
| 164 |
+
return output_path, df
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
# Aliases
|
| 168 |
+
classify = classify_logs
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
# ββ Self-test ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 172 |
+
if __name__ == "__main__":
|
| 173 |
+
sample = [
|
| 174 |
+
("ModernCRM", "IP 192.168.133.114 blocked due to potential attack"),
|
| 175 |
+
("BillingSystem", "User User12345 logged in."),
|
| 176 |
+
("AnalyticsEngine", "File data_6957.csv uploaded successfully by user User265."),
|
| 177 |
+
("ModernHR", "GET /v2/servers/detail HTTP/1.1 status: 200 len: 1583 time: 0.19"),
|
| 178 |
+
("ModernHR", "Admin access escalation detected for user 9429"),
|
| 179 |
+
("LegacyCRM", "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active."),
|
| 180 |
+
("LegacyCRM", "The 'ReportGenerator' module will be retired in version 4.0."),
|
| 181 |
+
]
|
| 182 |
+
|
| 183 |
+
print(f'{"Source":<20} {"Tier":<18} {"Conf":>6} {"Lat(ms)":>8} {"Label":<25} Log')
|
| 184 |
+
print("β" * 115)
|
| 185 |
+
results = classify_logs(sample)
|
| 186 |
+
for (source, log), r in zip(sample, results):
|
| 187 |
+
conf = f"{r['confidence']:.0%}" if r["confidence"] else " N/A"
|
| 188 |
+
print(f'{source:<20} {r["tier"]:<18} {conf:>6} {r["latency_ms"]:>8.1f} {r["label"]:<25} {log[:40]}')
|
| 189 |
+
|
| 190 |
+
summary = pipeline_summary(results)
|
| 191 |
+
print("\nπ Pipeline Summary:")
|
| 192 |
+
for tier, stats in summary["tier_stats"].items():
|
| 193 |
+
print(f" {tier}: {stats['count']} logs ({stats['pct']}%) | "
|
| 194 |
+
f"p50={stats['p50_ms']}ms p95={stats['p95_ms']}ms p99={stats['p99_ms']}ms")
|
| 195 |
+
|
| 196 |
+
print("\nπ·οΈ Label distribution:")
|
| 197 |
+
for label, count in sorted(summary["label_counts"].items(), key=lambda x: -x[1]):
|
| 198 |
+
print(f" β’ {label}: {count}")
|
hf_space/models/log_classifier.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9bfe9c71b71412797de0d426be2255566dbf6cf87b3f2ae5d2cd1fd69a98d18d
|
| 3 |
+
size 23997
|
hf_space/onnx_model/config.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BertModel"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"classifier_dropout": null,
|
| 7 |
+
"gradient_checkpointing": false,
|
| 8 |
+
"hidden_act": "gelu",
|
| 9 |
+
"hidden_dropout_prob": 0.1,
|
| 10 |
+
"hidden_size": 384,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"intermediate_size": 1536,
|
| 13 |
+
"layer_norm_eps": 1e-12,
|
| 14 |
+
"max_position_embeddings": 512,
|
| 15 |
+
"model_type": "bert",
|
| 16 |
+
"num_attention_heads": 12,
|
| 17 |
+
"num_hidden_layers": 6,
|
| 18 |
+
"pad_token_id": 0,
|
| 19 |
+
"position_embedding_type": "absolute",
|
| 20 |
+
"transformers_version": "4.57.6",
|
| 21 |
+
"type_vocab_size": 2,
|
| 22 |
+
"use_cache": true,
|
| 23 |
+
"vocab_size": 30522
|
| 24 |
+
}
|
hf_space/onnx_model/special_tokens_map.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": {
|
| 3 |
+
"content": "[CLS]",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"mask_token": {
|
| 10 |
+
"content": "[MASK]",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": {
|
| 17 |
+
"content": "[PAD]",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"sep_token": {
|
| 24 |
+
"content": "[SEP]",
|
| 25 |
+
"lstrip": false,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"unk_token": {
|
| 31 |
+
"content": "[UNK]",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": false,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
}
|
| 37 |
+
}
|
hf_space/onnx_model/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
hf_space/onnx_model/tokenizer_config.json
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "[PAD]",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"100": {
|
| 12 |
+
"content": "[UNK]",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"101": {
|
| 20 |
+
"content": "[CLS]",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"102": {
|
| 28 |
+
"content": "[SEP]",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"103": {
|
| 36 |
+
"content": "[MASK]",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"clean_up_tokenization_spaces": false,
|
| 45 |
+
"cls_token": "[CLS]",
|
| 46 |
+
"do_basic_tokenize": true,
|
| 47 |
+
"do_lower_case": true,
|
| 48 |
+
"extra_special_tokens": {},
|
| 49 |
+
"mask_token": "[MASK]",
|
| 50 |
+
"max_length": 128,
|
| 51 |
+
"model_max_length": 512,
|
| 52 |
+
"never_split": null,
|
| 53 |
+
"pad_to_multiple_of": null,
|
| 54 |
+
"pad_token": "[PAD]",
|
| 55 |
+
"pad_token_type_id": 0,
|
| 56 |
+
"padding_side": "right",
|
| 57 |
+
"sep_token": "[SEP]",
|
| 58 |
+
"stride": 0,
|
| 59 |
+
"strip_accents": null,
|
| 60 |
+
"tokenize_chinese_chars": true,
|
| 61 |
+
"tokenizer_class": "BertTokenizer",
|
| 62 |
+
"truncation_side": "right",
|
| 63 |
+
"truncation_strategy": "longest_first",
|
| 64 |
+
"unk_token": "[UNK]"
|
| 65 |
+
}
|
hf_space/onnx_model/vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
hf_space/processor_bert.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
processor_bert_fast.py β ONNX Runtime powered BERT classifier
|
| 3 |
+
Speed: 82 logs/s β 2000+ logs/s
|
| 4 |
+
|
| 5 |
+
Kaise kaam karta hai:
|
| 6 |
+
1. ONNX Runtime: Normal PyTorch se 3-5x faster
|
| 7 |
+
2. Batch processing: 64 logs ek saath process
|
| 8 |
+
3. Pre-allocated buffers: Memory waste nahi
|
| 9 |
+
"""
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
import os
|
| 12 |
+
import numpy as np
|
| 13 |
+
import joblib
|
| 14 |
+
|
| 15 |
+
# ββ Check karo kaunsa method use karna hai ββββββββββββββββββ
|
| 16 |
+
_USE_ONNX = False
|
| 17 |
+
_embedding_model = None
|
| 18 |
+
_classifier = None
|
| 19 |
+
_ort_session = None
|
| 20 |
+
_ort_tokenizer = None
|
| 21 |
+
|
| 22 |
+
MODEL_PATH = os.path.join(os.path.dirname(__file__), 'models', 'log_classifier.joblib')
|
| 23 |
+
ONNX_DIR = os.path.join(os.path.dirname(__file__), 'models', 'onnx')
|
| 24 |
+
CONFIDENCE_THRESHOLD = 0.30
|
| 25 |
+
DEFAULT_BATCH = 64
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _load_models():
|
| 29 |
+
"""Lazily load models β pehli call pe hi load hoga, baar baar nahi."""
|
| 30 |
+
global _USE_ONNX, _embedding_model, _classifier, _ort_session, _ort_tokenizer
|
| 31 |
+
|
| 32 |
+
if _classifier is not None:
|
| 33 |
+
return # Already loaded
|
| 34 |
+
|
| 35 |
+
# ββ Classifier load karo βββββββββββββββββββββββββββββββ
|
| 36 |
+
if not os.path.exists(MODEL_PATH):
|
| 37 |
+
raise FileNotFoundError(
|
| 38 |
+
f'Model nahi mila: {MODEL_PATH}\n'
|
| 39 |
+
'Pehle Colab notebook run karo aur model download karo.'
|
| 40 |
+
)
|
| 41 |
+
_classifier = joblib.load(MODEL_PATH)
|
| 42 |
+
|
| 43 |
+
# ββ ONNX try karo (fast), fallback to PyTorch ββββββββββ
|
| 44 |
+
onnx_model_file = os.path.join(ONNX_DIR, 'model.onnx')
|
| 45 |
+
|
| 46 |
+
if os.path.exists(onnx_model_file):
|
| 47 |
+
try:
|
| 48 |
+
import onnxruntime as ort
|
| 49 |
+
from transformers import AutoTokenizer
|
| 50 |
+
|
| 51 |
+
# CPU optimized session options
|
| 52 |
+
sess_opts = ort.SessionOptions()
|
| 53 |
+
sess_opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
| 54 |
+
sess_opts.intra_op_num_threads = os.cpu_count()
|
| 55 |
+
sess_opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
|
| 56 |
+
|
| 57 |
+
_ort_session = ort.InferenceSession(
|
| 58 |
+
onnx_model_file,
|
| 59 |
+
sess_options=sess_opts,
|
| 60 |
+
providers=['CPUExecutionProvider']
|
| 61 |
+
)
|
| 62 |
+
_ort_tokenizer = AutoTokenizer.from_pretrained(ONNX_DIR)
|
| 63 |
+
_USE_ONNX = True
|
| 64 |
+
print('[BERT] β
ONNX Runtime loaded β FAST MODE')
|
| 65 |
+
|
| 66 |
+
except Exception as e:
|
| 67 |
+
print(f'[BERT] ONNX load failed ({e}), fallback to PyTorch')
|
| 68 |
+
_USE_ONNX = False
|
| 69 |
+
|
| 70 |
+
if not _USE_ONNX:
|
| 71 |
+
from sentence_transformers import SentenceTransformer
|
| 72 |
+
_embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 73 |
+
print('[BERT] β οΈ PyTorch mode (install ONNX for 3-5x speedup)')
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def _embed_onnx(texts: list[str]) -> np.ndarray:
|
| 77 |
+
"""ONNX Runtime se embeddings generate karo β FAST."""
|
| 78 |
+
import torch
|
| 79 |
+
|
| 80 |
+
inputs = _ort_tokenizer(
|
| 81 |
+
texts,
|
| 82 |
+
padding=True,
|
| 83 |
+
truncation=True,
|
| 84 |
+
max_length=128,
|
| 85 |
+
return_tensors='np' # NumPy directly (faster than PyTorch tensors)
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
# ONNX session run
|
| 89 |
+
ort_inputs = {
|
| 90 |
+
'input_ids': inputs['input_ids'].astype(np.int64),
|
| 91 |
+
'attention_mask': inputs['attention_mask'].astype(np.int64),
|
| 92 |
+
}
|
| 93 |
+
if 'token_type_ids' in [i.name for i in _ort_session.get_inputs()]:
|
| 94 |
+
ort_inputs['token_type_ids'] = inputs.get(
|
| 95 |
+
'token_type_ids', np.zeros_like(inputs['input_ids'])
|
| 96 |
+
).astype(np.int64)
|
| 97 |
+
|
| 98 |
+
outputs = _ort_session.run(None, ort_inputs)
|
| 99 |
+
hidden = outputs[0] # (batch, seq_len, hidden)
|
| 100 |
+
|
| 101 |
+
# Mean pooling (attention mask weighted)
|
| 102 |
+
mask = inputs['attention_mask'][:, :, None].astype(np.float32)
|
| 103 |
+
summed = (hidden * mask).sum(axis=1)
|
| 104 |
+
counts = mask.sum(axis=1)
|
| 105 |
+
embeddings = summed / counts
|
| 106 |
+
|
| 107 |
+
# L2 normalize
|
| 108 |
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
| 109 |
+
return embeddings / (norms + 1e-8)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def _embed_pytorch(texts: list[str]) -> np.ndarray:
|
| 113 |
+
"""PyTorch fallback."""
|
| 114 |
+
return _embedding_model.encode(
|
| 115 |
+
texts,
|
| 116 |
+
batch_size=DEFAULT_BATCH,
|
| 117 |
+
convert_to_numpy=True,
|
| 118 |
+
normalize_embeddings=True,
|
| 119 |
+
show_progress_bar=False
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
# ββ PUBLIC API ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 124 |
+
|
| 125 |
+
def classify_with_bert(log_message: str) -> tuple[str, float]:
|
| 126 |
+
"""
|
| 127 |
+
Single log classify karo.
|
| 128 |
+
Returns: (label, confidence)
|
| 129 |
+
"""
|
| 130 |
+
_load_models()
|
| 131 |
+
results = classify_batch([log_message])
|
| 132 |
+
return results[0]
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def classify_batch(log_messages: list[str]) -> list[tuple[str, float]]:
|
| 136 |
+
"""
|
| 137 |
+
Multiple logs ek saath classify karo β MUCH FASTER!
|
| 138 |
+
Returns: list of (label, confidence) tuples
|
| 139 |
+
|
| 140 |
+
Example:
|
| 141 |
+
results = classify_batch(['log1', 'log2', 'log3'])
|
| 142 |
+
for label, conf in results:
|
| 143 |
+
print(f'{label}: {conf:.1%}')
|
| 144 |
+
"""
|
| 145 |
+
_load_models()
|
| 146 |
+
|
| 147 |
+
if not log_messages:
|
| 148 |
+
return []
|
| 149 |
+
|
| 150 |
+
results = []
|
| 151 |
+
|
| 152 |
+
# Process in batches
|
| 153 |
+
for i in range(0, len(log_messages), DEFAULT_BATCH):
|
| 154 |
+
batch = log_messages[i:i + DEFAULT_BATCH]
|
| 155 |
+
|
| 156 |
+
# Generate embeddings
|
| 157 |
+
if _USE_ONNX:
|
| 158 |
+
embeddings = _embed_onnx(batch)
|
| 159 |
+
else:
|
| 160 |
+
embeddings = _embed_pytorch(batch)
|
| 161 |
+
|
| 162 |
+
# Classify
|
| 163 |
+
probs = _classifier.predict_proba(embeddings)
|
| 164 |
+
max_probs = probs.max(axis=1)
|
| 165 |
+
labels = _classifier.predict(embeddings)
|
| 166 |
+
|
| 167 |
+
for label, conf in zip(labels, max_probs):
|
| 168 |
+
if conf < CONFIDENCE_THRESHOLD:
|
| 169 |
+
results.append(('Unclassified', float(conf)))
|
| 170 |
+
else:
|
| 171 |
+
results.append((str(label), float(conf)))
|
| 172 |
+
|
| 173 |
+
return results
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def get_classes() -> list[str]:
|
| 177 |
+
"""Classifier ke classes return karo."""
|
| 178 |
+
_load_models()
|
| 179 |
+
return list(_classifier.classes_)
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def is_onnx_mode() -> bool:
|
| 183 |
+
"""Check karo ONNX use ho raha hai ya nahi."""
|
| 184 |
+
_load_models()
|
| 185 |
+
return _USE_ONNX
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
# ββ TEST ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 189 |
+
if __name__ == '__main__':
|
| 190 |
+
import time
|
| 191 |
+
|
| 192 |
+
test_logs = [
|
| 193 |
+
'GET /v2/servers/detail HTTP/1.1 status: 404 len: 1583 time: 0.19',
|
| 194 |
+
'System crashed due to driver errors when restarting the server',
|
| 195 |
+
'Multiple login failures occurred on user 6454 account',
|
| 196 |
+
'Admin access escalation detected for user 9429',
|
| 197 |
+
'CPU usage at 98% for the last 10 minutes on node-7',
|
| 198 |
+
'Backup completed successfully.',
|
| 199 |
+
'User User123 logged in.',
|
| 200 |
+
'Data replication task for shard 14 did not complete',
|
| 201 |
+
'Hey bro chill ya!', # should be Unclassified
|
| 202 |
+
]
|
| 203 |
+
|
| 204 |
+
print('Single log test:')
|
| 205 |
+
for log in test_logs:
|
| 206 |
+
label, conf = classify_with_bert(log)
|
| 207 |
+
print(f' [{conf:.0%}] {label:25s} | {log[:60]}')
|
| 208 |
+
|
| 209 |
+
print(f'\nMode: {"ONNX π" if is_onnx_mode() else "PyTorch"}')
|
| 210 |
+
|
| 211 |
+
# Speed test
|
| 212 |
+
big_batch = test_logs * 100
|
| 213 |
+
t0 = time.perf_counter()
|
| 214 |
+
classify_batch(big_batch)
|
| 215 |
+
elapsed = time.perf_counter() - t0
|
| 216 |
+
print(f'\nSpeed: {len(big_batch)/elapsed:.0f} logs/s ({elapsed*1000/len(big_batch):.1f}ms/log)')
|
hf_space/processor_llm.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
processor_llm.py β Tier 3: LLM-based Classifier
|
| 3 |
+
|
| 4 |
+
Used for:
|
| 5 |
+
- LegacyCRM logs (Workflow Error, Deprecation Warning)
|
| 6 |
+
- BERT fallback when confidence < threshold
|
| 7 |
+
|
| 8 |
+
Production hardening in V3:
|
| 9 |
+
- Timeout (configurable, default 5s)
|
| 10 |
+
- Retry with exponential backoff (max 2 retries)
|
| 11 |
+
- Explicit failure modes: returns "Unclassified" on all error paths
|
| 12 |
+
- Caching for repeated log patterns (hash-based, in-memory)
|
| 13 |
+
- Token budget enforcement (max_tokens=15)
|
| 14 |
+
"""
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
import os
|
| 17 |
+
import re
|
| 18 |
+
import time
|
| 19 |
+
import hashlib
|
| 20 |
+
import logging
|
| 21 |
+
from functools import lru_cache
|
| 22 |
+
from typing import Optional
|
| 23 |
+
|
| 24 |
+
logger = logging.getLogger(__name__)
|
| 25 |
+
|
| 26 |
+
# ββ Config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 27 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 28 |
+
LLM_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
|
| 29 |
+
|
| 30 |
+
VALID_CATEGORIES = ["Workflow Error", "Deprecation Warning"]
|
| 31 |
+
|
| 32 |
+
# Retry / timeout config
|
| 33 |
+
MAX_RETRIES = 2
|
| 34 |
+
RETRY_DELAY_SEC = 1.0 # doubles on each retry (exponential backoff)
|
| 35 |
+
REQUEST_TIMEOUT = 5 # seconds β fail fast, do not hang pipeline
|
| 36 |
+
|
| 37 |
+
# In-memory cache to avoid redundant LLM calls for repeated logs
|
| 38 |
+
_RESPONSE_CACHE: dict[str, str] = {}
|
| 39 |
+
MAX_CACHE_SIZE = 1000 # evict oldest when full (simple FIFO)
|
| 40 |
+
|
| 41 |
+
SYSTEM_PROMPT = (
|
| 42 |
+
"You are an enterprise log classifier. "
|
| 43 |
+
"Classify log messages into exactly one category. "
|
| 44 |
+
"Return ONLY the category name β no explanation, no punctuation."
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
FEW_SHOT_EXAMPLES = [
|
| 48 |
+
{
|
| 49 |
+
"log": "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active.",
|
| 50 |
+
"label": "Workflow Error",
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"log": "The 'BulkEmailSender' feature is no longer supported. Use 'EmailCampaignManager' instead.",
|
| 54 |
+
"label": "Deprecation Warning",
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"log": "Invoice generation aborted for order ID 8910 due to invalid tax calculation module.",
|
| 58 |
+
"label": "Workflow Error",
|
| 59 |
+
},
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# ββ Cache helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 64 |
+
def _cache_key(log_msg: str) -> str:
|
| 65 |
+
return hashlib.md5(log_msg.strip().encode()).hexdigest()
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def _cache_get(log_msg: str) -> Optional[str]:
|
| 69 |
+
return _RESPONSE_CACHE.get(_cache_key(log_msg))
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def _cache_set(log_msg: str, label: str) -> None:
|
| 73 |
+
key = _cache_key(log_msg)
|
| 74 |
+
if len(_RESPONSE_CACHE) >= MAX_CACHE_SIZE:
|
| 75 |
+
# Evict oldest (first inserted) key
|
| 76 |
+
oldest = next(iter(_RESPONSE_CACHE))
|
| 77 |
+
del _RESPONSE_CACHE[oldest]
|
| 78 |
+
_RESPONSE_CACHE[key] = label
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def get_cache_stats() -> dict:
|
| 82 |
+
return {"size": len(_RESPONSE_CACHE), "max_size": MAX_CACHE_SIZE}
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
# ββ Prompt builder βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 86 |
+
def _build_messages(log_msg: str) -> list[dict]:
|
| 87 |
+
categories_str = ", ".join(f'"{c}"' for c in VALID_CATEGORIES)
|
| 88 |
+
user_content = (
|
| 89 |
+
f'Classify the following log into one of these categories: {categories_str}.\n'
|
| 90 |
+
'If none fits, return "Unclassified".\n\n'
|
| 91 |
+
)
|
| 92 |
+
for ex in FEW_SHOT_EXAMPLES:
|
| 93 |
+
user_content += f'Log: {ex["log"]}\nCategory: {ex["label"]}\n\n'
|
| 94 |
+
user_content += f"Log: {log_msg}\nCategory:"
|
| 95 |
+
|
| 96 |
+
return [
|
| 97 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 98 |
+
{"role": "user", "content": user_content},
|
| 99 |
+
]
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
# ββ Normalize raw LLM output βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 103 |
+
def _normalize(raw: str) -> str:
|
| 104 |
+
"""Map raw LLM output to a valid category or 'Unclassified'."""
|
| 105 |
+
raw = raw.strip().strip('"').strip("'")
|
| 106 |
+
for cat in VALID_CATEGORIES:
|
| 107 |
+
if cat.lower() in raw.lower():
|
| 108 |
+
return cat
|
| 109 |
+
return "Unclassified"
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
# ββ Main classify function ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 113 |
+
def classify_with_llm(log_msg: str) -> str:
|
| 114 |
+
"""
|
| 115 |
+
Tier 3 LLM classifier with:
|
| 116 |
+
- In-memory cache (avoids duplicate API calls)
|
| 117 |
+
- Timeout (REQUEST_TIMEOUT seconds)
|
| 118 |
+
- Retry with exponential backoff (MAX_RETRIES attempts)
|
| 119 |
+
- Explicit fallback to "Unclassified" on all error paths
|
| 120 |
+
|
| 121 |
+
Latency: 500β2000ms on cache miss; ~0ms on cache hit.
|
| 122 |
+
"""
|
| 123 |
+
# ββ Cache hit ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 124 |
+
cached = _cache_get(log_msg)
|
| 125 |
+
if cached is not None:
|
| 126 |
+
logger.debug(f"[LLM] Cache hit for: {log_msg[:60]}")
|
| 127 |
+
return cached
|
| 128 |
+
|
| 129 |
+
# ββ Inference with retry βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 130 |
+
if not HF_TOKEN:
|
| 131 |
+
logger.warning("[LLM] HF_TOKEN not set β returning Unclassified")
|
| 132 |
+
return "Unclassified"
|
| 133 |
+
|
| 134 |
+
from huggingface_hub import InferenceClient
|
| 135 |
+
|
| 136 |
+
client = InferenceClient(token=HF_TOKEN, timeout=REQUEST_TIMEOUT)
|
| 137 |
+
delay = RETRY_DELAY_SEC
|
| 138 |
+
last_err: Optional[Exception] = None
|
| 139 |
+
|
| 140 |
+
for attempt in range(1, MAX_RETRIES + 2): # +2: initial + MAX_RETRIES
|
| 141 |
+
try:
|
| 142 |
+
response = client.chat.completions.create(
|
| 143 |
+
model=LLM_MODEL,
|
| 144 |
+
messages=_build_messages(log_msg),
|
| 145 |
+
max_tokens=15,
|
| 146 |
+
temperature=0.1,
|
| 147 |
+
)
|
| 148 |
+
raw = response.choices[0].message.content
|
| 149 |
+
label = _normalize(raw)
|
| 150 |
+
|
| 151 |
+
_cache_set(log_msg, label)
|
| 152 |
+
logger.debug(f"[LLM] Attempt {attempt}: '{raw.strip()}' β '{label}'")
|
| 153 |
+
return label
|
| 154 |
+
|
| 155 |
+
except Exception as e:
|
| 156 |
+
last_err = e
|
| 157 |
+
if attempt <= MAX_RETRIES:
|
| 158 |
+
logger.warning(f"[LLM] Attempt {attempt} failed ({e}), retrying in {delay:.1f}sβ¦")
|
| 159 |
+
time.sleep(delay)
|
| 160 |
+
delay *= 2 # exponential backoff
|
| 161 |
+
else:
|
| 162 |
+
logger.error(f"[LLM] All {MAX_RETRIES + 1} attempts failed. Last error: {e}")
|
| 163 |
+
|
| 164 |
+
return "Unclassified"
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
# ββ Batch classify (serial β LLM is already rate-limited) ββββββββββββββββββββ
|
| 168 |
+
def classify_batch_llm(log_msgs: list[str]) -> list[str]:
|
| 169 |
+
"""Classify multiple logs through LLM. Each call is sequential to respect rate limits."""
|
| 170 |
+
return [classify_with_llm(msg) for msg in log_msgs]
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
# ββ CLI test βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 174 |
+
if __name__ == "__main__":
|
| 175 |
+
logging.basicConfig(level=logging.INFO)
|
| 176 |
+
|
| 177 |
+
test_logs = [
|
| 178 |
+
"Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active.",
|
| 179 |
+
"The 'ReportGenerator' module will be retired in version 4.0. Migrate to 'AdvancedAnalyticsSuite'.",
|
| 180 |
+
"System reboot initiated by user 12345.", # should be Unclassified
|
| 181 |
+
]
|
| 182 |
+
for log in test_logs:
|
| 183 |
+
result = classify_with_llm(log)
|
| 184 |
+
print(f"{result:25s} | {log[:80]}")
|
| 185 |
+
|
| 186 |
+
# Cache hit test
|
| 187 |
+
print("\nββ Cache hit test ββ")
|
| 188 |
+
t0 = time.perf_counter()
|
| 189 |
+
classify_with_llm(test_logs[0])
|
| 190 |
+
t1 = time.perf_counter()
|
| 191 |
+
print(f"Cache hit latency: {(t1-t0)*1000:.2f}ms")
|
| 192 |
+
print(f"Cache stats: {get_cache_stats()}")
|
hf_space/processor_regex.py
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
processor_regex.py β Tier 1: Rule-based Classifier
|
| 3 |
+
|
| 4 |
+
Target coverage: 40%+ (up from 15%)
|
| 5 |
+
Latency: sub-millisecond per log
|
| 6 |
+
|
| 7 |
+
New pattern groups added:
|
| 8 |
+
- HTTP request/response logs (was completely missing!)
|
| 9 |
+
- Auth / credential events (login failures, MFA, lockouts)
|
| 10 |
+
- System/infra events (disk, CPU, memory, cron)
|
| 11 |
+
- Network / firewall events (IP block, port scan)
|
| 12 |
+
- Structured error codes (ERROR, CRITICAL prefix logs)
|
| 13 |
+
"""
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
import re
|
| 16 |
+
import time
|
| 17 |
+
from typing import Optional
|
| 18 |
+
|
| 19 |
+
# ---------------------------------------------------------------------------
|
| 20 |
+
# Pattern registry: (compiled_pattern, label)
|
| 21 |
+
# Order matters β more specific patterns FIRST to avoid mis-labeling.
|
| 22 |
+
# ---------------------------------------------------------------------------
|
| 23 |
+
_RAW_PATTERNS: list[tuple[str, str]] = [
|
| 24 |
+
|
| 25 |
+
# ββ HTTP Status βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 26 |
+
# Covers: GET/POST/PUT/DELETE/PATCH + status code in request line
|
| 27 |
+
(r"\b(GET|POST|PUT|DELETE|PATCH|HEAD|OPTIONS)\s+\S+\s+HTTP/\d", "HTTP Status"),
|
| 28 |
+
# Nova / OpenStack style
|
| 29 |
+
(r"nova\.\S+\s+(GET|POST|PUT|DELETE)\s+\S+\s+HTTP/\d", "HTTP Status"),
|
| 30 |
+
# Status code only style: "returned HTTP 200" or "status: 404"
|
| 31 |
+
(r"\bstatus[:\s]+\d{3}\b", "HTTP Status"),
|
| 32 |
+
(r"\breturned\s+HTTP\s+\d{3}\b", "HTTP Status"),
|
| 33 |
+
(r"\bHTTP\s+status\s+code\s*[:-]?\s*\d{3}\b", "HTTP Status"),
|
| 34 |
+
# API response style
|
| 35 |
+
(r"\bAPI\s+(call|request)\s+\S+\s+completed\s+with\s+status\s+\d{3}", "HTTP Status"),
|
| 36 |
+
(r"\bEndpoint\s+\S+\s+responded\s+with\s+code\s+\d{3}", "HTTP Status"),
|
| 37 |
+
|
| 38 |
+
# ββ Security Alert ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 39 |
+
# Brute force / login failures
|
| 40 |
+
(r"(multiple\s+)?(bad\s+|failed?\s+)?login\s+(failure|attempt|failures)", "Security Alert"),
|
| 41 |
+
(r"brute[\s_-]force\s+(login|attack|attempt)", "Security Alert"),
|
| 42 |
+
# Unauthorized access
|
| 43 |
+
(r"unauthorized\s+(access|admin|privilege|attempt)", "Security Alert"),
|
| 44 |
+
(r"access\s+denied\s+(for|to)\s+(user|ip|host)", "Security Alert"),
|
| 45 |
+
# Privilege escalation
|
| 46 |
+
(r"(admin\s+)?access\s+escalation\s+detected", "Security Alert"),
|
| 47 |
+
(r"privilege\s+(elev|escalat)", "Security Alert"),
|
| 48 |
+
# IP blocking / suspicious traffic
|
| 49 |
+
(r"IP\s+\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\s+blocked", "Security Alert"),
|
| 50 |
+
(r"(suspicious|anomalous)\s+(login|traffic|activity|request)", "Security Alert"),
|
| 51 |
+
(r"potential\s+(DDoS|attack|breach|intrusion)", "Security Alert"),
|
| 52 |
+
(r"security\s+breach\s+suspected", "Security Alert"),
|
| 53 |
+
(r"(API\s+security\s+breach|bypass\s+API\s+security)", "Security Alert"),
|
| 54 |
+
(r"port\s+scan\s+(detected|attempt)", "Security Alert"),
|
| 55 |
+
|
| 56 |
+
# ββ User Action βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 57 |
+
(r"User\s+\w+\d*\s+logged\s+(in|out)", "User Action"),
|
| 58 |
+
(r"Account\s+(with\s+)?ID\s+\S+\s+created\s+by", "User Action"),
|
| 59 |
+
(r"User\s+\w+\d*\s+(updated\s+profile|changed\s+password|enabled\s+two|downloaded|exported)", "User Action"),
|
| 60 |
+
(r"(New\s+user|user\s+\w+\d*)\s+registered", "User Action"),
|
| 61 |
+
(r"Account\s+\S+\s+deleted\s+by\s+(administrator|admin)", "User Action"),
|
| 62 |
+
(r"User\s+\w+\d*\s+(tried|attempted)", "User Action"),
|
| 63 |
+
|
| 64 |
+
# ββ System Notification βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 65 |
+
# Backup events
|
| 66 |
+
(r"Backup\s+(started|ended|completed\s+successfully|failed|aborted)", "System Notification"),
|
| 67 |
+
(r"System\s+updated\s+to\s+version", "System Notification"),
|
| 68 |
+
(r"File\s+\S+\s+uploaded\s+successfully\s+by\s+user", "System Notification"),
|
| 69 |
+
(r"Disk\s+cleanup\s+completed\s+successfully", "System Notification"),
|
| 70 |
+
(r"System\s+reboot\s+initiated\s+by\s+user", "System Notification"),
|
| 71 |
+
(r"Scheduled\s+maintenance\s+(started|completed)", "System Notification"),
|
| 72 |
+
(r"Service\s+\w+\s+restarted\s+successfully", "System Notification"),
|
| 73 |
+
# NEW: cache, cron, health check, cert, log rotation
|
| 74 |
+
(r"Cache\s+cleared\s+successfully", "System Notification"),
|
| 75 |
+
(r"Log\s+rotation\s+completed", "System Notification"),
|
| 76 |
+
(r"Health\s+check\s+(passed|failed)\s+for\s+service", "System Notification"),
|
| 77 |
+
(r"Certificate\s+(renewed|expired|revoked)\s+successfully", "System Notification"),
|
| 78 |
+
(r"Cron\s+job\s+\S+\s+(executed|failed|completed)\s+successfully", "System Notification"),
|
| 79 |
+
(r"(Disk|Storage)\s+(usage|space)\s+(at|reached|exceeded)\s+\d+%", "System Notification"),
|
| 80 |
+
(r"CPU\s+usage\s+at\s+\d+%", "System Notification"),
|
| 81 |
+
(r"Memory\s+(usage|limit)\s+(at|reached|exceeded)\s+\d+%", "System Notification"),
|
| 82 |
+
# Deployment / config
|
| 83 |
+
(r"Deployment\s+(of|for)\s+\S+\s+(completed|failed|started)", "System Notification"),
|
| 84 |
+
(r"Configuration\s+(reloaded|updated|applied)\s+successfully", "System Notification"),
|
| 85 |
+
|
| 86 |
+
# ββ Error βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 87 |
+
(r"\bERROR\b.*\b(exception|failed|failure|crash|timeout|unavailable)\b", "Error"),
|
| 88 |
+
(r"System\s+crashed\s+due\s+to", "Error"),
|
| 89 |
+
(r"(connection|request|task|job)\s+(timed?\s*out|timeout)", "Error"),
|
| 90 |
+
(r"service\s+\S+\s+(is\s+down|unavailable|unreachable)", "Error"),
|
| 91 |
+
(r"database\s+connection\s+(failed|refused|lost|dropped)", "Error"),
|
| 92 |
+
(r"disk\s+(I/O\s+)?failure", "Error"),
|
| 93 |
+
(r"driver\s+error(s)?\s+(when|during|on)", "Error"),
|
| 94 |
+
(r"(replication|sync)\s+task\s+(did\s+not\s+complete|failed)", "Error"),
|
| 95 |
+
(r"null\s+pointer|segmentation\s+fault|stack\s+overflow", "Error"),
|
| 96 |
+
|
| 97 |
+
# ββ Critical Error ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 98 |
+
(r"\bCRITICAL\b", "Critical Error"),
|
| 99 |
+
(r"(FATAL|PANIC)\b", "Critical Error"),
|
| 100 |
+
(r"(data\s+loss|data\s+corruption)\s+(detected|occurred)", "Critical Error"),
|
| 101 |
+
(r"(cluster|node|shard)\s+(failure|crashed|went\s+down)", "Critical Error"),
|
| 102 |
+
(r"(catastrophic|unrecoverable)\s+(failure|error)", "Critical Error"),
|
| 103 |
+
(r"kernel\s+panic", "Critical Error"),
|
| 104 |
+
(r"out[\s-]of[\s-](memory|disk)\s+(error|killed|OOM)", "Critical Error"),
|
| 105 |
+
]
|
| 106 |
+
|
| 107 |
+
# Pre-compile all patterns at import time (not per-call)
|
| 108 |
+
REGEX_PATTERNS: list[tuple[re.Pattern, str]] = [
|
| 109 |
+
(re.compile(pat, re.IGNORECASE), label)
|
| 110 |
+
for pat, label in _RAW_PATTERNS
|
| 111 |
+
]
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def classify_with_regex(log_message: str) -> Optional[str]:
|
| 115 |
+
"""
|
| 116 |
+
Tier 1: Rule-based classifier.
|
| 117 |
+
Returns category label, or None if no pattern matches.
|
| 118 |
+
Latency: sub-millisecond (patterns pre-compiled at import).
|
| 119 |
+
"""
|
| 120 |
+
for pattern, label in REGEX_PATTERNS:
|
| 121 |
+
if pattern.search(log_message):
|
| 122 |
+
return label
|
| 123 |
+
return None
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def get_regex_coverage(log_messages: list[str]) -> dict:
|
| 127 |
+
"""Measure regex tier coverage and per-label breakdown."""
|
| 128 |
+
label_counts: dict[str, int] = {}
|
| 129 |
+
missed = 0
|
| 130 |
+
|
| 131 |
+
for msg in log_messages:
|
| 132 |
+
label = classify_with_regex(msg)
|
| 133 |
+
if label:
|
| 134 |
+
label_counts[label] = label_counts.get(label, 0) + 1
|
| 135 |
+
else:
|
| 136 |
+
missed += 1
|
| 137 |
+
|
| 138 |
+
total = len(log_messages)
|
| 139 |
+
matched = total - missed
|
| 140 |
+
|
| 141 |
+
return {
|
| 142 |
+
"total": total,
|
| 143 |
+
"matched": matched,
|
| 144 |
+
"missed": missed,
|
| 145 |
+
"coverage_pct": round(matched / total * 100, 2) if total else 0.0,
|
| 146 |
+
"label_breakdown": label_counts,
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def benchmark_regex(log_messages: list[str], runs: int = 3) -> dict:
|
| 151 |
+
"""Measure regex tier latency (p50 / p95 / p99) over multiple runs."""
|
| 152 |
+
import statistics
|
| 153 |
+
per_log_ms: list[float] = []
|
| 154 |
+
|
| 155 |
+
for _ in range(runs):
|
| 156 |
+
for msg in log_messages:
|
| 157 |
+
t0 = time.perf_counter()
|
| 158 |
+
classify_with_regex(msg)
|
| 159 |
+
per_log_ms.append((time.perf_counter() - t0) * 1000)
|
| 160 |
+
|
| 161 |
+
per_log_ms.sort()
|
| 162 |
+
return {
|
| 163 |
+
"p50_ms": round(statistics.median(per_log_ms), 4),
|
| 164 |
+
"p95_ms": round(per_log_ms[int(len(per_log_ms) * 0.95)], 4),
|
| 165 |
+
"p99_ms": round(per_log_ms[int(len(per_log_ms) * 0.99)], 4),
|
| 166 |
+
"mean_ms": round(statistics.mean(per_log_ms), 4),
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
# ββ CLI self-test ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 171 |
+
if __name__ == "__main__":
|
| 172 |
+
test_cases: list[tuple[str, str]] = [
|
| 173 |
+
# HTTP
|
| 174 |
+
("GET /api/v2/resource HTTP/1.1 status: 200 len: 1583 time: 0.19", "HTTP Status"),
|
| 175 |
+
("POST /v1/users HTTP/1.1 status: 201 len: 42 time: 0.05", "HTTP Status"),
|
| 176 |
+
("nova.osapi_compute.wsgi.server GET /v2/servers/detail HTTP/1.1 status: 404", "HTTP Status"),
|
| 177 |
+
# Security
|
| 178 |
+
("Multiple login failures occurred on user 6454 account", "Security Alert"),
|
| 179 |
+
("IP 192.168.133.114 blocked due to potential attack", "Security Alert"),
|
| 180 |
+
("Brute force login attempt from 10.0.0.5 detected", "Security Alert"),
|
| 181 |
+
("Admin access escalation detected for user 9429", "Security Alert"),
|
| 182 |
+
# User Action
|
| 183 |
+
("User User12345 logged in.", "User Action"),
|
| 184 |
+
("Account with ID 456 created by Admin.", "User Action"),
|
| 185 |
+
# System Notification
|
| 186 |
+
("Backup completed successfully.", "System Notification"),
|
| 187 |
+
("CPU usage at 98% for the last 10 minutes on node-7", "System Notification"),
|
| 188 |
+
("Health check passed for service payments-api", "System Notification"),
|
| 189 |
+
# Error
|
| 190 |
+
("System crashed due to disk I/O failure on node-3", "Error"),
|
| 191 |
+
("Database connection failed after 3 retries", "Error"),
|
| 192 |
+
# Critical
|
| 193 |
+
("CRITICAL: data corruption detected on shard-14", "Critical Error"),
|
| 194 |
+
("kernel panic: not syncing: VFS: unable to mount root fs", "Critical Error"),
|
| 195 |
+
# Should be None (unmatched)
|
| 196 |
+
("The 'BulkEmailSender' feature will be deprecated in v5.0.", None),
|
| 197 |
+
("Case escalation for ticket 7324 failed.", None),
|
| 198 |
+
]
|
| 199 |
+
|
| 200 |
+
correct = 0
|
| 201 |
+
print(f"{'Expected':<22} {'Got':<22} {'β/β'} | Log")
|
| 202 |
+
print("β" * 100)
|
| 203 |
+
for log, expected in test_cases:
|
| 204 |
+
got = classify_with_regex(log)
|
| 205 |
+
ok = got == expected
|
| 206 |
+
correct += ok
|
| 207 |
+
icon = "β" if ok else "β"
|
| 208 |
+
print(f"{str(expected):<22} {str(got):<22} {icon} | {log[:55]}")
|
| 209 |
+
|
| 210 |
+
print(f"\n{correct}/{len(test_cases)} correct")
|
| 211 |
+
|
| 212 |
+
# Coverage demo
|
| 213 |
+
all_logs = [log for log, _ in test_cases]
|
| 214 |
+
cov = get_regex_coverage(all_logs)
|
| 215 |
+
print(f"\nCoverage: {cov['coverage_pct']}% ({cov['matched']}/{cov['total']} matched)")
|
| 216 |
+
print("Label breakdown:", cov["label_breakdown"])
|
| 217 |
+
|
| 218 |
+
# Latency benchmark
|
| 219 |
+
lat = benchmark_regex(all_logs * 100)
|
| 220 |
+
print(f"\nLatency (p50/p95/p99): {lat['p50_ms']}ms / {lat['p95_ms']}ms / {lat['p99_ms']}ms")
|
hf_space/requirements.txt
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core
|
| 2 |
+
gradio>=4.44.0
|
| 3 |
+
pandas>=2.0.0
|
| 4 |
+
numpy>=1.24.0
|
| 5 |
+
joblib>=1.3.0
|
| 6 |
+
scikit-learn>=1.3.0
|
| 7 |
+
|
| 8 |
+
# Embedding + BERT
|
| 9 |
+
sentence-transformers>=2.7.0
|
| 10 |
+
transformers>=4.38.0
|
| 11 |
+
|
| 12 |
+
# ONNX (optional, 3-5x speedup)
|
| 13 |
+
onnxruntime>=1.17.0
|
| 14 |
+
optimum[onnxruntime]>=1.16.0
|
| 15 |
+
|
| 16 |
+
# LLM
|
| 17 |
+
huggingface-hub>=0.21.0
|
| 18 |
+
|
| 19 |
+
# FastAPI (production API)
|
| 20 |
+
fastapi>=0.110.0
|
| 21 |
+
uvicorn[standard]>=0.29.0
|
| 22 |
+
pydantic>=2.0.0
|
| 23 |
+
|
| 24 |
+
# Observability
|
| 25 |
+
psutil>=5.9.0
|