NOT-OMEGA commited on
Commit
b5d6b20
Β·
verified Β·
1 Parent(s): 238ab41

Upload 13 files

Browse files
hf_space/Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ RUN apt-get update && apt-get install -y --no-install-recommends \
4
+ curl \
5
+ && rm -rf /var/lib/apt/lists/*
6
+
7
+ WORKDIR /app
8
+
9
+ COPY requirements.txt .
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ COPY . .
13
+
14
+ RUN useradd -m -u 1000 appuser \
15
+ && chown -R appuser:appuser /app
16
+ USER appuser
17
+
18
+ EXPOSE 7860
19
+
20
+ CMD ["python", "app_gradio.py"]
hf_space/app_gradio.py ADDED
@@ -0,0 +1,542 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Log Classification System β€” HuggingFace Spaces
3
+ Ultra-modern 3D UI with custom CSS
4
+ """
5
+ from __future__ import annotations
6
+ import io
7
+ import time
8
+ import pandas as pd
9
+ import gradio as gr
10
+ from classify import classify_log, classify_csv
11
+
12
+ SOURCES = [
13
+ "ModernCRM", "ModernHR", "BillingSystem",
14
+ "AnalyticsEngine", "ThirdPartyAPI", "LegacyCRM",
15
+ ]
16
+
17
+ TIER_COLORS = {
18
+ "Regex": "🟒",
19
+ "BERT": "πŸ”΅",
20
+ "LLM": "🟑",
21
+ "LLM (fallback)": "🟠",
22
+ }
23
+
24
+ EXAMPLE_LOGS = [
25
+ ["ModernCRM", "User User12345 logged in."],
26
+ ["ModernHR", "Multiple login failures occurred on user 6454 account"],
27
+ ["BillingSystem", "GET /v2/servers/detail HTTP/1.1 status: 200 len: 1583 time: 0.19"],
28
+ ["AnalyticsEngine", "System crashed due to disk I/O failure on node-3"],
29
+ ["LegacyCRM", "Case escalation for ticket ID 7324 failed β€” support agent is no longer active."],
30
+ ["LegacyCRM", "The 'BulkEmailSender' feature will be deprecated in v5.0. Use 'EmailCampaignManager'."],
31
+ ]
32
+
33
+ # ── Custom CSS β€” 3D Modern Dark Theme ──────────────────────────────────────
34
+ CUSTOM_CSS = """
35
+ @import url('https://fonts.googleapis.com/css2?family=Rajdhani:wght@400;500;600;700&family=Share+Tech+Mono&family=Exo+2:wght@300;400;600;700&display=swap');
36
+
37
+ :root {
38
+ --bg-primary: #050810;
39
+ --bg-secondary: #0a0f1e;
40
+ --bg-card: #0d1425;
41
+ --bg-card-hover: #111a30;
42
+ --accent-cyan: #00d4ff;
43
+ --accent-blue: #0066ff;
44
+ --accent-purple: #7c3aed;
45
+ --accent-green: #00ff88;
46
+ --accent-orange: #ff6b00;
47
+ --text-primary: #e2e8f0;
48
+ --text-secondary: #94a3b8;
49
+ --text-muted: #475569;
50
+ --border-glow: rgba(0, 212, 255, 0.3);
51
+ --shadow-3d: 0 20px 60px rgba(0, 0, 0, 0.8), 0 0 40px rgba(0, 102, 255, 0.15);
52
+ --glow-cyan: 0 0 20px rgba(0, 212, 255, 0.4), 0 0 40px rgba(0, 212, 255, 0.2);
53
+ --glow-blue: 0 0 20px rgba(0, 102, 255, 0.4);
54
+ }
55
+
56
+ /* ── Base ── */
57
+ body, .gradio-container {
58
+ background: var(--bg-primary) !important;
59
+ font-family: 'Exo 2', sans-serif !important;
60
+ color: var(--text-primary) !important;
61
+ }
62
+
63
+ .gradio-container {
64
+ background:
65
+ radial-gradient(ellipse at 20% 20%, rgba(0, 102, 255, 0.08) 0%, transparent 50%),
66
+ radial-gradient(ellipse at 80% 80%, rgba(124, 58, 237, 0.08) 0%, transparent 50%),
67
+ radial-gradient(ellipse at 50% 50%, rgba(0, 212, 255, 0.03) 0%, transparent 70%),
68
+ var(--bg-primary) !important;
69
+ min-height: 100vh;
70
+ }
71
+
72
+ /* ── Header ── */
73
+ .main-header {
74
+ text-align: center;
75
+ padding: 48px 24px 32px;
76
+ position: relative;
77
+ }
78
+
79
+ .main-header::before {
80
+ content: '';
81
+ position: absolute;
82
+ top: 0; left: 50%;
83
+ transform: translateX(-50%);
84
+ width: 600px; height: 2px;
85
+ background: linear-gradient(90deg, transparent, var(--accent-cyan), var(--accent-blue), transparent);
86
+ box-shadow: var(--glow-cyan);
87
+ }
88
+
89
+ /* ── Tab Navigation ── */
90
+ .tab-nav {
91
+ background: rgba(13, 20, 37, 0.8) !important;
92
+ border: 1px solid rgba(0, 212, 255, 0.15) !important;
93
+ border-radius: 16px !important;
94
+ padding: 6px !important;
95
+ backdrop-filter: blur(20px) !important;
96
+ box-shadow: var(--shadow-3d) !important;
97
+ }
98
+
99
+ .tab-nav button {
100
+ font-family: 'Rajdhani', sans-serif !important;
101
+ font-weight: 600 !important;
102
+ font-size: 14px !important;
103
+ letter-spacing: 1.5px !important;
104
+ text-transform: uppercase !important;
105
+ color: var(--text-secondary) !important;
106
+ background: transparent !important;
107
+ border: none !important;
108
+ border-radius: 10px !important;
109
+ padding: 12px 24px !important;
110
+ transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
111
+ }
112
+
113
+ .tab-nav button.selected {
114
+ color: var(--accent-cyan) !important;
115
+ background: linear-gradient(135deg, rgba(0, 212, 255, 0.1), rgba(0, 102, 255, 0.1)) !important;
116
+ box-shadow: 0 0 20px rgba(0, 212, 255, 0.2), inset 0 1px 0 rgba(0, 212, 255, 0.3) !important;
117
+ border: 1px solid rgba(0, 212, 255, 0.3) !important;
118
+ }
119
+
120
+ /* ── Cards / Blocks ── */
121
+ .gradio-group, .gr-group {
122
+ background: var(--bg-card) !important;
123
+ border: 1px solid rgba(0, 212, 255, 0.1) !important;
124
+ border-radius: 20px !important;
125
+ box-shadow: var(--shadow-3d), inset 0 1px 0 rgba(255,255,255,0.03) !important;
126
+ transition: all 0.4s ease !important;
127
+ transform: perspective(1000px) rotateX(0deg);
128
+ position: relative;
129
+ overflow: hidden;
130
+ }
131
+
132
+ .gradio-group::before {
133
+ content: '';
134
+ position: absolute;
135
+ top: 0; left: 0; right: 0;
136
+ height: 1px;
137
+ background: linear-gradient(90deg, transparent, rgba(0, 212, 255, 0.5), transparent);
138
+ }
139
+
140
+ .gradio-group:hover {
141
+ border-color: rgba(0, 212, 255, 0.25) !important;
142
+ box-shadow: var(--shadow-3d), var(--glow-cyan) !important;
143
+ transform: perspective(1000px) translateY(-4px) !important;
144
+ }
145
+
146
+ /* ── Labels ── */
147
+ label span, .gr-label {
148
+ font-family: 'Rajdhani', sans-serif !important;
149
+ font-weight: 600 !important;
150
+ letter-spacing: 1.5px !important;
151
+ text-transform: uppercase !important;
152
+ font-size: 11px !important;
153
+ color: var(--accent-cyan) !important;
154
+ opacity: 0.85;
155
+ }
156
+
157
+ /* ── Inputs ── */
158
+ input, textarea, select, .gr-input {
159
+ background: rgba(5, 8, 16, 0.8) !important;
160
+ border: 1px solid rgba(0, 212, 255, 0.15) !important;
161
+ border-radius: 12px !important;
162
+ color: var(--text-primary) !important;
163
+ font-family: 'Share Tech Mono', monospace !important;
164
+ font-size: 13px !important;
165
+ transition: all 0.3s ease !important;
166
+ padding: 12px 16px !important;
167
+ }
168
+
169
+ input:focus, textarea:focus {
170
+ border-color: var(--accent-cyan) !important;
171
+ box-shadow: 0 0 0 3px rgba(0, 212, 255, 0.1), var(--glow-cyan) !important;
172
+ outline: none !important;
173
+ background: rgba(0, 212, 255, 0.03) !important;
174
+ }
175
+
176
+ /* ── Dropdown ── */
177
+ .gr-dropdown select, .gradio-dropdown {
178
+ background: rgba(5, 8, 16, 0.9) !important;
179
+ border: 1px solid rgba(0, 212, 255, 0.2) !important;
180
+ border-radius: 12px !important;
181
+ color: var(--accent-cyan) !important;
182
+ font-family: 'Rajdhani', sans-serif !important;
183
+ font-weight: 600 !important;
184
+ }
185
+
186
+ /* ── Primary Button ── */
187
+ button.primary, .gr-button-primary, button[variant="primary"] {
188
+ font-family: 'Rajdhani', sans-serif !important;
189
+ font-weight: 700 !important;
190
+ font-size: 15px !important;
191
+ letter-spacing: 2px !important;
192
+ text-transform: uppercase !important;
193
+ background: linear-gradient(135deg, #0066ff 0%, #00d4ff 50%, #0066ff 100%) !important;
194
+ background-size: 200% 200% !important;
195
+ border: none !important;
196
+ border-radius: 12px !important;
197
+ padding: 14px 32px !important;
198
+ color: #fff !important;
199
+ box-shadow:
200
+ 0 8px 32px rgba(0, 102, 255, 0.4),
201
+ 0 2px 8px rgba(0, 0, 0, 0.5),
202
+ inset 0 1px 0 rgba(255,255,255,0.2) !important;
203
+ transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
204
+ animation: gradientShift 3s ease infinite !important;
205
+ position: relative !important;
206
+ overflow: hidden !important;
207
+ }
208
+
209
+ button.primary::before {
210
+ content: '';
211
+ position: absolute;
212
+ top: -50%; left: -60%;
213
+ width: 40%; height: 200%;
214
+ background: rgba(255,255,255,0.1);
215
+ transform: skewX(-20deg);
216
+ transition: left 0.6s ease;
217
+ }
218
+
219
+ button.primary:hover::before {
220
+ left: 120%;
221
+ }
222
+
223
+ button.primary:hover {
224
+ transform: translateY(-3px) scale(1.02) !important;
225
+ box-shadow:
226
+ 0 16px 48px rgba(0, 102, 255, 0.5),
227
+ 0 0 30px rgba(0, 212, 255, 0.3),
228
+ inset 0 1px 0 rgba(255,255,255,0.3) !important;
229
+ }
230
+
231
+ button.primary:active {
232
+ transform: translateY(0px) scale(0.98) !important;
233
+ }
234
+
235
+ @keyframes gradientShift {
236
+ 0%, 100% { background-position: 0% 50%; }
237
+ 50% { background-position: 100% 50%; }
238
+ }
239
+
240
+ /* ── Output Textboxes β€” 3D Result Cards ── */
241
+ .output-card input, .output-card textarea {
242
+ background: linear-gradient(135deg, rgba(0, 212, 255, 0.05), rgba(0, 102, 255, 0.05)) !important;
243
+ border: 1px solid rgba(0, 212, 255, 0.2) !important;
244
+ border-radius: 14px !important;
245
+ font-family: 'Share Tech Mono', monospace !important;
246
+ font-size: 16px !important;
247
+ font-weight: bold !important;
248
+ color: var(--accent-cyan) !important;
249
+ text-align: center !important;
250
+ box-shadow: inset 0 2px 8px rgba(0,0,0,0.3), 0 0 20px rgba(0, 212, 255, 0.1) !important;
251
+ }
252
+
253
+ /* ── Table / DataFrame ── */
254
+ table {
255
+ border-collapse: separate !important;
256
+ border-spacing: 0 4px !important;
257
+ font-family: 'Share Tech Mono', monospace !important;
258
+ font-size: 12px !important;
259
+ }
260
+
261
+ th {
262
+ background: rgba(0, 102, 255, 0.2) !important;
263
+ color: var(--accent-cyan) !important;
264
+ font-family: 'Rajdhani', sans-serif !important;
265
+ letter-spacing: 1.5px !important;
266
+ text-transform: uppercase !important;
267
+ font-size: 11px !important;
268
+ padding: 10px 16px !important;
269
+ border: none !important;
270
+ }
271
+
272
+ td {
273
+ background: rgba(13, 20, 37, 0.6) !important;
274
+ color: var(--text-secondary) !important;
275
+ padding: 8px 16px !important;
276
+ border: none !important;
277
+ border-top: 1px solid rgba(0, 212, 255, 0.05) !important;
278
+ transition: background 0.2s ease !important;
279
+ }
280
+
281
+ tr:hover td {
282
+ background: rgba(0, 212, 255, 0.05) !important;
283
+ color: var(--text-primary) !important;
284
+ }
285
+
286
+ /* ── Markdown ── */
287
+ .prose, .markdown {
288
+ color: var(--text-secondary) !important;
289
+ font-family: 'Exo 2', sans-serif !important;
290
+ }
291
+
292
+ .prose h1, .markdown h1 {
293
+ font-family: 'Rajdhani', sans-serif !important;
294
+ font-size: 3rem !important;
295
+ font-weight: 700 !important;
296
+ letter-spacing: 3px !important;
297
+ text-transform: uppercase !important;
298
+ background: linear-gradient(135deg, #ffffff 0%, var(--accent-cyan) 40%, var(--accent-blue) 100%) !important;
299
+ -webkit-background-clip: text !important;
300
+ -webkit-text-fill-color: transparent !important;
301
+ background-clip: text !important;
302
+ filter: drop-shadow(0 0 30px rgba(0, 212, 255, 0.3)) !important;
303
+ margin-bottom: 8px !important;
304
+ }
305
+
306
+ .prose h2, .markdown h2 {
307
+ font-family: 'Rajdhani', sans-serif !important;
308
+ font-size: 1.4rem !important;
309
+ font-weight: 600 !important;
310
+ letter-spacing: 2px !important;
311
+ color: var(--accent-cyan) !important;
312
+ text-transform: uppercase !important;
313
+ border-bottom: 1px solid rgba(0, 212, 255, 0.2) !important;
314
+ padding-bottom: 8px !important;
315
+ }
316
+
317
+ .prose p, .markdown p {
318
+ color: var(--text-secondary) !important;
319
+ line-height: 1.7 !important;
320
+ font-size: 14px !important;
321
+ }
322
+
323
+ .prose strong, .markdown strong {
324
+ color: var(--accent-cyan) !important;
325
+ }
326
+
327
+ /* ── Code blocks ── */
328
+ code, pre {
329
+ font-family: 'Share Tech Mono', monospace !important;
330
+ background: rgba(0, 212, 255, 0.05) !important;
331
+ border: 1px solid rgba(0, 212, 255, 0.15) !important;
332
+ border-radius: 8px !important;
333
+ color: var(--accent-cyan) !important;
334
+ font-size: 12px !important;
335
+ }
336
+
337
+ /* ── Examples Table ── */
338
+ .examples {
339
+ background: var(--bg-card) !important;
340
+ border: 1px solid rgba(0, 212, 255, 0.1) !important;
341
+ border-radius: 14px !important;
342
+ overflow: hidden !important;
343
+ }
344
+
345
+ .examples table th {
346
+ background: rgba(0, 102, 255, 0.15) !important;
347
+ }
348
+
349
+ /* ── File Upload ── */
350
+ .gr-file {
351
+ background: rgba(5, 8, 16, 0.8) !important;
352
+ border: 2px dashed rgba(0, 212, 255, 0.25) !important;
353
+ border-radius: 16px !important;
354
+ transition: all 0.3s ease !important;
355
+ }
356
+
357
+ .gr-file:hover {
358
+ border-color: var(--accent-cyan) !important;
359
+ background: rgba(0, 212, 255, 0.03) !important;
360
+ box-shadow: var(--glow-cyan) !important;
361
+ }
362
+
363
+ /* ── Scrollbar ── */
364
+ ::-webkit-scrollbar { width: 6px; height: 6px; }
365
+ ::-webkit-scrollbar-track { background: var(--bg-secondary); }
366
+ ::-webkit-scrollbar-thumb {
367
+ background: linear-gradient(var(--accent-blue), var(--accent-cyan));
368
+ border-radius: 3px;
369
+ }
370
+
371
+ /* ── Pulsing accent line ── */
372
+ @keyframes pulse-glow {
373
+ 0%, 100% { opacity: 0.4; box-shadow: 0 0 10px rgba(0,212,255,0.3); }
374
+ 50% { opacity: 1; box-shadow: 0 0 30px rgba(0,212,255,0.8); }
375
+ }
376
+
377
+ /* ── Tier badge colors ── */
378
+ .tier-regex { color: #00ff88 !important; }
379
+ .tier-bert { color: #00d4ff !important; }
380
+ .tier-llm { color: #ffd700 !important; }
381
+ """
382
+
383
+ # ── Functions ───────────────────────────────────────────────────────────────
384
+ def classify_single(source: str, log_message: str):
385
+ if not log_message.strip():
386
+ return "β€”", "β€”", "β€”", "β€”"
387
+ t0 = time.perf_counter()
388
+ result = classify_log(source, log_message)
389
+ latency_ms = (time.perf_counter() - t0) * 1000
390
+ label = result["label"]
391
+ tier = result["tier"]
392
+ confidence = f"{result['confidence']:.1%}" if result["confidence"] is not None else "N/A"
393
+ icon = TIER_COLORS.get(tier, "βšͺ")
394
+ return label, f"{icon} {tier}", confidence, f"{latency_ms:.1f} ms"
395
+
396
+
397
+ def classify_batch(file):
398
+ if file is None:
399
+ return None, "⚠️ Please upload a CSV file."
400
+ try:
401
+ output_path, df = classify_csv(file.name, "/tmp/classified_output.csv")
402
+ except ValueError as e:
403
+ return None, f"⚠️ {e}"
404
+ except Exception as e:
405
+ return None, f"❌ Error: {e}"
406
+ total = len(df)
407
+ tier_counts = df["tier_used"].value_counts().to_dict()
408
+ label_counts = df["predicted_label"].value_counts().to_dict()
409
+ tier_lines = "\n".join(f" {TIER_COLORS.get(k,'βšͺ')} {k}: {v} ({v/total:.0%})" for k, v in tier_counts.items())
410
+ label_lines = "\n".join(f" β€’ {k}: {v}" for k, v in label_counts.items())
411
+ stats = (
412
+ f"βœ… Classified {total} logs\n\n"
413
+ f"πŸ“Š Tier breakdown:\n{tier_lines}\n\n"
414
+ f"🏷️ Label distribution:\n{label_lines}"
415
+ )
416
+ return output_path, stats
417
+
418
+
419
+ # ── UI ───────────────────────────────────────────────────────────────────────
420
+ with gr.Blocks(
421
+ title="LOG CLASSIFICATION SYSTEM",
422
+ theme=gr.themes.Base(
423
+ primary_hue="blue",
424
+ secondary_hue="cyan",
425
+ neutral_hue="slate",
426
+ font=[gr.themes.GoogleFont("Exo 2"), "sans-serif"],
427
+ font_mono=[gr.themes.GoogleFont("Share Tech Mono"), "monospace"],
428
+ ).set(
429
+ body_background_fill="#050810",
430
+ body_text_color="#e2e8f0",
431
+ block_background_fill="#0d1425",
432
+ block_border_color="rgba(0,212,255,0.15)",
433
+ block_label_text_color="#00d4ff",
434
+ input_background_fill="#050810",
435
+ input_border_color="rgba(0,212,255,0.2)",
436
+ button_primary_background_fill="linear-gradient(135deg, #0066ff, #00d4ff)",
437
+ button_primary_text_color="#ffffff",
438
+ border_color_accent="#00d4ff",
439
+ color_accent_soft="rgba(0,212,255,0.1)",
440
+ ),
441
+ css=CUSTOM_CSS
442
+ ) as demo:
443
+
444
+ gr.Markdown("""
445
+ # πŸ” LOG CLASSIFICATION SYSTEM
446
+ **3-tier hybrid pipeline** β€” 🟒 Regex Β· πŸ”΅ BERT + ML Β· 🟑 LLM
447
+ *Enterprise-grade log monitoring at production scale*
448
+ """)
449
+
450
+ with gr.Tabs():
451
+
452
+ # ── Tab 1: Single Log ─────────────────────────────────────────────
453
+ with gr.Tab("⚑ SINGLE LOG"):
454
+ with gr.Row():
455
+ with gr.Column(scale=1):
456
+ source_input = gr.Dropdown(
457
+ choices=SOURCES,
458
+ value="ModernCRM",
459
+ label="SOURCE SYSTEM",
460
+ )
461
+ with gr.Column(scale=3):
462
+ log_input = gr.Textbox(
463
+ label="LOG MESSAGE",
464
+ placeholder="Paste a log message here...",
465
+ lines=3,
466
+ )
467
+
468
+ classify_btn = gr.Button("β–Ά CLASSIFY LOG", variant="primary", size="lg")
469
+
470
+ with gr.Row():
471
+ label_out = gr.Textbox(label="🏷️ PREDICTED LABEL", interactive=False)
472
+ tier_out = gr.Textbox(label="βš™οΈ TIER USED", interactive=False)
473
+ confidence_out = gr.Textbox(label="πŸ“ˆ CONFIDENCE", interactive=False)
474
+ latency_out = gr.Textbox(label="⏱️ LATENCY", interactive=False)
475
+
476
+ classify_btn.click(
477
+ fn=classify_single,
478
+ inputs=[source_input, log_input],
479
+ outputs=[label_out, tier_out, confidence_out, latency_out],
480
+ )
481
+
482
+ gr.Examples(
483
+ examples=EXAMPLE_LOGS,
484
+ inputs=[source_input, log_input],
485
+ label="πŸ“‹ EXAMPLE LOGS β€” click to try",
486
+ )
487
+
488
+ # ── Tab 2: Batch CSV ──────────────────────────────────────────────
489
+ with gr.Tab("πŸ“¦ BATCH CSV"):
490
+ gr.Markdown("""
491
+ ### Bulk Classification
492
+ Upload a CSV with columns: **`source`**, **`log_message`**
493
+ Output includes: `predicted_label`, `tier_used`, `confidence`, `latency_ms`
494
+ """)
495
+ with gr.Row():
496
+ with gr.Column():
497
+ csv_input = gr.File(label="πŸ“‚ UPLOAD CSV", file_types=[".csv"])
498
+ batch_btn = gr.Button("β–Ά CLASSIFY ALL", variant="primary")
499
+ with gr.Column():
500
+ csv_output = gr.File(label="πŸ“₯ DOWNLOAD RESULTS")
501
+ stats_out = gr.Textbox(label="πŸ“Š STATISTICS", lines=12, interactive=False)
502
+
503
+ batch_btn.click(
504
+ fn=classify_batch,
505
+ inputs=[csv_input],
506
+ outputs=[csv_output, stats_out],
507
+ )
508
+
509
+ gr.Markdown("""
510
+ **Sample CSV format:**
511
+ ```
512
+ source,log_message
513
+ ModernCRM,User User123 logged in.
514
+ LegacyCRM,Case escalation for ticket ID 7324 failed.
515
+ BillingSystem,GET /api/v2/invoice HTTP/1.1 status: 500
516
+ ```
517
+ """)
518
+
519
+ # ── Tab 3: Architecture ───────────────────────────────────────────
520
+ with gr.Tab("πŸ—οΈ ARCHITECTURE"):
521
+ gr.Markdown("""
522
+ ## 3-Tier Hybrid Pipeline
523
+
524
+ | Tier | Method | Coverage | Latency | Trigger |
525
+ |------|--------|----------|---------|---------|
526
+ | 🟒 **Regex** | Python `re` patterns | ~21% | < 1ms | Fixed patterns |
527
+ | πŸ”΅ **BERT** | `all-MiniLM-L6-v2` + LogReg | ~79% | 20–80ms | High-volume categories |
528
+ | 🟑 **LLM** | HuggingFace Inference API | ~0.3% | 500–2000ms | LegacyCRM + rare patterns |
529
+
530
+ ## Model Performance
531
+ - **Training data**: 2,410 synthetic enterprise logs
532
+ - **Confidence threshold**: 0.5 (below β†’ escalate to LLM)
533
+ - **Source-aware routing**: `LegacyCRM` β†’ LLM directly
534
+
535
+ ## Environment Variables
536
+ | Secret | Purpose |
537
+ |--------|---------|
538
+ | `HF_TOKEN` | LLM inference for LegacyCRM logs |
539
+ """)
540
+
541
+ if __name__ == "__main__":
542
+ demo.launch(server_name="0.0.0.0", server_port=7860)
hf_space/classify.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ classify.py β€” 3-Tier Hybrid Pipeline (V3 β€” Latency-Tracked)
3
+
4
+ Architecture:
5
+ LegacyCRM β†’ LLM directly
6
+ Others β†’ Regex β†’ BERT (batch) β†’ LLM fallback
7
+
8
+ Changes in V3:
9
+ - Tier-wise latency tracking (regex_ms, bert_ms, llm_ms)
10
+ - Pipeline summary with p50/p95 per tier
11
+ - Defensive: LLM timeout + retry baked in via processor_llm
12
+ - classify_logs returns richer result dict
13
+ """
14
+ from __future__ import annotations
15
+ import time
16
+ import statistics
17
+ import pandas as pd
18
+ from processor_regex import classify_with_regex
19
+ from processor_bert import classify_batch as bert_batch
20
+ from processor_llm import classify_with_llm
21
+
22
+ LEGACY_SOURCE = "LegacyCRM"
23
+
24
+
25
+ # ── Result type ─────────────────────────────────────────────────────────────
26
+ def _make_result(label: str, tier: str, confidence, latency_ms: float) -> dict:
27
+ return {
28
+ "label": label,
29
+ "tier": tier,
30
+ "confidence": confidence,
31
+ "latency_ms": round(latency_ms, 3),
32
+ }
33
+
34
+
35
+ # ── Single log (backward-compatible) ────────────────────────────────────────
36
+ def classify_log(source: str, log_msg: str) -> dict:
37
+ """Single log classify karo. Returns label, tier, confidence, latency_ms."""
38
+ results = classify_logs([(source, log_msg)])
39
+ return results[0]
40
+
41
+
42
+ # ── Batch pipeline (main entry point) ───────────────────────────────────────
43
+ def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
44
+ """
45
+ Batch classify with 3-tier routing + per-result latency.
46
+
47
+ Returns list of dicts:
48
+ { label, tier, confidence, latency_ms }
49
+
50
+ Tier routing:
51
+ LegacyCRM source β†’ LLM directly
52
+ Regex match β†’ done (sub-ms)
53
+ Remainder β†’ BERT batch β†’ LLM if low confidence
54
+ """
55
+ n = len(logs)
56
+ results = [None] * n
57
+
58
+ # ── Step 1: Route to groups ─────────────────────────────────────────────
59
+ llm_indices = []
60
+ bert_indices = []
61
+ entry_times = [time.perf_counter()] * n # approximate per-log start
62
+
63
+ t_route_start = time.perf_counter()
64
+ for i, (source, log_msg) in enumerate(logs):
65
+ entry_times[i] = time.perf_counter()
66
+ if source == LEGACY_SOURCE:
67
+ llm_indices.append(i)
68
+ else:
69
+ t0 = time.perf_counter()
70
+ label = classify_with_regex(log_msg)
71
+ t1 = time.perf_counter()
72
+ if label:
73
+ results[i] = _make_result(label, "Regex", 1.0, (t1 - t0) * 1000)
74
+ else:
75
+ bert_indices.append(i)
76
+
77
+ # ── Step 2: BERT batch ──────────────────────────────────────────────────
78
+ if bert_indices:
79
+ bert_msgs = [logs[i][1] for i in bert_indices]
80
+
81
+ t_bert_start = time.perf_counter()
82
+ bert_results = bert_batch(bert_msgs)
83
+ t_bert_end = time.perf_counter()
84
+
85
+ bert_ms_per_log = (t_bert_end - t_bert_start) * 1000 / len(bert_msgs)
86
+
87
+ for idx, (label, conf) in zip(bert_indices, bert_results):
88
+ if label != "Unclassified":
89
+ results[idx] = _make_result(label, "BERT", conf, bert_ms_per_log)
90
+ else:
91
+ llm_indices.append(idx)
92
+
93
+ # ── Step 3: LLM (LegacyCRM + BERT fallback) ────────────────────────────
94
+ for i in llm_indices:
95
+ _, log_msg = logs[i]
96
+ t0 = time.perf_counter()
97
+ label = classify_with_llm(log_msg)
98
+ t1 = time.perf_counter()
99
+ tier = "LLM" if logs[i][0] == LEGACY_SOURCE else "LLM (fallback)"
100
+ results[i] = _make_result(label, tier, None, (t1 - t0) * 1000)
101
+
102
+ return results
103
+
104
+
105
+ # ── Pipeline summary ─────────────────────────────────────────────────────────
106
+ def pipeline_summary(results: list[dict]) -> dict:
107
+ """
108
+ Aggregate stats from classify_logs output.
109
+ Useful for dashboard and benchmark reporting.
110
+ """
111
+ tier_groups: dict[str, list[float]] = {}
112
+ label_counts: dict[str, int] = {}
113
+
114
+ for r in results:
115
+ tier = r["tier"]
116
+ tier_groups.setdefault(tier, []).append(r["latency_ms"])
117
+ label_counts[r["label"]] = label_counts.get(r["label"], 0) + 1
118
+
119
+ total = len(results)
120
+ tier_stats = {}
121
+ for tier, latencies in tier_groups.items():
122
+ latencies_sorted = sorted(latencies)
123
+ n = len(latencies_sorted)
124
+ tier_stats[tier] = {
125
+ "count": n,
126
+ "pct": round(n / total * 100, 1),
127
+ "p50_ms": round(statistics.median(latencies_sorted), 2),
128
+ "p95_ms": round(latencies_sorted[min(int(n * 0.95), n - 1)], 2),
129
+ "p99_ms": round(latencies_sorted[min(int(n * 0.99), n - 1)], 2),
130
+ "mean_ms": round(statistics.mean(latencies_sorted), 2),
131
+ }
132
+
133
+ return {
134
+ "total": total,
135
+ "tier_stats": tier_stats,
136
+ "label_counts": label_counts,
137
+ }
138
+
139
+
140
+ # ── CSV batch classify ───────────────────────────────────────────────────────
141
+ def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str, pd.DataFrame]:
142
+ """
143
+ CSV file classify karo.
144
+ Required columns: 'source', 'log_message'
145
+ Output: adds 'predicted_label', 'tier_used', 'confidence', 'latency_ms'
146
+ """
147
+ df = pd.read_csv(input_path)
148
+ required = {"source", "log_message"}
149
+ if not required.issubset(df.columns):
150
+ raise ValueError(f"CSV mein ye columns chahiye: {required}. Mila: {set(df.columns)}")
151
+
152
+ log_pairs = list(zip(df["source"], df["log_message"]))
153
+ results = classify_logs(log_pairs)
154
+
155
+ df["predicted_label"] = [r["label"] for r in results]
156
+ df["tier_used"] = [r["tier"] for r in results]
157
+ df["latency_ms"] = [r["latency_ms"] for r in results]
158
+ df["confidence"] = [
159
+ f"{r['confidence']:.1%}" if r["confidence"] is not None else "N/A"
160
+ for r in results
161
+ ]
162
+
163
+ df.to_csv(output_path, index=False)
164
+ return output_path, df
165
+
166
+
167
+ # Aliases
168
+ classify = classify_logs
169
+
170
+
171
+ # ── Self-test ────────────────────────────────────────────────────────────────
172
+ if __name__ == "__main__":
173
+ sample = [
174
+ ("ModernCRM", "IP 192.168.133.114 blocked due to potential attack"),
175
+ ("BillingSystem", "User User12345 logged in."),
176
+ ("AnalyticsEngine", "File data_6957.csv uploaded successfully by user User265."),
177
+ ("ModernHR", "GET /v2/servers/detail HTTP/1.1 status: 200 len: 1583 time: 0.19"),
178
+ ("ModernHR", "Admin access escalation detected for user 9429"),
179
+ ("LegacyCRM", "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active."),
180
+ ("LegacyCRM", "The 'ReportGenerator' module will be retired in version 4.0."),
181
+ ]
182
+
183
+ print(f'{"Source":<20} {"Tier":<18} {"Conf":>6} {"Lat(ms)":>8} {"Label":<25} Log')
184
+ print("─" * 115)
185
+ results = classify_logs(sample)
186
+ for (source, log), r in zip(sample, results):
187
+ conf = f"{r['confidence']:.0%}" if r["confidence"] else " N/A"
188
+ print(f'{source:<20} {r["tier"]:<18} {conf:>6} {r["latency_ms"]:>8.1f} {r["label"]:<25} {log[:40]}')
189
+
190
+ summary = pipeline_summary(results)
191
+ print("\nπŸ“Š Pipeline Summary:")
192
+ for tier, stats in summary["tier_stats"].items():
193
+ print(f" {tier}: {stats['count']} logs ({stats['pct']}%) | "
194
+ f"p50={stats['p50_ms']}ms p95={stats['p95_ms']}ms p99={stats['p99_ms']}ms")
195
+
196
+ print("\n🏷️ Label distribution:")
197
+ for label, count in sorted(summary["label_counts"].items(), key=lambda x: -x[1]):
198
+ print(f" β€’ {label}: {count}")
hf_space/models/log_classifier.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bfe9c71b71412797de0d426be2255566dbf6cf87b3f2ae5d2cd1fd69a98d18d
3
+ size 23997
hf_space/onnx_model/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertModel"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 384,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 1536,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 6,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "transformers_version": "4.57.6",
21
+ "type_vocab_size": 2,
22
+ "use_cache": true,
23
+ "vocab_size": 30522
24
+ }
hf_space/onnx_model/special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
hf_space/onnx_model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
hf_space/onnx_model/tokenizer_config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "extra_special_tokens": {},
49
+ "mask_token": "[MASK]",
50
+ "max_length": 128,
51
+ "model_max_length": 512,
52
+ "never_split": null,
53
+ "pad_to_multiple_of": null,
54
+ "pad_token": "[PAD]",
55
+ "pad_token_type_id": 0,
56
+ "padding_side": "right",
57
+ "sep_token": "[SEP]",
58
+ "stride": 0,
59
+ "strip_accents": null,
60
+ "tokenize_chinese_chars": true,
61
+ "tokenizer_class": "BertTokenizer",
62
+ "truncation_side": "right",
63
+ "truncation_strategy": "longest_first",
64
+ "unk_token": "[UNK]"
65
+ }
hf_space/onnx_model/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
hf_space/processor_bert.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ processor_bert_fast.py β€” ONNX Runtime powered BERT classifier
3
+ Speed: 82 logs/s β†’ 2000+ logs/s
4
+
5
+ Kaise kaam karta hai:
6
+ 1. ONNX Runtime: Normal PyTorch se 3-5x faster
7
+ 2. Batch processing: 64 logs ek saath process
8
+ 3. Pre-allocated buffers: Memory waste nahi
9
+ """
10
+ from __future__ import annotations
11
+ import os
12
+ import numpy as np
13
+ import joblib
14
+
15
+ # ── Check karo kaunsa method use karna hai ──────────────────
16
+ _USE_ONNX = False
17
+ _embedding_model = None
18
+ _classifier = None
19
+ _ort_session = None
20
+ _ort_tokenizer = None
21
+
22
+ MODEL_PATH = os.path.join(os.path.dirname(__file__), 'models', 'log_classifier.joblib')
23
+ ONNX_DIR = os.path.join(os.path.dirname(__file__), 'models', 'onnx')
24
+ CONFIDENCE_THRESHOLD = 0.30
25
+ DEFAULT_BATCH = 64
26
+
27
+
28
+ def _load_models():
29
+ """Lazily load models β€” pehli call pe hi load hoga, baar baar nahi."""
30
+ global _USE_ONNX, _embedding_model, _classifier, _ort_session, _ort_tokenizer
31
+
32
+ if _classifier is not None:
33
+ return # Already loaded
34
+
35
+ # ── Classifier load karo ───────────────────────────────
36
+ if not os.path.exists(MODEL_PATH):
37
+ raise FileNotFoundError(
38
+ f'Model nahi mila: {MODEL_PATH}\n'
39
+ 'Pehle Colab notebook run karo aur model download karo.'
40
+ )
41
+ _classifier = joblib.load(MODEL_PATH)
42
+
43
+ # ── ONNX try karo (fast), fallback to PyTorch ──────────
44
+ onnx_model_file = os.path.join(ONNX_DIR, 'model.onnx')
45
+
46
+ if os.path.exists(onnx_model_file):
47
+ try:
48
+ import onnxruntime as ort
49
+ from transformers import AutoTokenizer
50
+
51
+ # CPU optimized session options
52
+ sess_opts = ort.SessionOptions()
53
+ sess_opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
54
+ sess_opts.intra_op_num_threads = os.cpu_count()
55
+ sess_opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
56
+
57
+ _ort_session = ort.InferenceSession(
58
+ onnx_model_file,
59
+ sess_options=sess_opts,
60
+ providers=['CPUExecutionProvider']
61
+ )
62
+ _ort_tokenizer = AutoTokenizer.from_pretrained(ONNX_DIR)
63
+ _USE_ONNX = True
64
+ print('[BERT] βœ… ONNX Runtime loaded β€” FAST MODE')
65
+
66
+ except Exception as e:
67
+ print(f'[BERT] ONNX load failed ({e}), fallback to PyTorch')
68
+ _USE_ONNX = False
69
+
70
+ if not _USE_ONNX:
71
+ from sentence_transformers import SentenceTransformer
72
+ _embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
73
+ print('[BERT] ⚠️ PyTorch mode (install ONNX for 3-5x speedup)')
74
+
75
+
76
+ def _embed_onnx(texts: list[str]) -> np.ndarray:
77
+ """ONNX Runtime se embeddings generate karo β€” FAST."""
78
+ import torch
79
+
80
+ inputs = _ort_tokenizer(
81
+ texts,
82
+ padding=True,
83
+ truncation=True,
84
+ max_length=128,
85
+ return_tensors='np' # NumPy directly (faster than PyTorch tensors)
86
+ )
87
+
88
+ # ONNX session run
89
+ ort_inputs = {
90
+ 'input_ids': inputs['input_ids'].astype(np.int64),
91
+ 'attention_mask': inputs['attention_mask'].astype(np.int64),
92
+ }
93
+ if 'token_type_ids' in [i.name for i in _ort_session.get_inputs()]:
94
+ ort_inputs['token_type_ids'] = inputs.get(
95
+ 'token_type_ids', np.zeros_like(inputs['input_ids'])
96
+ ).astype(np.int64)
97
+
98
+ outputs = _ort_session.run(None, ort_inputs)
99
+ hidden = outputs[0] # (batch, seq_len, hidden)
100
+
101
+ # Mean pooling (attention mask weighted)
102
+ mask = inputs['attention_mask'][:, :, None].astype(np.float32)
103
+ summed = (hidden * mask).sum(axis=1)
104
+ counts = mask.sum(axis=1)
105
+ embeddings = summed / counts
106
+
107
+ # L2 normalize
108
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
109
+ return embeddings / (norms + 1e-8)
110
+
111
+
112
+ def _embed_pytorch(texts: list[str]) -> np.ndarray:
113
+ """PyTorch fallback."""
114
+ return _embedding_model.encode(
115
+ texts,
116
+ batch_size=DEFAULT_BATCH,
117
+ convert_to_numpy=True,
118
+ normalize_embeddings=True,
119
+ show_progress_bar=False
120
+ )
121
+
122
+
123
+ # ── PUBLIC API ──────────────────────────────────────────────
124
+
125
+ def classify_with_bert(log_message: str) -> tuple[str, float]:
126
+ """
127
+ Single log classify karo.
128
+ Returns: (label, confidence)
129
+ """
130
+ _load_models()
131
+ results = classify_batch([log_message])
132
+ return results[0]
133
+
134
+
135
+ def classify_batch(log_messages: list[str]) -> list[tuple[str, float]]:
136
+ """
137
+ Multiple logs ek saath classify karo β€” MUCH FASTER!
138
+ Returns: list of (label, confidence) tuples
139
+
140
+ Example:
141
+ results = classify_batch(['log1', 'log2', 'log3'])
142
+ for label, conf in results:
143
+ print(f'{label}: {conf:.1%}')
144
+ """
145
+ _load_models()
146
+
147
+ if not log_messages:
148
+ return []
149
+
150
+ results = []
151
+
152
+ # Process in batches
153
+ for i in range(0, len(log_messages), DEFAULT_BATCH):
154
+ batch = log_messages[i:i + DEFAULT_BATCH]
155
+
156
+ # Generate embeddings
157
+ if _USE_ONNX:
158
+ embeddings = _embed_onnx(batch)
159
+ else:
160
+ embeddings = _embed_pytorch(batch)
161
+
162
+ # Classify
163
+ probs = _classifier.predict_proba(embeddings)
164
+ max_probs = probs.max(axis=1)
165
+ labels = _classifier.predict(embeddings)
166
+
167
+ for label, conf in zip(labels, max_probs):
168
+ if conf < CONFIDENCE_THRESHOLD:
169
+ results.append(('Unclassified', float(conf)))
170
+ else:
171
+ results.append((str(label), float(conf)))
172
+
173
+ return results
174
+
175
+
176
+ def get_classes() -> list[str]:
177
+ """Classifier ke classes return karo."""
178
+ _load_models()
179
+ return list(_classifier.classes_)
180
+
181
+
182
+ def is_onnx_mode() -> bool:
183
+ """Check karo ONNX use ho raha hai ya nahi."""
184
+ _load_models()
185
+ return _USE_ONNX
186
+
187
+
188
+ # ── TEST ────────────────────────────────────────────────────
189
+ if __name__ == '__main__':
190
+ import time
191
+
192
+ test_logs = [
193
+ 'GET /v2/servers/detail HTTP/1.1 status: 404 len: 1583 time: 0.19',
194
+ 'System crashed due to driver errors when restarting the server',
195
+ 'Multiple login failures occurred on user 6454 account',
196
+ 'Admin access escalation detected for user 9429',
197
+ 'CPU usage at 98% for the last 10 minutes on node-7',
198
+ 'Backup completed successfully.',
199
+ 'User User123 logged in.',
200
+ 'Data replication task for shard 14 did not complete',
201
+ 'Hey bro chill ya!', # should be Unclassified
202
+ ]
203
+
204
+ print('Single log test:')
205
+ for log in test_logs:
206
+ label, conf = classify_with_bert(log)
207
+ print(f' [{conf:.0%}] {label:25s} | {log[:60]}')
208
+
209
+ print(f'\nMode: {"ONNX πŸš€" if is_onnx_mode() else "PyTorch"}')
210
+
211
+ # Speed test
212
+ big_batch = test_logs * 100
213
+ t0 = time.perf_counter()
214
+ classify_batch(big_batch)
215
+ elapsed = time.perf_counter() - t0
216
+ print(f'\nSpeed: {len(big_batch)/elapsed:.0f} logs/s ({elapsed*1000/len(big_batch):.1f}ms/log)')
hf_space/processor_llm.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ processor_llm.py β€” Tier 3: LLM-based Classifier
3
+
4
+ Used for:
5
+ - LegacyCRM logs (Workflow Error, Deprecation Warning)
6
+ - BERT fallback when confidence < threshold
7
+
8
+ Production hardening in V3:
9
+ - Timeout (configurable, default 5s)
10
+ - Retry with exponential backoff (max 2 retries)
11
+ - Explicit failure modes: returns "Unclassified" on all error paths
12
+ - Caching for repeated log patterns (hash-based, in-memory)
13
+ - Token budget enforcement (max_tokens=15)
14
+ """
15
+ from __future__ import annotations
16
+ import os
17
+ import re
18
+ import time
19
+ import hashlib
20
+ import logging
21
+ from functools import lru_cache
22
+ from typing import Optional
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # ── Config ─────────────────────────────────────────────────────────────────
27
+ HF_TOKEN = os.getenv("HF_TOKEN")
28
+ LLM_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
29
+
30
+ VALID_CATEGORIES = ["Workflow Error", "Deprecation Warning"]
31
+
32
+ # Retry / timeout config
33
+ MAX_RETRIES = 2
34
+ RETRY_DELAY_SEC = 1.0 # doubles on each retry (exponential backoff)
35
+ REQUEST_TIMEOUT = 5 # seconds β€” fail fast, do not hang pipeline
36
+
37
+ # In-memory cache to avoid redundant LLM calls for repeated logs
38
+ _RESPONSE_CACHE: dict[str, str] = {}
39
+ MAX_CACHE_SIZE = 1000 # evict oldest when full (simple FIFO)
40
+
41
+ SYSTEM_PROMPT = (
42
+ "You are an enterprise log classifier. "
43
+ "Classify log messages into exactly one category. "
44
+ "Return ONLY the category name β€” no explanation, no punctuation."
45
+ )
46
+
47
+ FEW_SHOT_EXAMPLES = [
48
+ {
49
+ "log": "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active.",
50
+ "label": "Workflow Error",
51
+ },
52
+ {
53
+ "log": "The 'BulkEmailSender' feature is no longer supported. Use 'EmailCampaignManager' instead.",
54
+ "label": "Deprecation Warning",
55
+ },
56
+ {
57
+ "log": "Invoice generation aborted for order ID 8910 due to invalid tax calculation module.",
58
+ "label": "Workflow Error",
59
+ },
60
+ ]
61
+
62
+
63
+ # ── Cache helpers ────────────────────────────────────────────────────────────
64
+ def _cache_key(log_msg: str) -> str:
65
+ return hashlib.md5(log_msg.strip().encode()).hexdigest()
66
+
67
+
68
+ def _cache_get(log_msg: str) -> Optional[str]:
69
+ return _RESPONSE_CACHE.get(_cache_key(log_msg))
70
+
71
+
72
+ def _cache_set(log_msg: str, label: str) -> None:
73
+ key = _cache_key(log_msg)
74
+ if len(_RESPONSE_CACHE) >= MAX_CACHE_SIZE:
75
+ # Evict oldest (first inserted) key
76
+ oldest = next(iter(_RESPONSE_CACHE))
77
+ del _RESPONSE_CACHE[oldest]
78
+ _RESPONSE_CACHE[key] = label
79
+
80
+
81
+ def get_cache_stats() -> dict:
82
+ return {"size": len(_RESPONSE_CACHE), "max_size": MAX_CACHE_SIZE}
83
+
84
+
85
+ # ── Prompt builder ───────────────────────────────────────────────────────────
86
+ def _build_messages(log_msg: str) -> list[dict]:
87
+ categories_str = ", ".join(f'"{c}"' for c in VALID_CATEGORIES)
88
+ user_content = (
89
+ f'Classify the following log into one of these categories: {categories_str}.\n'
90
+ 'If none fits, return "Unclassified".\n\n'
91
+ )
92
+ for ex in FEW_SHOT_EXAMPLES:
93
+ user_content += f'Log: {ex["log"]}\nCategory: {ex["label"]}\n\n'
94
+ user_content += f"Log: {log_msg}\nCategory:"
95
+
96
+ return [
97
+ {"role": "system", "content": SYSTEM_PROMPT},
98
+ {"role": "user", "content": user_content},
99
+ ]
100
+
101
+
102
+ # ── Normalize raw LLM output ─────────────────────────────────────────────────
103
+ def _normalize(raw: str) -> str:
104
+ """Map raw LLM output to a valid category or 'Unclassified'."""
105
+ raw = raw.strip().strip('"').strip("'")
106
+ for cat in VALID_CATEGORIES:
107
+ if cat.lower() in raw.lower():
108
+ return cat
109
+ return "Unclassified"
110
+
111
+
112
+ # ── Main classify function ────────────────────────────────────────────────────
113
+ def classify_with_llm(log_msg: str) -> str:
114
+ """
115
+ Tier 3 LLM classifier with:
116
+ - In-memory cache (avoids duplicate API calls)
117
+ - Timeout (REQUEST_TIMEOUT seconds)
118
+ - Retry with exponential backoff (MAX_RETRIES attempts)
119
+ - Explicit fallback to "Unclassified" on all error paths
120
+
121
+ Latency: 500–2000ms on cache miss; ~0ms on cache hit.
122
+ """
123
+ # ── Cache hit ────────────────────────────────────────────────────────────
124
+ cached = _cache_get(log_msg)
125
+ if cached is not None:
126
+ logger.debug(f"[LLM] Cache hit for: {log_msg[:60]}")
127
+ return cached
128
+
129
+ # ── Inference with retry ─────────────────────────────────────────────────
130
+ if not HF_TOKEN:
131
+ logger.warning("[LLM] HF_TOKEN not set β€” returning Unclassified")
132
+ return "Unclassified"
133
+
134
+ from huggingface_hub import InferenceClient
135
+
136
+ client = InferenceClient(token=HF_TOKEN, timeout=REQUEST_TIMEOUT)
137
+ delay = RETRY_DELAY_SEC
138
+ last_err: Optional[Exception] = None
139
+
140
+ for attempt in range(1, MAX_RETRIES + 2): # +2: initial + MAX_RETRIES
141
+ try:
142
+ response = client.chat.completions.create(
143
+ model=LLM_MODEL,
144
+ messages=_build_messages(log_msg),
145
+ max_tokens=15,
146
+ temperature=0.1,
147
+ )
148
+ raw = response.choices[0].message.content
149
+ label = _normalize(raw)
150
+
151
+ _cache_set(log_msg, label)
152
+ logger.debug(f"[LLM] Attempt {attempt}: '{raw.strip()}' β†’ '{label}'")
153
+ return label
154
+
155
+ except Exception as e:
156
+ last_err = e
157
+ if attempt <= MAX_RETRIES:
158
+ logger.warning(f"[LLM] Attempt {attempt} failed ({e}), retrying in {delay:.1f}s…")
159
+ time.sleep(delay)
160
+ delay *= 2 # exponential backoff
161
+ else:
162
+ logger.error(f"[LLM] All {MAX_RETRIES + 1} attempts failed. Last error: {e}")
163
+
164
+ return "Unclassified"
165
+
166
+
167
+ # ── Batch classify (serial β€” LLM is already rate-limited) ────────────────────
168
+ def classify_batch_llm(log_msgs: list[str]) -> list[str]:
169
+ """Classify multiple logs through LLM. Each call is sequential to respect rate limits."""
170
+ return [classify_with_llm(msg) for msg in log_msgs]
171
+
172
+
173
+ # ── CLI test ─────────────────────────────────────────────────────────────────
174
+ if __name__ == "__main__":
175
+ logging.basicConfig(level=logging.INFO)
176
+
177
+ test_logs = [
178
+ "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active.",
179
+ "The 'ReportGenerator' module will be retired in version 4.0. Migrate to 'AdvancedAnalyticsSuite'.",
180
+ "System reboot initiated by user 12345.", # should be Unclassified
181
+ ]
182
+ for log in test_logs:
183
+ result = classify_with_llm(log)
184
+ print(f"{result:25s} | {log[:80]}")
185
+
186
+ # Cache hit test
187
+ print("\n── Cache hit test ──")
188
+ t0 = time.perf_counter()
189
+ classify_with_llm(test_logs[0])
190
+ t1 = time.perf_counter()
191
+ print(f"Cache hit latency: {(t1-t0)*1000:.2f}ms")
192
+ print(f"Cache stats: {get_cache_stats()}")
hf_space/processor_regex.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ processor_regex.py β€” Tier 1: Rule-based Classifier
3
+
4
+ Target coverage: 40%+ (up from 15%)
5
+ Latency: sub-millisecond per log
6
+
7
+ New pattern groups added:
8
+ - HTTP request/response logs (was completely missing!)
9
+ - Auth / credential events (login failures, MFA, lockouts)
10
+ - System/infra events (disk, CPU, memory, cron)
11
+ - Network / firewall events (IP block, port scan)
12
+ - Structured error codes (ERROR, CRITICAL prefix logs)
13
+ """
14
+ from __future__ import annotations
15
+ import re
16
+ import time
17
+ from typing import Optional
18
+
19
+ # ---------------------------------------------------------------------------
20
+ # Pattern registry: (compiled_pattern, label)
21
+ # Order matters β€” more specific patterns FIRST to avoid mis-labeling.
22
+ # ---------------------------------------------------------------------------
23
+ _RAW_PATTERNS: list[tuple[str, str]] = [
24
+
25
+ # ── HTTP Status ─────────────────────────────────────────────────────────
26
+ # Covers: GET/POST/PUT/DELETE/PATCH + status code in request line
27
+ (r"\b(GET|POST|PUT|DELETE|PATCH|HEAD|OPTIONS)\s+\S+\s+HTTP/\d", "HTTP Status"),
28
+ # Nova / OpenStack style
29
+ (r"nova\.\S+\s+(GET|POST|PUT|DELETE)\s+\S+\s+HTTP/\d", "HTTP Status"),
30
+ # Status code only style: "returned HTTP 200" or "status: 404"
31
+ (r"\bstatus[:\s]+\d{3}\b", "HTTP Status"),
32
+ (r"\breturned\s+HTTP\s+\d{3}\b", "HTTP Status"),
33
+ (r"\bHTTP\s+status\s+code\s*[:-]?\s*\d{3}\b", "HTTP Status"),
34
+ # API response style
35
+ (r"\bAPI\s+(call|request)\s+\S+\s+completed\s+with\s+status\s+\d{3}", "HTTP Status"),
36
+ (r"\bEndpoint\s+\S+\s+responded\s+with\s+code\s+\d{3}", "HTTP Status"),
37
+
38
+ # ── Security Alert ──────────────────────────────────────────────────────
39
+ # Brute force / login failures
40
+ (r"(multiple\s+)?(bad\s+|failed?\s+)?login\s+(failure|attempt|failures)", "Security Alert"),
41
+ (r"brute[\s_-]force\s+(login|attack|attempt)", "Security Alert"),
42
+ # Unauthorized access
43
+ (r"unauthorized\s+(access|admin|privilege|attempt)", "Security Alert"),
44
+ (r"access\s+denied\s+(for|to)\s+(user|ip|host)", "Security Alert"),
45
+ # Privilege escalation
46
+ (r"(admin\s+)?access\s+escalation\s+detected", "Security Alert"),
47
+ (r"privilege\s+(elev|escalat)", "Security Alert"),
48
+ # IP blocking / suspicious traffic
49
+ (r"IP\s+\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\s+blocked", "Security Alert"),
50
+ (r"(suspicious|anomalous)\s+(login|traffic|activity|request)", "Security Alert"),
51
+ (r"potential\s+(DDoS|attack|breach|intrusion)", "Security Alert"),
52
+ (r"security\s+breach\s+suspected", "Security Alert"),
53
+ (r"(API\s+security\s+breach|bypass\s+API\s+security)", "Security Alert"),
54
+ (r"port\s+scan\s+(detected|attempt)", "Security Alert"),
55
+
56
+ # ── User Action ─────────────────────────────────────────────────────────
57
+ (r"User\s+\w+\d*\s+logged\s+(in|out)", "User Action"),
58
+ (r"Account\s+(with\s+)?ID\s+\S+\s+created\s+by", "User Action"),
59
+ (r"User\s+\w+\d*\s+(updated\s+profile|changed\s+password|enabled\s+two|downloaded|exported)", "User Action"),
60
+ (r"(New\s+user|user\s+\w+\d*)\s+registered", "User Action"),
61
+ (r"Account\s+\S+\s+deleted\s+by\s+(administrator|admin)", "User Action"),
62
+ (r"User\s+\w+\d*\s+(tried|attempted)", "User Action"),
63
+
64
+ # ── System Notification ─────────────────────────────────────────────────
65
+ # Backup events
66
+ (r"Backup\s+(started|ended|completed\s+successfully|failed|aborted)", "System Notification"),
67
+ (r"System\s+updated\s+to\s+version", "System Notification"),
68
+ (r"File\s+\S+\s+uploaded\s+successfully\s+by\s+user", "System Notification"),
69
+ (r"Disk\s+cleanup\s+completed\s+successfully", "System Notification"),
70
+ (r"System\s+reboot\s+initiated\s+by\s+user", "System Notification"),
71
+ (r"Scheduled\s+maintenance\s+(started|completed)", "System Notification"),
72
+ (r"Service\s+\w+\s+restarted\s+successfully", "System Notification"),
73
+ # NEW: cache, cron, health check, cert, log rotation
74
+ (r"Cache\s+cleared\s+successfully", "System Notification"),
75
+ (r"Log\s+rotation\s+completed", "System Notification"),
76
+ (r"Health\s+check\s+(passed|failed)\s+for\s+service", "System Notification"),
77
+ (r"Certificate\s+(renewed|expired|revoked)\s+successfully", "System Notification"),
78
+ (r"Cron\s+job\s+\S+\s+(executed|failed|completed)\s+successfully", "System Notification"),
79
+ (r"(Disk|Storage)\s+(usage|space)\s+(at|reached|exceeded)\s+\d+%", "System Notification"),
80
+ (r"CPU\s+usage\s+at\s+\d+%", "System Notification"),
81
+ (r"Memory\s+(usage|limit)\s+(at|reached|exceeded)\s+\d+%", "System Notification"),
82
+ # Deployment / config
83
+ (r"Deployment\s+(of|for)\s+\S+\s+(completed|failed|started)", "System Notification"),
84
+ (r"Configuration\s+(reloaded|updated|applied)\s+successfully", "System Notification"),
85
+
86
+ # ── Error ───────────────────────────────────────────────────────────────
87
+ (r"\bERROR\b.*\b(exception|failed|failure|crash|timeout|unavailable)\b", "Error"),
88
+ (r"System\s+crashed\s+due\s+to", "Error"),
89
+ (r"(connection|request|task|job)\s+(timed?\s*out|timeout)", "Error"),
90
+ (r"service\s+\S+\s+(is\s+down|unavailable|unreachable)", "Error"),
91
+ (r"database\s+connection\s+(failed|refused|lost|dropped)", "Error"),
92
+ (r"disk\s+(I/O\s+)?failure", "Error"),
93
+ (r"driver\s+error(s)?\s+(when|during|on)", "Error"),
94
+ (r"(replication|sync)\s+task\s+(did\s+not\s+complete|failed)", "Error"),
95
+ (r"null\s+pointer|segmentation\s+fault|stack\s+overflow", "Error"),
96
+
97
+ # ── Critical Error ──────────────────────────────────────────────────────
98
+ (r"\bCRITICAL\b", "Critical Error"),
99
+ (r"(FATAL|PANIC)\b", "Critical Error"),
100
+ (r"(data\s+loss|data\s+corruption)\s+(detected|occurred)", "Critical Error"),
101
+ (r"(cluster|node|shard)\s+(failure|crashed|went\s+down)", "Critical Error"),
102
+ (r"(catastrophic|unrecoverable)\s+(failure|error)", "Critical Error"),
103
+ (r"kernel\s+panic", "Critical Error"),
104
+ (r"out[\s-]of[\s-](memory|disk)\s+(error|killed|OOM)", "Critical Error"),
105
+ ]
106
+
107
+ # Pre-compile all patterns at import time (not per-call)
108
+ REGEX_PATTERNS: list[tuple[re.Pattern, str]] = [
109
+ (re.compile(pat, re.IGNORECASE), label)
110
+ for pat, label in _RAW_PATTERNS
111
+ ]
112
+
113
+
114
+ def classify_with_regex(log_message: str) -> Optional[str]:
115
+ """
116
+ Tier 1: Rule-based classifier.
117
+ Returns category label, or None if no pattern matches.
118
+ Latency: sub-millisecond (patterns pre-compiled at import).
119
+ """
120
+ for pattern, label in REGEX_PATTERNS:
121
+ if pattern.search(log_message):
122
+ return label
123
+ return None
124
+
125
+
126
+ def get_regex_coverage(log_messages: list[str]) -> dict:
127
+ """Measure regex tier coverage and per-label breakdown."""
128
+ label_counts: dict[str, int] = {}
129
+ missed = 0
130
+
131
+ for msg in log_messages:
132
+ label = classify_with_regex(msg)
133
+ if label:
134
+ label_counts[label] = label_counts.get(label, 0) + 1
135
+ else:
136
+ missed += 1
137
+
138
+ total = len(log_messages)
139
+ matched = total - missed
140
+
141
+ return {
142
+ "total": total,
143
+ "matched": matched,
144
+ "missed": missed,
145
+ "coverage_pct": round(matched / total * 100, 2) if total else 0.0,
146
+ "label_breakdown": label_counts,
147
+ }
148
+
149
+
150
+ def benchmark_regex(log_messages: list[str], runs: int = 3) -> dict:
151
+ """Measure regex tier latency (p50 / p95 / p99) over multiple runs."""
152
+ import statistics
153
+ per_log_ms: list[float] = []
154
+
155
+ for _ in range(runs):
156
+ for msg in log_messages:
157
+ t0 = time.perf_counter()
158
+ classify_with_regex(msg)
159
+ per_log_ms.append((time.perf_counter() - t0) * 1000)
160
+
161
+ per_log_ms.sort()
162
+ return {
163
+ "p50_ms": round(statistics.median(per_log_ms), 4),
164
+ "p95_ms": round(per_log_ms[int(len(per_log_ms) * 0.95)], 4),
165
+ "p99_ms": round(per_log_ms[int(len(per_log_ms) * 0.99)], 4),
166
+ "mean_ms": round(statistics.mean(per_log_ms), 4),
167
+ }
168
+
169
+
170
+ # ── CLI self-test ────────────────────────────────────────────────────────────
171
+ if __name__ == "__main__":
172
+ test_cases: list[tuple[str, str]] = [
173
+ # HTTP
174
+ ("GET /api/v2/resource HTTP/1.1 status: 200 len: 1583 time: 0.19", "HTTP Status"),
175
+ ("POST /v1/users HTTP/1.1 status: 201 len: 42 time: 0.05", "HTTP Status"),
176
+ ("nova.osapi_compute.wsgi.server GET /v2/servers/detail HTTP/1.1 status: 404", "HTTP Status"),
177
+ # Security
178
+ ("Multiple login failures occurred on user 6454 account", "Security Alert"),
179
+ ("IP 192.168.133.114 blocked due to potential attack", "Security Alert"),
180
+ ("Brute force login attempt from 10.0.0.5 detected", "Security Alert"),
181
+ ("Admin access escalation detected for user 9429", "Security Alert"),
182
+ # User Action
183
+ ("User User12345 logged in.", "User Action"),
184
+ ("Account with ID 456 created by Admin.", "User Action"),
185
+ # System Notification
186
+ ("Backup completed successfully.", "System Notification"),
187
+ ("CPU usage at 98% for the last 10 minutes on node-7", "System Notification"),
188
+ ("Health check passed for service payments-api", "System Notification"),
189
+ # Error
190
+ ("System crashed due to disk I/O failure on node-3", "Error"),
191
+ ("Database connection failed after 3 retries", "Error"),
192
+ # Critical
193
+ ("CRITICAL: data corruption detected on shard-14", "Critical Error"),
194
+ ("kernel panic: not syncing: VFS: unable to mount root fs", "Critical Error"),
195
+ # Should be None (unmatched)
196
+ ("The 'BulkEmailSender' feature will be deprecated in v5.0.", None),
197
+ ("Case escalation for ticket 7324 failed.", None),
198
+ ]
199
+
200
+ correct = 0
201
+ print(f"{'Expected':<22} {'Got':<22} {'βœ“/βœ—'} | Log")
202
+ print("─" * 100)
203
+ for log, expected in test_cases:
204
+ got = classify_with_regex(log)
205
+ ok = got == expected
206
+ correct += ok
207
+ icon = "βœ“" if ok else "βœ—"
208
+ print(f"{str(expected):<22} {str(got):<22} {icon} | {log[:55]}")
209
+
210
+ print(f"\n{correct}/{len(test_cases)} correct")
211
+
212
+ # Coverage demo
213
+ all_logs = [log for log, _ in test_cases]
214
+ cov = get_regex_coverage(all_logs)
215
+ print(f"\nCoverage: {cov['coverage_pct']}% ({cov['matched']}/{cov['total']} matched)")
216
+ print("Label breakdown:", cov["label_breakdown"])
217
+
218
+ # Latency benchmark
219
+ lat = benchmark_regex(all_logs * 100)
220
+ print(f"\nLatency (p50/p95/p99): {lat['p50_ms']}ms / {lat['p95_ms']}ms / {lat['p99_ms']}ms")
hf_space/requirements.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core
2
+ gradio>=4.44.0
3
+ pandas>=2.0.0
4
+ numpy>=1.24.0
5
+ joblib>=1.3.0
6
+ scikit-learn>=1.3.0
7
+
8
+ # Embedding + BERT
9
+ sentence-transformers>=2.7.0
10
+ transformers>=4.38.0
11
+
12
+ # ONNX (optional, 3-5x speedup)
13
+ onnxruntime>=1.17.0
14
+ optimum[onnxruntime]>=1.16.0
15
+
16
+ # LLM
17
+ huggingface-hub>=0.21.0
18
+
19
+ # FastAPI (production API)
20
+ fastapi>=0.110.0
21
+ uvicorn[standard]>=0.29.0
22
+ pydantic>=2.0.0
23
+
24
+ # Observability
25
+ psutil>=5.9.0