NOT-OMEGA commited on
Commit
4561114
Β·
verified Β·
1 Parent(s): 0c1b7a7

Upload 10 files

Browse files
HF/Dockerfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Build args
4
+ ARG PORT=8000
5
+
6
+ # System deps
7
+ RUN apt-get update && apt-get install -y --no-install-recommends \
8
+ curl \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ WORKDIR /app
12
+
13
+ # Install Python deps first (layer cache)
14
+ COPY requirements.txt .
15
+ RUN pip install --no-cache-dir -r requirements.txt
16
+
17
+ # Copy source
18
+ COPY . .
19
+
20
+ # Non-root user
21
+ RUN adduser --disabled-password --gecos "" appuser \
22
+ && chown -R appuser:appuser /app
23
+ USER appuser
24
+
25
+ # Health check
26
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
27
+ CMD curl -f http://localhost:${PORT}/health || exit 1
28
+
29
+ EXPOSE ${PORT}
30
+
31
+ # Production: single worker (CPU-bound inference β€” scale via replicas, not threads)
32
+ CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "1", "--log-level", "info"]
HF/api.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ api.py β€” Async FastAPI Inference Service
3
+
4
+ Endpoints:
5
+ POST /classify β€” Single log
6
+ POST /classify/batch β€” Batch of logs (up to 512)
7
+ GET /health β€” Liveness check
8
+ GET /ready β€” Readiness check (model loaded?)
9
+ GET /metrics β€” Request counts, throughput, latency stats
10
+
11
+ Features:
12
+ - Async request handling (non-blocking)
13
+ - Worker pool via asyncio semaphore (bounded concurrency)
14
+ - Structured JSON logs with request_id
15
+ - Rate limiting (configurable)
16
+ - Request ID tracing
17
+ - Batch queue aggregation for small requests
18
+
19
+ Run:
20
+ uvicorn api:app --host 0.0.0.0 --port 8000 --workers 1
21
+
22
+ Example:
23
+ curl -X POST http://localhost:8000/classify \
24
+ -H "Content-Type: application/json" \
25
+ -d '{"source": "ModernCRM", "log_message": "User User123 logged in."}'
26
+ """
27
+ from __future__ import annotations
28
+ import asyncio
29
+ import logging
30
+ import os
31
+ import time
32
+ import uuid
33
+ import statistics
34
+ from collections import deque
35
+ from contextlib import asynccontextmanager
36
+ from typing import Optional
37
+
38
+ from fastapi import FastAPI, HTTPException, Request, status
39
+ from fastapi.middleware.cors import CORSMiddleware
40
+ from fastapi.responses import JSONResponse
41
+ from pydantic import BaseModel, Field, field_validator
42
+
43
+ # ── Logging setup ─────────────────────────────────────────────────────────────
44
+ logging.basicConfig(
45
+ level=logging.INFO,
46
+ format='{"time":"%(asctime)s","level":"%(levelname)s","logger":"%(name)s","msg":"%(message)s"}'
47
+ )
48
+ logger = logging.getLogger("log-classifier-api")
49
+
50
+ # ── Config ─────────────────────────────────────────────────────────────────────
51
+ MAX_BATCH_SIZE = int(os.getenv("MAX_BATCH_SIZE", "512"))
52
+ MAX_CONCURRENT = int(os.getenv("MAX_CONCURRENT", "4")) # concurrency cap
53
+ RATE_LIMIT_PER_MIN = int(os.getenv("RATE_LIMIT_PER_MIN", "1000"))
54
+ LOG_MAX_CHARS = 2048 # truncate huge logs before classify
55
+
56
+ # ── Global state ───────────────────────────────────────────────────────────────
57
+ _semaphore: asyncio.Semaphore = None # type: ignore
58
+ _model_ready: bool = False
59
+
60
+ # Metrics ring buffer (last 1000 requests)
61
+ _latencies_ms: deque = deque(maxlen=1000)
62
+ _request_count = 0
63
+ _error_count = 0
64
+ _start_time = time.time()
65
+
66
+ # Rate limiter (simple sliding window per process)
67
+ _rate_window: deque = deque(maxlen=RATE_LIMIT_PER_MIN)
68
+
69
+
70
+ # ── Lifespan: load models on startup ──────────────────────────────────────────
71
+ @asynccontextmanager
72
+ async def lifespan(app: FastAPI):
73
+ global _semaphore, _model_ready
74
+
75
+ logger.info("Starting up β€” loading models…")
76
+ _semaphore = asyncio.Semaphore(MAX_CONCURRENT)
77
+
78
+ # Load models in a thread pool (blocking I/O, don't block event loop)
79
+ loop = asyncio.get_event_loop()
80
+ try:
81
+ await loop.run_in_executor(None, _load_models_blocking)
82
+ _model_ready = True
83
+ logger.info("βœ… Models loaded β€” API ready")
84
+ except Exception as e:
85
+ logger.error(f"❌ Model load failed: {e}")
86
+ # Service starts but /ready will return 503
87
+
88
+ yield
89
+
90
+ logger.info("Shutting down")
91
+
92
+
93
+ def _load_models_blocking():
94
+ """Load BERT + classifier (blocks β€” run in executor)."""
95
+ from processor_bert import classify_batch as _
96
+ logger.info("BERT model loaded")
97
+
98
+
99
+ # ── App factory ────────────────────────────────────────────────────────────────
100
+ app = FastAPI(
101
+ title="Log Classification API",
102
+ description="3-tier hybrid pipeline: Regex β†’ BERT β†’ LLM",
103
+ version="3.0.0",
104
+ lifespan=lifespan,
105
+ )
106
+
107
+ app.add_middleware(
108
+ CORSMiddleware,
109
+ allow_origins=["*"],
110
+ allow_methods=["*"],
111
+ allow_headers=["*"],
112
+ )
113
+
114
+
115
+ # ── Request / Response schemas ─────────────────────────────────────────────────
116
+ class LogRequest(BaseModel):
117
+ source: str = Field(..., example="ModernCRM")
118
+ log_message: str = Field(..., example="User User123 logged in.", min_length=1)
119
+
120
+ @field_validator("log_message")
121
+ @classmethod
122
+ def truncate_long_logs(cls, v: str) -> str:
123
+ return v[:LOG_MAX_CHARS]
124
+
125
+
126
+ class LogResponse(BaseModel):
127
+ request_id: str
128
+ label: str
129
+ tier: str
130
+ confidence: Optional[float]
131
+ latency_ms: float
132
+ cached: bool = False
133
+
134
+
135
+ class BatchRequest(BaseModel):
136
+ logs: list[LogRequest] = Field(..., max_length=MAX_BATCH_SIZE)
137
+
138
+
139
+ class BatchResponse(BaseModel):
140
+ request_id: str
141
+ total: int
142
+ elapsed_ms: float
143
+ throughput: float
144
+ results: list[LogResponse]
145
+
146
+
147
+ class HealthResponse(BaseModel):
148
+ status: str
149
+ uptime_s: float
150
+
151
+
152
+ class MetricsResponse(BaseModel):
153
+ total_requests: int
154
+ total_errors: int
155
+ uptime_s: float
156
+ requests_per_min: float
157
+ latency_p50_ms: Optional[float]
158
+ latency_p95_ms: Optional[float]
159
+ latency_p99_ms: Optional[float]
160
+
161
+
162
+ # ── Rate limiter ───────────────────────────────────────────────────────────────
163
+ def _check_rate_limit() -> None:
164
+ now = time.time()
165
+ _rate_window.append(now)
166
+ # Window = last 60 seconds
167
+ recent = [t for t in _rate_window if now - t < 60]
168
+ if len(recent) > RATE_LIMIT_PER_MIN:
169
+ raise HTTPException(
170
+ status_code=status.HTTP_429_TOO_MANY_REQUESTS,
171
+ detail=f"Rate limit exceeded: {RATE_LIMIT_PER_MIN} req/min",
172
+ )
173
+
174
+
175
+ # ── Middleware: request logging ────────────────────────────────────────────────
176
+ @app.middleware("http")
177
+ async def log_requests(request: Request, call_next):
178
+ rid = request.headers.get("X-Request-ID", str(uuid.uuid4())[:8])
179
+ request.state.request_id = rid
180
+ t0 = time.perf_counter()
181
+ response = await call_next(request)
182
+ elapsed = (time.perf_counter() - t0) * 1000
183
+ logger.info(
184
+ f"method={request.method} path={request.url.path} "
185
+ f"status={response.status_code} latency={elapsed:.1f}ms rid={rid}"
186
+ )
187
+ response.headers["X-Request-ID"] = rid
188
+ return response
189
+
190
+
191
+ # ── Health & readiness ─────────────────────────────────────────────────────────
192
+ @app.get("/health", response_model=HealthResponse, tags=["ops"])
193
+ async def health():
194
+ return {"status": "ok", "uptime_s": round(time.time() - _start_time, 1)}
195
+
196
+
197
+ @app.get("/ready", tags=["ops"])
198
+ async def ready():
199
+ if not _model_ready:
200
+ raise HTTPException(status_code=503, detail="Models not yet loaded")
201
+ return {"status": "ready"}
202
+
203
+
204
+ # ── Metrics ────────────────────────────────────────────────────────────────────
205
+ @app.get("/metrics", response_model=MetricsResponse, tags=["ops"])
206
+ async def metrics():
207
+ uptime = time.time() - _start_time
208
+ lats = sorted(_latencies_ms) if _latencies_ms else []
209
+ n = len(lats)
210
+
211
+ def pct(p):
212
+ return round(lats[min(int(n * p), n - 1)], 2) if n else None
213
+
214
+ return {
215
+ "total_requests": _request_count,
216
+ "total_errors": _error_count,
217
+ "uptime_s": round(uptime, 1),
218
+ "requests_per_min": round(_request_count / max(uptime / 60, 1), 1),
219
+ "latency_p50_ms": pct(0.50),
220
+ "latency_p95_ms": pct(0.95),
221
+ "latency_p99_ms": pct(0.99),
222
+ }
223
+
224
+
225
+ # ── Classify single ────────────────────────────────────────────────────────────
226
+ @app.post("/classify", response_model=LogResponse, tags=["inference"])
227
+ async def classify_single(req: LogRequest, request: Request):
228
+ global _request_count, _error_count
229
+ _check_rate_limit()
230
+ _request_count += 1
231
+ rid = getattr(request.state, "request_id", str(uuid.uuid4())[:8])
232
+
233
+ async with _semaphore:
234
+ loop = asyncio.get_event_loop()
235
+ t0 = time.perf_counter()
236
+ try:
237
+ result = await loop.run_in_executor(
238
+ None, _classify_blocking, req.source, req.log_message
239
+ )
240
+ except Exception as e:
241
+ _error_count += 1
242
+ logger.error(f"rid={rid} classify error: {e}")
243
+ raise HTTPException(status_code=500, detail=str(e))
244
+
245
+ latency = (time.perf_counter() - t0) * 1000
246
+ _latencies_ms.append(latency)
247
+
248
+ return LogResponse(
249
+ request_id = rid,
250
+ label = result["label"],
251
+ tier = result["tier"],
252
+ confidence = result.get("confidence"),
253
+ latency_ms = round(latency, 2),
254
+ )
255
+
256
+
257
+ def _classify_blocking(source: str, log_message: str) -> dict:
258
+ from classify import classify_log
259
+ return classify_log(source, log_message)
260
+
261
+
262
+ # ── Classify batch ─────────────────────────────────────────────────────────────
263
+ @app.post("/classify/batch", response_model=BatchResponse, tags=["inference"])
264
+ async def classify_batch_endpoint(req: BatchRequest, request: Request):
265
+ global _request_count, _error_count
266
+ _check_rate_limit()
267
+ _request_count += 1
268
+ rid = getattr(request.state, "request_id", str(uuid.uuid4())[:8])
269
+
270
+ log_pairs = [(r.source, r.log_message) for r in req.logs]
271
+
272
+ async with _semaphore:
273
+ loop = asyncio.get_event_loop()
274
+ t0 = time.perf_counter()
275
+ try:
276
+ results = await loop.run_in_executor(
277
+ None, _classify_batch_blocking, log_pairs
278
+ )
279
+ except Exception as e:
280
+ _error_count += 1
281
+ logger.error(f"rid={rid} batch error: {e}")
282
+ raise HTTPException(status_code=500, detail=str(e))
283
+
284
+ elapsed_ms = (time.perf_counter() - t0) * 1000
285
+ throughput = round(len(log_pairs) / (elapsed_ms / 1000), 1)
286
+ _latencies_ms.extend([elapsed_ms / len(log_pairs)] * len(log_pairs))
287
+
288
+ return BatchResponse(
289
+ request_id = rid,
290
+ total = len(log_pairs),
291
+ elapsed_ms = round(elapsed_ms, 2),
292
+ throughput = throughput,
293
+ results = [
294
+ LogResponse(
295
+ request_id = rid,
296
+ label = r["label"],
297
+ tier = r["tier"],
298
+ confidence = r.get("confidence"),
299
+ latency_ms = round(elapsed_ms / len(log_pairs), 2),
300
+ )
301
+ for r in results
302
+ ],
303
+ )
304
+
305
+
306
+ def _classify_batch_blocking(log_pairs: list[tuple[str, str]]) -> list[dict]:
307
+ from classify import classify_logs
308
+ return classify_logs(log_pairs)
309
+
310
+
311
+ # ── Dev runner ──────────────────────────────────────────────────────────────────
312
+ if __name__ == "__main__":
313
+ import uvicorn
314
+ uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=False, workers=1)
HF/app_gradio.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Log Classification System β€” HuggingFace Spaces
3
+ Gradio UI for the 3-tier hybrid log classification pipeline.
4
+ """
5
+ from __future__ import annotations
6
+ import io
7
+ import time
8
+ import pandas as pd
9
+ import gradio as gr
10
+ from classify import classify_log, classify_csv
11
+
12
+ # ── Source options ──────────────────────────────────────────────────────────
13
+ SOURCES = [
14
+ "ModernCRM",
15
+ "ModernHR",
16
+ "BillingSystem",
17
+ "AnalyticsEngine",
18
+ "ThirdPartyAPI",
19
+ "LegacyCRM",
20
+ ]
21
+
22
+ TIER_COLORS = {
23
+ "Regex": "🟒",
24
+ "BERT": "πŸ”΅",
25
+ "LLM": "🟑",
26
+ "LLM (fallback)": "🟠",
27
+ }
28
+
29
+ EXAMPLE_LOGS = [
30
+ ["ModernCRM", "User User12345 logged in."],
31
+ ["ModernHR", "Multiple login failures occurred on user 6454 account"],
32
+ ["BillingSystem", "GET /v2/servers/detail HTTP/1.1 status: 200 len: 1583 time: 0.19"],
33
+ ["AnalyticsEngine", "System crashed due to disk I/O failure on node-3"],
34
+ ["LegacyCRM", "Case escalation for ticket ID 7324 failed β€” support agent is no longer active."],
35
+ ["LegacyCRM", "The 'BulkEmailSender' feature will be deprecated in v5.0. Use 'EmailCampaignManager'."],
36
+ ]
37
+
38
+
39
+ # ── Single log tab ──────────────────────────────────────────────────────────
40
+ def classify_single(source: str, log_message: str):
41
+ if not log_message.strip():
42
+ return "β€”", "β€”", "β€”", "β€”"
43
+
44
+ t0 = time.perf_counter()
45
+ result = classify_log(source, log_message)
46
+ latency_ms = (time.perf_counter() - t0) * 1000
47
+
48
+ label = result["label"]
49
+ tier = result["tier"]
50
+ confidence = f"{result['confidence']:.1%}" if result["confidence"] is not None else "N/A"
51
+ icon = TIER_COLORS.get(tier, "βšͺ")
52
+
53
+ return (
54
+ label,
55
+ f"{icon} {tier}",
56
+ confidence,
57
+ f"{latency_ms:.1f} ms",
58
+ )
59
+
60
+
61
+ # ── Batch CSV tab ───────────────────────────────────────────────────────────
62
+ def classify_batch(file):
63
+ if file is None:
64
+ return None, "⚠️ Please upload a CSV file."
65
+
66
+ try:
67
+ output_path, df = classify_csv(file.name, "/tmp/classified_output.csv")
68
+ except ValueError as e:
69
+ return None, f"⚠️ {e}"
70
+ except Exception as e:
71
+ return None, f"❌ Error: {e}"
72
+
73
+ total = len(df)
74
+ tier_counts = df["tier_used"].value_counts().to_dict()
75
+ label_counts = df["predicted_label"].value_counts().to_dict()
76
+
77
+ tier_lines = "\n".join(f" {TIER_COLORS.get(k,'βšͺ')} {k}: {v} ({v/total:.0%})" for k, v in tier_counts.items())
78
+ label_lines = "\n".join(f" β€’ {k}: {v}" for k, v in label_counts.items())
79
+
80
+ stats = (
81
+ f"βœ… Classified {total} logs\n\n"
82
+ f"πŸ“Š Tier breakdown:\n{tier_lines}\n\n"
83
+ f"🏷️ Label distribution:\n{label_lines}"
84
+ )
85
+
86
+ return output_path, stats
87
+
88
+
89
+ # ── UI ──────────────────────────────────────────────────────────────────────
90
+ with gr.Blocks(title="Log Classification System", theme=gr.themes.Soft()) as demo:
91
+
92
+ gr.Markdown("""
93
+ # πŸ” Log Classification System
94
+ **3-tier hybrid pipeline** β†’ 🟒 Regex Β· πŸ”΅ BERT + LogReg Β· 🟑 LLM
95
+ Built to mimic production enterprise log monitoring architecture.
96
+ """)
97
+
98
+ with gr.Tabs():
99
+
100
+ # ── Tab 1: Single Log ────────────────────────────────────────────
101
+ with gr.Tab("Single Log"):
102
+ with gr.Row():
103
+ source_input = gr.Dropdown(
104
+ choices=SOURCES,
105
+ value="ModernCRM",
106
+ label="Source System",
107
+ )
108
+ log_input = gr.Textbox(
109
+ label="Log Message",
110
+ placeholder="Paste a log message here...",
111
+ lines=3,
112
+ )
113
+
114
+ classify_btn = gr.Button("Classify", variant="primary")
115
+
116
+ with gr.Row():
117
+ label_out = gr.Textbox(label="🏷️ Predicted Label", interactive=False)
118
+ tier_out = gr.Textbox(label="βš™οΈ Tier Used", interactive=False)
119
+ confidence_out = gr.Textbox(label="πŸ“ˆ Confidence", interactive=False)
120
+ latency_out = gr.Textbox(label="⏱️ Latency", interactive=False)
121
+
122
+ classify_btn.click(
123
+ fn=classify_single,
124
+ inputs=[source_input, log_input],
125
+ outputs=[label_out, tier_out, confidence_out, latency_out],
126
+ )
127
+
128
+ gr.Examples(
129
+ examples=EXAMPLE_LOGS,
130
+ inputs=[source_input, log_input],
131
+ label="πŸ“‹ Example Logs (click to try)",
132
+ )
133
+
134
+ # ── Tab 2: Batch CSV ─────────────────────────────────────────────
135
+ with gr.Tab("Batch CSV Upload"):
136
+ gr.Markdown("""
137
+ Upload a CSV with columns: **`source`**, **`log_message`**
138
+ Download the classified CSV with added columns: `predicted_label`, `tier_used`, `confidence`.
139
+ """)
140
+ with gr.Row():
141
+ with gr.Column():
142
+ csv_input = gr.File(label="πŸ“‚ Upload CSV", file_types=[".csv"])
143
+ batch_btn = gr.Button("Classify All", variant="primary")
144
+ with gr.Column():
145
+ csv_output = gr.File(label="πŸ“₯ Download Classified CSV")
146
+ stats_out = gr.Textbox(label="πŸ“Š Stats", lines=12, interactive=False)
147
+
148
+ batch_btn.click(
149
+ fn=classify_batch,
150
+ inputs=[csv_input],
151
+ outputs=[csv_output, stats_out],
152
+ )
153
+
154
+ gr.Markdown("""
155
+ **Sample CSV format:**
156
+ ```
157
+ source,log_message
158
+ ModernCRM,User User123 logged in.
159
+ LegacyCRM,Case escalation for ticket ID 7324 failed.
160
+ BillingSystem,GET /api/v2/invoice HTTP/1.1 status: 500
161
+ ```
162
+ """)
163
+
164
+ # ── Tab 3: Architecture ──────────────────────────────────────────
165
+ with gr.Tab("Architecture"):
166
+ gr.Markdown("""
167
+ ## πŸ—οΈ 3-Tier Hybrid Pipeline
168
+
169
+ | Tier | Method | Coverage | Latency | When Used |
170
+ |------|--------|----------|---------|-----------|
171
+ | 🟒 Regex | Python `re` patterns | ~21% | < 1ms | Fixed patterns (login, backup, etc.) |
172
+ | πŸ”΅ BERT | `all-MiniLM-L6-v2` + LogReg | ~79% | 20–80ms | High-volume categories with 150+ samples |
173
+ | 🟑 LLM | HuggingFace Inference API | ~0.3% | 500–2000ms | LegacyCRM logs, rare patterns |
174
+
175
+ ## πŸ“Š Model Performance (from training)
176
+ - **BERT + LogReg** trained on 2,410 synthetic enterprise logs
177
+ - **Confidence threshold**: 0.5 (below β†’ escalate to LLM)
178
+ - **Source-aware routing**: `LegacyCRM` bypasses ML entirely (only 7 training samples)
179
+
180
+ ## πŸ”‘ Environment Variables
181
+ | Secret | Required For |
182
+ |--------|-------------|
183
+ | `HF_TOKEN` | LLM inference (LegacyCRM logs) |
184
+ """)
185
+
186
+ if __name__ == "__main__":
187
+ demo.launch()
HF/benchmark.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ benchmark.py β€” Full Benchmark Harness
3
+
4
+ Outputs a CSV with columns:
5
+ batch_size, mode, throughput_logs_sec, p50_ms, p95_ms, p99_ms, cpu_pct, ram_mb, tier_regex_pct, tier_bert_pct, tier_llm_pct
6
+
7
+ Usage:
8
+ python benchmark.py --logs 5000 --output benchmark_results.csv
9
+
10
+ What it measures:
11
+ - Batch size sweep: 1, 8, 16, 32, 64, 128
12
+ - Throughput (logs/sec)
13
+ - Latency: p50 / p95 / p99 (per-log)
14
+ - CPU and RAM during inference
15
+ - Tier distribution (Regex % / BERT % / LLM %)
16
+
17
+ Google interview talking point:
18
+ "I designed a benchmark harness that sweeps batch sizes and measures
19
+ latency percentiles + resource utilization, so I can show the
20
+ throughput-latency tradeoff curve empirically."
21
+ """
22
+ from __future__ import annotations
23
+ import argparse
24
+ import csv
25
+ import os
26
+ import random
27
+ import sys
28
+ import time
29
+ import statistics
30
+ from pathlib import Path
31
+
32
+ import psutil
33
+
34
+ # ── Synthetic log generator (no external deps needed) ────────────────────────
35
+ SOURCES = ["ModernCRM", "ModernHR", "BillingSystem", "AnalyticsEngine", "ThirdPartyAPI"]
36
+
37
+ _LOG_TEMPLATES = [
38
+ ("ModernCRM", "User User{id} logged in."),
39
+ ("ModernCRM", "IP {ip} blocked due to potential attack"),
40
+ ("ModernHR", "Multiple login failures occurred on user {id} account"),
41
+ ("ModernHR", "Admin access escalation detected for user {id}"),
42
+ ("BillingSystem", "GET /api/v2/invoices HTTP/1.1 status: {code} len: {len} time: {t}"),
43
+ ("BillingSystem", "POST /api/v1/payments HTTP/1.1 status: {code} len: {len} time: {t}"),
44
+ ("AnalyticsEngine", "System crashed due to disk I/O failure on node-{n}"),
45
+ ("AnalyticsEngine", "Backup completed successfully."),
46
+ ("ThirdPartyAPI", "Service payments-api is unreachable after 3 retries"),
47
+ ("ThirdPartyAPI", "CPU usage at {pct}% for the last 10 minutes on node-{n}"),
48
+ ("AnalyticsEngine", "CRITICAL: data corruption detected on shard-{n}"),
49
+ ("ModernCRM", "Health check passed for service {svc}"),
50
+ ]
51
+
52
+ def _rand_ip():
53
+ return f"{random.randint(10,192)}.{random.randint(0,255)}.{random.randint(0,255)}.{random.randint(1,254)}"
54
+
55
+ def _fill(template: str) -> str:
56
+ return (template
57
+ .replace("{id}", str(random.randint(100, 99999)))
58
+ .replace("{ip}", _rand_ip())
59
+ .replace("{code}", random.choice(["200", "201", "400", "404", "500", "503"]))
60
+ .replace("{len}", str(random.randint(100, 9999)))
61
+ .replace("{t}", f"{random.uniform(0.01, 2.5):.2f}")
62
+ .replace("{n}", str(random.randint(1, 20)))
63
+ .replace("{pct}", str(random.randint(60, 99)))
64
+ .replace("{svc}", random.choice(["auth-api", "billing", "analytics", "events"]))
65
+ )
66
+
67
+ def generate_logs(n: int) -> list[tuple[str, str]]:
68
+ random.seed(42)
69
+ return [
70
+ (src, _fill(tmpl))
71
+ for src, tmpl in random.choices(_LOG_TEMPLATES, k=n)
72
+ ]
73
+
74
+
75
+ # ── Benchmark runner ─────────────────────────────────────────────────────────
76
+ def run_benchmark(
77
+ logs: list[tuple[str, str]],
78
+ batch_sizes: list[int],
79
+ output_csv: str,
80
+ warmup_n: int = 50,
81
+ ) -> list[dict]:
82
+ from classify import classify_logs, pipeline_summary
83
+
84
+ proc = psutil.Process(os.getpid())
85
+ rows: list[dict] = []
86
+
87
+ # Warmup (model load, JIT, etc.)
88
+ print(f"πŸ”₯ Warming up with {warmup_n} logs…")
89
+ classify_logs(logs[:warmup_n])
90
+
91
+ for bs in batch_sizes:
92
+ # Slice logs into batches of size `bs`
93
+ batches = [logs[i:i + bs] for i in range(0, len(logs), bs)]
94
+ if not batches:
95
+ continue
96
+
97
+ per_log_latencies: list[float] = []
98
+ cpu_samples: list[float] = []
99
+ ram_samples: list[float] = []
100
+ all_results: list[dict] = []
101
+
102
+ print(f"\nπŸ“ Batch size = {bs} ({len(batches)} batches Γ— {bs} logs)…")
103
+
104
+ wall_start = time.perf_counter()
105
+
106
+ for batch in batches:
107
+ t0 = time.perf_counter()
108
+ results = classify_logs(batch)
109
+ t1 = time.perf_counter()
110
+ batch_ms = (t1 - t0) * 1000
111
+ per_log_ms = batch_ms / len(batch)
112
+
113
+ per_log_latencies.extend([per_log_ms] * len(batch))
114
+ all_results.extend(results)
115
+
116
+ # Resource snapshot
117
+ cpu_samples.append(proc.cpu_percent(interval=None))
118
+ ram_samples.append(proc.memory_info().rss / 1_048_576) # MB
119
+
120
+ wall_elapsed = time.perf_counter() - wall_start
121
+ total_logs = len(logs)
122
+ throughput = round(total_logs / wall_elapsed, 1)
123
+
124
+ per_log_latencies.sort()
125
+ n = len(per_log_latencies)
126
+
127
+ summary = pipeline_summary(all_results)
128
+ tier_stats = summary["tier_stats"]
129
+
130
+ def tier_pct(name):
131
+ return tier_stats.get(name, {}).get("pct", 0.0)
132
+
133
+ row = {
134
+ "batch_size": bs,
135
+ "total_logs": total_logs,
136
+ "elapsed_sec": round(wall_elapsed, 2),
137
+ "throughput_logs_sec": throughput,
138
+ "p50_ms": round(statistics.median(per_log_latencies), 3),
139
+ "p95_ms": round(per_log_latencies[min(int(n * 0.95), n - 1)], 3),
140
+ "p99_ms": round(per_log_latencies[min(int(n * 0.99), n - 1)], 3),
141
+ "mean_ms": round(statistics.mean(per_log_latencies), 3),
142
+ "cpu_mean_pct": round(statistics.mean(cpu_samples), 1) if cpu_samples else 0,
143
+ "cpu_max_pct": round(max(cpu_samples), 1) if cpu_samples else 0,
144
+ "ram_mean_mb": round(statistics.mean(ram_samples), 1) if ram_samples else 0,
145
+ "ram_max_mb": round(max(ram_samples), 1) if ram_samples else 0,
146
+ "tier_regex_pct": tier_pct("Regex"),
147
+ "tier_bert_pct": tier_pct("BERT"),
148
+ "tier_llm_pct": tier_pct("LLM") + tier_pct("LLM (fallback)"),
149
+ }
150
+ rows.append(row)
151
+
152
+ print(f" βœ… Throughput: {throughput} logs/sec | "
153
+ f"p50={row['p50_ms']}ms p95={row['p95_ms']}ms p99={row['p99_ms']}ms | "
154
+ f"CPU={row['cpu_mean_pct']}% RAM={row['ram_mean_mb']}MB")
155
+ print(f" πŸ“Š Tiers: Regex={row['tier_regex_pct']}% "
156
+ f"BERT={row['tier_bert_pct']}% "
157
+ f"LLM={row['tier_llm_pct']}%")
158
+
159
+ # Write CSV
160
+ Path(output_csv).parent.mkdir(parents=True, exist_ok=True)
161
+ with open(output_csv, "w", newline="") as f:
162
+ writer = csv.DictWriter(f, fieldnames=rows[0].keys())
163
+ writer.writeheader()
164
+ writer.writerows(rows)
165
+
166
+ print(f"\nβœ… Benchmark results saved β†’ {output_csv}")
167
+ return rows
168
+
169
+
170
+ # ── Scaling stress test ──────────────────────────────────────────────────────
171
+ def stress_test(sizes: list[int] = [5_000, 20_000, 50_000, 100_000]) -> None:
172
+ """Quick throughput check at different total log counts."""
173
+ from classify import classify_logs
174
+
175
+ print("\nπŸ”₯ Stress Test β€” Scaling")
176
+ print(f"{'N logs':>10} {'Elapsed(s)':>12} {'Throughput':>12} {'p95_ms':>10}")
177
+ print("─" * 50)
178
+
179
+ for n in sizes:
180
+ logs = generate_logs(n)
181
+ t0 = time.perf_counter()
182
+ classify_logs(logs)
183
+ elapsed = time.perf_counter() - t0
184
+ tput = n / elapsed
185
+ # Rough p95 approximation: time / n * correction factor
186
+ p95_approx = (elapsed / n * 1000) * 1.5
187
+ print(f"{n:>10,} {elapsed:>12.2f}s {tput:>12.1f}/s {p95_approx:>10.1f}ms")
188
+
189
+
190
+ # ── CLI ──────────────────────────────────────────────────────────────────────
191
+ def main():
192
+ parser = argparse.ArgumentParser(description="Log pipeline benchmark harness")
193
+ parser.add_argument("--logs", type=int, default=5_000,
194
+ help="Number of logs to benchmark (default: 5000)")
195
+ parser.add_argument("--output", default="benchmark_results.csv",
196
+ help="Output CSV path")
197
+ parser.add_argument("--stress", action="store_true",
198
+ help="Run scaling stress test (5k, 20k, 50k, 100k)")
199
+ parser.add_argument("--batches", default="1,8,16,32,64,128",
200
+ help="Comma-separated batch sizes to sweep")
201
+ args = parser.parse_args()
202
+
203
+ batch_sizes = [int(x) for x in args.batches.split(",")]
204
+ logs = generate_logs(args.logs)
205
+ print(f"πŸ“¦ Generated {len(logs):,} synthetic logs")
206
+
207
+ run_benchmark(logs, batch_sizes, args.output)
208
+
209
+ if args.stress:
210
+ stress_test()
211
+
212
+
213
+ if __name__ == "__main__":
214
+ main()
HF/classify.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ classify.py β€” 3-Tier Hybrid Pipeline (V3 β€” Latency-Tracked)
3
+
4
+ Architecture:
5
+ LegacyCRM β†’ LLM directly
6
+ Others β†’ Regex β†’ BERT (batch) β†’ LLM fallback
7
+
8
+ Changes in V3:
9
+ - Tier-wise latency tracking (regex_ms, bert_ms, llm_ms)
10
+ - Pipeline summary with p50/p95 per tier
11
+ - Defensive: LLM timeout + retry baked in via processor_llm
12
+ - classify_logs returns richer result dict
13
+ """
14
+ from __future__ import annotations
15
+ import time
16
+ import statistics
17
+ import pandas as pd
18
+ from processor_regex import classify_with_regex
19
+ from processor_bert import classify_batch as bert_batch
20
+ from processor_llm import classify_with_llm
21
+
22
+ LEGACY_SOURCE = "LegacyCRM"
23
+
24
+
25
+ # ── Result type ─────────────────────────────────────────────────────────────
26
+ def _make_result(label: str, tier: str, confidence, latency_ms: float) -> dict:
27
+ return {
28
+ "label": label,
29
+ "tier": tier,
30
+ "confidence": confidence,
31
+ "latency_ms": round(latency_ms, 3),
32
+ }
33
+
34
+
35
+ # ── Single log (backward-compatible) ────────────────────────────────────────
36
+ def classify_log(source: str, log_msg: str) -> dict:
37
+ """Single log classify karo. Returns label, tier, confidence, latency_ms."""
38
+ results = classify_logs([(source, log_msg)])
39
+ return results[0]
40
+
41
+
42
+ # ── Batch pipeline (main entry point) ───────────────────────────────────────
43
+ def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
44
+ """
45
+ Batch classify with 3-tier routing + per-result latency.
46
+
47
+ Returns list of dicts:
48
+ { label, tier, confidence, latency_ms }
49
+
50
+ Tier routing:
51
+ LegacyCRM source β†’ LLM directly
52
+ Regex match β†’ done (sub-ms)
53
+ Remainder β†’ BERT batch β†’ LLM if low confidence
54
+ """
55
+ n = len(logs)
56
+ results = [None] * n
57
+
58
+ # ── Step 1: Route to groups ─────────────────────────────────────────────
59
+ llm_indices = []
60
+ bert_indices = []
61
+ entry_times = [time.perf_counter()] * n # approximate per-log start
62
+
63
+ t_route_start = time.perf_counter()
64
+ for i, (source, log_msg) in enumerate(logs):
65
+ entry_times[i] = time.perf_counter()
66
+ if source == LEGACY_SOURCE:
67
+ llm_indices.append(i)
68
+ else:
69
+ t0 = time.perf_counter()
70
+ label = classify_with_regex(log_msg)
71
+ t1 = time.perf_counter()
72
+ if label:
73
+ results[i] = _make_result(label, "Regex", 1.0, (t1 - t0) * 1000)
74
+ else:
75
+ bert_indices.append(i)
76
+
77
+ # ── Step 2: BERT batch ──────────────────────────────────────────────────
78
+ if bert_indices:
79
+ bert_msgs = [logs[i][1] for i in bert_indices]
80
+
81
+ t_bert_start = time.perf_counter()
82
+ bert_results = bert_batch(bert_msgs)
83
+ t_bert_end = time.perf_counter()
84
+
85
+ bert_ms_per_log = (t_bert_end - t_bert_start) * 1000 / len(bert_msgs)
86
+
87
+ for idx, (label, conf) in zip(bert_indices, bert_results):
88
+ if label != "Unclassified":
89
+ results[idx] = _make_result(label, "BERT", conf, bert_ms_per_log)
90
+ else:
91
+ llm_indices.append(idx)
92
+
93
+ # ── Step 3: LLM (LegacyCRM + BERT fallback) ────────────────────────────
94
+ for i in llm_indices:
95
+ _, log_msg = logs[i]
96
+ t0 = time.perf_counter()
97
+ label = classify_with_llm(log_msg)
98
+ t1 = time.perf_counter()
99
+ tier = "LLM" if logs[i][0] == LEGACY_SOURCE else "LLM (fallback)"
100
+ results[i] = _make_result(label, tier, None, (t1 - t0) * 1000)
101
+
102
+ return results
103
+
104
+
105
+ # ── Pipeline summary ─────────────────────────────────────────────────────────
106
+ def pipeline_summary(results: list[dict]) -> dict:
107
+ """
108
+ Aggregate stats from classify_logs output.
109
+ Useful for dashboard and benchmark reporting.
110
+ """
111
+ tier_groups: dict[str, list[float]] = {}
112
+ label_counts: dict[str, int] = {}
113
+
114
+ for r in results:
115
+ tier = r["tier"]
116
+ tier_groups.setdefault(tier, []).append(r["latency_ms"])
117
+ label_counts[r["label"]] = label_counts.get(r["label"], 0) + 1
118
+
119
+ total = len(results)
120
+ tier_stats = {}
121
+ for tier, latencies in tier_groups.items():
122
+ latencies_sorted = sorted(latencies)
123
+ n = len(latencies_sorted)
124
+ tier_stats[tier] = {
125
+ "count": n,
126
+ "pct": round(n / total * 100, 1),
127
+ "p50_ms": round(statistics.median(latencies_sorted), 2),
128
+ "p95_ms": round(latencies_sorted[min(int(n * 0.95), n - 1)], 2),
129
+ "p99_ms": round(latencies_sorted[min(int(n * 0.99), n - 1)], 2),
130
+ "mean_ms": round(statistics.mean(latencies_sorted), 2),
131
+ }
132
+
133
+ return {
134
+ "total": total,
135
+ "tier_stats": tier_stats,
136
+ "label_counts": label_counts,
137
+ }
138
+
139
+
140
+ # ── CSV batch classify ───────────────────────────────────────────────────────
141
+ def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str, pd.DataFrame]:
142
+ """
143
+ CSV file classify karo.
144
+ Required columns: 'source', 'log_message'
145
+ Output: adds 'predicted_label', 'tier_used', 'confidence', 'latency_ms'
146
+ """
147
+ df = pd.read_csv(input_path)
148
+ required = {"source", "log_message"}
149
+ if not required.issubset(df.columns):
150
+ raise ValueError(f"CSV mein ye columns chahiye: {required}. Mila: {set(df.columns)}")
151
+
152
+ log_pairs = list(zip(df["source"], df["log_message"]))
153
+ results = classify_logs(log_pairs)
154
+
155
+ df["predicted_label"] = [r["label"] for r in results]
156
+ df["tier_used"] = [r["tier"] for r in results]
157
+ df["latency_ms"] = [r["latency_ms"] for r in results]
158
+ df["confidence"] = [
159
+ f"{r['confidence']:.1%}" if r["confidence"] is not None else "N/A"
160
+ for r in results
161
+ ]
162
+
163
+ df.to_csv(output_path, index=False)
164
+ return output_path, df
165
+
166
+
167
+ # Aliases
168
+ classify = classify_logs
169
+
170
+
171
+ # ── Self-test ────────────────────────────────────────────────────────────────
172
+ if __name__ == "__main__":
173
+ sample = [
174
+ ("ModernCRM", "IP 192.168.133.114 blocked due to potential attack"),
175
+ ("BillingSystem", "User User12345 logged in."),
176
+ ("AnalyticsEngine", "File data_6957.csv uploaded successfully by user User265."),
177
+ ("ModernHR", "GET /v2/servers/detail HTTP/1.1 status: 200 len: 1583 time: 0.19"),
178
+ ("ModernHR", "Admin access escalation detected for user 9429"),
179
+ ("LegacyCRM", "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active."),
180
+ ("LegacyCRM", "The 'ReportGenerator' module will be retired in version 4.0."),
181
+ ]
182
+
183
+ print(f'{"Source":<20} {"Tier":<18} {"Conf":>6} {"Lat(ms)":>8} {"Label":<25} Log')
184
+ print("─" * 115)
185
+ results = classify_logs(sample)
186
+ for (source, log), r in zip(sample, results):
187
+ conf = f"{r['confidence']:.0%}" if r["confidence"] else " N/A"
188
+ print(f'{source:<20} {r["tier"]:<18} {conf:>6} {r["latency_ms"]:>8.1f} {r["label"]:<25} {log[:40]}')
189
+
190
+ summary = pipeline_summary(results)
191
+ print("\nπŸ“Š Pipeline Summary:")
192
+ for tier, stats in summary["tier_stats"].items():
193
+ print(f" {tier}: {stats['count']} logs ({stats['pct']}%) | "
194
+ f"p50={stats['p50_ms']}ms p95={stats['p95_ms']}ms p99={stats['p99_ms']}ms")
195
+
196
+ print("\n🏷️ Label distribution:")
197
+ for label, count in sorted(summary["label_counts"].items(), key=lambda x: -x[1]):
198
+ print(f" β€’ {label}: {count}")
HF/error_analysis.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ error_analysis.py β€” Deep Dive into Unclassified / Misclassified Logs
3
+
4
+ This script addresses the 76 unclassified logs from the 20k run.
5
+ It answers:
6
+ 1. What do these logs look like? (print + group)
7
+ 2. Why did the model fail? (pattern analysis)
8
+ 3. What should we do? (actionable fix suggestions)
9
+
10
+ Google interview talking point:
11
+ "I performed structured error analysis on my model's failure cases.
12
+ I grouped them by failure type β€” vocabulary mismatch, ambiguous intent,
13
+ formatting noise β€” and used that to drive targeted improvements."
14
+
15
+ Usage:
16
+ python error_analysis.py --input output.csv # post-classify CSV
17
+ python error_analysis.py --simulate # demo with synthetic data
18
+ """
19
+ from __future__ import annotations
20
+ import argparse
21
+ import re
22
+ import sys
23
+ from collections import Counter, defaultdict
24
+ from typing import Optional
25
+ import pandas as pd
26
+
27
+
28
+ # ── Failure mode taxonomy ────────────────────────────────────────────────────
29
+ class FailureMode:
30
+ RARE_VOCAB = "rare_vocabulary" # domain-specific terms not in training
31
+ AMBIGUOUS = "ambiguous_intent" # log could match multiple categories
32
+ LEGACY_FORMAT = "legacy_format" # non-standard / old-school formatting
33
+ TRUNCATED = "truncated_or_noisy" # partial / malformed log line
34
+ NUMERIC_ONLY = "mostly_numeric" # ID/code-heavy, no semantic signal
35
+ MULTI_EVENT = "multi_event" # one line, multiple events
36
+ UNKNOWN = "unknown"
37
+
38
+
39
+ def _detect_failure_mode(log: str) -> str:
40
+ """Heuristic: guess WHY this log was unclassified."""
41
+ log_l = log.lower()
42
+
43
+ if len(log) < 20:
44
+ return FailureMode.TRUNCATED
45
+
46
+ # Check ratio of digits to total chars
47
+ digit_ratio = sum(c.isdigit() for c in log) / max(len(log), 1)
48
+ if digit_ratio > 0.40:
49
+ return FailureMode.NUMERIC_ONLY
50
+
51
+ # Looks like it has 2+ events joined
52
+ if log.count(";") >= 2 or log.count(" AND ") >= 1 or log.count(" | ") >= 2:
53
+ return FailureMode.MULTI_EVENT
54
+
55
+ # Legacy / unusual format signals
56
+ legacy_signals = ["ticket", "escalation", "crm", "deprecated", "retire",
57
+ "module will be", "workflow", "assigned agent"]
58
+ if any(s in log_l for s in legacy_signals):
59
+ return FailureMode.LEGACY_FORMAT
60
+
61
+ # Ambiguity signals β€” could be error OR security
62
+ ambiguous_signals = ["failed", "error", "unauthorized", "denied", "blocked"]
63
+ if sum(1 for s in ambiguous_signals if s in log_l) >= 2:
64
+ return FailureMode.AMBIGUOUS
65
+
66
+ # Rare vocabulary
67
+ rare_signals = ["sla", "oncall", "runbook", "pagerduty", "janitor", "gc ", "eviction"]
68
+ if any(s in log_l for s in rare_signals):
69
+ return FailureMode.RARE_VOCAB
70
+
71
+ return FailureMode.UNKNOWN
72
+
73
+
74
+ def _suggest_fix(mode: str) -> str:
75
+ fixes = {
76
+ FailureMode.RARE_VOCAB: "Add 5–10 training examples covering this vocabulary; or add regex rule.",
77
+ FailureMode.AMBIGUOUS: "Use multi-label or add a dedicated 'Ambiguous' class; review confidence threshold.",
78
+ FailureMode.LEGACY_FORMAT: "Route all legacy-format logs to LLM tier; add few-shot examples for LLM prompt.",
79
+ FailureMode.TRUNCATED: "Add input validation: reject/flag logs under 15 chars before classification.",
80
+ FailureMode.NUMERIC_ONLY: "Add regex patterns for structured numeric formats (job IDs, error codes, etc.).",
81
+ FailureMode.MULTI_EVENT: "Pre-process: split multi-event lines on ';' or ' | ' before classifying.",
82
+ FailureMode.UNKNOWN: "Manually review and add to training data or LLM few-shot examples.",
83
+ }
84
+ return fixes.get(mode, "Manual review required.")
85
+
86
+
87
+ # ── Core analysis ────────────────────────────────────────────────────────────
88
+ def analyze_unclassified(df: pd.DataFrame, label_col: str = "predicted_label") -> None:
89
+ """Full error analysis on a classified CSV DataFrame."""
90
+
91
+ unclassified = df[df[label_col] == "Unclassified"].copy()
92
+ total_unclassified = len(unclassified)
93
+
94
+ if total_unclassified == 0:
95
+ print("βœ… No unclassified logs found!")
96
+ return
97
+
98
+ print(f"\n{'='*70}")
99
+ print(f"πŸ” ERROR ANALYSIS: {total_unclassified} Unclassified Logs")
100
+ print(f"{'='*70}\n")
101
+
102
+ # ── Step 1: Print all unclassified logs ─────────────────────────────────
103
+ log_col = "log_message" if "log_message" in df.columns else df.columns[-1]
104
+ print(f"{'#':>4} {'Log Message'}")
105
+ print("─" * 80)
106
+ for i, (_, row) in enumerate(unclassified.iterrows(), 1):
107
+ log = str(row.get(log_col, ""))
108
+ print(f"{i:>4}. {log[:120]}")
109
+
110
+ # ── Step 2: Group by failure mode ───────────────────────────────────────
111
+ print(f"\n{'='*70}")
112
+ print("πŸ“‚ GROUPING BY FAILURE MODE")
113
+ print("─" * 70)
114
+
115
+ groups: dict[str, list[str]] = defaultdict(list)
116
+ for _, row in unclassified.iterrows():
117
+ log = str(row.get(log_col, ""))
118
+ mode = _detect_failure_mode(log)
119
+ groups[mode].append(log)
120
+
121
+ for mode, logs in sorted(groups.items(), key=lambda x: -len(x[1])):
122
+ pct = len(logs) / total_unclassified * 100
123
+ print(f"\nπŸ”Ή {mode} β€” {len(logs)} logs ({pct:.1f}%)")
124
+ print(f" πŸ’‘ Fix: {_suggest_fix(mode)}")
125
+ print(f" Examples:")
126
+ for log in logs[:3]:
127
+ print(f" β€’ {log[:110]}")
128
+
129
+ # ── Step 3: Token frequency analysis ────────────────────────────────────
130
+ print(f"\n{'='*70}")
131
+ print("πŸ“Š COMMON TOKENS IN UNCLASSIFIED LOGS")
132
+ print("─" * 70)
133
+
134
+ STOPWORDS = {"the", "a", "an", "is", "in", "on", "for", "to", "of",
135
+ "and", "or", "by", "at", "with", "has", "was", "be",
136
+ "this", "that", "it", "not", "are", "from", "as"}
137
+
138
+ all_tokens: list[str] = []
139
+ for _, row in unclassified.iterrows():
140
+ log = str(row.get(log_col, "")).lower()
141
+ tokens = re.findall(r"[a-z]{3,}", log)
142
+ all_tokens.extend(t for t in tokens if t not in STOPWORDS)
143
+
144
+ counter = Counter(all_tokens)
145
+ print("Top 20 tokens in unclassified logs:")
146
+ for token, count in counter.most_common(20):
147
+ bar = "β–ˆ" * min(count, 40)
148
+ print(f" {token:<20} {count:>4} {bar}")
149
+
150
+ # ── Step 4: Length distribution ─────────────────────────────────────────
151
+ lengths = unclassified[log_col].apply(lambda x: len(str(x)))
152
+ print(f"\n{'='*70}")
153
+ print("πŸ“ LOG LENGTH DISTRIBUTION (Unclassified)")
154
+ print(f" Mean: {lengths.mean():.1f} chars")
155
+ print(f" Median: {lengths.median():.1f} chars")
156
+ print(f" Min: {lengths.min()} chars")
157
+ print(f" Max: {lengths.max()} chars")
158
+
159
+ short = (lengths < 30).sum()
160
+ if short:
161
+ print(f" ⚠️ {short} logs under 30 chars β€” likely truncated/noisy")
162
+
163
+ # ── Step 5: Source breakdown ─────────────────────────────────────────────
164
+ if "source" in df.columns:
165
+ print(f"\n{'='*70}")
166
+ print("🏷️ UNCLASSIFIED BY SOURCE")
167
+ src_counts = unclassified["source"].value_counts()
168
+ for src, cnt in src_counts.items():
169
+ bar = "β–ˆ" * min(cnt, 40)
170
+ print(f" {src:<22} {cnt:>4} {bar}")
171
+
172
+ # ── Step 6: Actionable summary ───────────────────────────────────────────
173
+ print(f"\n{'='*70}")
174
+ print("βœ… ACTIONABLE FIXES (Priority Order)")
175
+ print("─" * 70)
176
+ dominant_mode = max(groups.items(), key=lambda x: len(x[1]))[0] if groups else FailureMode.UNKNOWN
177
+ fixes = [
178
+ (1, "regex", "Add patterns for top unclassified tokens to processor_regex.py"),
179
+ (2, "training", "Add 10–20 examples per failure mode to training data"),
180
+ (3, "llm", "For LEGACY_FORMAT failures: add to LLM few-shot examples"),
181
+ (4, "preproc", "Pre-process: split multi-event logs, reject truncated logs"),
182
+ (5, "threshold","Tune BERT confidence threshold (currently 0.30 β€” try 0.40)"),
183
+ ]
184
+ for priority, area, fix in fixes:
185
+ print(f" {priority}. [{area.upper():^10}] {fix}")
186
+
187
+ print(f"\nπŸ“Œ Dominant failure mode: '{dominant_mode}' ({len(groups.get(dominant_mode,[]))} logs)")
188
+ print(f" Start here: {_suggest_fix(dominant_mode)}\n")
189
+
190
+
191
+ # ── Simulate 76 unclassified logs for demo ────────────────────────────────────
192
+ def _simulate_unclassified() -> pd.DataFrame:
193
+ """Generate synthetic 'unclassified' logs that mimic real failure patterns."""
194
+ logs = [
195
+ # Legacy format / CRM
196
+ "Case escalation for ticket ID 9021 failed: agent inactive.",
197
+ "CRM module 'ReportGenerator' will be retired in v4.1.",
198
+ "Workflow for approval chain #4421 stalled at step 3.",
199
+ "SLA breach detected for case ID 7701 (P1, 4h breach).",
200
+ # Ambiguous
201
+ "Service auth-api failed and unauthorized access was logged.",
202
+ "Error: blocked request from 10.0.0.5 β€” reason unknown.",
203
+ # Truncated / noisy
204
+ "ERR",
205
+ "srv timeout",
206
+ "node-7",
207
+ # Numeric-heavy
208
+ "8821 9001 443 0 0 DROP IN=eth0 OUT= MAC=",
209
+ "16 0 0 1 2024-01-14 03:21:00.001",
210
+ # Multi-event
211
+ "Backup started; disk usage at 92%; health check failed | node-3",
212
+ # Rare vocab
213
+ "PagerDuty alert triggered for on-call rotation P1-incident.",
214
+ "GC eviction: 3.2GB heap compacted in 420ms.",
215
+ "Janitor job completed: 14,000 stale tokens purged.",
216
+ "Runbook auto-remediation triggered for alert ALT-9021.",
217
+ ]
218
+ # Pad to ~76
219
+ padded = (logs * 5)[:76]
220
+ return pd.DataFrame({
221
+ "source": ["ModernCRM"] * 30 + ["LegacyCRM"] * 20 + ["AnalyticsEngine"] * 26,
222
+ "log_message": padded,
223
+ "predicted_label": ["Unclassified"] * 76,
224
+ })
225
+
226
+
227
+ # ── CLI ──────────────────────────────────────────────────────────────────────
228
+ def main():
229
+ parser = argparse.ArgumentParser(description="Analyze unclassified/misclassified logs")
230
+ parser.add_argument("--input", help="Path to classified CSV from classify_csv()")
231
+ parser.add_argument("--simulate", action="store_true",
232
+ help="Run with synthetic unclassified logs (no CSV needed)")
233
+ parser.add_argument("--label-col", default="predicted_label",
234
+ help="Column name that holds the predicted label")
235
+ args = parser.parse_args()
236
+
237
+ if args.simulate:
238
+ df = _simulate_unclassified()
239
+ print("🎭 Running with SIMULATED 76 unclassified logs…")
240
+ elif args.input:
241
+ df = pd.read_csv(args.input)
242
+ else:
243
+ parser.print_help()
244
+ sys.exit(1)
245
+
246
+ analyze_unclassified(df, label_col=args.label_col)
247
+
248
+
249
+ if __name__ == "__main__":
250
+ main()
HF/processor_bert.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ processor_bert_fast.py β€” ONNX Runtime powered BERT classifier
3
+ Speed: 82 logs/s β†’ 2000+ logs/s
4
+
5
+ Kaise kaam karta hai:
6
+ 1. ONNX Runtime: Normal PyTorch se 3-5x faster
7
+ 2. Batch processing: 64 logs ek saath process
8
+ 3. Pre-allocated buffers: Memory waste nahi
9
+ """
10
+ from __future__ import annotations
11
+ import os
12
+ import numpy as np
13
+ import joblib
14
+
15
+ # ── Check karo kaunsa method use karna hai ──────────────────
16
+ _USE_ONNX = False
17
+ _embedding_model = None
18
+ _classifier = None
19
+ _ort_session = None
20
+ _ort_tokenizer = None
21
+
22
+ MODEL_PATH = os.path.join(os.path.dirname(__file__), 'models', 'log_classifier.joblib')
23
+ ONNX_DIR = os.path.join(os.path.dirname(__file__), 'models', 'onnx')
24
+ CONFIDENCE_THRESHOLD = 0.30
25
+ DEFAULT_BATCH = 64
26
+
27
+
28
+ def _load_models():
29
+ """Lazily load models β€” pehli call pe hi load hoga, baar baar nahi."""
30
+ global _USE_ONNX, _embedding_model, _classifier, _ort_session, _ort_tokenizer
31
+
32
+ if _classifier is not None:
33
+ return # Already loaded
34
+
35
+ # ── Classifier load karo ───────────────────────────────
36
+ if not os.path.exists(MODEL_PATH):
37
+ raise FileNotFoundError(
38
+ f'Model nahi mila: {MODEL_PATH}\n'
39
+ 'Pehle Colab notebook run karo aur model download karo.'
40
+ )
41
+ _classifier = joblib.load(MODEL_PATH)
42
+
43
+ # ── ONNX try karo (fast), fallback to PyTorch ──────────
44
+ onnx_model_file = os.path.join(ONNX_DIR, 'model.onnx')
45
+
46
+ if os.path.exists(onnx_model_file):
47
+ try:
48
+ import onnxruntime as ort
49
+ from transformers import AutoTokenizer
50
+
51
+ # CPU optimized session options
52
+ sess_opts = ort.SessionOptions()
53
+ sess_opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
54
+ sess_opts.intra_op_num_threads = os.cpu_count()
55
+ sess_opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
56
+
57
+ _ort_session = ort.InferenceSession(
58
+ onnx_model_file,
59
+ sess_options=sess_opts,
60
+ providers=['CPUExecutionProvider']
61
+ )
62
+ _ort_tokenizer = AutoTokenizer.from_pretrained(ONNX_DIR)
63
+ _USE_ONNX = True
64
+ print('[BERT] βœ… ONNX Runtime loaded β€” FAST MODE')
65
+
66
+ except Exception as e:
67
+ print(f'[BERT] ONNX load failed ({e}), fallback to PyTorch')
68
+ _USE_ONNX = False
69
+
70
+ if not _USE_ONNX:
71
+ from sentence_transformers import SentenceTransformer
72
+ _embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
73
+ print('[BERT] ⚠️ PyTorch mode (install ONNX for 3-5x speedup)')
74
+
75
+
76
+ def _embed_onnx(texts: list[str]) -> np.ndarray:
77
+ """ONNX Runtime se embeddings generate karo β€” FAST."""
78
+ import torch
79
+
80
+ inputs = _ort_tokenizer(
81
+ texts,
82
+ padding=True,
83
+ truncation=True,
84
+ max_length=128,
85
+ return_tensors='np' # NumPy directly (faster than PyTorch tensors)
86
+ )
87
+
88
+ # ONNX session run
89
+ ort_inputs = {
90
+ 'input_ids': inputs['input_ids'].astype(np.int64),
91
+ 'attention_mask': inputs['attention_mask'].astype(np.int64),
92
+ }
93
+ if 'token_type_ids' in [i.name for i in _ort_session.get_inputs()]:
94
+ ort_inputs['token_type_ids'] = inputs.get(
95
+ 'token_type_ids', np.zeros_like(inputs['input_ids'])
96
+ ).astype(np.int64)
97
+
98
+ outputs = _ort_session.run(None, ort_inputs)
99
+ hidden = outputs[0] # (batch, seq_len, hidden)
100
+
101
+ # Mean pooling (attention mask weighted)
102
+ mask = inputs['attention_mask'][:, :, None].astype(np.float32)
103
+ summed = (hidden * mask).sum(axis=1)
104
+ counts = mask.sum(axis=1)
105
+ embeddings = summed / counts
106
+
107
+ # L2 normalize
108
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
109
+ return embeddings / (norms + 1e-8)
110
+
111
+
112
+ def _embed_pytorch(texts: list[str]) -> np.ndarray:
113
+ """PyTorch fallback."""
114
+ return _embedding_model.encode(
115
+ texts,
116
+ batch_size=DEFAULT_BATCH,
117
+ convert_to_numpy=True,
118
+ normalize_embeddings=True,
119
+ show_progress_bar=False
120
+ )
121
+
122
+
123
+ # ── PUBLIC API ──────────────────────────────────────────────
124
+
125
+ def classify_with_bert(log_message: str) -> tuple[str, float]:
126
+ """
127
+ Single log classify karo.
128
+ Returns: (label, confidence)
129
+ """
130
+ _load_models()
131
+ results = classify_batch([log_message])
132
+ return results[0]
133
+
134
+
135
+ def classify_batch(log_messages: list[str]) -> list[tuple[str, float]]:
136
+ """
137
+ Multiple logs ek saath classify karo β€” MUCH FASTER!
138
+ Returns: list of (label, confidence) tuples
139
+
140
+ Example:
141
+ results = classify_batch(['log1', 'log2', 'log3'])
142
+ for label, conf in results:
143
+ print(f'{label}: {conf:.1%}')
144
+ """
145
+ _load_models()
146
+
147
+ if not log_messages:
148
+ return []
149
+
150
+ results = []
151
+
152
+ # Process in batches
153
+ for i in range(0, len(log_messages), DEFAULT_BATCH):
154
+ batch = log_messages[i:i + DEFAULT_BATCH]
155
+
156
+ # Generate embeddings
157
+ if _USE_ONNX:
158
+ embeddings = _embed_onnx(batch)
159
+ else:
160
+ embeddings = _embed_pytorch(batch)
161
+
162
+ # Classify
163
+ probs = _classifier.predict_proba(embeddings)
164
+ max_probs = probs.max(axis=1)
165
+ labels = _classifier.predict(embeddings)
166
+
167
+ for label, conf in zip(labels, max_probs):
168
+ if conf < CONFIDENCE_THRESHOLD:
169
+ results.append(('Unclassified', float(conf)))
170
+ else:
171
+ results.append((str(label), float(conf)))
172
+
173
+ return results
174
+
175
+
176
+ def get_classes() -> list[str]:
177
+ """Classifier ke classes return karo."""
178
+ _load_models()
179
+ return list(_classifier.classes_)
180
+
181
+
182
+ def is_onnx_mode() -> bool:
183
+ """Check karo ONNX use ho raha hai ya nahi."""
184
+ _load_models()
185
+ return _USE_ONNX
186
+
187
+
188
+ # ── TEST ────────────────────────────────────────────────────
189
+ if __name__ == '__main__':
190
+ import time
191
+
192
+ test_logs = [
193
+ 'GET /v2/servers/detail HTTP/1.1 status: 404 len: 1583 time: 0.19',
194
+ 'System crashed due to driver errors when restarting the server',
195
+ 'Multiple login failures occurred on user 6454 account',
196
+ 'Admin access escalation detected for user 9429',
197
+ 'CPU usage at 98% for the last 10 minutes on node-7',
198
+ 'Backup completed successfully.',
199
+ 'User User123 logged in.',
200
+ 'Data replication task for shard 14 did not complete',
201
+ 'Hey bro chill ya!', # should be Unclassified
202
+ ]
203
+
204
+ print('Single log test:')
205
+ for log in test_logs:
206
+ label, conf = classify_with_bert(log)
207
+ print(f' [{conf:.0%}] {label:25s} | {log[:60]}')
208
+
209
+ print(f'\nMode: {"ONNX πŸš€" if is_onnx_mode() else "PyTorch"}')
210
+
211
+ # Speed test
212
+ big_batch = test_logs * 100
213
+ t0 = time.perf_counter()
214
+ classify_batch(big_batch)
215
+ elapsed = time.perf_counter() - t0
216
+ print(f'\nSpeed: {len(big_batch)/elapsed:.0f} logs/s ({elapsed*1000/len(big_batch):.1f}ms/log)')
HF/processor_llm.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ processor_llm.py β€” Tier 3: LLM-based Classifier
3
+
4
+ Used for:
5
+ - LegacyCRM logs (Workflow Error, Deprecation Warning)
6
+ - BERT fallback when confidence < threshold
7
+
8
+ Production hardening in V3:
9
+ - Timeout (configurable, default 5s)
10
+ - Retry with exponential backoff (max 2 retries)
11
+ - Explicit failure modes: returns "Unclassified" on all error paths
12
+ - Caching for repeated log patterns (hash-based, in-memory)
13
+ - Token budget enforcement (max_tokens=15)
14
+ """
15
+ from __future__ import annotations
16
+ import os
17
+ import re
18
+ import time
19
+ import hashlib
20
+ import logging
21
+ from functools import lru_cache
22
+ from typing import Optional
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # ── Config ─────────────────────────────────────────────────────────────────
27
+ HF_TOKEN = os.getenv("HF_TOKEN")
28
+ LLM_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
29
+
30
+ VALID_CATEGORIES = ["Workflow Error", "Deprecation Warning"]
31
+
32
+ # Retry / timeout config
33
+ MAX_RETRIES = 2
34
+ RETRY_DELAY_SEC = 1.0 # doubles on each retry (exponential backoff)
35
+ REQUEST_TIMEOUT = 5 # seconds β€” fail fast, do not hang pipeline
36
+
37
+ # In-memory cache to avoid redundant LLM calls for repeated logs
38
+ _RESPONSE_CACHE: dict[str, str] = {}
39
+ MAX_CACHE_SIZE = 1000 # evict oldest when full (simple FIFO)
40
+
41
+ SYSTEM_PROMPT = (
42
+ "You are an enterprise log classifier. "
43
+ "Classify log messages into exactly one category. "
44
+ "Return ONLY the category name β€” no explanation, no punctuation."
45
+ )
46
+
47
+ FEW_SHOT_EXAMPLES = [
48
+ {
49
+ "log": "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active.",
50
+ "label": "Workflow Error",
51
+ },
52
+ {
53
+ "log": "The 'BulkEmailSender' feature is no longer supported. Use 'EmailCampaignManager' instead.",
54
+ "label": "Deprecation Warning",
55
+ },
56
+ {
57
+ "log": "Invoice generation aborted for order ID 8910 due to invalid tax calculation module.",
58
+ "label": "Workflow Error",
59
+ },
60
+ ]
61
+
62
+
63
+ # ── Cache helpers ────────────────────────────────────────────────────────────
64
+ def _cache_key(log_msg: str) -> str:
65
+ return hashlib.md5(log_msg.strip().encode()).hexdigest()
66
+
67
+
68
+ def _cache_get(log_msg: str) -> Optional[str]:
69
+ return _RESPONSE_CACHE.get(_cache_key(log_msg))
70
+
71
+
72
+ def _cache_set(log_msg: str, label: str) -> None:
73
+ key = _cache_key(log_msg)
74
+ if len(_RESPONSE_CACHE) >= MAX_CACHE_SIZE:
75
+ # Evict oldest (first inserted) key
76
+ oldest = next(iter(_RESPONSE_CACHE))
77
+ del _RESPONSE_CACHE[oldest]
78
+ _RESPONSE_CACHE[key] = label
79
+
80
+
81
+ def get_cache_stats() -> dict:
82
+ return {"size": len(_RESPONSE_CACHE), "max_size": MAX_CACHE_SIZE}
83
+
84
+
85
+ # ── Prompt builder ───────────────────────────────────────────────────────────
86
+ def _build_messages(log_msg: str) -> list[dict]:
87
+ categories_str = ", ".join(f'"{c}"' for c in VALID_CATEGORIES)
88
+ user_content = (
89
+ f'Classify the following log into one of these categories: {categories_str}.\n'
90
+ 'If none fits, return "Unclassified".\n\n'
91
+ )
92
+ for ex in FEW_SHOT_EXAMPLES:
93
+ user_content += f'Log: {ex["log"]}\nCategory: {ex["label"]}\n\n'
94
+ user_content += f"Log: {log_msg}\nCategory:"
95
+
96
+ return [
97
+ {"role": "system", "content": SYSTEM_PROMPT},
98
+ {"role": "user", "content": user_content},
99
+ ]
100
+
101
+
102
+ # ── Normalize raw LLM output ─────────────────────────────────────────────────
103
+ def _normalize(raw: str) -> str:
104
+ """Map raw LLM output to a valid category or 'Unclassified'."""
105
+ raw = raw.strip().strip('"').strip("'")
106
+ for cat in VALID_CATEGORIES:
107
+ if cat.lower() in raw.lower():
108
+ return cat
109
+ return "Unclassified"
110
+
111
+
112
+ # ── Main classify function ────────────────────────────────────────────────────
113
+ def classify_with_llm(log_msg: str) -> str:
114
+ """
115
+ Tier 3 LLM classifier with:
116
+ - In-memory cache (avoids duplicate API calls)
117
+ - Timeout (REQUEST_TIMEOUT seconds)
118
+ - Retry with exponential backoff (MAX_RETRIES attempts)
119
+ - Explicit fallback to "Unclassified" on all error paths
120
+
121
+ Latency: 500–2000ms on cache miss; ~0ms on cache hit.
122
+ """
123
+ # ── Cache hit ────────────────────────────────────────────────────────────
124
+ cached = _cache_get(log_msg)
125
+ if cached is not None:
126
+ logger.debug(f"[LLM] Cache hit for: {log_msg[:60]}")
127
+ return cached
128
+
129
+ # ── Inference with retry ─────────────────────────────────────────────────
130
+ if not HF_TOKEN:
131
+ logger.warning("[LLM] HF_TOKEN not set β€” returning Unclassified")
132
+ return "Unclassified"
133
+
134
+ from huggingface_hub import InferenceClient
135
+
136
+ client = InferenceClient(token=HF_TOKEN, timeout=REQUEST_TIMEOUT)
137
+ delay = RETRY_DELAY_SEC
138
+ last_err: Optional[Exception] = None
139
+
140
+ for attempt in range(1, MAX_RETRIES + 2): # +2: initial + MAX_RETRIES
141
+ try:
142
+ response = client.chat.completions.create(
143
+ model=LLM_MODEL,
144
+ messages=_build_messages(log_msg),
145
+ max_tokens=15,
146
+ temperature=0.1,
147
+ )
148
+ raw = response.choices[0].message.content
149
+ label = _normalize(raw)
150
+
151
+ _cache_set(log_msg, label)
152
+ logger.debug(f"[LLM] Attempt {attempt}: '{raw.strip()}' β†’ '{label}'")
153
+ return label
154
+
155
+ except Exception as e:
156
+ last_err = e
157
+ if attempt <= MAX_RETRIES:
158
+ logger.warning(f"[LLM] Attempt {attempt} failed ({e}), retrying in {delay:.1f}s…")
159
+ time.sleep(delay)
160
+ delay *= 2 # exponential backoff
161
+ else:
162
+ logger.error(f"[LLM] All {MAX_RETRIES + 1} attempts failed. Last error: {e}")
163
+
164
+ return "Unclassified"
165
+
166
+
167
+ # ── Batch classify (serial β€” LLM is already rate-limited) ────────────────────
168
+ def classify_batch_llm(log_msgs: list[str]) -> list[str]:
169
+ """Classify multiple logs through LLM. Each call is sequential to respect rate limits."""
170
+ return [classify_with_llm(msg) for msg in log_msgs]
171
+
172
+
173
+ # ── CLI test ─────────────────────────────────────────────────────────────────
174
+ if __name__ == "__main__":
175
+ logging.basicConfig(level=logging.INFO)
176
+
177
+ test_logs = [
178
+ "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active.",
179
+ "The 'ReportGenerator' module will be retired in version 4.0. Migrate to 'AdvancedAnalyticsSuite'.",
180
+ "System reboot initiated by user 12345.", # should be Unclassified
181
+ ]
182
+ for log in test_logs:
183
+ result = classify_with_llm(log)
184
+ print(f"{result:25s} | {log[:80]}")
185
+
186
+ # Cache hit test
187
+ print("\n── Cache hit test ──")
188
+ t0 = time.perf_counter()
189
+ classify_with_llm(test_logs[0])
190
+ t1 = time.perf_counter()
191
+ print(f"Cache hit latency: {(t1-t0)*1000:.2f}ms")
192
+ print(f"Cache stats: {get_cache_stats()}")
HF/processor_regex.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ processor_regex.py β€” Tier 1: Rule-based Classifier
3
+
4
+ Target coverage: 40%+ (up from 15%)
5
+ Latency: sub-millisecond per log
6
+
7
+ New pattern groups added:
8
+ - HTTP request/response logs (was completely missing!)
9
+ - Auth / credential events (login failures, MFA, lockouts)
10
+ - System/infra events (disk, CPU, memory, cron)
11
+ - Network / firewall events (IP block, port scan)
12
+ - Structured error codes (ERROR, CRITICAL prefix logs)
13
+ """
14
+ from __future__ import annotations
15
+ import re
16
+ import time
17
+ from typing import Optional
18
+
19
+ # ---------------------------------------------------------------------------
20
+ # Pattern registry: (compiled_pattern, label)
21
+ # Order matters β€” more specific patterns FIRST to avoid mis-labeling.
22
+ # ---------------------------------------------------------------------------
23
+ _RAW_PATTERNS: list[tuple[str, str]] = [
24
+
25
+ # ── HTTP Status ─────────────────────────────────────────────────────────
26
+ # Covers: GET/POST/PUT/DELETE/PATCH + status code in request line
27
+ (r"\b(GET|POST|PUT|DELETE|PATCH|HEAD|OPTIONS)\s+\S+\s+HTTP/\d", "HTTP Status"),
28
+ # Nova / OpenStack style
29
+ (r"nova\.\S+\s+(GET|POST|PUT|DELETE)\s+\S+\s+HTTP/\d", "HTTP Status"),
30
+ # Status code only style: "returned HTTP 200" or "status: 404"
31
+ (r"\bstatus[:\s]+\d{3}\b", "HTTP Status"),
32
+ (r"\breturned\s+HTTP\s+\d{3}\b", "HTTP Status"),
33
+ (r"\bHTTP\s+status\s+code\s*[:-]?\s*\d{3}\b", "HTTP Status"),
34
+ # API response style
35
+ (r"\bAPI\s+(call|request)\s+\S+\s+completed\s+with\s+status\s+\d{3}", "HTTP Status"),
36
+ (r"\bEndpoint\s+\S+\s+responded\s+with\s+code\s+\d{3}", "HTTP Status"),
37
+
38
+ # ── Security Alert ──────────────────────────────────────────────────────
39
+ # Brute force / login failures
40
+ (r"(multiple\s+)?(bad\s+|failed?\s+)?login\s+(failure|attempt|failures)", "Security Alert"),
41
+ (r"brute[\s_-]force\s+(login|attack|attempt)", "Security Alert"),
42
+ # Unauthorized access
43
+ (r"unauthorized\s+(access|admin|privilege|attempt)", "Security Alert"),
44
+ (r"access\s+denied\s+(for|to)\s+(user|ip|host)", "Security Alert"),
45
+ # Privilege escalation
46
+ (r"(admin\s+)?access\s+escalation\s+detected", "Security Alert"),
47
+ (r"privilege\s+(elev|escalat)", "Security Alert"),
48
+ # IP blocking / suspicious traffic
49
+ (r"IP\s+\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\s+blocked", "Security Alert"),
50
+ (r"(suspicious|anomalous)\s+(login|traffic|activity|request)", "Security Alert"),
51
+ (r"potential\s+(DDoS|attack|breach|intrusion)", "Security Alert"),
52
+ (r"security\s+breach\s+suspected", "Security Alert"),
53
+ (r"(API\s+security\s+breach|bypass\s+API\s+security)", "Security Alert"),
54
+ (r"port\s+scan\s+(detected|attempt)", "Security Alert"),
55
+
56
+ # ── User Action ─────────────────────────────────────────────────────────
57
+ (r"User\s+\w+\d*\s+logged\s+(in|out)", "User Action"),
58
+ (r"Account\s+(with\s+)?ID\s+\S+\s+created\s+by", "User Action"),
59
+ (r"User\s+\w+\d*\s+(updated\s+profile|changed\s+password|enabled\s+two|downloaded|exported)", "User Action"),
60
+ (r"(New\s+user|user\s+\w+\d*)\s+registered", "User Action"),
61
+ (r"Account\s+\S+\s+deleted\s+by\s+(administrator|admin)", "User Action"),
62
+ (r"User\s+\w+\d*\s+(tried|attempted)", "User Action"),
63
+
64
+ # ── System Notification ─────────────────────────────────────────────────
65
+ # Backup events
66
+ (r"Backup\s+(started|ended|completed\s+successfully|failed|aborted)", "System Notification"),
67
+ (r"System\s+updated\s+to\s+version", "System Notification"),
68
+ (r"File\s+\S+\s+uploaded\s+successfully\s+by\s+user", "System Notification"),
69
+ (r"Disk\s+cleanup\s+completed\s+successfully", "System Notification"),
70
+ (r"System\s+reboot\s+initiated\s+by\s+user", "System Notification"),
71
+ (r"Scheduled\s+maintenance\s+(started|completed)", "System Notification"),
72
+ (r"Service\s+\w+\s+restarted\s+successfully", "System Notification"),
73
+ # NEW: cache, cron, health check, cert, log rotation
74
+ (r"Cache\s+cleared\s+successfully", "System Notification"),
75
+ (r"Log\s+rotation\s+completed", "System Notification"),
76
+ (r"Health\s+check\s+(passed|failed)\s+for\s+service", "System Notification"),
77
+ (r"Certificate\s+(renewed|expired|revoked)\s+successfully", "System Notification"),
78
+ (r"Cron\s+job\s+\S+\s+(executed|failed|completed)\s+successfully", "System Notification"),
79
+ (r"(Disk|Storage)\s+(usage|space)\s+(at|reached|exceeded)\s+\d+%", "System Notification"),
80
+ (r"CPU\s+usage\s+at\s+\d+%", "System Notification"),
81
+ (r"Memory\s+(usage|limit)\s+(at|reached|exceeded)\s+\d+%", "System Notification"),
82
+ # Deployment / config
83
+ (r"Deployment\s+(of|for)\s+\S+\s+(completed|failed|started)", "System Notification"),
84
+ (r"Configuration\s+(reloaded|updated|applied)\s+successfully", "System Notification"),
85
+
86
+ # ── Error ───────────────────────────────────────────────────────────────
87
+ (r"\bERROR\b.*\b(exception|failed|failure|crash|timeout|unavailable)\b", "Error"),
88
+ (r"System\s+crashed\s+due\s+to", "Error"),
89
+ (r"(connection|request|task|job)\s+(timed?\s*out|timeout)", "Error"),
90
+ (r"service\s+\S+\s+(is\s+down|unavailable|unreachable)", "Error"),
91
+ (r"database\s+connection\s+(failed|refused|lost|dropped)", "Error"),
92
+ (r"disk\s+(I/O\s+)?failure", "Error"),
93
+ (r"driver\s+error(s)?\s+(when|during|on)", "Error"),
94
+ (r"(replication|sync)\s+task\s+(did\s+not\s+complete|failed)", "Error"),
95
+ (r"null\s+pointer|segmentation\s+fault|stack\s+overflow", "Error"),
96
+
97
+ # ── Critical Error ──────────────────────────────────────────────────────
98
+ (r"\bCRITICAL\b", "Critical Error"),
99
+ (r"(FATAL|PANIC)\b", "Critical Error"),
100
+ (r"(data\s+loss|data\s+corruption)\s+(detected|occurred)", "Critical Error"),
101
+ (r"(cluster|node|shard)\s+(failure|crashed|went\s+down)", "Critical Error"),
102
+ (r"(catastrophic|unrecoverable)\s+(failure|error)", "Critical Error"),
103
+ (r"kernel\s+panic", "Critical Error"),
104
+ (r"out[\s-]of[\s-](memory|disk)\s+(error|killed|OOM)", "Critical Error"),
105
+ ]
106
+
107
+ # Pre-compile all patterns at import time (not per-call)
108
+ REGEX_PATTERNS: list[tuple[re.Pattern, str]] = [
109
+ (re.compile(pat, re.IGNORECASE), label)
110
+ for pat, label in _RAW_PATTERNS
111
+ ]
112
+
113
+
114
+ def classify_with_regex(log_message: str) -> Optional[str]:
115
+ """
116
+ Tier 1: Rule-based classifier.
117
+ Returns category label, or None if no pattern matches.
118
+ Latency: sub-millisecond (patterns pre-compiled at import).
119
+ """
120
+ for pattern, label in REGEX_PATTERNS:
121
+ if pattern.search(log_message):
122
+ return label
123
+ return None
124
+
125
+
126
+ def get_regex_coverage(log_messages: list[str]) -> dict:
127
+ """Measure regex tier coverage and per-label breakdown."""
128
+ label_counts: dict[str, int] = {}
129
+ missed = 0
130
+
131
+ for msg in log_messages:
132
+ label = classify_with_regex(msg)
133
+ if label:
134
+ label_counts[label] = label_counts.get(label, 0) + 1
135
+ else:
136
+ missed += 1
137
+
138
+ total = len(log_messages)
139
+ matched = total - missed
140
+
141
+ return {
142
+ "total": total,
143
+ "matched": matched,
144
+ "missed": missed,
145
+ "coverage_pct": round(matched / total * 100, 2) if total else 0.0,
146
+ "label_breakdown": label_counts,
147
+ }
148
+
149
+
150
+ def benchmark_regex(log_messages: list[str], runs: int = 3) -> dict:
151
+ """Measure regex tier latency (p50 / p95 / p99) over multiple runs."""
152
+ import statistics
153
+ per_log_ms: list[float] = []
154
+
155
+ for _ in range(runs):
156
+ for msg in log_messages:
157
+ t0 = time.perf_counter()
158
+ classify_with_regex(msg)
159
+ per_log_ms.append((time.perf_counter() - t0) * 1000)
160
+
161
+ per_log_ms.sort()
162
+ return {
163
+ "p50_ms": round(statistics.median(per_log_ms), 4),
164
+ "p95_ms": round(per_log_ms[int(len(per_log_ms) * 0.95)], 4),
165
+ "p99_ms": round(per_log_ms[int(len(per_log_ms) * 0.99)], 4),
166
+ "mean_ms": round(statistics.mean(per_log_ms), 4),
167
+ }
168
+
169
+
170
+ # ── CLI self-test ────────────────────────────────────────────────────────────
171
+ if __name__ == "__main__":
172
+ test_cases: list[tuple[str, str]] = [
173
+ # HTTP
174
+ ("GET /api/v2/resource HTTP/1.1 status: 200 len: 1583 time: 0.19", "HTTP Status"),
175
+ ("POST /v1/users HTTP/1.1 status: 201 len: 42 time: 0.05", "HTTP Status"),
176
+ ("nova.osapi_compute.wsgi.server GET /v2/servers/detail HTTP/1.1 status: 404", "HTTP Status"),
177
+ # Security
178
+ ("Multiple login failures occurred on user 6454 account", "Security Alert"),
179
+ ("IP 192.168.133.114 blocked due to potential attack", "Security Alert"),
180
+ ("Brute force login attempt from 10.0.0.5 detected", "Security Alert"),
181
+ ("Admin access escalation detected for user 9429", "Security Alert"),
182
+ # User Action
183
+ ("User User12345 logged in.", "User Action"),
184
+ ("Account with ID 456 created by Admin.", "User Action"),
185
+ # System Notification
186
+ ("Backup completed successfully.", "System Notification"),
187
+ ("CPU usage at 98% for the last 10 minutes on node-7", "System Notification"),
188
+ ("Health check passed for service payments-api", "System Notification"),
189
+ # Error
190
+ ("System crashed due to disk I/O failure on node-3", "Error"),
191
+ ("Database connection failed after 3 retries", "Error"),
192
+ # Critical
193
+ ("CRITICAL: data corruption detected on shard-14", "Critical Error"),
194
+ ("kernel panic: not syncing: VFS: unable to mount root fs", "Critical Error"),
195
+ # Should be None (unmatched)
196
+ ("The 'BulkEmailSender' feature will be deprecated in v5.0.", None),
197
+ ("Case escalation for ticket 7324 failed.", None),
198
+ ]
199
+
200
+ correct = 0
201
+ print(f"{'Expected':<22} {'Got':<22} {'βœ“/βœ—'} | Log")
202
+ print("─" * 100)
203
+ for log, expected in test_cases:
204
+ got = classify_with_regex(log)
205
+ ok = got == expected
206
+ correct += ok
207
+ icon = "βœ“" if ok else "βœ—"
208
+ print(f"{str(expected):<22} {str(got):<22} {icon} | {log[:55]}")
209
+
210
+ print(f"\n{correct}/{len(test_cases)} correct")
211
+
212
+ # Coverage demo
213
+ all_logs = [log for log, _ in test_cases]
214
+ cov = get_regex_coverage(all_logs)
215
+ print(f"\nCoverage: {cov['coverage_pct']}% ({cov['matched']}/{cov['total']} matched)")
216
+ print("Label breakdown:", cov["label_breakdown"])
217
+
218
+ # Latency benchmark
219
+ lat = benchmark_regex(all_logs * 100)
220
+ print(f"\nLatency (p50/p95/p99): {lat['p50_ms']}ms / {lat['p95_ms']}ms / {lat['p99_ms']}ms")
HF/requirements.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core
2
+ gradio>=4.0.0
3
+ pandas>=2.0.0
4
+ numpy>=1.24.0
5
+ joblib>=1.3.0
6
+ scikit-learn>=1.3.0
7
+
8
+ # Embedding + BERT
9
+ sentence-transformers>=2.7.0
10
+ transformers>=4.38.0
11
+
12
+ # ONNX (optional, 3-5x speedup)
13
+ onnxruntime>=1.17.0
14
+ optimum[onnxruntime]>=1.16.0
15
+
16
+ # LLM
17
+ huggingface-hub>=0.21.0
18
+
19
+ # FastAPI (production API)
20
+ fastapi>=0.110.0
21
+ uvicorn[standard]>=0.29.0
22
+ pydantic>=2.0.0
23
+
24
+ # Observability
25
+ psutil>=5.9.0
26
+
27
+ # Testing
28
+ pytest>=8.0.0
29
+ pytest-asyncio>=0.23.0
30
+ httpx>=0.27.0 # for FastAPI test client