Spaces:
Sleeping
Sleeping
Upload 10 files
Browse files- HF/Dockerfile +32 -0
- HF/api.py +314 -0
- HF/app_gradio.py +187 -0
- HF/benchmark.py +214 -0
- HF/classify.py +198 -0
- HF/error_analysis.py +250 -0
- HF/processor_bert.py +216 -0
- HF/processor_llm.py +192 -0
- HF/processor_regex.py +220 -0
- HF/requirements.txt +30 -0
HF/Dockerfile
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
# Build args
|
| 4 |
+
ARG PORT=8000
|
| 5 |
+
|
| 6 |
+
# System deps
|
| 7 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 8 |
+
curl \
|
| 9 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
WORKDIR /app
|
| 12 |
+
|
| 13 |
+
# Install Python deps first (layer cache)
|
| 14 |
+
COPY requirements.txt .
|
| 15 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 16 |
+
|
| 17 |
+
# Copy source
|
| 18 |
+
COPY . .
|
| 19 |
+
|
| 20 |
+
# Non-root user
|
| 21 |
+
RUN adduser --disabled-password --gecos "" appuser \
|
| 22 |
+
&& chown -R appuser:appuser /app
|
| 23 |
+
USER appuser
|
| 24 |
+
|
| 25 |
+
# Health check
|
| 26 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
| 27 |
+
CMD curl -f http://localhost:${PORT}/health || exit 1
|
| 28 |
+
|
| 29 |
+
EXPOSE ${PORT}
|
| 30 |
+
|
| 31 |
+
# Production: single worker (CPU-bound inference β scale via replicas, not threads)
|
| 32 |
+
CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "1", "--log-level", "info"]
|
HF/api.py
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
api.py β Async FastAPI Inference Service
|
| 3 |
+
|
| 4 |
+
Endpoints:
|
| 5 |
+
POST /classify β Single log
|
| 6 |
+
POST /classify/batch β Batch of logs (up to 512)
|
| 7 |
+
GET /health β Liveness check
|
| 8 |
+
GET /ready β Readiness check (model loaded?)
|
| 9 |
+
GET /metrics β Request counts, throughput, latency stats
|
| 10 |
+
|
| 11 |
+
Features:
|
| 12 |
+
- Async request handling (non-blocking)
|
| 13 |
+
- Worker pool via asyncio semaphore (bounded concurrency)
|
| 14 |
+
- Structured JSON logs with request_id
|
| 15 |
+
- Rate limiting (configurable)
|
| 16 |
+
- Request ID tracing
|
| 17 |
+
- Batch queue aggregation for small requests
|
| 18 |
+
|
| 19 |
+
Run:
|
| 20 |
+
uvicorn api:app --host 0.0.0.0 --port 8000 --workers 1
|
| 21 |
+
|
| 22 |
+
Example:
|
| 23 |
+
curl -X POST http://localhost:8000/classify \
|
| 24 |
+
-H "Content-Type: application/json" \
|
| 25 |
+
-d '{"source": "ModernCRM", "log_message": "User User123 logged in."}'
|
| 26 |
+
"""
|
| 27 |
+
from __future__ import annotations
|
| 28 |
+
import asyncio
|
| 29 |
+
import logging
|
| 30 |
+
import os
|
| 31 |
+
import time
|
| 32 |
+
import uuid
|
| 33 |
+
import statistics
|
| 34 |
+
from collections import deque
|
| 35 |
+
from contextlib import asynccontextmanager
|
| 36 |
+
from typing import Optional
|
| 37 |
+
|
| 38 |
+
from fastapi import FastAPI, HTTPException, Request, status
|
| 39 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 40 |
+
from fastapi.responses import JSONResponse
|
| 41 |
+
from pydantic import BaseModel, Field, field_validator
|
| 42 |
+
|
| 43 |
+
# ββ Logging setup βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 44 |
+
logging.basicConfig(
|
| 45 |
+
level=logging.INFO,
|
| 46 |
+
format='{"time":"%(asctime)s","level":"%(levelname)s","logger":"%(name)s","msg":"%(message)s"}'
|
| 47 |
+
)
|
| 48 |
+
logger = logging.getLogger("log-classifier-api")
|
| 49 |
+
|
| 50 |
+
# ββ Config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 51 |
+
MAX_BATCH_SIZE = int(os.getenv("MAX_BATCH_SIZE", "512"))
|
| 52 |
+
MAX_CONCURRENT = int(os.getenv("MAX_CONCURRENT", "4")) # concurrency cap
|
| 53 |
+
RATE_LIMIT_PER_MIN = int(os.getenv("RATE_LIMIT_PER_MIN", "1000"))
|
| 54 |
+
LOG_MAX_CHARS = 2048 # truncate huge logs before classify
|
| 55 |
+
|
| 56 |
+
# ββ Global state βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 57 |
+
_semaphore: asyncio.Semaphore = None # type: ignore
|
| 58 |
+
_model_ready: bool = False
|
| 59 |
+
|
| 60 |
+
# Metrics ring buffer (last 1000 requests)
|
| 61 |
+
_latencies_ms: deque = deque(maxlen=1000)
|
| 62 |
+
_request_count = 0
|
| 63 |
+
_error_count = 0
|
| 64 |
+
_start_time = time.time()
|
| 65 |
+
|
| 66 |
+
# Rate limiter (simple sliding window per process)
|
| 67 |
+
_rate_window: deque = deque(maxlen=RATE_LIMIT_PER_MIN)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
# ββ Lifespan: load models on startup ββββββββββββββββββββββββββββββββββββββββββ
|
| 71 |
+
@asynccontextmanager
|
| 72 |
+
async def lifespan(app: FastAPI):
|
| 73 |
+
global _semaphore, _model_ready
|
| 74 |
+
|
| 75 |
+
logger.info("Starting up β loading modelsβ¦")
|
| 76 |
+
_semaphore = asyncio.Semaphore(MAX_CONCURRENT)
|
| 77 |
+
|
| 78 |
+
# Load models in a thread pool (blocking I/O, don't block event loop)
|
| 79 |
+
loop = asyncio.get_event_loop()
|
| 80 |
+
try:
|
| 81 |
+
await loop.run_in_executor(None, _load_models_blocking)
|
| 82 |
+
_model_ready = True
|
| 83 |
+
logger.info("β
Models loaded β API ready")
|
| 84 |
+
except Exception as e:
|
| 85 |
+
logger.error(f"β Model load failed: {e}")
|
| 86 |
+
# Service starts but /ready will return 503
|
| 87 |
+
|
| 88 |
+
yield
|
| 89 |
+
|
| 90 |
+
logger.info("Shutting down")
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def _load_models_blocking():
|
| 94 |
+
"""Load BERT + classifier (blocks β run in executor)."""
|
| 95 |
+
from processor_bert import classify_batch as _
|
| 96 |
+
logger.info("BERT model loaded")
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# ββ App factory ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 100 |
+
app = FastAPI(
|
| 101 |
+
title="Log Classification API",
|
| 102 |
+
description="3-tier hybrid pipeline: Regex β BERT β LLM",
|
| 103 |
+
version="3.0.0",
|
| 104 |
+
lifespan=lifespan,
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
app.add_middleware(
|
| 108 |
+
CORSMiddleware,
|
| 109 |
+
allow_origins=["*"],
|
| 110 |
+
allow_methods=["*"],
|
| 111 |
+
allow_headers=["*"],
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
# ββ Request / Response schemas βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 116 |
+
class LogRequest(BaseModel):
|
| 117 |
+
source: str = Field(..., example="ModernCRM")
|
| 118 |
+
log_message: str = Field(..., example="User User123 logged in.", min_length=1)
|
| 119 |
+
|
| 120 |
+
@field_validator("log_message")
|
| 121 |
+
@classmethod
|
| 122 |
+
def truncate_long_logs(cls, v: str) -> str:
|
| 123 |
+
return v[:LOG_MAX_CHARS]
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
class LogResponse(BaseModel):
|
| 127 |
+
request_id: str
|
| 128 |
+
label: str
|
| 129 |
+
tier: str
|
| 130 |
+
confidence: Optional[float]
|
| 131 |
+
latency_ms: float
|
| 132 |
+
cached: bool = False
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
class BatchRequest(BaseModel):
|
| 136 |
+
logs: list[LogRequest] = Field(..., max_length=MAX_BATCH_SIZE)
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
class BatchResponse(BaseModel):
|
| 140 |
+
request_id: str
|
| 141 |
+
total: int
|
| 142 |
+
elapsed_ms: float
|
| 143 |
+
throughput: float
|
| 144 |
+
results: list[LogResponse]
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
class HealthResponse(BaseModel):
|
| 148 |
+
status: str
|
| 149 |
+
uptime_s: float
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
class MetricsResponse(BaseModel):
|
| 153 |
+
total_requests: int
|
| 154 |
+
total_errors: int
|
| 155 |
+
uptime_s: float
|
| 156 |
+
requests_per_min: float
|
| 157 |
+
latency_p50_ms: Optional[float]
|
| 158 |
+
latency_p95_ms: Optional[float]
|
| 159 |
+
latency_p99_ms: Optional[float]
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
# ββ Rate limiter βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 163 |
+
def _check_rate_limit() -> None:
|
| 164 |
+
now = time.time()
|
| 165 |
+
_rate_window.append(now)
|
| 166 |
+
# Window = last 60 seconds
|
| 167 |
+
recent = [t for t in _rate_window if now - t < 60]
|
| 168 |
+
if len(recent) > RATE_LIMIT_PER_MIN:
|
| 169 |
+
raise HTTPException(
|
| 170 |
+
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
|
| 171 |
+
detail=f"Rate limit exceeded: {RATE_LIMIT_PER_MIN} req/min",
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
# ββ Middleware: request logging ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 176 |
+
@app.middleware("http")
|
| 177 |
+
async def log_requests(request: Request, call_next):
|
| 178 |
+
rid = request.headers.get("X-Request-ID", str(uuid.uuid4())[:8])
|
| 179 |
+
request.state.request_id = rid
|
| 180 |
+
t0 = time.perf_counter()
|
| 181 |
+
response = await call_next(request)
|
| 182 |
+
elapsed = (time.perf_counter() - t0) * 1000
|
| 183 |
+
logger.info(
|
| 184 |
+
f"method={request.method} path={request.url.path} "
|
| 185 |
+
f"status={response.status_code} latency={elapsed:.1f}ms rid={rid}"
|
| 186 |
+
)
|
| 187 |
+
response.headers["X-Request-ID"] = rid
|
| 188 |
+
return response
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
# ββ Health & readiness βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 192 |
+
@app.get("/health", response_model=HealthResponse, tags=["ops"])
|
| 193 |
+
async def health():
|
| 194 |
+
return {"status": "ok", "uptime_s": round(time.time() - _start_time, 1)}
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
@app.get("/ready", tags=["ops"])
|
| 198 |
+
async def ready():
|
| 199 |
+
if not _model_ready:
|
| 200 |
+
raise HTTPException(status_code=503, detail="Models not yet loaded")
|
| 201 |
+
return {"status": "ready"}
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
# ββ Metrics ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 205 |
+
@app.get("/metrics", response_model=MetricsResponse, tags=["ops"])
|
| 206 |
+
async def metrics():
|
| 207 |
+
uptime = time.time() - _start_time
|
| 208 |
+
lats = sorted(_latencies_ms) if _latencies_ms else []
|
| 209 |
+
n = len(lats)
|
| 210 |
+
|
| 211 |
+
def pct(p):
|
| 212 |
+
return round(lats[min(int(n * p), n - 1)], 2) if n else None
|
| 213 |
+
|
| 214 |
+
return {
|
| 215 |
+
"total_requests": _request_count,
|
| 216 |
+
"total_errors": _error_count,
|
| 217 |
+
"uptime_s": round(uptime, 1),
|
| 218 |
+
"requests_per_min": round(_request_count / max(uptime / 60, 1), 1),
|
| 219 |
+
"latency_p50_ms": pct(0.50),
|
| 220 |
+
"latency_p95_ms": pct(0.95),
|
| 221 |
+
"latency_p99_ms": pct(0.99),
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
# ββ Classify single ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 226 |
+
@app.post("/classify", response_model=LogResponse, tags=["inference"])
|
| 227 |
+
async def classify_single(req: LogRequest, request: Request):
|
| 228 |
+
global _request_count, _error_count
|
| 229 |
+
_check_rate_limit()
|
| 230 |
+
_request_count += 1
|
| 231 |
+
rid = getattr(request.state, "request_id", str(uuid.uuid4())[:8])
|
| 232 |
+
|
| 233 |
+
async with _semaphore:
|
| 234 |
+
loop = asyncio.get_event_loop()
|
| 235 |
+
t0 = time.perf_counter()
|
| 236 |
+
try:
|
| 237 |
+
result = await loop.run_in_executor(
|
| 238 |
+
None, _classify_blocking, req.source, req.log_message
|
| 239 |
+
)
|
| 240 |
+
except Exception as e:
|
| 241 |
+
_error_count += 1
|
| 242 |
+
logger.error(f"rid={rid} classify error: {e}")
|
| 243 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 244 |
+
|
| 245 |
+
latency = (time.perf_counter() - t0) * 1000
|
| 246 |
+
_latencies_ms.append(latency)
|
| 247 |
+
|
| 248 |
+
return LogResponse(
|
| 249 |
+
request_id = rid,
|
| 250 |
+
label = result["label"],
|
| 251 |
+
tier = result["tier"],
|
| 252 |
+
confidence = result.get("confidence"),
|
| 253 |
+
latency_ms = round(latency, 2),
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def _classify_blocking(source: str, log_message: str) -> dict:
|
| 258 |
+
from classify import classify_log
|
| 259 |
+
return classify_log(source, log_message)
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
# ββ Classify batch βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 263 |
+
@app.post("/classify/batch", response_model=BatchResponse, tags=["inference"])
|
| 264 |
+
async def classify_batch_endpoint(req: BatchRequest, request: Request):
|
| 265 |
+
global _request_count, _error_count
|
| 266 |
+
_check_rate_limit()
|
| 267 |
+
_request_count += 1
|
| 268 |
+
rid = getattr(request.state, "request_id", str(uuid.uuid4())[:8])
|
| 269 |
+
|
| 270 |
+
log_pairs = [(r.source, r.log_message) for r in req.logs]
|
| 271 |
+
|
| 272 |
+
async with _semaphore:
|
| 273 |
+
loop = asyncio.get_event_loop()
|
| 274 |
+
t0 = time.perf_counter()
|
| 275 |
+
try:
|
| 276 |
+
results = await loop.run_in_executor(
|
| 277 |
+
None, _classify_batch_blocking, log_pairs
|
| 278 |
+
)
|
| 279 |
+
except Exception as e:
|
| 280 |
+
_error_count += 1
|
| 281 |
+
logger.error(f"rid={rid} batch error: {e}")
|
| 282 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 283 |
+
|
| 284 |
+
elapsed_ms = (time.perf_counter() - t0) * 1000
|
| 285 |
+
throughput = round(len(log_pairs) / (elapsed_ms / 1000), 1)
|
| 286 |
+
_latencies_ms.extend([elapsed_ms / len(log_pairs)] * len(log_pairs))
|
| 287 |
+
|
| 288 |
+
return BatchResponse(
|
| 289 |
+
request_id = rid,
|
| 290 |
+
total = len(log_pairs),
|
| 291 |
+
elapsed_ms = round(elapsed_ms, 2),
|
| 292 |
+
throughput = throughput,
|
| 293 |
+
results = [
|
| 294 |
+
LogResponse(
|
| 295 |
+
request_id = rid,
|
| 296 |
+
label = r["label"],
|
| 297 |
+
tier = r["tier"],
|
| 298 |
+
confidence = r.get("confidence"),
|
| 299 |
+
latency_ms = round(elapsed_ms / len(log_pairs), 2),
|
| 300 |
+
)
|
| 301 |
+
for r in results
|
| 302 |
+
],
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
def _classify_batch_blocking(log_pairs: list[tuple[str, str]]) -> list[dict]:
|
| 307 |
+
from classify import classify_logs
|
| 308 |
+
return classify_logs(log_pairs)
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
# ββ Dev runner ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 312 |
+
if __name__ == "__main__":
|
| 313 |
+
import uvicorn
|
| 314 |
+
uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=False, workers=1)
|
HF/app_gradio.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Log Classification System β HuggingFace Spaces
|
| 3 |
+
Gradio UI for the 3-tier hybrid log classification pipeline.
|
| 4 |
+
"""
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
import io
|
| 7 |
+
import time
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import gradio as gr
|
| 10 |
+
from classify import classify_log, classify_csv
|
| 11 |
+
|
| 12 |
+
# ββ Source options ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 13 |
+
SOURCES = [
|
| 14 |
+
"ModernCRM",
|
| 15 |
+
"ModernHR",
|
| 16 |
+
"BillingSystem",
|
| 17 |
+
"AnalyticsEngine",
|
| 18 |
+
"ThirdPartyAPI",
|
| 19 |
+
"LegacyCRM",
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
TIER_COLORS = {
|
| 23 |
+
"Regex": "π’",
|
| 24 |
+
"BERT": "π΅",
|
| 25 |
+
"LLM": "π‘",
|
| 26 |
+
"LLM (fallback)": "π ",
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
EXAMPLE_LOGS = [
|
| 30 |
+
["ModernCRM", "User User12345 logged in."],
|
| 31 |
+
["ModernHR", "Multiple login failures occurred on user 6454 account"],
|
| 32 |
+
["BillingSystem", "GET /v2/servers/detail HTTP/1.1 status: 200 len: 1583 time: 0.19"],
|
| 33 |
+
["AnalyticsEngine", "System crashed due to disk I/O failure on node-3"],
|
| 34 |
+
["LegacyCRM", "Case escalation for ticket ID 7324 failed β support agent is no longer active."],
|
| 35 |
+
["LegacyCRM", "The 'BulkEmailSender' feature will be deprecated in v5.0. Use 'EmailCampaignManager'."],
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# ββ Single log tab ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 40 |
+
def classify_single(source: str, log_message: str):
|
| 41 |
+
if not log_message.strip():
|
| 42 |
+
return "β", "β", "β", "β"
|
| 43 |
+
|
| 44 |
+
t0 = time.perf_counter()
|
| 45 |
+
result = classify_log(source, log_message)
|
| 46 |
+
latency_ms = (time.perf_counter() - t0) * 1000
|
| 47 |
+
|
| 48 |
+
label = result["label"]
|
| 49 |
+
tier = result["tier"]
|
| 50 |
+
confidence = f"{result['confidence']:.1%}" if result["confidence"] is not None else "N/A"
|
| 51 |
+
icon = TIER_COLORS.get(tier, "βͺ")
|
| 52 |
+
|
| 53 |
+
return (
|
| 54 |
+
label,
|
| 55 |
+
f"{icon} {tier}",
|
| 56 |
+
confidence,
|
| 57 |
+
f"{latency_ms:.1f} ms",
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# ββ Batch CSV tab βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 62 |
+
def classify_batch(file):
|
| 63 |
+
if file is None:
|
| 64 |
+
return None, "β οΈ Please upload a CSV file."
|
| 65 |
+
|
| 66 |
+
try:
|
| 67 |
+
output_path, df = classify_csv(file.name, "/tmp/classified_output.csv")
|
| 68 |
+
except ValueError as e:
|
| 69 |
+
return None, f"β οΈ {e}"
|
| 70 |
+
except Exception as e:
|
| 71 |
+
return None, f"β Error: {e}"
|
| 72 |
+
|
| 73 |
+
total = len(df)
|
| 74 |
+
tier_counts = df["tier_used"].value_counts().to_dict()
|
| 75 |
+
label_counts = df["predicted_label"].value_counts().to_dict()
|
| 76 |
+
|
| 77 |
+
tier_lines = "\n".join(f" {TIER_COLORS.get(k,'βͺ')} {k}: {v} ({v/total:.0%})" for k, v in tier_counts.items())
|
| 78 |
+
label_lines = "\n".join(f" β’ {k}: {v}" for k, v in label_counts.items())
|
| 79 |
+
|
| 80 |
+
stats = (
|
| 81 |
+
f"β
Classified {total} logs\n\n"
|
| 82 |
+
f"π Tier breakdown:\n{tier_lines}\n\n"
|
| 83 |
+
f"π·οΈ Label distribution:\n{label_lines}"
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
return output_path, stats
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# ββ UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 90 |
+
with gr.Blocks(title="Log Classification System", theme=gr.themes.Soft()) as demo:
|
| 91 |
+
|
| 92 |
+
gr.Markdown("""
|
| 93 |
+
# π Log Classification System
|
| 94 |
+
**3-tier hybrid pipeline** β π’ Regex Β· π΅ BERT + LogReg Β· π‘ LLM
|
| 95 |
+
Built to mimic production enterprise log monitoring architecture.
|
| 96 |
+
""")
|
| 97 |
+
|
| 98 |
+
with gr.Tabs():
|
| 99 |
+
|
| 100 |
+
# ββ Tab 1: Single Log ββββββββββββββββββββββββββββββββββββββββββββ
|
| 101 |
+
with gr.Tab("Single Log"):
|
| 102 |
+
with gr.Row():
|
| 103 |
+
source_input = gr.Dropdown(
|
| 104 |
+
choices=SOURCES,
|
| 105 |
+
value="ModernCRM",
|
| 106 |
+
label="Source System",
|
| 107 |
+
)
|
| 108 |
+
log_input = gr.Textbox(
|
| 109 |
+
label="Log Message",
|
| 110 |
+
placeholder="Paste a log message here...",
|
| 111 |
+
lines=3,
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
classify_btn = gr.Button("Classify", variant="primary")
|
| 115 |
+
|
| 116 |
+
with gr.Row():
|
| 117 |
+
label_out = gr.Textbox(label="π·οΈ Predicted Label", interactive=False)
|
| 118 |
+
tier_out = gr.Textbox(label="βοΈ Tier Used", interactive=False)
|
| 119 |
+
confidence_out = gr.Textbox(label="π Confidence", interactive=False)
|
| 120 |
+
latency_out = gr.Textbox(label="β±οΈ Latency", interactive=False)
|
| 121 |
+
|
| 122 |
+
classify_btn.click(
|
| 123 |
+
fn=classify_single,
|
| 124 |
+
inputs=[source_input, log_input],
|
| 125 |
+
outputs=[label_out, tier_out, confidence_out, latency_out],
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
gr.Examples(
|
| 129 |
+
examples=EXAMPLE_LOGS,
|
| 130 |
+
inputs=[source_input, log_input],
|
| 131 |
+
label="π Example Logs (click to try)",
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
# ββ Tab 2: Batch CSV βββββββββββββββββββββββββββββββββββββββββββββ
|
| 135 |
+
with gr.Tab("Batch CSV Upload"):
|
| 136 |
+
gr.Markdown("""
|
| 137 |
+
Upload a CSV with columns: **`source`**, **`log_message`**
|
| 138 |
+
Download the classified CSV with added columns: `predicted_label`, `tier_used`, `confidence`.
|
| 139 |
+
""")
|
| 140 |
+
with gr.Row():
|
| 141 |
+
with gr.Column():
|
| 142 |
+
csv_input = gr.File(label="π Upload CSV", file_types=[".csv"])
|
| 143 |
+
batch_btn = gr.Button("Classify All", variant="primary")
|
| 144 |
+
with gr.Column():
|
| 145 |
+
csv_output = gr.File(label="π₯ Download Classified CSV")
|
| 146 |
+
stats_out = gr.Textbox(label="π Stats", lines=12, interactive=False)
|
| 147 |
+
|
| 148 |
+
batch_btn.click(
|
| 149 |
+
fn=classify_batch,
|
| 150 |
+
inputs=[csv_input],
|
| 151 |
+
outputs=[csv_output, stats_out],
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
gr.Markdown("""
|
| 155 |
+
**Sample CSV format:**
|
| 156 |
+
```
|
| 157 |
+
source,log_message
|
| 158 |
+
ModernCRM,User User123 logged in.
|
| 159 |
+
LegacyCRM,Case escalation for ticket ID 7324 failed.
|
| 160 |
+
BillingSystem,GET /api/v2/invoice HTTP/1.1 status: 500
|
| 161 |
+
```
|
| 162 |
+
""")
|
| 163 |
+
|
| 164 |
+
# ββ Tab 3: Architecture ββββββββββββββββββββββββββββββββββββββββββ
|
| 165 |
+
with gr.Tab("Architecture"):
|
| 166 |
+
gr.Markdown("""
|
| 167 |
+
## ποΈ 3-Tier Hybrid Pipeline
|
| 168 |
+
|
| 169 |
+
| Tier | Method | Coverage | Latency | When Used |
|
| 170 |
+
|------|--------|----------|---------|-----------|
|
| 171 |
+
| π’ Regex | Python `re` patterns | ~21% | < 1ms | Fixed patterns (login, backup, etc.) |
|
| 172 |
+
| π΅ BERT | `all-MiniLM-L6-v2` + LogReg | ~79% | 20β80ms | High-volume categories with 150+ samples |
|
| 173 |
+
| π‘ LLM | HuggingFace Inference API | ~0.3% | 500β2000ms | LegacyCRM logs, rare patterns |
|
| 174 |
+
|
| 175 |
+
## π Model Performance (from training)
|
| 176 |
+
- **BERT + LogReg** trained on 2,410 synthetic enterprise logs
|
| 177 |
+
- **Confidence threshold**: 0.5 (below β escalate to LLM)
|
| 178 |
+
- **Source-aware routing**: `LegacyCRM` bypasses ML entirely (only 7 training samples)
|
| 179 |
+
|
| 180 |
+
## π Environment Variables
|
| 181 |
+
| Secret | Required For |
|
| 182 |
+
|--------|-------------|
|
| 183 |
+
| `HF_TOKEN` | LLM inference (LegacyCRM logs) |
|
| 184 |
+
""")
|
| 185 |
+
|
| 186 |
+
if __name__ == "__main__":
|
| 187 |
+
demo.launch()
|
HF/benchmark.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
benchmark.py β Full Benchmark Harness
|
| 3 |
+
|
| 4 |
+
Outputs a CSV with columns:
|
| 5 |
+
batch_size, mode, throughput_logs_sec, p50_ms, p95_ms, p99_ms, cpu_pct, ram_mb, tier_regex_pct, tier_bert_pct, tier_llm_pct
|
| 6 |
+
|
| 7 |
+
Usage:
|
| 8 |
+
python benchmark.py --logs 5000 --output benchmark_results.csv
|
| 9 |
+
|
| 10 |
+
What it measures:
|
| 11 |
+
- Batch size sweep: 1, 8, 16, 32, 64, 128
|
| 12 |
+
- Throughput (logs/sec)
|
| 13 |
+
- Latency: p50 / p95 / p99 (per-log)
|
| 14 |
+
- CPU and RAM during inference
|
| 15 |
+
- Tier distribution (Regex % / BERT % / LLM %)
|
| 16 |
+
|
| 17 |
+
Google interview talking point:
|
| 18 |
+
"I designed a benchmark harness that sweeps batch sizes and measures
|
| 19 |
+
latency percentiles + resource utilization, so I can show the
|
| 20 |
+
throughput-latency tradeoff curve empirically."
|
| 21 |
+
"""
|
| 22 |
+
from __future__ import annotations
|
| 23 |
+
import argparse
|
| 24 |
+
import csv
|
| 25 |
+
import os
|
| 26 |
+
import random
|
| 27 |
+
import sys
|
| 28 |
+
import time
|
| 29 |
+
import statistics
|
| 30 |
+
from pathlib import Path
|
| 31 |
+
|
| 32 |
+
import psutil
|
| 33 |
+
|
| 34 |
+
# ββ Synthetic log generator (no external deps needed) ββββββββββββββββββββββββ
|
| 35 |
+
SOURCES = ["ModernCRM", "ModernHR", "BillingSystem", "AnalyticsEngine", "ThirdPartyAPI"]
|
| 36 |
+
|
| 37 |
+
_LOG_TEMPLATES = [
|
| 38 |
+
("ModernCRM", "User User{id} logged in."),
|
| 39 |
+
("ModernCRM", "IP {ip} blocked due to potential attack"),
|
| 40 |
+
("ModernHR", "Multiple login failures occurred on user {id} account"),
|
| 41 |
+
("ModernHR", "Admin access escalation detected for user {id}"),
|
| 42 |
+
("BillingSystem", "GET /api/v2/invoices HTTP/1.1 status: {code} len: {len} time: {t}"),
|
| 43 |
+
("BillingSystem", "POST /api/v1/payments HTTP/1.1 status: {code} len: {len} time: {t}"),
|
| 44 |
+
("AnalyticsEngine", "System crashed due to disk I/O failure on node-{n}"),
|
| 45 |
+
("AnalyticsEngine", "Backup completed successfully."),
|
| 46 |
+
("ThirdPartyAPI", "Service payments-api is unreachable after 3 retries"),
|
| 47 |
+
("ThirdPartyAPI", "CPU usage at {pct}% for the last 10 minutes on node-{n}"),
|
| 48 |
+
("AnalyticsEngine", "CRITICAL: data corruption detected on shard-{n}"),
|
| 49 |
+
("ModernCRM", "Health check passed for service {svc}"),
|
| 50 |
+
]
|
| 51 |
+
|
| 52 |
+
def _rand_ip():
|
| 53 |
+
return f"{random.randint(10,192)}.{random.randint(0,255)}.{random.randint(0,255)}.{random.randint(1,254)}"
|
| 54 |
+
|
| 55 |
+
def _fill(template: str) -> str:
|
| 56 |
+
return (template
|
| 57 |
+
.replace("{id}", str(random.randint(100, 99999)))
|
| 58 |
+
.replace("{ip}", _rand_ip())
|
| 59 |
+
.replace("{code}", random.choice(["200", "201", "400", "404", "500", "503"]))
|
| 60 |
+
.replace("{len}", str(random.randint(100, 9999)))
|
| 61 |
+
.replace("{t}", f"{random.uniform(0.01, 2.5):.2f}")
|
| 62 |
+
.replace("{n}", str(random.randint(1, 20)))
|
| 63 |
+
.replace("{pct}", str(random.randint(60, 99)))
|
| 64 |
+
.replace("{svc}", random.choice(["auth-api", "billing", "analytics", "events"]))
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
def generate_logs(n: int) -> list[tuple[str, str]]:
|
| 68 |
+
random.seed(42)
|
| 69 |
+
return [
|
| 70 |
+
(src, _fill(tmpl))
|
| 71 |
+
for src, tmpl in random.choices(_LOG_TEMPLATES, k=n)
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# ββ Benchmark runner βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 76 |
+
def run_benchmark(
|
| 77 |
+
logs: list[tuple[str, str]],
|
| 78 |
+
batch_sizes: list[int],
|
| 79 |
+
output_csv: str,
|
| 80 |
+
warmup_n: int = 50,
|
| 81 |
+
) -> list[dict]:
|
| 82 |
+
from classify import classify_logs, pipeline_summary
|
| 83 |
+
|
| 84 |
+
proc = psutil.Process(os.getpid())
|
| 85 |
+
rows: list[dict] = []
|
| 86 |
+
|
| 87 |
+
# Warmup (model load, JIT, etc.)
|
| 88 |
+
print(f"π₯ Warming up with {warmup_n} logsβ¦")
|
| 89 |
+
classify_logs(logs[:warmup_n])
|
| 90 |
+
|
| 91 |
+
for bs in batch_sizes:
|
| 92 |
+
# Slice logs into batches of size `bs`
|
| 93 |
+
batches = [logs[i:i + bs] for i in range(0, len(logs), bs)]
|
| 94 |
+
if not batches:
|
| 95 |
+
continue
|
| 96 |
+
|
| 97 |
+
per_log_latencies: list[float] = []
|
| 98 |
+
cpu_samples: list[float] = []
|
| 99 |
+
ram_samples: list[float] = []
|
| 100 |
+
all_results: list[dict] = []
|
| 101 |
+
|
| 102 |
+
print(f"\nπ Batch size = {bs} ({len(batches)} batches Γ {bs} logs)β¦")
|
| 103 |
+
|
| 104 |
+
wall_start = time.perf_counter()
|
| 105 |
+
|
| 106 |
+
for batch in batches:
|
| 107 |
+
t0 = time.perf_counter()
|
| 108 |
+
results = classify_logs(batch)
|
| 109 |
+
t1 = time.perf_counter()
|
| 110 |
+
batch_ms = (t1 - t0) * 1000
|
| 111 |
+
per_log_ms = batch_ms / len(batch)
|
| 112 |
+
|
| 113 |
+
per_log_latencies.extend([per_log_ms] * len(batch))
|
| 114 |
+
all_results.extend(results)
|
| 115 |
+
|
| 116 |
+
# Resource snapshot
|
| 117 |
+
cpu_samples.append(proc.cpu_percent(interval=None))
|
| 118 |
+
ram_samples.append(proc.memory_info().rss / 1_048_576) # MB
|
| 119 |
+
|
| 120 |
+
wall_elapsed = time.perf_counter() - wall_start
|
| 121 |
+
total_logs = len(logs)
|
| 122 |
+
throughput = round(total_logs / wall_elapsed, 1)
|
| 123 |
+
|
| 124 |
+
per_log_latencies.sort()
|
| 125 |
+
n = len(per_log_latencies)
|
| 126 |
+
|
| 127 |
+
summary = pipeline_summary(all_results)
|
| 128 |
+
tier_stats = summary["tier_stats"]
|
| 129 |
+
|
| 130 |
+
def tier_pct(name):
|
| 131 |
+
return tier_stats.get(name, {}).get("pct", 0.0)
|
| 132 |
+
|
| 133 |
+
row = {
|
| 134 |
+
"batch_size": bs,
|
| 135 |
+
"total_logs": total_logs,
|
| 136 |
+
"elapsed_sec": round(wall_elapsed, 2),
|
| 137 |
+
"throughput_logs_sec": throughput,
|
| 138 |
+
"p50_ms": round(statistics.median(per_log_latencies), 3),
|
| 139 |
+
"p95_ms": round(per_log_latencies[min(int(n * 0.95), n - 1)], 3),
|
| 140 |
+
"p99_ms": round(per_log_latencies[min(int(n * 0.99), n - 1)], 3),
|
| 141 |
+
"mean_ms": round(statistics.mean(per_log_latencies), 3),
|
| 142 |
+
"cpu_mean_pct": round(statistics.mean(cpu_samples), 1) if cpu_samples else 0,
|
| 143 |
+
"cpu_max_pct": round(max(cpu_samples), 1) if cpu_samples else 0,
|
| 144 |
+
"ram_mean_mb": round(statistics.mean(ram_samples), 1) if ram_samples else 0,
|
| 145 |
+
"ram_max_mb": round(max(ram_samples), 1) if ram_samples else 0,
|
| 146 |
+
"tier_regex_pct": tier_pct("Regex"),
|
| 147 |
+
"tier_bert_pct": tier_pct("BERT"),
|
| 148 |
+
"tier_llm_pct": tier_pct("LLM") + tier_pct("LLM (fallback)"),
|
| 149 |
+
}
|
| 150 |
+
rows.append(row)
|
| 151 |
+
|
| 152 |
+
print(f" β
Throughput: {throughput} logs/sec | "
|
| 153 |
+
f"p50={row['p50_ms']}ms p95={row['p95_ms']}ms p99={row['p99_ms']}ms | "
|
| 154 |
+
f"CPU={row['cpu_mean_pct']}% RAM={row['ram_mean_mb']}MB")
|
| 155 |
+
print(f" π Tiers: Regex={row['tier_regex_pct']}% "
|
| 156 |
+
f"BERT={row['tier_bert_pct']}% "
|
| 157 |
+
f"LLM={row['tier_llm_pct']}%")
|
| 158 |
+
|
| 159 |
+
# Write CSV
|
| 160 |
+
Path(output_csv).parent.mkdir(parents=True, exist_ok=True)
|
| 161 |
+
with open(output_csv, "w", newline="") as f:
|
| 162 |
+
writer = csv.DictWriter(f, fieldnames=rows[0].keys())
|
| 163 |
+
writer.writeheader()
|
| 164 |
+
writer.writerows(rows)
|
| 165 |
+
|
| 166 |
+
print(f"\nβ
Benchmark results saved β {output_csv}")
|
| 167 |
+
return rows
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
# ββ Scaling stress test ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 171 |
+
def stress_test(sizes: list[int] = [5_000, 20_000, 50_000, 100_000]) -> None:
|
| 172 |
+
"""Quick throughput check at different total log counts."""
|
| 173 |
+
from classify import classify_logs
|
| 174 |
+
|
| 175 |
+
print("\nπ₯ Stress Test β Scaling")
|
| 176 |
+
print(f"{'N logs':>10} {'Elapsed(s)':>12} {'Throughput':>12} {'p95_ms':>10}")
|
| 177 |
+
print("β" * 50)
|
| 178 |
+
|
| 179 |
+
for n in sizes:
|
| 180 |
+
logs = generate_logs(n)
|
| 181 |
+
t0 = time.perf_counter()
|
| 182 |
+
classify_logs(logs)
|
| 183 |
+
elapsed = time.perf_counter() - t0
|
| 184 |
+
tput = n / elapsed
|
| 185 |
+
# Rough p95 approximation: time / n * correction factor
|
| 186 |
+
p95_approx = (elapsed / n * 1000) * 1.5
|
| 187 |
+
print(f"{n:>10,} {elapsed:>12.2f}s {tput:>12.1f}/s {p95_approx:>10.1f}ms")
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
# ββ CLI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 191 |
+
def main():
|
| 192 |
+
parser = argparse.ArgumentParser(description="Log pipeline benchmark harness")
|
| 193 |
+
parser.add_argument("--logs", type=int, default=5_000,
|
| 194 |
+
help="Number of logs to benchmark (default: 5000)")
|
| 195 |
+
parser.add_argument("--output", default="benchmark_results.csv",
|
| 196 |
+
help="Output CSV path")
|
| 197 |
+
parser.add_argument("--stress", action="store_true",
|
| 198 |
+
help="Run scaling stress test (5k, 20k, 50k, 100k)")
|
| 199 |
+
parser.add_argument("--batches", default="1,8,16,32,64,128",
|
| 200 |
+
help="Comma-separated batch sizes to sweep")
|
| 201 |
+
args = parser.parse_args()
|
| 202 |
+
|
| 203 |
+
batch_sizes = [int(x) for x in args.batches.split(",")]
|
| 204 |
+
logs = generate_logs(args.logs)
|
| 205 |
+
print(f"π¦ Generated {len(logs):,} synthetic logs")
|
| 206 |
+
|
| 207 |
+
run_benchmark(logs, batch_sizes, args.output)
|
| 208 |
+
|
| 209 |
+
if args.stress:
|
| 210 |
+
stress_test()
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
if __name__ == "__main__":
|
| 214 |
+
main()
|
HF/classify.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
classify.py β 3-Tier Hybrid Pipeline (V3 β Latency-Tracked)
|
| 3 |
+
|
| 4 |
+
Architecture:
|
| 5 |
+
LegacyCRM β LLM directly
|
| 6 |
+
Others β Regex β BERT (batch) β LLM fallback
|
| 7 |
+
|
| 8 |
+
Changes in V3:
|
| 9 |
+
- Tier-wise latency tracking (regex_ms, bert_ms, llm_ms)
|
| 10 |
+
- Pipeline summary with p50/p95 per tier
|
| 11 |
+
- Defensive: LLM timeout + retry baked in via processor_llm
|
| 12 |
+
- classify_logs returns richer result dict
|
| 13 |
+
"""
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
import time
|
| 16 |
+
import statistics
|
| 17 |
+
import pandas as pd
|
| 18 |
+
from processor_regex import classify_with_regex
|
| 19 |
+
from processor_bert import classify_batch as bert_batch
|
| 20 |
+
from processor_llm import classify_with_llm
|
| 21 |
+
|
| 22 |
+
LEGACY_SOURCE = "LegacyCRM"
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# ββ Result type βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 26 |
+
def _make_result(label: str, tier: str, confidence, latency_ms: float) -> dict:
|
| 27 |
+
return {
|
| 28 |
+
"label": label,
|
| 29 |
+
"tier": tier,
|
| 30 |
+
"confidence": confidence,
|
| 31 |
+
"latency_ms": round(latency_ms, 3),
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# ββ Single log (backward-compatible) ββββββββββββββββββββββββββββββββββββββββ
|
| 36 |
+
def classify_log(source: str, log_msg: str) -> dict:
|
| 37 |
+
"""Single log classify karo. Returns label, tier, confidence, latency_ms."""
|
| 38 |
+
results = classify_logs([(source, log_msg)])
|
| 39 |
+
return results[0]
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# ββ Batch pipeline (main entry point) βββββββββββββββββββββββββββββββββββββββ
|
| 43 |
+
def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
|
| 44 |
+
"""
|
| 45 |
+
Batch classify with 3-tier routing + per-result latency.
|
| 46 |
+
|
| 47 |
+
Returns list of dicts:
|
| 48 |
+
{ label, tier, confidence, latency_ms }
|
| 49 |
+
|
| 50 |
+
Tier routing:
|
| 51 |
+
LegacyCRM source β LLM directly
|
| 52 |
+
Regex match β done (sub-ms)
|
| 53 |
+
Remainder β BERT batch β LLM if low confidence
|
| 54 |
+
"""
|
| 55 |
+
n = len(logs)
|
| 56 |
+
results = [None] * n
|
| 57 |
+
|
| 58 |
+
# ββ Step 1: Route to groups βββββββββββββββββββββββββββββββββββββββββββββ
|
| 59 |
+
llm_indices = []
|
| 60 |
+
bert_indices = []
|
| 61 |
+
entry_times = [time.perf_counter()] * n # approximate per-log start
|
| 62 |
+
|
| 63 |
+
t_route_start = time.perf_counter()
|
| 64 |
+
for i, (source, log_msg) in enumerate(logs):
|
| 65 |
+
entry_times[i] = time.perf_counter()
|
| 66 |
+
if source == LEGACY_SOURCE:
|
| 67 |
+
llm_indices.append(i)
|
| 68 |
+
else:
|
| 69 |
+
t0 = time.perf_counter()
|
| 70 |
+
label = classify_with_regex(log_msg)
|
| 71 |
+
t1 = time.perf_counter()
|
| 72 |
+
if label:
|
| 73 |
+
results[i] = _make_result(label, "Regex", 1.0, (t1 - t0) * 1000)
|
| 74 |
+
else:
|
| 75 |
+
bert_indices.append(i)
|
| 76 |
+
|
| 77 |
+
# ββ Step 2: BERT batch ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 78 |
+
if bert_indices:
|
| 79 |
+
bert_msgs = [logs[i][1] for i in bert_indices]
|
| 80 |
+
|
| 81 |
+
t_bert_start = time.perf_counter()
|
| 82 |
+
bert_results = bert_batch(bert_msgs)
|
| 83 |
+
t_bert_end = time.perf_counter()
|
| 84 |
+
|
| 85 |
+
bert_ms_per_log = (t_bert_end - t_bert_start) * 1000 / len(bert_msgs)
|
| 86 |
+
|
| 87 |
+
for idx, (label, conf) in zip(bert_indices, bert_results):
|
| 88 |
+
if label != "Unclassified":
|
| 89 |
+
results[idx] = _make_result(label, "BERT", conf, bert_ms_per_log)
|
| 90 |
+
else:
|
| 91 |
+
llm_indices.append(idx)
|
| 92 |
+
|
| 93 |
+
# ββ Step 3: LLM (LegacyCRM + BERT fallback) ββββββββββββββββββββββββββββ
|
| 94 |
+
for i in llm_indices:
|
| 95 |
+
_, log_msg = logs[i]
|
| 96 |
+
t0 = time.perf_counter()
|
| 97 |
+
label = classify_with_llm(log_msg)
|
| 98 |
+
t1 = time.perf_counter()
|
| 99 |
+
tier = "LLM" if logs[i][0] == LEGACY_SOURCE else "LLM (fallback)"
|
| 100 |
+
results[i] = _make_result(label, tier, None, (t1 - t0) * 1000)
|
| 101 |
+
|
| 102 |
+
return results
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
# ββ Pipeline summary βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 106 |
+
def pipeline_summary(results: list[dict]) -> dict:
|
| 107 |
+
"""
|
| 108 |
+
Aggregate stats from classify_logs output.
|
| 109 |
+
Useful for dashboard and benchmark reporting.
|
| 110 |
+
"""
|
| 111 |
+
tier_groups: dict[str, list[float]] = {}
|
| 112 |
+
label_counts: dict[str, int] = {}
|
| 113 |
+
|
| 114 |
+
for r in results:
|
| 115 |
+
tier = r["tier"]
|
| 116 |
+
tier_groups.setdefault(tier, []).append(r["latency_ms"])
|
| 117 |
+
label_counts[r["label"]] = label_counts.get(r["label"], 0) + 1
|
| 118 |
+
|
| 119 |
+
total = len(results)
|
| 120 |
+
tier_stats = {}
|
| 121 |
+
for tier, latencies in tier_groups.items():
|
| 122 |
+
latencies_sorted = sorted(latencies)
|
| 123 |
+
n = len(latencies_sorted)
|
| 124 |
+
tier_stats[tier] = {
|
| 125 |
+
"count": n,
|
| 126 |
+
"pct": round(n / total * 100, 1),
|
| 127 |
+
"p50_ms": round(statistics.median(latencies_sorted), 2),
|
| 128 |
+
"p95_ms": round(latencies_sorted[min(int(n * 0.95), n - 1)], 2),
|
| 129 |
+
"p99_ms": round(latencies_sorted[min(int(n * 0.99), n - 1)], 2),
|
| 130 |
+
"mean_ms": round(statistics.mean(latencies_sorted), 2),
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
return {
|
| 134 |
+
"total": total,
|
| 135 |
+
"tier_stats": tier_stats,
|
| 136 |
+
"label_counts": label_counts,
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
# ββ CSV batch classify βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 141 |
+
def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str, pd.DataFrame]:
|
| 142 |
+
"""
|
| 143 |
+
CSV file classify karo.
|
| 144 |
+
Required columns: 'source', 'log_message'
|
| 145 |
+
Output: adds 'predicted_label', 'tier_used', 'confidence', 'latency_ms'
|
| 146 |
+
"""
|
| 147 |
+
df = pd.read_csv(input_path)
|
| 148 |
+
required = {"source", "log_message"}
|
| 149 |
+
if not required.issubset(df.columns):
|
| 150 |
+
raise ValueError(f"CSV mein ye columns chahiye: {required}. Mila: {set(df.columns)}")
|
| 151 |
+
|
| 152 |
+
log_pairs = list(zip(df["source"], df["log_message"]))
|
| 153 |
+
results = classify_logs(log_pairs)
|
| 154 |
+
|
| 155 |
+
df["predicted_label"] = [r["label"] for r in results]
|
| 156 |
+
df["tier_used"] = [r["tier"] for r in results]
|
| 157 |
+
df["latency_ms"] = [r["latency_ms"] for r in results]
|
| 158 |
+
df["confidence"] = [
|
| 159 |
+
f"{r['confidence']:.1%}" if r["confidence"] is not None else "N/A"
|
| 160 |
+
for r in results
|
| 161 |
+
]
|
| 162 |
+
|
| 163 |
+
df.to_csv(output_path, index=False)
|
| 164 |
+
return output_path, df
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
# Aliases
|
| 168 |
+
classify = classify_logs
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
# ββ Self-test ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 172 |
+
if __name__ == "__main__":
|
| 173 |
+
sample = [
|
| 174 |
+
("ModernCRM", "IP 192.168.133.114 blocked due to potential attack"),
|
| 175 |
+
("BillingSystem", "User User12345 logged in."),
|
| 176 |
+
("AnalyticsEngine", "File data_6957.csv uploaded successfully by user User265."),
|
| 177 |
+
("ModernHR", "GET /v2/servers/detail HTTP/1.1 status: 200 len: 1583 time: 0.19"),
|
| 178 |
+
("ModernHR", "Admin access escalation detected for user 9429"),
|
| 179 |
+
("LegacyCRM", "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active."),
|
| 180 |
+
("LegacyCRM", "The 'ReportGenerator' module will be retired in version 4.0."),
|
| 181 |
+
]
|
| 182 |
+
|
| 183 |
+
print(f'{"Source":<20} {"Tier":<18} {"Conf":>6} {"Lat(ms)":>8} {"Label":<25} Log')
|
| 184 |
+
print("β" * 115)
|
| 185 |
+
results = classify_logs(sample)
|
| 186 |
+
for (source, log), r in zip(sample, results):
|
| 187 |
+
conf = f"{r['confidence']:.0%}" if r["confidence"] else " N/A"
|
| 188 |
+
print(f'{source:<20} {r["tier"]:<18} {conf:>6} {r["latency_ms"]:>8.1f} {r["label"]:<25} {log[:40]}')
|
| 189 |
+
|
| 190 |
+
summary = pipeline_summary(results)
|
| 191 |
+
print("\nπ Pipeline Summary:")
|
| 192 |
+
for tier, stats in summary["tier_stats"].items():
|
| 193 |
+
print(f" {tier}: {stats['count']} logs ({stats['pct']}%) | "
|
| 194 |
+
f"p50={stats['p50_ms']}ms p95={stats['p95_ms']}ms p99={stats['p99_ms']}ms")
|
| 195 |
+
|
| 196 |
+
print("\nπ·οΈ Label distribution:")
|
| 197 |
+
for label, count in sorted(summary["label_counts"].items(), key=lambda x: -x[1]):
|
| 198 |
+
print(f" β’ {label}: {count}")
|
HF/error_analysis.py
ADDED
|
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
error_analysis.py β Deep Dive into Unclassified / Misclassified Logs
|
| 3 |
+
|
| 4 |
+
This script addresses the 76 unclassified logs from the 20k run.
|
| 5 |
+
It answers:
|
| 6 |
+
1. What do these logs look like? (print + group)
|
| 7 |
+
2. Why did the model fail? (pattern analysis)
|
| 8 |
+
3. What should we do? (actionable fix suggestions)
|
| 9 |
+
|
| 10 |
+
Google interview talking point:
|
| 11 |
+
"I performed structured error analysis on my model's failure cases.
|
| 12 |
+
I grouped them by failure type β vocabulary mismatch, ambiguous intent,
|
| 13 |
+
formatting noise β and used that to drive targeted improvements."
|
| 14 |
+
|
| 15 |
+
Usage:
|
| 16 |
+
python error_analysis.py --input output.csv # post-classify CSV
|
| 17 |
+
python error_analysis.py --simulate # demo with synthetic data
|
| 18 |
+
"""
|
| 19 |
+
from __future__ import annotations
|
| 20 |
+
import argparse
|
| 21 |
+
import re
|
| 22 |
+
import sys
|
| 23 |
+
from collections import Counter, defaultdict
|
| 24 |
+
from typing import Optional
|
| 25 |
+
import pandas as pd
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# ββ Failure mode taxonomy ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 29 |
+
class FailureMode:
|
| 30 |
+
RARE_VOCAB = "rare_vocabulary" # domain-specific terms not in training
|
| 31 |
+
AMBIGUOUS = "ambiguous_intent" # log could match multiple categories
|
| 32 |
+
LEGACY_FORMAT = "legacy_format" # non-standard / old-school formatting
|
| 33 |
+
TRUNCATED = "truncated_or_noisy" # partial / malformed log line
|
| 34 |
+
NUMERIC_ONLY = "mostly_numeric" # ID/code-heavy, no semantic signal
|
| 35 |
+
MULTI_EVENT = "multi_event" # one line, multiple events
|
| 36 |
+
UNKNOWN = "unknown"
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _detect_failure_mode(log: str) -> str:
|
| 40 |
+
"""Heuristic: guess WHY this log was unclassified."""
|
| 41 |
+
log_l = log.lower()
|
| 42 |
+
|
| 43 |
+
if len(log) < 20:
|
| 44 |
+
return FailureMode.TRUNCATED
|
| 45 |
+
|
| 46 |
+
# Check ratio of digits to total chars
|
| 47 |
+
digit_ratio = sum(c.isdigit() for c in log) / max(len(log), 1)
|
| 48 |
+
if digit_ratio > 0.40:
|
| 49 |
+
return FailureMode.NUMERIC_ONLY
|
| 50 |
+
|
| 51 |
+
# Looks like it has 2+ events joined
|
| 52 |
+
if log.count(";") >= 2 or log.count(" AND ") >= 1 or log.count(" | ") >= 2:
|
| 53 |
+
return FailureMode.MULTI_EVENT
|
| 54 |
+
|
| 55 |
+
# Legacy / unusual format signals
|
| 56 |
+
legacy_signals = ["ticket", "escalation", "crm", "deprecated", "retire",
|
| 57 |
+
"module will be", "workflow", "assigned agent"]
|
| 58 |
+
if any(s in log_l for s in legacy_signals):
|
| 59 |
+
return FailureMode.LEGACY_FORMAT
|
| 60 |
+
|
| 61 |
+
# Ambiguity signals β could be error OR security
|
| 62 |
+
ambiguous_signals = ["failed", "error", "unauthorized", "denied", "blocked"]
|
| 63 |
+
if sum(1 for s in ambiguous_signals if s in log_l) >= 2:
|
| 64 |
+
return FailureMode.AMBIGUOUS
|
| 65 |
+
|
| 66 |
+
# Rare vocabulary
|
| 67 |
+
rare_signals = ["sla", "oncall", "runbook", "pagerduty", "janitor", "gc ", "eviction"]
|
| 68 |
+
if any(s in log_l for s in rare_signals):
|
| 69 |
+
return FailureMode.RARE_VOCAB
|
| 70 |
+
|
| 71 |
+
return FailureMode.UNKNOWN
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def _suggest_fix(mode: str) -> str:
|
| 75 |
+
fixes = {
|
| 76 |
+
FailureMode.RARE_VOCAB: "Add 5β10 training examples covering this vocabulary; or add regex rule.",
|
| 77 |
+
FailureMode.AMBIGUOUS: "Use multi-label or add a dedicated 'Ambiguous' class; review confidence threshold.",
|
| 78 |
+
FailureMode.LEGACY_FORMAT: "Route all legacy-format logs to LLM tier; add few-shot examples for LLM prompt.",
|
| 79 |
+
FailureMode.TRUNCATED: "Add input validation: reject/flag logs under 15 chars before classification.",
|
| 80 |
+
FailureMode.NUMERIC_ONLY: "Add regex patterns for structured numeric formats (job IDs, error codes, etc.).",
|
| 81 |
+
FailureMode.MULTI_EVENT: "Pre-process: split multi-event lines on ';' or ' | ' before classifying.",
|
| 82 |
+
FailureMode.UNKNOWN: "Manually review and add to training data or LLM few-shot examples.",
|
| 83 |
+
}
|
| 84 |
+
return fixes.get(mode, "Manual review required.")
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
# ββ Core analysis ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 88 |
+
def analyze_unclassified(df: pd.DataFrame, label_col: str = "predicted_label") -> None:
|
| 89 |
+
"""Full error analysis on a classified CSV DataFrame."""
|
| 90 |
+
|
| 91 |
+
unclassified = df[df[label_col] == "Unclassified"].copy()
|
| 92 |
+
total_unclassified = len(unclassified)
|
| 93 |
+
|
| 94 |
+
if total_unclassified == 0:
|
| 95 |
+
print("β
No unclassified logs found!")
|
| 96 |
+
return
|
| 97 |
+
|
| 98 |
+
print(f"\n{'='*70}")
|
| 99 |
+
print(f"π ERROR ANALYSIS: {total_unclassified} Unclassified Logs")
|
| 100 |
+
print(f"{'='*70}\n")
|
| 101 |
+
|
| 102 |
+
# ββ Step 1: Print all unclassified logs βββββββββββββββββββββββββββββββββ
|
| 103 |
+
log_col = "log_message" if "log_message" in df.columns else df.columns[-1]
|
| 104 |
+
print(f"{'#':>4} {'Log Message'}")
|
| 105 |
+
print("β" * 80)
|
| 106 |
+
for i, (_, row) in enumerate(unclassified.iterrows(), 1):
|
| 107 |
+
log = str(row.get(log_col, ""))
|
| 108 |
+
print(f"{i:>4}. {log[:120]}")
|
| 109 |
+
|
| 110 |
+
# ββ Step 2: Group by failure mode βββββββββββββββββββββββββββββββββββββββ
|
| 111 |
+
print(f"\n{'='*70}")
|
| 112 |
+
print("π GROUPING BY FAILURE MODE")
|
| 113 |
+
print("β" * 70)
|
| 114 |
+
|
| 115 |
+
groups: dict[str, list[str]] = defaultdict(list)
|
| 116 |
+
for _, row in unclassified.iterrows():
|
| 117 |
+
log = str(row.get(log_col, ""))
|
| 118 |
+
mode = _detect_failure_mode(log)
|
| 119 |
+
groups[mode].append(log)
|
| 120 |
+
|
| 121 |
+
for mode, logs in sorted(groups.items(), key=lambda x: -len(x[1])):
|
| 122 |
+
pct = len(logs) / total_unclassified * 100
|
| 123 |
+
print(f"\nπΉ {mode} β {len(logs)} logs ({pct:.1f}%)")
|
| 124 |
+
print(f" π‘ Fix: {_suggest_fix(mode)}")
|
| 125 |
+
print(f" Examples:")
|
| 126 |
+
for log in logs[:3]:
|
| 127 |
+
print(f" β’ {log[:110]}")
|
| 128 |
+
|
| 129 |
+
# ββ Step 3: Token frequency analysis ββββββββββββββββββββββββββββββββββββ
|
| 130 |
+
print(f"\n{'='*70}")
|
| 131 |
+
print("π COMMON TOKENS IN UNCLASSIFIED LOGS")
|
| 132 |
+
print("β" * 70)
|
| 133 |
+
|
| 134 |
+
STOPWORDS = {"the", "a", "an", "is", "in", "on", "for", "to", "of",
|
| 135 |
+
"and", "or", "by", "at", "with", "has", "was", "be",
|
| 136 |
+
"this", "that", "it", "not", "are", "from", "as"}
|
| 137 |
+
|
| 138 |
+
all_tokens: list[str] = []
|
| 139 |
+
for _, row in unclassified.iterrows():
|
| 140 |
+
log = str(row.get(log_col, "")).lower()
|
| 141 |
+
tokens = re.findall(r"[a-z]{3,}", log)
|
| 142 |
+
all_tokens.extend(t for t in tokens if t not in STOPWORDS)
|
| 143 |
+
|
| 144 |
+
counter = Counter(all_tokens)
|
| 145 |
+
print("Top 20 tokens in unclassified logs:")
|
| 146 |
+
for token, count in counter.most_common(20):
|
| 147 |
+
bar = "β" * min(count, 40)
|
| 148 |
+
print(f" {token:<20} {count:>4} {bar}")
|
| 149 |
+
|
| 150 |
+
# ββ Step 4: Length distribution βββββββββββββββββββββββββββββββββββββββββ
|
| 151 |
+
lengths = unclassified[log_col].apply(lambda x: len(str(x)))
|
| 152 |
+
print(f"\n{'='*70}")
|
| 153 |
+
print("π LOG LENGTH DISTRIBUTION (Unclassified)")
|
| 154 |
+
print(f" Mean: {lengths.mean():.1f} chars")
|
| 155 |
+
print(f" Median: {lengths.median():.1f} chars")
|
| 156 |
+
print(f" Min: {lengths.min()} chars")
|
| 157 |
+
print(f" Max: {lengths.max()} chars")
|
| 158 |
+
|
| 159 |
+
short = (lengths < 30).sum()
|
| 160 |
+
if short:
|
| 161 |
+
print(f" β οΈ {short} logs under 30 chars β likely truncated/noisy")
|
| 162 |
+
|
| 163 |
+
# ββ Step 5: Source breakdown βββββββββββββββββββββββββββββββββββββββββββββ
|
| 164 |
+
if "source" in df.columns:
|
| 165 |
+
print(f"\n{'='*70}")
|
| 166 |
+
print("π·οΈ UNCLASSIFIED BY SOURCE")
|
| 167 |
+
src_counts = unclassified["source"].value_counts()
|
| 168 |
+
for src, cnt in src_counts.items():
|
| 169 |
+
bar = "β" * min(cnt, 40)
|
| 170 |
+
print(f" {src:<22} {cnt:>4} {bar}")
|
| 171 |
+
|
| 172 |
+
# ββ Step 6: Actionable summary βββββββββββββββββββββββββββββββββββββββββββ
|
| 173 |
+
print(f"\n{'='*70}")
|
| 174 |
+
print("β
ACTIONABLE FIXES (Priority Order)")
|
| 175 |
+
print("β" * 70)
|
| 176 |
+
dominant_mode = max(groups.items(), key=lambda x: len(x[1]))[0] if groups else FailureMode.UNKNOWN
|
| 177 |
+
fixes = [
|
| 178 |
+
(1, "regex", "Add patterns for top unclassified tokens to processor_regex.py"),
|
| 179 |
+
(2, "training", "Add 10β20 examples per failure mode to training data"),
|
| 180 |
+
(3, "llm", "For LEGACY_FORMAT failures: add to LLM few-shot examples"),
|
| 181 |
+
(4, "preproc", "Pre-process: split multi-event logs, reject truncated logs"),
|
| 182 |
+
(5, "threshold","Tune BERT confidence threshold (currently 0.30 β try 0.40)"),
|
| 183 |
+
]
|
| 184 |
+
for priority, area, fix in fixes:
|
| 185 |
+
print(f" {priority}. [{area.upper():^10}] {fix}")
|
| 186 |
+
|
| 187 |
+
print(f"\nπ Dominant failure mode: '{dominant_mode}' ({len(groups.get(dominant_mode,[]))} logs)")
|
| 188 |
+
print(f" Start here: {_suggest_fix(dominant_mode)}\n")
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
# ββ Simulate 76 unclassified logs for demo ββββββββββββββββββββββββββββββββββββ
|
| 192 |
+
def _simulate_unclassified() -> pd.DataFrame:
|
| 193 |
+
"""Generate synthetic 'unclassified' logs that mimic real failure patterns."""
|
| 194 |
+
logs = [
|
| 195 |
+
# Legacy format / CRM
|
| 196 |
+
"Case escalation for ticket ID 9021 failed: agent inactive.",
|
| 197 |
+
"CRM module 'ReportGenerator' will be retired in v4.1.",
|
| 198 |
+
"Workflow for approval chain #4421 stalled at step 3.",
|
| 199 |
+
"SLA breach detected for case ID 7701 (P1, 4h breach).",
|
| 200 |
+
# Ambiguous
|
| 201 |
+
"Service auth-api failed and unauthorized access was logged.",
|
| 202 |
+
"Error: blocked request from 10.0.0.5 β reason unknown.",
|
| 203 |
+
# Truncated / noisy
|
| 204 |
+
"ERR",
|
| 205 |
+
"srv timeout",
|
| 206 |
+
"node-7",
|
| 207 |
+
# Numeric-heavy
|
| 208 |
+
"8821 9001 443 0 0 DROP IN=eth0 OUT= MAC=",
|
| 209 |
+
"16 0 0 1 2024-01-14 03:21:00.001",
|
| 210 |
+
# Multi-event
|
| 211 |
+
"Backup started; disk usage at 92%; health check failed | node-3",
|
| 212 |
+
# Rare vocab
|
| 213 |
+
"PagerDuty alert triggered for on-call rotation P1-incident.",
|
| 214 |
+
"GC eviction: 3.2GB heap compacted in 420ms.",
|
| 215 |
+
"Janitor job completed: 14,000 stale tokens purged.",
|
| 216 |
+
"Runbook auto-remediation triggered for alert ALT-9021.",
|
| 217 |
+
]
|
| 218 |
+
# Pad to ~76
|
| 219 |
+
padded = (logs * 5)[:76]
|
| 220 |
+
return pd.DataFrame({
|
| 221 |
+
"source": ["ModernCRM"] * 30 + ["LegacyCRM"] * 20 + ["AnalyticsEngine"] * 26,
|
| 222 |
+
"log_message": padded,
|
| 223 |
+
"predicted_label": ["Unclassified"] * 76,
|
| 224 |
+
})
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
# ββ CLI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 228 |
+
def main():
|
| 229 |
+
parser = argparse.ArgumentParser(description="Analyze unclassified/misclassified logs")
|
| 230 |
+
parser.add_argument("--input", help="Path to classified CSV from classify_csv()")
|
| 231 |
+
parser.add_argument("--simulate", action="store_true",
|
| 232 |
+
help="Run with synthetic unclassified logs (no CSV needed)")
|
| 233 |
+
parser.add_argument("--label-col", default="predicted_label",
|
| 234 |
+
help="Column name that holds the predicted label")
|
| 235 |
+
args = parser.parse_args()
|
| 236 |
+
|
| 237 |
+
if args.simulate:
|
| 238 |
+
df = _simulate_unclassified()
|
| 239 |
+
print("π Running with SIMULATED 76 unclassified logsβ¦")
|
| 240 |
+
elif args.input:
|
| 241 |
+
df = pd.read_csv(args.input)
|
| 242 |
+
else:
|
| 243 |
+
parser.print_help()
|
| 244 |
+
sys.exit(1)
|
| 245 |
+
|
| 246 |
+
analyze_unclassified(df, label_col=args.label_col)
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
if __name__ == "__main__":
|
| 250 |
+
main()
|
HF/processor_bert.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
processor_bert_fast.py β ONNX Runtime powered BERT classifier
|
| 3 |
+
Speed: 82 logs/s β 2000+ logs/s
|
| 4 |
+
|
| 5 |
+
Kaise kaam karta hai:
|
| 6 |
+
1. ONNX Runtime: Normal PyTorch se 3-5x faster
|
| 7 |
+
2. Batch processing: 64 logs ek saath process
|
| 8 |
+
3. Pre-allocated buffers: Memory waste nahi
|
| 9 |
+
"""
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
import os
|
| 12 |
+
import numpy as np
|
| 13 |
+
import joblib
|
| 14 |
+
|
| 15 |
+
# ββ Check karo kaunsa method use karna hai ββββββββββββββββββ
|
| 16 |
+
_USE_ONNX = False
|
| 17 |
+
_embedding_model = None
|
| 18 |
+
_classifier = None
|
| 19 |
+
_ort_session = None
|
| 20 |
+
_ort_tokenizer = None
|
| 21 |
+
|
| 22 |
+
MODEL_PATH = os.path.join(os.path.dirname(__file__), 'models', 'log_classifier.joblib')
|
| 23 |
+
ONNX_DIR = os.path.join(os.path.dirname(__file__), 'models', 'onnx')
|
| 24 |
+
CONFIDENCE_THRESHOLD = 0.30
|
| 25 |
+
DEFAULT_BATCH = 64
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _load_models():
|
| 29 |
+
"""Lazily load models β pehli call pe hi load hoga, baar baar nahi."""
|
| 30 |
+
global _USE_ONNX, _embedding_model, _classifier, _ort_session, _ort_tokenizer
|
| 31 |
+
|
| 32 |
+
if _classifier is not None:
|
| 33 |
+
return # Already loaded
|
| 34 |
+
|
| 35 |
+
# ββ Classifier load karo βββββββββββββββββββββββββββββββ
|
| 36 |
+
if not os.path.exists(MODEL_PATH):
|
| 37 |
+
raise FileNotFoundError(
|
| 38 |
+
f'Model nahi mila: {MODEL_PATH}\n'
|
| 39 |
+
'Pehle Colab notebook run karo aur model download karo.'
|
| 40 |
+
)
|
| 41 |
+
_classifier = joblib.load(MODEL_PATH)
|
| 42 |
+
|
| 43 |
+
# ββ ONNX try karo (fast), fallback to PyTorch ββββββββββ
|
| 44 |
+
onnx_model_file = os.path.join(ONNX_DIR, 'model.onnx')
|
| 45 |
+
|
| 46 |
+
if os.path.exists(onnx_model_file):
|
| 47 |
+
try:
|
| 48 |
+
import onnxruntime as ort
|
| 49 |
+
from transformers import AutoTokenizer
|
| 50 |
+
|
| 51 |
+
# CPU optimized session options
|
| 52 |
+
sess_opts = ort.SessionOptions()
|
| 53 |
+
sess_opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
| 54 |
+
sess_opts.intra_op_num_threads = os.cpu_count()
|
| 55 |
+
sess_opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
|
| 56 |
+
|
| 57 |
+
_ort_session = ort.InferenceSession(
|
| 58 |
+
onnx_model_file,
|
| 59 |
+
sess_options=sess_opts,
|
| 60 |
+
providers=['CPUExecutionProvider']
|
| 61 |
+
)
|
| 62 |
+
_ort_tokenizer = AutoTokenizer.from_pretrained(ONNX_DIR)
|
| 63 |
+
_USE_ONNX = True
|
| 64 |
+
print('[BERT] β
ONNX Runtime loaded β FAST MODE')
|
| 65 |
+
|
| 66 |
+
except Exception as e:
|
| 67 |
+
print(f'[BERT] ONNX load failed ({e}), fallback to PyTorch')
|
| 68 |
+
_USE_ONNX = False
|
| 69 |
+
|
| 70 |
+
if not _USE_ONNX:
|
| 71 |
+
from sentence_transformers import SentenceTransformer
|
| 72 |
+
_embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 73 |
+
print('[BERT] β οΈ PyTorch mode (install ONNX for 3-5x speedup)')
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def _embed_onnx(texts: list[str]) -> np.ndarray:
|
| 77 |
+
"""ONNX Runtime se embeddings generate karo β FAST."""
|
| 78 |
+
import torch
|
| 79 |
+
|
| 80 |
+
inputs = _ort_tokenizer(
|
| 81 |
+
texts,
|
| 82 |
+
padding=True,
|
| 83 |
+
truncation=True,
|
| 84 |
+
max_length=128,
|
| 85 |
+
return_tensors='np' # NumPy directly (faster than PyTorch tensors)
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
# ONNX session run
|
| 89 |
+
ort_inputs = {
|
| 90 |
+
'input_ids': inputs['input_ids'].astype(np.int64),
|
| 91 |
+
'attention_mask': inputs['attention_mask'].astype(np.int64),
|
| 92 |
+
}
|
| 93 |
+
if 'token_type_ids' in [i.name for i in _ort_session.get_inputs()]:
|
| 94 |
+
ort_inputs['token_type_ids'] = inputs.get(
|
| 95 |
+
'token_type_ids', np.zeros_like(inputs['input_ids'])
|
| 96 |
+
).astype(np.int64)
|
| 97 |
+
|
| 98 |
+
outputs = _ort_session.run(None, ort_inputs)
|
| 99 |
+
hidden = outputs[0] # (batch, seq_len, hidden)
|
| 100 |
+
|
| 101 |
+
# Mean pooling (attention mask weighted)
|
| 102 |
+
mask = inputs['attention_mask'][:, :, None].astype(np.float32)
|
| 103 |
+
summed = (hidden * mask).sum(axis=1)
|
| 104 |
+
counts = mask.sum(axis=1)
|
| 105 |
+
embeddings = summed / counts
|
| 106 |
+
|
| 107 |
+
# L2 normalize
|
| 108 |
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
| 109 |
+
return embeddings / (norms + 1e-8)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def _embed_pytorch(texts: list[str]) -> np.ndarray:
|
| 113 |
+
"""PyTorch fallback."""
|
| 114 |
+
return _embedding_model.encode(
|
| 115 |
+
texts,
|
| 116 |
+
batch_size=DEFAULT_BATCH,
|
| 117 |
+
convert_to_numpy=True,
|
| 118 |
+
normalize_embeddings=True,
|
| 119 |
+
show_progress_bar=False
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
# ββ PUBLIC API ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 124 |
+
|
| 125 |
+
def classify_with_bert(log_message: str) -> tuple[str, float]:
|
| 126 |
+
"""
|
| 127 |
+
Single log classify karo.
|
| 128 |
+
Returns: (label, confidence)
|
| 129 |
+
"""
|
| 130 |
+
_load_models()
|
| 131 |
+
results = classify_batch([log_message])
|
| 132 |
+
return results[0]
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def classify_batch(log_messages: list[str]) -> list[tuple[str, float]]:
|
| 136 |
+
"""
|
| 137 |
+
Multiple logs ek saath classify karo β MUCH FASTER!
|
| 138 |
+
Returns: list of (label, confidence) tuples
|
| 139 |
+
|
| 140 |
+
Example:
|
| 141 |
+
results = classify_batch(['log1', 'log2', 'log3'])
|
| 142 |
+
for label, conf in results:
|
| 143 |
+
print(f'{label}: {conf:.1%}')
|
| 144 |
+
"""
|
| 145 |
+
_load_models()
|
| 146 |
+
|
| 147 |
+
if not log_messages:
|
| 148 |
+
return []
|
| 149 |
+
|
| 150 |
+
results = []
|
| 151 |
+
|
| 152 |
+
# Process in batches
|
| 153 |
+
for i in range(0, len(log_messages), DEFAULT_BATCH):
|
| 154 |
+
batch = log_messages[i:i + DEFAULT_BATCH]
|
| 155 |
+
|
| 156 |
+
# Generate embeddings
|
| 157 |
+
if _USE_ONNX:
|
| 158 |
+
embeddings = _embed_onnx(batch)
|
| 159 |
+
else:
|
| 160 |
+
embeddings = _embed_pytorch(batch)
|
| 161 |
+
|
| 162 |
+
# Classify
|
| 163 |
+
probs = _classifier.predict_proba(embeddings)
|
| 164 |
+
max_probs = probs.max(axis=1)
|
| 165 |
+
labels = _classifier.predict(embeddings)
|
| 166 |
+
|
| 167 |
+
for label, conf in zip(labels, max_probs):
|
| 168 |
+
if conf < CONFIDENCE_THRESHOLD:
|
| 169 |
+
results.append(('Unclassified', float(conf)))
|
| 170 |
+
else:
|
| 171 |
+
results.append((str(label), float(conf)))
|
| 172 |
+
|
| 173 |
+
return results
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def get_classes() -> list[str]:
|
| 177 |
+
"""Classifier ke classes return karo."""
|
| 178 |
+
_load_models()
|
| 179 |
+
return list(_classifier.classes_)
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def is_onnx_mode() -> bool:
|
| 183 |
+
"""Check karo ONNX use ho raha hai ya nahi."""
|
| 184 |
+
_load_models()
|
| 185 |
+
return _USE_ONNX
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
# ββ TEST ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 189 |
+
if __name__ == '__main__':
|
| 190 |
+
import time
|
| 191 |
+
|
| 192 |
+
test_logs = [
|
| 193 |
+
'GET /v2/servers/detail HTTP/1.1 status: 404 len: 1583 time: 0.19',
|
| 194 |
+
'System crashed due to driver errors when restarting the server',
|
| 195 |
+
'Multiple login failures occurred on user 6454 account',
|
| 196 |
+
'Admin access escalation detected for user 9429',
|
| 197 |
+
'CPU usage at 98% for the last 10 minutes on node-7',
|
| 198 |
+
'Backup completed successfully.',
|
| 199 |
+
'User User123 logged in.',
|
| 200 |
+
'Data replication task for shard 14 did not complete',
|
| 201 |
+
'Hey bro chill ya!', # should be Unclassified
|
| 202 |
+
]
|
| 203 |
+
|
| 204 |
+
print('Single log test:')
|
| 205 |
+
for log in test_logs:
|
| 206 |
+
label, conf = classify_with_bert(log)
|
| 207 |
+
print(f' [{conf:.0%}] {label:25s} | {log[:60]}')
|
| 208 |
+
|
| 209 |
+
print(f'\nMode: {"ONNX π" if is_onnx_mode() else "PyTorch"}')
|
| 210 |
+
|
| 211 |
+
# Speed test
|
| 212 |
+
big_batch = test_logs * 100
|
| 213 |
+
t0 = time.perf_counter()
|
| 214 |
+
classify_batch(big_batch)
|
| 215 |
+
elapsed = time.perf_counter() - t0
|
| 216 |
+
print(f'\nSpeed: {len(big_batch)/elapsed:.0f} logs/s ({elapsed*1000/len(big_batch):.1f}ms/log)')
|
HF/processor_llm.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
processor_llm.py β Tier 3: LLM-based Classifier
|
| 3 |
+
|
| 4 |
+
Used for:
|
| 5 |
+
- LegacyCRM logs (Workflow Error, Deprecation Warning)
|
| 6 |
+
- BERT fallback when confidence < threshold
|
| 7 |
+
|
| 8 |
+
Production hardening in V3:
|
| 9 |
+
- Timeout (configurable, default 5s)
|
| 10 |
+
- Retry with exponential backoff (max 2 retries)
|
| 11 |
+
- Explicit failure modes: returns "Unclassified" on all error paths
|
| 12 |
+
- Caching for repeated log patterns (hash-based, in-memory)
|
| 13 |
+
- Token budget enforcement (max_tokens=15)
|
| 14 |
+
"""
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
import os
|
| 17 |
+
import re
|
| 18 |
+
import time
|
| 19 |
+
import hashlib
|
| 20 |
+
import logging
|
| 21 |
+
from functools import lru_cache
|
| 22 |
+
from typing import Optional
|
| 23 |
+
|
| 24 |
+
logger = logging.getLogger(__name__)
|
| 25 |
+
|
| 26 |
+
# ββ Config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 27 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 28 |
+
LLM_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
|
| 29 |
+
|
| 30 |
+
VALID_CATEGORIES = ["Workflow Error", "Deprecation Warning"]
|
| 31 |
+
|
| 32 |
+
# Retry / timeout config
|
| 33 |
+
MAX_RETRIES = 2
|
| 34 |
+
RETRY_DELAY_SEC = 1.0 # doubles on each retry (exponential backoff)
|
| 35 |
+
REQUEST_TIMEOUT = 5 # seconds β fail fast, do not hang pipeline
|
| 36 |
+
|
| 37 |
+
# In-memory cache to avoid redundant LLM calls for repeated logs
|
| 38 |
+
_RESPONSE_CACHE: dict[str, str] = {}
|
| 39 |
+
MAX_CACHE_SIZE = 1000 # evict oldest when full (simple FIFO)
|
| 40 |
+
|
| 41 |
+
SYSTEM_PROMPT = (
|
| 42 |
+
"You are an enterprise log classifier. "
|
| 43 |
+
"Classify log messages into exactly one category. "
|
| 44 |
+
"Return ONLY the category name β no explanation, no punctuation."
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
FEW_SHOT_EXAMPLES = [
|
| 48 |
+
{
|
| 49 |
+
"log": "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active.",
|
| 50 |
+
"label": "Workflow Error",
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"log": "The 'BulkEmailSender' feature is no longer supported. Use 'EmailCampaignManager' instead.",
|
| 54 |
+
"label": "Deprecation Warning",
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"log": "Invoice generation aborted for order ID 8910 due to invalid tax calculation module.",
|
| 58 |
+
"label": "Workflow Error",
|
| 59 |
+
},
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# ββ Cache helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 64 |
+
def _cache_key(log_msg: str) -> str:
|
| 65 |
+
return hashlib.md5(log_msg.strip().encode()).hexdigest()
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def _cache_get(log_msg: str) -> Optional[str]:
|
| 69 |
+
return _RESPONSE_CACHE.get(_cache_key(log_msg))
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def _cache_set(log_msg: str, label: str) -> None:
|
| 73 |
+
key = _cache_key(log_msg)
|
| 74 |
+
if len(_RESPONSE_CACHE) >= MAX_CACHE_SIZE:
|
| 75 |
+
# Evict oldest (first inserted) key
|
| 76 |
+
oldest = next(iter(_RESPONSE_CACHE))
|
| 77 |
+
del _RESPONSE_CACHE[oldest]
|
| 78 |
+
_RESPONSE_CACHE[key] = label
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def get_cache_stats() -> dict:
|
| 82 |
+
return {"size": len(_RESPONSE_CACHE), "max_size": MAX_CACHE_SIZE}
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
# ββ Prompt builder βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 86 |
+
def _build_messages(log_msg: str) -> list[dict]:
|
| 87 |
+
categories_str = ", ".join(f'"{c}"' for c in VALID_CATEGORIES)
|
| 88 |
+
user_content = (
|
| 89 |
+
f'Classify the following log into one of these categories: {categories_str}.\n'
|
| 90 |
+
'If none fits, return "Unclassified".\n\n'
|
| 91 |
+
)
|
| 92 |
+
for ex in FEW_SHOT_EXAMPLES:
|
| 93 |
+
user_content += f'Log: {ex["log"]}\nCategory: {ex["label"]}\n\n'
|
| 94 |
+
user_content += f"Log: {log_msg}\nCategory:"
|
| 95 |
+
|
| 96 |
+
return [
|
| 97 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 98 |
+
{"role": "user", "content": user_content},
|
| 99 |
+
]
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
# ββ Normalize raw LLM output βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 103 |
+
def _normalize(raw: str) -> str:
|
| 104 |
+
"""Map raw LLM output to a valid category or 'Unclassified'."""
|
| 105 |
+
raw = raw.strip().strip('"').strip("'")
|
| 106 |
+
for cat in VALID_CATEGORIES:
|
| 107 |
+
if cat.lower() in raw.lower():
|
| 108 |
+
return cat
|
| 109 |
+
return "Unclassified"
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
# ββ Main classify function ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 113 |
+
def classify_with_llm(log_msg: str) -> str:
|
| 114 |
+
"""
|
| 115 |
+
Tier 3 LLM classifier with:
|
| 116 |
+
- In-memory cache (avoids duplicate API calls)
|
| 117 |
+
- Timeout (REQUEST_TIMEOUT seconds)
|
| 118 |
+
- Retry with exponential backoff (MAX_RETRIES attempts)
|
| 119 |
+
- Explicit fallback to "Unclassified" on all error paths
|
| 120 |
+
|
| 121 |
+
Latency: 500β2000ms on cache miss; ~0ms on cache hit.
|
| 122 |
+
"""
|
| 123 |
+
# ββ Cache hit ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 124 |
+
cached = _cache_get(log_msg)
|
| 125 |
+
if cached is not None:
|
| 126 |
+
logger.debug(f"[LLM] Cache hit for: {log_msg[:60]}")
|
| 127 |
+
return cached
|
| 128 |
+
|
| 129 |
+
# ββ Inference with retry βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 130 |
+
if not HF_TOKEN:
|
| 131 |
+
logger.warning("[LLM] HF_TOKEN not set β returning Unclassified")
|
| 132 |
+
return "Unclassified"
|
| 133 |
+
|
| 134 |
+
from huggingface_hub import InferenceClient
|
| 135 |
+
|
| 136 |
+
client = InferenceClient(token=HF_TOKEN, timeout=REQUEST_TIMEOUT)
|
| 137 |
+
delay = RETRY_DELAY_SEC
|
| 138 |
+
last_err: Optional[Exception] = None
|
| 139 |
+
|
| 140 |
+
for attempt in range(1, MAX_RETRIES + 2): # +2: initial + MAX_RETRIES
|
| 141 |
+
try:
|
| 142 |
+
response = client.chat.completions.create(
|
| 143 |
+
model=LLM_MODEL,
|
| 144 |
+
messages=_build_messages(log_msg),
|
| 145 |
+
max_tokens=15,
|
| 146 |
+
temperature=0.1,
|
| 147 |
+
)
|
| 148 |
+
raw = response.choices[0].message.content
|
| 149 |
+
label = _normalize(raw)
|
| 150 |
+
|
| 151 |
+
_cache_set(log_msg, label)
|
| 152 |
+
logger.debug(f"[LLM] Attempt {attempt}: '{raw.strip()}' β '{label}'")
|
| 153 |
+
return label
|
| 154 |
+
|
| 155 |
+
except Exception as e:
|
| 156 |
+
last_err = e
|
| 157 |
+
if attempt <= MAX_RETRIES:
|
| 158 |
+
logger.warning(f"[LLM] Attempt {attempt} failed ({e}), retrying in {delay:.1f}sβ¦")
|
| 159 |
+
time.sleep(delay)
|
| 160 |
+
delay *= 2 # exponential backoff
|
| 161 |
+
else:
|
| 162 |
+
logger.error(f"[LLM] All {MAX_RETRIES + 1} attempts failed. Last error: {e}")
|
| 163 |
+
|
| 164 |
+
return "Unclassified"
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
# ββ Batch classify (serial β LLM is already rate-limited) ββββββββββββββββββββ
|
| 168 |
+
def classify_batch_llm(log_msgs: list[str]) -> list[str]:
|
| 169 |
+
"""Classify multiple logs through LLM. Each call is sequential to respect rate limits."""
|
| 170 |
+
return [classify_with_llm(msg) for msg in log_msgs]
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
# ββ CLI test βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 174 |
+
if __name__ == "__main__":
|
| 175 |
+
logging.basicConfig(level=logging.INFO)
|
| 176 |
+
|
| 177 |
+
test_logs = [
|
| 178 |
+
"Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active.",
|
| 179 |
+
"The 'ReportGenerator' module will be retired in version 4.0. Migrate to 'AdvancedAnalyticsSuite'.",
|
| 180 |
+
"System reboot initiated by user 12345.", # should be Unclassified
|
| 181 |
+
]
|
| 182 |
+
for log in test_logs:
|
| 183 |
+
result = classify_with_llm(log)
|
| 184 |
+
print(f"{result:25s} | {log[:80]}")
|
| 185 |
+
|
| 186 |
+
# Cache hit test
|
| 187 |
+
print("\nββ Cache hit test ββ")
|
| 188 |
+
t0 = time.perf_counter()
|
| 189 |
+
classify_with_llm(test_logs[0])
|
| 190 |
+
t1 = time.perf_counter()
|
| 191 |
+
print(f"Cache hit latency: {(t1-t0)*1000:.2f}ms")
|
| 192 |
+
print(f"Cache stats: {get_cache_stats()}")
|
HF/processor_regex.py
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
processor_regex.py β Tier 1: Rule-based Classifier
|
| 3 |
+
|
| 4 |
+
Target coverage: 40%+ (up from 15%)
|
| 5 |
+
Latency: sub-millisecond per log
|
| 6 |
+
|
| 7 |
+
New pattern groups added:
|
| 8 |
+
- HTTP request/response logs (was completely missing!)
|
| 9 |
+
- Auth / credential events (login failures, MFA, lockouts)
|
| 10 |
+
- System/infra events (disk, CPU, memory, cron)
|
| 11 |
+
- Network / firewall events (IP block, port scan)
|
| 12 |
+
- Structured error codes (ERROR, CRITICAL prefix logs)
|
| 13 |
+
"""
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
import re
|
| 16 |
+
import time
|
| 17 |
+
from typing import Optional
|
| 18 |
+
|
| 19 |
+
# ---------------------------------------------------------------------------
|
| 20 |
+
# Pattern registry: (compiled_pattern, label)
|
| 21 |
+
# Order matters β more specific patterns FIRST to avoid mis-labeling.
|
| 22 |
+
# ---------------------------------------------------------------------------
|
| 23 |
+
_RAW_PATTERNS: list[tuple[str, str]] = [
|
| 24 |
+
|
| 25 |
+
# ββ HTTP Status βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 26 |
+
# Covers: GET/POST/PUT/DELETE/PATCH + status code in request line
|
| 27 |
+
(r"\b(GET|POST|PUT|DELETE|PATCH|HEAD|OPTIONS)\s+\S+\s+HTTP/\d", "HTTP Status"),
|
| 28 |
+
# Nova / OpenStack style
|
| 29 |
+
(r"nova\.\S+\s+(GET|POST|PUT|DELETE)\s+\S+\s+HTTP/\d", "HTTP Status"),
|
| 30 |
+
# Status code only style: "returned HTTP 200" or "status: 404"
|
| 31 |
+
(r"\bstatus[:\s]+\d{3}\b", "HTTP Status"),
|
| 32 |
+
(r"\breturned\s+HTTP\s+\d{3}\b", "HTTP Status"),
|
| 33 |
+
(r"\bHTTP\s+status\s+code\s*[:-]?\s*\d{3}\b", "HTTP Status"),
|
| 34 |
+
# API response style
|
| 35 |
+
(r"\bAPI\s+(call|request)\s+\S+\s+completed\s+with\s+status\s+\d{3}", "HTTP Status"),
|
| 36 |
+
(r"\bEndpoint\s+\S+\s+responded\s+with\s+code\s+\d{3}", "HTTP Status"),
|
| 37 |
+
|
| 38 |
+
# ββ Security Alert ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 39 |
+
# Brute force / login failures
|
| 40 |
+
(r"(multiple\s+)?(bad\s+|failed?\s+)?login\s+(failure|attempt|failures)", "Security Alert"),
|
| 41 |
+
(r"brute[\s_-]force\s+(login|attack|attempt)", "Security Alert"),
|
| 42 |
+
# Unauthorized access
|
| 43 |
+
(r"unauthorized\s+(access|admin|privilege|attempt)", "Security Alert"),
|
| 44 |
+
(r"access\s+denied\s+(for|to)\s+(user|ip|host)", "Security Alert"),
|
| 45 |
+
# Privilege escalation
|
| 46 |
+
(r"(admin\s+)?access\s+escalation\s+detected", "Security Alert"),
|
| 47 |
+
(r"privilege\s+(elev|escalat)", "Security Alert"),
|
| 48 |
+
# IP blocking / suspicious traffic
|
| 49 |
+
(r"IP\s+\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\s+blocked", "Security Alert"),
|
| 50 |
+
(r"(suspicious|anomalous)\s+(login|traffic|activity|request)", "Security Alert"),
|
| 51 |
+
(r"potential\s+(DDoS|attack|breach|intrusion)", "Security Alert"),
|
| 52 |
+
(r"security\s+breach\s+suspected", "Security Alert"),
|
| 53 |
+
(r"(API\s+security\s+breach|bypass\s+API\s+security)", "Security Alert"),
|
| 54 |
+
(r"port\s+scan\s+(detected|attempt)", "Security Alert"),
|
| 55 |
+
|
| 56 |
+
# ββ User Action βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 57 |
+
(r"User\s+\w+\d*\s+logged\s+(in|out)", "User Action"),
|
| 58 |
+
(r"Account\s+(with\s+)?ID\s+\S+\s+created\s+by", "User Action"),
|
| 59 |
+
(r"User\s+\w+\d*\s+(updated\s+profile|changed\s+password|enabled\s+two|downloaded|exported)", "User Action"),
|
| 60 |
+
(r"(New\s+user|user\s+\w+\d*)\s+registered", "User Action"),
|
| 61 |
+
(r"Account\s+\S+\s+deleted\s+by\s+(administrator|admin)", "User Action"),
|
| 62 |
+
(r"User\s+\w+\d*\s+(tried|attempted)", "User Action"),
|
| 63 |
+
|
| 64 |
+
# ββ System Notification βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 65 |
+
# Backup events
|
| 66 |
+
(r"Backup\s+(started|ended|completed\s+successfully|failed|aborted)", "System Notification"),
|
| 67 |
+
(r"System\s+updated\s+to\s+version", "System Notification"),
|
| 68 |
+
(r"File\s+\S+\s+uploaded\s+successfully\s+by\s+user", "System Notification"),
|
| 69 |
+
(r"Disk\s+cleanup\s+completed\s+successfully", "System Notification"),
|
| 70 |
+
(r"System\s+reboot\s+initiated\s+by\s+user", "System Notification"),
|
| 71 |
+
(r"Scheduled\s+maintenance\s+(started|completed)", "System Notification"),
|
| 72 |
+
(r"Service\s+\w+\s+restarted\s+successfully", "System Notification"),
|
| 73 |
+
# NEW: cache, cron, health check, cert, log rotation
|
| 74 |
+
(r"Cache\s+cleared\s+successfully", "System Notification"),
|
| 75 |
+
(r"Log\s+rotation\s+completed", "System Notification"),
|
| 76 |
+
(r"Health\s+check\s+(passed|failed)\s+for\s+service", "System Notification"),
|
| 77 |
+
(r"Certificate\s+(renewed|expired|revoked)\s+successfully", "System Notification"),
|
| 78 |
+
(r"Cron\s+job\s+\S+\s+(executed|failed|completed)\s+successfully", "System Notification"),
|
| 79 |
+
(r"(Disk|Storage)\s+(usage|space)\s+(at|reached|exceeded)\s+\d+%", "System Notification"),
|
| 80 |
+
(r"CPU\s+usage\s+at\s+\d+%", "System Notification"),
|
| 81 |
+
(r"Memory\s+(usage|limit)\s+(at|reached|exceeded)\s+\d+%", "System Notification"),
|
| 82 |
+
# Deployment / config
|
| 83 |
+
(r"Deployment\s+(of|for)\s+\S+\s+(completed|failed|started)", "System Notification"),
|
| 84 |
+
(r"Configuration\s+(reloaded|updated|applied)\s+successfully", "System Notification"),
|
| 85 |
+
|
| 86 |
+
# ββ Error βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 87 |
+
(r"\bERROR\b.*\b(exception|failed|failure|crash|timeout|unavailable)\b", "Error"),
|
| 88 |
+
(r"System\s+crashed\s+due\s+to", "Error"),
|
| 89 |
+
(r"(connection|request|task|job)\s+(timed?\s*out|timeout)", "Error"),
|
| 90 |
+
(r"service\s+\S+\s+(is\s+down|unavailable|unreachable)", "Error"),
|
| 91 |
+
(r"database\s+connection\s+(failed|refused|lost|dropped)", "Error"),
|
| 92 |
+
(r"disk\s+(I/O\s+)?failure", "Error"),
|
| 93 |
+
(r"driver\s+error(s)?\s+(when|during|on)", "Error"),
|
| 94 |
+
(r"(replication|sync)\s+task\s+(did\s+not\s+complete|failed)", "Error"),
|
| 95 |
+
(r"null\s+pointer|segmentation\s+fault|stack\s+overflow", "Error"),
|
| 96 |
+
|
| 97 |
+
# ββ Critical Error ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 98 |
+
(r"\bCRITICAL\b", "Critical Error"),
|
| 99 |
+
(r"(FATAL|PANIC)\b", "Critical Error"),
|
| 100 |
+
(r"(data\s+loss|data\s+corruption)\s+(detected|occurred)", "Critical Error"),
|
| 101 |
+
(r"(cluster|node|shard)\s+(failure|crashed|went\s+down)", "Critical Error"),
|
| 102 |
+
(r"(catastrophic|unrecoverable)\s+(failure|error)", "Critical Error"),
|
| 103 |
+
(r"kernel\s+panic", "Critical Error"),
|
| 104 |
+
(r"out[\s-]of[\s-](memory|disk)\s+(error|killed|OOM)", "Critical Error"),
|
| 105 |
+
]
|
| 106 |
+
|
| 107 |
+
# Pre-compile all patterns at import time (not per-call)
|
| 108 |
+
REGEX_PATTERNS: list[tuple[re.Pattern, str]] = [
|
| 109 |
+
(re.compile(pat, re.IGNORECASE), label)
|
| 110 |
+
for pat, label in _RAW_PATTERNS
|
| 111 |
+
]
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def classify_with_regex(log_message: str) -> Optional[str]:
|
| 115 |
+
"""
|
| 116 |
+
Tier 1: Rule-based classifier.
|
| 117 |
+
Returns category label, or None if no pattern matches.
|
| 118 |
+
Latency: sub-millisecond (patterns pre-compiled at import).
|
| 119 |
+
"""
|
| 120 |
+
for pattern, label in REGEX_PATTERNS:
|
| 121 |
+
if pattern.search(log_message):
|
| 122 |
+
return label
|
| 123 |
+
return None
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def get_regex_coverage(log_messages: list[str]) -> dict:
|
| 127 |
+
"""Measure regex tier coverage and per-label breakdown."""
|
| 128 |
+
label_counts: dict[str, int] = {}
|
| 129 |
+
missed = 0
|
| 130 |
+
|
| 131 |
+
for msg in log_messages:
|
| 132 |
+
label = classify_with_regex(msg)
|
| 133 |
+
if label:
|
| 134 |
+
label_counts[label] = label_counts.get(label, 0) + 1
|
| 135 |
+
else:
|
| 136 |
+
missed += 1
|
| 137 |
+
|
| 138 |
+
total = len(log_messages)
|
| 139 |
+
matched = total - missed
|
| 140 |
+
|
| 141 |
+
return {
|
| 142 |
+
"total": total,
|
| 143 |
+
"matched": matched,
|
| 144 |
+
"missed": missed,
|
| 145 |
+
"coverage_pct": round(matched / total * 100, 2) if total else 0.0,
|
| 146 |
+
"label_breakdown": label_counts,
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def benchmark_regex(log_messages: list[str], runs: int = 3) -> dict:
|
| 151 |
+
"""Measure regex tier latency (p50 / p95 / p99) over multiple runs."""
|
| 152 |
+
import statistics
|
| 153 |
+
per_log_ms: list[float] = []
|
| 154 |
+
|
| 155 |
+
for _ in range(runs):
|
| 156 |
+
for msg in log_messages:
|
| 157 |
+
t0 = time.perf_counter()
|
| 158 |
+
classify_with_regex(msg)
|
| 159 |
+
per_log_ms.append((time.perf_counter() - t0) * 1000)
|
| 160 |
+
|
| 161 |
+
per_log_ms.sort()
|
| 162 |
+
return {
|
| 163 |
+
"p50_ms": round(statistics.median(per_log_ms), 4),
|
| 164 |
+
"p95_ms": round(per_log_ms[int(len(per_log_ms) * 0.95)], 4),
|
| 165 |
+
"p99_ms": round(per_log_ms[int(len(per_log_ms) * 0.99)], 4),
|
| 166 |
+
"mean_ms": round(statistics.mean(per_log_ms), 4),
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
# ββ CLI self-test ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 171 |
+
if __name__ == "__main__":
|
| 172 |
+
test_cases: list[tuple[str, str]] = [
|
| 173 |
+
# HTTP
|
| 174 |
+
("GET /api/v2/resource HTTP/1.1 status: 200 len: 1583 time: 0.19", "HTTP Status"),
|
| 175 |
+
("POST /v1/users HTTP/1.1 status: 201 len: 42 time: 0.05", "HTTP Status"),
|
| 176 |
+
("nova.osapi_compute.wsgi.server GET /v2/servers/detail HTTP/1.1 status: 404", "HTTP Status"),
|
| 177 |
+
# Security
|
| 178 |
+
("Multiple login failures occurred on user 6454 account", "Security Alert"),
|
| 179 |
+
("IP 192.168.133.114 blocked due to potential attack", "Security Alert"),
|
| 180 |
+
("Brute force login attempt from 10.0.0.5 detected", "Security Alert"),
|
| 181 |
+
("Admin access escalation detected for user 9429", "Security Alert"),
|
| 182 |
+
# User Action
|
| 183 |
+
("User User12345 logged in.", "User Action"),
|
| 184 |
+
("Account with ID 456 created by Admin.", "User Action"),
|
| 185 |
+
# System Notification
|
| 186 |
+
("Backup completed successfully.", "System Notification"),
|
| 187 |
+
("CPU usage at 98% for the last 10 minutes on node-7", "System Notification"),
|
| 188 |
+
("Health check passed for service payments-api", "System Notification"),
|
| 189 |
+
# Error
|
| 190 |
+
("System crashed due to disk I/O failure on node-3", "Error"),
|
| 191 |
+
("Database connection failed after 3 retries", "Error"),
|
| 192 |
+
# Critical
|
| 193 |
+
("CRITICAL: data corruption detected on shard-14", "Critical Error"),
|
| 194 |
+
("kernel panic: not syncing: VFS: unable to mount root fs", "Critical Error"),
|
| 195 |
+
# Should be None (unmatched)
|
| 196 |
+
("The 'BulkEmailSender' feature will be deprecated in v5.0.", None),
|
| 197 |
+
("Case escalation for ticket 7324 failed.", None),
|
| 198 |
+
]
|
| 199 |
+
|
| 200 |
+
correct = 0
|
| 201 |
+
print(f"{'Expected':<22} {'Got':<22} {'β/β'} | Log")
|
| 202 |
+
print("β" * 100)
|
| 203 |
+
for log, expected in test_cases:
|
| 204 |
+
got = classify_with_regex(log)
|
| 205 |
+
ok = got == expected
|
| 206 |
+
correct += ok
|
| 207 |
+
icon = "β" if ok else "β"
|
| 208 |
+
print(f"{str(expected):<22} {str(got):<22} {icon} | {log[:55]}")
|
| 209 |
+
|
| 210 |
+
print(f"\n{correct}/{len(test_cases)} correct")
|
| 211 |
+
|
| 212 |
+
# Coverage demo
|
| 213 |
+
all_logs = [log for log, _ in test_cases]
|
| 214 |
+
cov = get_regex_coverage(all_logs)
|
| 215 |
+
print(f"\nCoverage: {cov['coverage_pct']}% ({cov['matched']}/{cov['total']} matched)")
|
| 216 |
+
print("Label breakdown:", cov["label_breakdown"])
|
| 217 |
+
|
| 218 |
+
# Latency benchmark
|
| 219 |
+
lat = benchmark_regex(all_logs * 100)
|
| 220 |
+
print(f"\nLatency (p50/p95/p99): {lat['p50_ms']}ms / {lat['p95_ms']}ms / {lat['p99_ms']}ms")
|
HF/requirements.txt
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core
|
| 2 |
+
gradio>=4.0.0
|
| 3 |
+
pandas>=2.0.0
|
| 4 |
+
numpy>=1.24.0
|
| 5 |
+
joblib>=1.3.0
|
| 6 |
+
scikit-learn>=1.3.0
|
| 7 |
+
|
| 8 |
+
# Embedding + BERT
|
| 9 |
+
sentence-transformers>=2.7.0
|
| 10 |
+
transformers>=4.38.0
|
| 11 |
+
|
| 12 |
+
# ONNX (optional, 3-5x speedup)
|
| 13 |
+
onnxruntime>=1.17.0
|
| 14 |
+
optimum[onnxruntime]>=1.16.0
|
| 15 |
+
|
| 16 |
+
# LLM
|
| 17 |
+
huggingface-hub>=0.21.0
|
| 18 |
+
|
| 19 |
+
# FastAPI (production API)
|
| 20 |
+
fastapi>=0.110.0
|
| 21 |
+
uvicorn[standard]>=0.29.0
|
| 22 |
+
pydantic>=2.0.0
|
| 23 |
+
|
| 24 |
+
# Observability
|
| 25 |
+
psutil>=5.9.0
|
| 26 |
+
|
| 27 |
+
# Testing
|
| 28 |
+
pytest>=8.0.0
|
| 29 |
+
pytest-asyncio>=0.23.0
|
| 30 |
+
httpx>=0.27.0 # for FastAPI test client
|