Spaces:

NOT-OMEGA
/

LogAI-Engine

Sleeping

App Files Files Community

NOT-OMEGA commited on Apr 14

Commit

b5d6b20

verified ·

1 Parent(s): 238ab41

Upload 13 files

Browse files

Files changed (13) hide show

hf_space/Dockerfile +20 -0
hf_space/app_gradio.py +542 -0
hf_space/classify.py +198 -0
hf_space/models/log_classifier.joblib +3 -0
hf_space/onnx_model/config.json +24 -0
hf_space/onnx_model/special_tokens_map.json +37 -0
hf_space/onnx_model/tokenizer.json +0 -0
hf_space/onnx_model/tokenizer_config.json +65 -0
hf_space/onnx_model/vocab.txt +0 -0
hf_space/processor_bert.py +216 -0
hf_space/processor_llm.py +192 -0
hf_space/processor_regex.py +220 -0
hf_space/requirements.txt +25 -0

hf_space/Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM python:3.11-slim
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+RUN useradd -m -u 1000 appuser \
+    && chown -R appuser:appuser /app
+USER appuser
+EXPOSE 7860
+CMD ["python", "app_gradio.py"]

hf_space/app_gradio.py ADDED Viewed

	@@ -0,0 +1,542 @@

+"""
+Log Classification System — HuggingFace Spaces
+Ultra-modern 3D UI with custom CSS
+"""
+from __future__ import annotations
+import io
+import time
+import pandas as pd
+import gradio as gr
+from classify import classify_log, classify_csv
+SOURCES = [
+    "ModernCRM", "ModernHR", "BillingSystem",
+    "AnalyticsEngine", "ThirdPartyAPI", "LegacyCRM",
+]
+TIER_COLORS = {
+    "Regex":          "🟢",
+    "BERT":           "🔵",
+    "LLM":            "🟡",
+    "LLM (fallback)": "🟠",
+}
+EXAMPLE_LOGS = [
+    ["ModernCRM",       "User User12345 logged in."],
+    ["ModernHR",        "Multiple login failures occurred on user 6454 account"],
+    ["BillingSystem",   "GET /v2/servers/detail HTTP/1.1 status: 200 len: 1583 time: 0.19"],
+    ["AnalyticsEngine", "System crashed due to disk I/O failure on node-3"],
+    ["LegacyCRM",       "Case escalation for ticket ID 7324 failed — support agent is no longer active."],
+    ["LegacyCRM",       "The 'BulkEmailSender' feature will be deprecated in v5.0. Use 'EmailCampaignManager'."],
+]
+# ── Custom CSS — 3D Modern Dark Theme ──────────────────────────────────────
+CUSTOM_CSS = """
+@import url('https://fonts.googleapis.com/css2?family=Rajdhani:wght@400;500;600;700&family=Share+Tech+Mono&family=Exo+2:wght@300;400;600;700&display=swap');
+:root {
+    --bg-primary: #050810;
+    --bg-secondary: #0a0f1e;
+    --bg-card: #0d1425;
+    --bg-card-hover: #111a30;
+    --accent-cyan: #00d4ff;
+    --accent-blue: #0066ff;
+    --accent-purple: #7c3aed;
+    --accent-green: #00ff88;
+    --accent-orange: #ff6b00;
+    --text-primary: #e2e8f0;
+    --text-secondary: #94a3b8;
+    --text-muted: #475569;
+    --border-glow: rgba(0, 212, 255, 0.3);
+    --shadow-3d: 0 20px 60px rgba(0, 0, 0, 0.8), 0 0 40px rgba(0, 102, 255, 0.15);
+    --glow-cyan: 0 0 20px rgba(0, 212, 255, 0.4), 0 0 40px rgba(0, 212, 255, 0.2);
+    --glow-blue: 0 0 20px rgba(0, 102, 255, 0.4);
+}
+/* ── Base ── */
+body, .gradio-container {
+    background: var(--bg-primary) !important;
+    font-family: 'Exo 2', sans-serif !important;
+    color: var(--text-primary) !important;
+}
+.gradio-container {
+    background:
+        radial-gradient(ellipse at 20% 20%, rgba(0, 102, 255, 0.08) 0%, transparent 50%),
+        radial-gradient(ellipse at 80% 80%, rgba(124, 58, 237, 0.08) 0%, transparent 50%),
+        radial-gradient(ellipse at 50% 50%, rgba(0, 212, 255, 0.03) 0%, transparent 70%),
+        var(--bg-primary) !important;
+    min-height: 100vh;
+}
+/* ── Header ── */
+.main-header {
+    text-align: center;
+    padding: 48px 24px 32px;
+    position: relative;
+}
+.main-header::before {
+    content: '';
+    position: absolute;
+    top: 0; left: 50%;
+    transform: translateX(-50%);
+    width: 600px; height: 2px;
+    background: linear-gradient(90deg, transparent, var(--accent-cyan), var(--accent-blue), transparent);
+    box-shadow: var(--glow-cyan);
+}
+/* ── Tab Navigation ── */
+.tab-nav {
+    background: rgba(13, 20, 37, 0.8) !important;
+    border: 1px solid rgba(0, 212, 255, 0.15) !important;
+    border-radius: 16px !important;
+    padding: 6px !important;
+    backdrop-filter: blur(20px) !important;
+    box-shadow: var(--shadow-3d) !important;
+}
+.tab-nav button {
+    font-family: 'Rajdhani', sans-serif !important;
+    font-weight: 600 !important;
+    font-size: 14px !important;
+    letter-spacing: 1.5px !important;
+    text-transform: uppercase !important;
+    color: var(--text-secondary) !important;
+    background: transparent !important;
+    border: none !important;
+    border-radius: 10px !important;
+    padding: 12px 24px !important;
+    transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
+}
+.tab-nav button.selected {
+    color: var(--accent-cyan) !important;
+    background: linear-gradient(135deg, rgba(0, 212, 255, 0.1), rgba(0, 102, 255, 0.1)) !important;
+    box-shadow: 0 0 20px rgba(0, 212, 255, 0.2), inset 0 1px 0 rgba(0, 212, 255, 0.3) !important;
+    border: 1px solid rgba(0, 212, 255, 0.3) !important;
+}
+/* ── Cards / Blocks ── */
+.gradio-group, .gr-group {
+    background: var(--bg-card) !important;
+    border: 1px solid rgba(0, 212, 255, 0.1) !important;
+    border-radius: 20px !important;
+    box-shadow: var(--shadow-3d), inset 0 1px 0 rgba(255,255,255,0.03) !important;
+    transition: all 0.4s ease !important;
+    transform: perspective(1000px) rotateX(0deg);
+    position: relative;
+    overflow: hidden;
+}
+.gradio-group::before {
+    content: '';
+    position: absolute;
+    top: 0; left: 0; right: 0;
+    height: 1px;
+    background: linear-gradient(90deg, transparent, rgba(0, 212, 255, 0.5), transparent);
+}
+.gradio-group:hover {
+    border-color: rgba(0, 212, 255, 0.25) !important;
+    box-shadow: var(--shadow-3d), var(--glow-cyan) !important;
+    transform: perspective(1000px) translateY(-4px) !important;
+}
+/* ── Labels ── */
+label span, .gr-label {
+    font-family: 'Rajdhani', sans-serif !important;
+    font-weight: 600 !important;
+    letter-spacing: 1.5px !important;
+    text-transform: uppercase !important;
+    font-size: 11px !important;
+    color: var(--accent-cyan) !important;
+    opacity: 0.85;
+}
+/* ── Inputs ── */
+input, textarea, select, .gr-input {
+    background: rgba(5, 8, 16, 0.8) !important;
+    border: 1px solid rgba(0, 212, 255, 0.15) !important;
+    border-radius: 12px !important;
+    color: var(--text-primary) !important;
+    font-family: 'Share Tech Mono', monospace !important;
+    font-size: 13px !important;
+    transition: all 0.3s ease !important;
+    padding: 12px 16px !important;
+}
+input:focus, textarea:focus {
+    border-color: var(--accent-cyan) !important;
+    box-shadow: 0 0 0 3px rgba(0, 212, 255, 0.1), var(--glow-cyan) !important;
+    outline: none !important;
+    background: rgba(0, 212, 255, 0.03) !important;
+}
+/* ── Dropdown ── */
+.gr-dropdown select, .gradio-dropdown {
+    background: rgba(5, 8, 16, 0.9) !important;
+    border: 1px solid rgba(0, 212, 255, 0.2) !important;
+    border-radius: 12px !important;
+    color: var(--accent-cyan) !important;
+    font-family: 'Rajdhani', sans-serif !important;
+    font-weight: 600 !important;
+}
+/* ── Primary Button ── */
+button.primary, .gr-button-primary, button[variant="primary"] {
+    font-family: 'Rajdhani', sans-serif !important;
+    font-weight: 700 !important;
+    font-size: 15px !important;
+    letter-spacing: 2px !important;
+    text-transform: uppercase !important;
+    background: linear-gradient(135deg, #0066ff 0%, #00d4ff 50%, #0066ff 100%) !important;
+    background-size: 200% 200% !important;
+    border: none !important;
+    border-radius: 12px !important;
+    padding: 14px 32px !important;
+    color: #fff !important;
+    box-shadow:
+        0 8px 32px rgba(0, 102, 255, 0.4),
+        0 2px 8px rgba(0, 0, 0, 0.5),
+        inset 0 1px 0 rgba(255,255,255,0.2) !important;
+    transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
+    animation: gradientShift 3s ease infinite !important;
+    position: relative !important;
+    overflow: hidden !important;
+}
+button.primary::before {
+    content: '';
+    position: absolute;
+    top: -50%; left: -60%;
+    width: 40%; height: 200%;
+    background: rgba(255,255,255,0.1);
+    transform: skewX(-20deg);
+    transition: left 0.6s ease;
+}
+button.primary:hover::before {
+    left: 120%;
+}
+button.primary:hover {
+    transform: translateY(-3px) scale(1.02) !important;
+    box-shadow:
+        0 16px 48px rgba(0, 102, 255, 0.5),
+        0 0 30px rgba(0, 212, 255, 0.3),
+        inset 0 1px 0 rgba(255,255,255,0.3) !important;
+}
+button.primary:active {
+    transform: translateY(0px) scale(0.98) !important;
+}
+@keyframes gradientShift {
+    0%, 100% { background-position: 0% 50%; }
+    50% { background-position: 100% 50%; }
+}
+/* ── Output Textboxes — 3D Result Cards ── */
+.output-card input, .output-card textarea {
+    background: linear-gradient(135deg, rgba(0, 212, 255, 0.05), rgba(0, 102, 255, 0.05)) !important;
+    border: 1px solid rgba(0, 212, 255, 0.2) !important;
+    border-radius: 14px !important;
+    font-family: 'Share Tech Mono', monospace !important;
+    font-size: 16px !important;
+    font-weight: bold !important;
+    color: var(--accent-cyan) !important;
+    text-align: center !important;
+    box-shadow: inset 0 2px 8px rgba(0,0,0,0.3), 0 0 20px rgba(0, 212, 255, 0.1) !important;
+}
+/* ── Table / DataFrame ── */
+table {
+    border-collapse: separate !important;
+    border-spacing: 0 4px !important;
+    font-family: 'Share Tech Mono', monospace !important;
+    font-size: 12px !important;
+}
+th {
+    background: rgba(0, 102, 255, 0.2) !important;
+    color: var(--accent-cyan) !important;
+    font-family: 'Rajdhani', sans-serif !important;
+    letter-spacing: 1.5px !important;
+    text-transform: uppercase !important;
+    font-size: 11px !important;
+    padding: 10px 16px !important;
+    border: none !important;
+}
+td {
+    background: rgba(13, 20, 37, 0.6) !important;
+    color: var(--text-secondary) !important;
+    padding: 8px 16px !important;
+    border: none !important;
+    border-top: 1px solid rgba(0, 212, 255, 0.05) !important;
+    transition: background 0.2s ease !important;
+}
+tr:hover td {
+    background: rgba(0, 212, 255, 0.05) !important;
+    color: var(--text-primary) !important;
+}
+/* ── Markdown ── */
+.prose, .markdown {
+    color: var(--text-secondary) !important;
+    font-family: 'Exo 2', sans-serif !important;
+}
+.prose h1, .markdown h1 {
+    font-family: 'Rajdhani', sans-serif !important;
+    font-size: 3rem !important;
+    font-weight: 700 !important;
+    letter-spacing: 3px !important;
+    text-transform: uppercase !important;
+    background: linear-gradient(135deg, #ffffff 0%, var(--accent-cyan) 40%, var(--accent-blue) 100%) !important;
+    -webkit-background-clip: text !important;
+    -webkit-text-fill-color: transparent !important;
+    background-clip: text !important;
+    filter: drop-shadow(0 0 30px rgba(0, 212, 255, 0.3)) !important;
+    margin-bottom: 8px !important;
+}
+.prose h2, .markdown h2 {
+    font-family: 'Rajdhani', sans-serif !important;
+    font-size: 1.4rem !important;
+    font-weight: 600 !important;
+    letter-spacing: 2px !important;
+    color: var(--accent-cyan) !important;
+    text-transform: uppercase !important;
+    border-bottom: 1px solid rgba(0, 212, 255, 0.2) !important;
+    padding-bottom: 8px !important;
+}
+.prose p, .markdown p {
+    color: var(--text-secondary) !important;
+    line-height: 1.7 !important;
+    font-size: 14px !important;
+}
+.prose strong, .markdown strong {
+    color: var(--accent-cyan) !important;
+}
+/* ── Code blocks ── */
+code, pre {
+    font-family: 'Share Tech Mono', monospace !important;
+    background: rgba(0, 212, 255, 0.05) !important;
+    border: 1px solid rgba(0, 212, 255, 0.15) !important;
+    border-radius: 8px !important;
+    color: var(--accent-cyan) !important;
+    font-size: 12px !important;
+}
+/* ── Examples Table ── */
+.examples {
+    background: var(--bg-card) !important;
+    border: 1px solid rgba(0, 212, 255, 0.1) !important;
+    border-radius: 14px !important;
+    overflow: hidden !important;
+}
+.examples table th {
+    background: rgba(0, 102, 255, 0.15) !important;
+}
+/* ── File Upload ── */
+.gr-file {
+    background: rgba(5, 8, 16, 0.8) !important;
+    border: 2px dashed rgba(0, 212, 255, 0.25) !important;
+    border-radius: 16px !important;
+    transition: all 0.3s ease !important;
+}
+.gr-file:hover {
+    border-color: var(--accent-cyan) !important;
+    background: rgba(0, 212, 255, 0.03) !important;
+    box-shadow: var(--glow-cyan) !important;
+}
+/* ── Scrollbar ── */
+::-webkit-scrollbar { width: 6px; height: 6px; }
+::-webkit-scrollbar-track { background: var(--bg-secondary); }
+::-webkit-scrollbar-thumb {
+    background: linear-gradient(var(--accent-blue), var(--accent-cyan));
+    border-radius: 3px;
+}
+/* ── Pulsing accent line ── */
+@keyframes pulse-glow {
+    0%, 100% { opacity: 0.4; box-shadow: 0 0 10px rgba(0,212,255,0.3); }
+    50% { opacity: 1; box-shadow: 0 0 30px rgba(0,212,255,0.8); }
+}
+/* ── Tier badge colors ── */
+.tier-regex  { color: #00ff88 !important; }
+.tier-bert   { color: #00d4ff !important; }
+.tier-llm    { color: #ffd700 !important; }
+"""
+# ── Functions ───────────────────────────────────────────────────────────────
+def classify_single(source: str, log_message: str):
+    if not log_message.strip():
+        return "—", "—", "—", "—"
+    t0 = time.perf_counter()
+    result = classify_log(source, log_message)
+    latency_ms = (time.perf_counter() - t0) * 1000
+    label      = result["label"]
+    tier       = result["tier"]
+    confidence = f"{result['confidence']:.1%}" if result["confidence"] is not None else "N/A"
+    icon       = TIER_COLORS.get(tier, "⚪")
+    return label, f"{icon} {tier}", confidence, f"{latency_ms:.1f} ms"
+def classify_batch(file):
+    if file is None:
+        return None, "⚠️ Please upload a CSV file."
+    try:
+        output_path, df = classify_csv(file.name, "/tmp/classified_output.csv")
+    except ValueError as e:
+        return None, f"⚠️ {e}"
+    except Exception as e:
+        return None, f"❌ Error: {e}"
+    total = len(df)
+    tier_counts  = df["tier_used"].value_counts().to_dict()
+    label_counts = df["predicted_label"].value_counts().to_dict()
+    tier_lines  = "\n".join(f"  {TIER_COLORS.get(k,'⚪')} {k}: {v} ({v/total:.0%})" for k, v in tier_counts.items())
+    label_lines = "\n".join(f"  • {k}: {v}" for k, v in label_counts.items())
+    stats = (
+        f"✅ Classified {total} logs\n\n"
+        f"📊 Tier breakdown:\n{tier_lines}\n\n"
+        f"🏷️ Label distribution:\n{label_lines}"
+    )
+    return output_path, stats
+# ── UI ───────────────────────────────────────────────────────────────────────
+with gr.Blocks(
+    title="LOG CLASSIFICATION SYSTEM",
+    theme=gr.themes.Base(
+        primary_hue="blue",
+        secondary_hue="cyan",
+        neutral_hue="slate",
+        font=[gr.themes.GoogleFont("Exo 2"), "sans-serif"],
+        font_mono=[gr.themes.GoogleFont("Share Tech Mono"), "monospace"],
+    ).set(
+        body_background_fill="#050810",
+        body_text_color="#e2e8f0",
+        block_background_fill="#0d1425",
+        block_border_color="rgba(0,212,255,0.15)",
+        block_label_text_color="#00d4ff",
+        input_background_fill="#050810",
+        input_border_color="rgba(0,212,255,0.2)",
+        button_primary_background_fill="linear-gradient(135deg, #0066ff, #00d4ff)",
+        button_primary_text_color="#ffffff",
+        border_color_accent="#00d4ff",
+        color_accent_soft="rgba(0,212,255,0.1)",
+    ),
+    css=CUSTOM_CSS
+) as demo:
+    gr.Markdown("""
+# 🔍 LOG CLASSIFICATION SYSTEM
+**3-tier hybrid pipeline** — 🟢 Regex · 🔵 BERT + ML · 🟡 LLM
+*Enterprise-grade log monitoring at production scale*
+""")
+    with gr.Tabs():
+        # ── Tab 1: Single Log ─────────────────────────────────────────────
+        with gr.Tab("⚡ SINGLE LOG"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    source_input = gr.Dropdown(
+                        choices=SOURCES,
+                        value="ModernCRM",
+                        label="SOURCE SYSTEM",
+                    )
+                with gr.Column(scale=3):
+                    log_input = gr.Textbox(
+                        label="LOG MESSAGE",
+                        placeholder="Paste a log message here...",
+                        lines=3,
+                    )
+            classify_btn = gr.Button("▶  CLASSIFY LOG", variant="primary", size="lg")
+            with gr.Row():
+                label_out      = gr.Textbox(label="🏷️ PREDICTED LABEL",  interactive=False)
+                tier_out       = gr.Textbox(label="⚙️  TIER USED",        interactive=False)
+                confidence_out = gr.Textbox(label="📈 CONFIDENCE",        interactive=False)
+                latency_out    = gr.Textbox(label="⏱️  LATENCY",          interactive=False)
+            classify_btn.click(
+                fn=classify_single,
+                inputs=[source_input, log_input],
+                outputs=[label_out, tier_out, confidence_out, latency_out],
+            )
+            gr.Examples(
+                examples=EXAMPLE_LOGS,
+                inputs=[source_input, log_input],
+                label="📋 EXAMPLE LOGS — click to try",
+            )
+        # ── Tab 2: Batch CSV ──────────────────────────────────────────────
+        with gr.Tab("📦 BATCH CSV"):
+            gr.Markdown("""
+### Bulk Classification
+Upload a CSV with columns: **`source`**, **`log_message`**
+Output includes: `predicted_label`, `tier_used`, `confidence`, `latency_ms`
+""")
+            with gr.Row():
+                with gr.Column():
+                    csv_input  = gr.File(label="📂 UPLOAD CSV", file_types=[".csv"])
+                    batch_btn  = gr.Button("▶  CLASSIFY ALL", variant="primary")
+                with gr.Column():
+                    csv_output = gr.File(label="📥 DOWNLOAD RESULTS")
+                    stats_out  = gr.Textbox(label="📊 STATISTICS", lines=12, interactive=False)
+            batch_btn.click(
+                fn=classify_batch,
+                inputs=[csv_input],
+                outputs=[csv_output, stats_out],
+            )
+            gr.Markdown("""
+**Sample CSV format:**
+```
+source,log_message
+ModernCRM,User User123 logged in.
+LegacyCRM,Case escalation for ticket ID 7324 failed.
+BillingSystem,GET /api/v2/invoice HTTP/1.1 status: 500
+```
+""")
+        # ── Tab 3: Architecture ───────────────────────────────────────────
+        with gr.Tab("🏗️ ARCHITECTURE"):
+            gr.Markdown("""
+## 3-Tier Hybrid Pipeline
+| Tier | Method | Coverage | Latency | Trigger |
+|------|--------|----------|---------|---------|
+| 🟢 **Regex** | Python `re` patterns | ~21% | < 1ms | Fixed patterns |
+| 🔵 **BERT** | `all-MiniLM-L6-v2` + LogReg | ~79% | 20–80ms | High-volume categories |
+| 🟡 **LLM** | HuggingFace Inference API | ~0.3% | 500–2000ms | LegacyCRM + rare patterns |
+## Model Performance
+- **Training data**: 2,410 synthetic enterprise logs
+- **Confidence threshold**: 0.5 (below → escalate to LLM)
+- **Source-aware routing**: `LegacyCRM` → LLM directly
+## Environment Variables
+| Secret | Purpose |
+|--------|---------|
+| `HF_TOKEN` | LLM inference for LegacyCRM logs |
+""")
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

hf_space/classify.py ADDED Viewed

	@@ -0,0 +1,198 @@

+"""
+classify.py — 3-Tier Hybrid Pipeline (V3 — Latency-Tracked)
+Architecture:
+  LegacyCRM → LLM directly
+  Others    → Regex → BERT (batch) → LLM fallback
+Changes in V3:
+  - Tier-wise latency tracking (regex_ms, bert_ms, llm_ms)
+  - Pipeline summary with p50/p95 per tier
+  - Defensive: LLM timeout + retry baked in via processor_llm
+  - classify_logs returns richer result dict
+"""
+from __future__ import annotations
+import time
+import statistics
+import pandas as pd
+from processor_regex import classify_with_regex
+from processor_bert  import classify_batch as bert_batch
+from processor_llm   import classify_with_llm
+LEGACY_SOURCE = "LegacyCRM"
+# ── Result type ─────────────────────────────────────────────────────────────
+def _make_result(label: str, tier: str, confidence, latency_ms: float) -> dict:
+    return {
+        "label":      label,
+        "tier":       tier,
+        "confidence": confidence,
+        "latency_ms": round(latency_ms, 3),
+    }
+# ── Single log (backward-compatible) ────────────────────────────────────────
+def classify_log(source: str, log_msg: str) -> dict:
+    """Single log classify karo. Returns label, tier, confidence, latency_ms."""
+    results = classify_logs([(source, log_msg)])
+    return results[0]
+# ── Batch pipeline (main entry point) ───────────────────────────────────────
+def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
+    """
+    Batch classify with 3-tier routing + per-result latency.
+    Returns list of dicts:
+      { label, tier, confidence, latency_ms }
+    Tier routing:
+      LegacyCRM source → LLM directly
+      Regex match      → done (sub-ms)
+      Remainder        → BERT batch → LLM if low confidence
+    """
+    n       = len(logs)
+    results = [None] * n
+    # ── Step 1: Route to groups ─────────────────────────────────────────────
+    llm_indices   = []
+    bert_indices  = []
+    entry_times   = [time.perf_counter()] * n  # approximate per-log start
+    t_route_start = time.perf_counter()
+    for i, (source, log_msg) in enumerate(logs):
+        entry_times[i] = time.perf_counter()
+        if source == LEGACY_SOURCE:
+            llm_indices.append(i)
+        else:
+            t0    = time.perf_counter()
+            label = classify_with_regex(log_msg)
+            t1    = time.perf_counter()
+            if label:
+                results[i] = _make_result(label, "Regex", 1.0, (t1 - t0) * 1000)
+            else:
+                bert_indices.append(i)
+    # ── Step 2: BERT batch ──────────────────────────────────────────────────
+    if bert_indices:
+        bert_msgs = [logs[i][1] for i in bert_indices]
+        t_bert_start = time.perf_counter()
+        bert_results = bert_batch(bert_msgs)
+        t_bert_end   = time.perf_counter()
+        bert_ms_per_log = (t_bert_end - t_bert_start) * 1000 / len(bert_msgs)
+        for idx, (label, conf) in zip(bert_indices, bert_results):
+            if label != "Unclassified":
+                results[idx] = _make_result(label, "BERT", conf, bert_ms_per_log)
+            else:
+                llm_indices.append(idx)
+    # ── Step 3: LLM (LegacyCRM + BERT fallback) ────────────────────────────
+    for i in llm_indices:
+        _, log_msg = logs[i]
+        t0    = time.perf_counter()
+        label = classify_with_llm(log_msg)
+        t1    = time.perf_counter()
+        tier  = "LLM" if logs[i][0] == LEGACY_SOURCE else "LLM (fallback)"
+        results[i] = _make_result(label, tier, None, (t1 - t0) * 1000)
+    return results
+# ── Pipeline summary ─────────────────────────────────────────────────────────
+def pipeline_summary(results: list[dict]) -> dict:
+    """
+    Aggregate stats from classify_logs output.
+    Useful for dashboard and benchmark reporting.
+    """
+    tier_groups: dict[str, list[float]] = {}
+    label_counts: dict[str, int] = {}
+    for r in results:
+        tier = r["tier"]
+        tier_groups.setdefault(tier, []).append(r["latency_ms"])
+        label_counts[r["label"]] = label_counts.get(r["label"], 0) + 1
+    total = len(results)
+    tier_stats = {}
+    for tier, latencies in tier_groups.items():
+        latencies_sorted = sorted(latencies)
+        n = len(latencies_sorted)
+        tier_stats[tier] = {
+            "count":    n,
+            "pct":      round(n / total * 100, 1),
+            "p50_ms":   round(statistics.median(latencies_sorted), 2),
+            "p95_ms":   round(latencies_sorted[min(int(n * 0.95), n - 1)], 2),
+            "p99_ms":   round(latencies_sorted[min(int(n * 0.99), n - 1)], 2),
+            "mean_ms":  round(statistics.mean(latencies_sorted), 2),
+        }
+    return {
+        "total":        total,
+        "tier_stats":   tier_stats,
+        "label_counts": label_counts,
+    }
+# ── CSV batch classify ───────────────────────────────────────────────────────
+def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str, pd.DataFrame]:
+    """
+    CSV file classify karo.
+    Required columns: 'source', 'log_message'
+    Output: adds 'predicted_label', 'tier_used', 'confidence', 'latency_ms'
+    """
+    df = pd.read_csv(input_path)
+    required = {"source", "log_message"}
+    if not required.issubset(df.columns):
+        raise ValueError(f"CSV mein ye columns chahiye: {required}. Mila: {set(df.columns)}")
+    log_pairs = list(zip(df["source"], df["log_message"]))
+    results   = classify_logs(log_pairs)
+    df["predicted_label"] = [r["label"]      for r in results]
+    df["tier_used"]        = [r["tier"]       for r in results]
+    df["latency_ms"]       = [r["latency_ms"] for r in results]
+    df["confidence"]       = [
+        f"{r['confidence']:.1%}" if r["confidence"] is not None else "N/A"
+        for r in results
+    ]
+    df.to_csv(output_path, index=False)
+    return output_path, df
+# Aliases
+classify = classify_logs
+# ── Self-test ────────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    sample = [
+        ("ModernCRM",       "IP 192.168.133.114 blocked due to potential attack"),
+        ("BillingSystem",   "User User12345 logged in."),
+        ("AnalyticsEngine", "File data_6957.csv uploaded successfully by user User265."),
+        ("ModernHR",        "GET /v2/servers/detail HTTP/1.1 status: 200 len: 1583 time: 0.19"),
+        ("ModernHR",        "Admin access escalation detected for user 9429"),
+        ("LegacyCRM",       "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active."),
+        ("LegacyCRM",       "The 'ReportGenerator' module will be retired in version 4.0."),
+    ]
+    print(f'{"Source":<20} {"Tier":<18} {"Conf":>6} {"Lat(ms)":>8}  {"Label":<25} Log')
+    print("─" * 115)
+    results = classify_logs(sample)
+    for (source, log), r in zip(sample, results):
+        conf = f"{r['confidence']:.0%}" if r["confidence"] else "  N/A"
+        print(f'{source:<20} {r["tier"]:<18} {conf:>6} {r["latency_ms"]:>8.1f}  {r["label"]:<25} {log[:40]}')
+    summary = pipeline_summary(results)
+    print("\n📊 Pipeline Summary:")
+    for tier, stats in summary["tier_stats"].items():
+        print(f"  {tier}: {stats['count']} logs ({stats['pct']}%) | "
+              f"p50={stats['p50_ms']}ms p95={stats['p95_ms']}ms p99={stats['p99_ms']}ms")
+    print("\n🏷️  Label distribution:")
+    for label, count in sorted(summary["label_counts"].items(), key=lambda x: -x[1]):
+        print(f"  • {label}: {count}")

hf_space/models/log_classifier.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9bfe9c71b71412797de0d426be2255566dbf6cf87b3f2ae5d2cd1fd69a98d18d
+size 23997

hf_space/onnx_model/config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.57.6",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

hf_space/onnx_model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

hf_space/onnx_model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

hf_space/onnx_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,65 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "max_length": 128,
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_to_multiple_of": null,
+  "pad_token": "[PAD]",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "sep_token": "[SEP]",
+  "stride": 0,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "[UNK]"
+}

hf_space/onnx_model/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

hf_space/processor_bert.py ADDED Viewed

	@@ -0,0 +1,216 @@

+"""
+processor_bert_fast.py — ONNX Runtime powered BERT classifier
+Speed: 82 logs/s → 2000+ logs/s
+Kaise kaam karta hai:
+1. ONNX Runtime: Normal PyTorch se 3-5x faster
+2. Batch processing: 64 logs ek saath process
+3. Pre-allocated buffers: Memory waste nahi
+"""
+from __future__ import annotations
+import os
+import numpy as np
+import joblib
+# ── Check karo kaunsa method use karna hai ──────────────────
+_USE_ONNX = False
+_embedding_model = None
+_classifier       = None
+_ort_session      = None
+_ort_tokenizer    = None
+MODEL_PATH    = os.path.join(os.path.dirname(__file__), 'models', 'log_classifier.joblib')
+ONNX_DIR      = os.path.join(os.path.dirname(__file__), 'models', 'onnx')
+CONFIDENCE_THRESHOLD = 0.30
+DEFAULT_BATCH = 64
+def _load_models():
+    """Lazily load models — pehli call pe hi load hoga, baar baar nahi."""
+    global _USE_ONNX, _embedding_model, _classifier, _ort_session, _ort_tokenizer
+    if _classifier is not None:
+        return  # Already loaded
+    # ── Classifier load karo ───────────────────────────────
+    if not os.path.exists(MODEL_PATH):
+        raise FileNotFoundError(
+            f'Model nahi mila: {MODEL_PATH}\n'
+            'Pehle Colab notebook run karo aur model download karo.'
+        )
+    _classifier = joblib.load(MODEL_PATH)
+    # ── ONNX try karo (fast), fallback to PyTorch ──────────
+    onnx_model_file = os.path.join(ONNX_DIR, 'model.onnx')
+    if os.path.exists(onnx_model_file):
+        try:
+            import onnxruntime as ort
+            from transformers import AutoTokenizer
+            # CPU optimized session options
+            sess_opts = ort.SessionOptions()
+            sess_opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+            sess_opts.intra_op_num_threads = os.cpu_count()
+            sess_opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+            _ort_session = ort.InferenceSession(
+                onnx_model_file,
+                sess_options=sess_opts,
+                providers=['CPUExecutionProvider']
+            )
+            _ort_tokenizer = AutoTokenizer.from_pretrained(ONNX_DIR)
+            _USE_ONNX = True
+            print('[BERT] ✅ ONNX Runtime loaded — FAST MODE')
+        except Exception as e:
+            print(f'[BERT] ONNX load failed ({e}), fallback to PyTorch')
+            _USE_ONNX = False
+    if not _USE_ONNX:
+        from sentence_transformers import SentenceTransformer
+        _embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+        print('[BERT] ⚠️  PyTorch mode (install ONNX for 3-5x speedup)')
+def _embed_onnx(texts: list[str]) -> np.ndarray:
+    """ONNX Runtime se embeddings generate karo — FAST."""
+    import torch
+    inputs = _ort_tokenizer(
+        texts,
+        padding=True,
+        truncation=True,
+        max_length=128,
+        return_tensors='np'  # NumPy directly (faster than PyTorch tensors)
+    )
+    # ONNX session run
+    ort_inputs = {
+        'input_ids':      inputs['input_ids'].astype(np.int64),
+        'attention_mask': inputs['attention_mask'].astype(np.int64),
+    }
+    if 'token_type_ids' in [i.name for i in _ort_session.get_inputs()]:
+        ort_inputs['token_type_ids'] = inputs.get(
+            'token_type_ids', np.zeros_like(inputs['input_ids'])
+        ).astype(np.int64)
+    outputs = _ort_session.run(None, ort_inputs)
+    hidden  = outputs[0]  # (batch, seq_len, hidden)
+    # Mean pooling (attention mask weighted)
+    mask    = inputs['attention_mask'][:, :, None].astype(np.float32)
+    summed  = (hidden * mask).sum(axis=1)
+    counts  = mask.sum(axis=1)
+    embeddings = summed / counts
+    # L2 normalize
+    norms  = np.linalg.norm(embeddings, axis=1, keepdims=True)
+    return  embeddings / (norms + 1e-8)
+def _embed_pytorch(texts: list[str]) -> np.ndarray:
+    """PyTorch fallback."""
+    return _embedding_model.encode(
+        texts,
+        batch_size=DEFAULT_BATCH,
+        convert_to_numpy=True,
+        normalize_embeddings=True,
+        show_progress_bar=False
+    )
+# ── PUBLIC API ──────────────────────────────────────────────
+def classify_with_bert(log_message: str) -> tuple[str, float]:
+    """
+    Single log classify karo.
+    Returns: (label, confidence)
+    """
+    _load_models()
+    results = classify_batch([log_message])
+    return results[0]
+def classify_batch(log_messages: list[str]) -> list[tuple[str, float]]:
+    """
+    Multiple logs ek saath classify karo — MUCH FASTER!
+    Returns: list of (label, confidence) tuples
+    Example:
+        results = classify_batch(['log1', 'log2', 'log3'])
+        for label, conf in results:
+            print(f'{label}: {conf:.1%}')
+    """
+    _load_models()
+    if not log_messages:
+        return []
+    results = []
+    # Process in batches
+    for i in range(0, len(log_messages), DEFAULT_BATCH):
+        batch = log_messages[i:i + DEFAULT_BATCH]
+        # Generate embeddings
+        if _USE_ONNX:
+            embeddings = _embed_onnx(batch)
+        else:
+            embeddings = _embed_pytorch(batch)
+        # Classify
+        probs   = _classifier.predict_proba(embeddings)
+        max_probs = probs.max(axis=1)
+        labels    = _classifier.predict(embeddings)
+        for label, conf in zip(labels, max_probs):
+            if conf < CONFIDENCE_THRESHOLD:
+                results.append(('Unclassified', float(conf)))
+            else:
+                results.append((str(label), float(conf)))
+    return results
+def get_classes() -> list[str]:
+    """Classifier ke classes return karo."""
+    _load_models()
+    return list(_classifier.classes_)
+def is_onnx_mode() -> bool:
+    """Check karo ONNX use ho raha hai ya nahi."""
+    _load_models()
+    return _USE_ONNX
+# ── TEST ────────────────────────────────────────────────────
+if __name__ == '__main__':
+    import time
+    test_logs = [
+        'GET /v2/servers/detail HTTP/1.1 status: 404 len: 1583 time: 0.19',
+        'System crashed due to driver errors when restarting the server',
+        'Multiple login failures occurred on user 6454 account',
+        'Admin access escalation detected for user 9429',
+        'CPU usage at 98% for the last 10 minutes on node-7',
+        'Backup completed successfully.',
+        'User User123 logged in.',
+        'Data replication task for shard 14 did not complete',
+        'Hey bro chill ya!',     # should be Unclassified
+    ]
+    print('Single log test:')
+    for log in test_logs:
+        label, conf = classify_with_bert(log)
+        print(f'  [{conf:.0%}] {label:25s} | {log[:60]}')
+    print(f'\nMode: {"ONNX 🚀" if is_onnx_mode() else "PyTorch"}')
+    # Speed test
+    big_batch = test_logs * 100
+    t0 = time.perf_counter()
+    classify_batch(big_batch)
+    elapsed = time.perf_counter() - t0
+    print(f'\nSpeed: {len(big_batch)/elapsed:.0f} logs/s  ({elapsed*1000/len(big_batch):.1f}ms/log)')

hf_space/processor_llm.py ADDED Viewed

	@@ -0,0 +1,192 @@

+"""
+processor_llm.py — Tier 3: LLM-based Classifier
+Used for:
+  - LegacyCRM logs (Workflow Error, Deprecation Warning)
+  - BERT fallback when confidence < threshold
+Production hardening in V3:
+  - Timeout (configurable, default 5s)
+  - Retry with exponential backoff (max 2 retries)
+  - Explicit failure modes: returns "Unclassified" on all error paths
+  - Caching for repeated log patterns (hash-based, in-memory)
+  - Token budget enforcement (max_tokens=15)
+"""
+from __future__ import annotations
+import os
+import re
+import time
+import hashlib
+import logging
+from functools import lru_cache
+from typing import Optional
+logger = logging.getLogger(__name__)
+# ── Config ─────────────────────────────────────────────────────────────────
+HF_TOKEN   = os.getenv("HF_TOKEN")
+LLM_MODEL  = "mistralai/Mistral-7B-Instruct-v0.3"
+VALID_CATEGORIES = ["Workflow Error", "Deprecation Warning"]
+# Retry / timeout config
+MAX_RETRIES     = 2
+RETRY_DELAY_SEC = 1.0   # doubles on each retry (exponential backoff)
+REQUEST_TIMEOUT = 5     # seconds — fail fast, do not hang pipeline
+# In-memory cache to avoid redundant LLM calls for repeated logs
+_RESPONSE_CACHE: dict[str, str] = {}
+MAX_CACHE_SIZE = 1000  # evict oldest when full (simple FIFO)
+SYSTEM_PROMPT = (
+    "You are an enterprise log classifier. "
+    "Classify log messages into exactly one category. "
+    "Return ONLY the category name — no explanation, no punctuation."
+)
+FEW_SHOT_EXAMPLES = [
+    {
+        "log":   "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active.",
+        "label": "Workflow Error",
+    },
+    {
+        "log":   "The 'BulkEmailSender' feature is no longer supported. Use 'EmailCampaignManager' instead.",
+        "label": "Deprecation Warning",
+    },
+    {
+        "log":   "Invoice generation aborted for order ID 8910 due to invalid tax calculation module.",
+        "label": "Workflow Error",
+    },
+]
+# ── Cache helpers ────────────────────────────────────────────────────────────
+def _cache_key(log_msg: str) -> str:
+    return hashlib.md5(log_msg.strip().encode()).hexdigest()
+def _cache_get(log_msg: str) -> Optional[str]:
+    return _RESPONSE_CACHE.get(_cache_key(log_msg))
+def _cache_set(log_msg: str, label: str) -> None:
+    key = _cache_key(log_msg)
+    if len(_RESPONSE_CACHE) >= MAX_CACHE_SIZE:
+        # Evict oldest (first inserted) key
+        oldest = next(iter(_RESPONSE_CACHE))
+        del _RESPONSE_CACHE[oldest]
+    _RESPONSE_CACHE[key] = label
+def get_cache_stats() -> dict:
+    return {"size": len(_RESPONSE_CACHE), "max_size": MAX_CACHE_SIZE}
+# ── Prompt builder ───────────────────────────────────────────────────────────
+def _build_messages(log_msg: str) -> list[dict]:
+    categories_str = ", ".join(f'"{c}"' for c in VALID_CATEGORIES)
+    user_content = (
+        f'Classify the following log into one of these categories: {categories_str}.\n'
+        'If none fits, return "Unclassified".\n\n'
+    )
+    for ex in FEW_SHOT_EXAMPLES:
+        user_content += f'Log: {ex["log"]}\nCategory: {ex["label"]}\n\n'
+    user_content += f"Log: {log_msg}\nCategory:"
+    return [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user",   "content": user_content},
+    ]
+# ── Normalize raw LLM output ─────────────────────────────────────────────────
+def _normalize(raw: str) -> str:
+    """Map raw LLM output to a valid category or 'Unclassified'."""
+    raw = raw.strip().strip('"').strip("'")
+    for cat in VALID_CATEGORIES:
+        if cat.lower() in raw.lower():
+            return cat
+    return "Unclassified"
+# ── Main classify function ────────────────────────────────────────────────────
+def classify_with_llm(log_msg: str) -> str:
+    """
+    Tier 3 LLM classifier with:
+      - In-memory cache (avoids duplicate API calls)
+      - Timeout (REQUEST_TIMEOUT seconds)
+      - Retry with exponential backoff (MAX_RETRIES attempts)
+      - Explicit fallback to "Unclassified" on all error paths
+    Latency: 500–2000ms on cache miss; ~0ms on cache hit.
+    """
+    # ── Cache hit ────────────────────────────────────────────────────────────
+    cached = _cache_get(log_msg)
+    if cached is not None:
+        logger.debug(f"[LLM] Cache hit for: {log_msg[:60]}")
+        return cached
+    # ── Inference with retry ─────────────────────────────────────────────────
+    if not HF_TOKEN:
+        logger.warning("[LLM] HF_TOKEN not set — returning Unclassified")
+        return "Unclassified"
+    from huggingface_hub import InferenceClient
+    client  = InferenceClient(token=HF_TOKEN, timeout=REQUEST_TIMEOUT)
+    delay   = RETRY_DELAY_SEC
+    last_err: Optional[Exception] = None
+    for attempt in range(1, MAX_RETRIES + 2):  # +2: initial + MAX_RETRIES
+        try:
+            response = client.chat.completions.create(
+                model=LLM_MODEL,
+                messages=_build_messages(log_msg),
+                max_tokens=15,
+                temperature=0.1,
+            )
+            raw   = response.choices[0].message.content
+            label = _normalize(raw)
+            _cache_set(log_msg, label)
+            logger.debug(f"[LLM] Attempt {attempt}: '{raw.strip()}' → '{label}'")
+            return label
+        except Exception as e:
+            last_err = e
+            if attempt <= MAX_RETRIES:
+                logger.warning(f"[LLM] Attempt {attempt} failed ({e}), retrying in {delay:.1f}s…")
+                time.sleep(delay)
+                delay *= 2  # exponential backoff
+            else:
+                logger.error(f"[LLM] All {MAX_RETRIES + 1} attempts failed. Last error: {e}")
+    return "Unclassified"
+# ── Batch classify (serial — LLM is already rate-limited) ────────────────────
+def classify_batch_llm(log_msgs: list[str]) -> list[str]:
+    """Classify multiple logs through LLM. Each call is sequential to respect rate limits."""
+    return [classify_with_llm(msg) for msg in log_msgs]
+# ── CLI test ─────────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    test_logs = [
+        "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active.",
+        "The 'ReportGenerator' module will be retired in version 4.0. Migrate to 'AdvancedAnalyticsSuite'.",
+        "System reboot initiated by user 12345.",   # should be Unclassified
+    ]
+    for log in test_logs:
+        result = classify_with_llm(log)
+        print(f"{result:25s} | {log[:80]}")
+    # Cache hit test
+    print("\n── Cache hit test ──")
+    t0 = time.perf_counter()
+    classify_with_llm(test_logs[0])
+    t1 = time.perf_counter()
+    print(f"Cache hit latency: {(t1-t0)*1000:.2f}ms")
+    print(f"Cache stats: {get_cache_stats()}")

hf_space/processor_regex.py ADDED Viewed

	@@ -0,0 +1,220 @@

+"""
+processor_regex.py — Tier 1: Rule-based Classifier
+Target coverage: 40%+ (up from 15%)
+Latency: sub-millisecond per log
+New pattern groups added:
+  - HTTP request/response logs   (was completely missing!)
+  - Auth / credential events     (login failures, MFA, lockouts)
+  - System/infra events          (disk, CPU, memory, cron)
+  - Network / firewall events    (IP block, port scan)
+  - Structured error codes       (ERROR, CRITICAL prefix logs)
+"""
+from __future__ import annotations
+import re
+import time
+from typing import Optional
+# ---------------------------------------------------------------------------
+# Pattern registry: (compiled_pattern, label)
+# Order matters — more specific patterns FIRST to avoid mis-labeling.
+# ---------------------------------------------------------------------------
+_RAW_PATTERNS: list[tuple[str, str]] = [
+    # ── HTTP Status ─────────────────────────────────────────────────────────
+    # Covers: GET/POST/PUT/DELETE/PATCH + status code in request line
+    (r"\b(GET|POST|PUT|DELETE|PATCH|HEAD|OPTIONS)\s+\S+\s+HTTP/\d", "HTTP Status"),
+    # Nova / OpenStack style
+    (r"nova\.\S+\s+(GET|POST|PUT|DELETE)\s+\S+\s+HTTP/\d", "HTTP Status"),
+    # Status code only style: "returned HTTP 200" or "status: 404"
+    (r"\bstatus[:\s]+\d{3}\b", "HTTP Status"),
+    (r"\breturned\s+HTTP\s+\d{3}\b", "HTTP Status"),
+    (r"\bHTTP\s+status\s+code\s*[:-]?\s*\d{3}\b", "HTTP Status"),
+    # API response style
+    (r"\bAPI\s+(call|request)\s+\S+\s+completed\s+with\s+status\s+\d{3}", "HTTP Status"),
+    (r"\bEndpoint\s+\S+\s+responded\s+with\s+code\s+\d{3}", "HTTP Status"),
+    # ── Security Alert ──────────────────────────────────────────────────────
+    # Brute force / login failures
+    (r"(multiple\s+)?(bad\s+|failed?\s+)?login\s+(failure|attempt|failures)", "Security Alert"),
+    (r"brute[\s_-]force\s+(login|attack|attempt)", "Security Alert"),
+    # Unauthorized access
+    (r"unauthorized\s+(access|admin|privilege|attempt)", "Security Alert"),
+    (r"access\s+denied\s+(for|to)\s+(user|ip|host)", "Security Alert"),
+    # Privilege escalation
+    (r"(admin\s+)?access\s+escalation\s+detected", "Security Alert"),
+    (r"privilege\s+(elev|escalat)", "Security Alert"),
+    # IP blocking / suspicious traffic
+    (r"IP\s+\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\s+blocked", "Security Alert"),
+    (r"(suspicious|anomalous)\s+(login|traffic|activity|request)", "Security Alert"),
+    (r"potential\s+(DDoS|attack|breach|intrusion)", "Security Alert"),
+    (r"security\s+breach\s+suspected", "Security Alert"),
+    (r"(API\s+security\s+breach|bypass\s+API\s+security)", "Security Alert"),
+    (r"port\s+scan\s+(detected|attempt)", "Security Alert"),
+    # ── User Action ─────────────────────────────────────────────────────────
+    (r"User\s+\w+\d*\s+logged\s+(in|out)", "User Action"),
+    (r"Account\s+(with\s+)?ID\s+\S+\s+created\s+by", "User Action"),
+    (r"User\s+\w+\d*\s+(updated\s+profile|changed\s+password|enabled\s+two|downloaded|exported)", "User Action"),
+    (r"(New\s+user|user\s+\w+\d*)\s+registered", "User Action"),
+    (r"Account\s+\S+\s+deleted\s+by\s+(administrator|admin)", "User Action"),
+    (r"User\s+\w+\d*\s+(tried|attempted)", "User Action"),
+    # ── System Notification ─────────────────────────────────────────────────
+    # Backup events
+    (r"Backup\s+(started|ended|completed\s+successfully|failed|aborted)", "System Notification"),
+    (r"System\s+updated\s+to\s+version", "System Notification"),
+    (r"File\s+\S+\s+uploaded\s+successfully\s+by\s+user", "System Notification"),
+    (r"Disk\s+cleanup\s+completed\s+successfully", "System Notification"),
+    (r"System\s+reboot\s+initiated\s+by\s+user", "System Notification"),
+    (r"Scheduled\s+maintenance\s+(started|completed)", "System Notification"),
+    (r"Service\s+\w+\s+restarted\s+successfully", "System Notification"),
+    # NEW: cache, cron, health check, cert, log rotation
+    (r"Cache\s+cleared\s+successfully", "System Notification"),
+    (r"Log\s+rotation\s+completed", "System Notification"),
+    (r"Health\s+check\s+(passed|failed)\s+for\s+service", "System Notification"),
+    (r"Certificate\s+(renewed|expired|revoked)\s+successfully", "System Notification"),
+    (r"Cron\s+job\s+\S+\s+(executed|failed|completed)\s+successfully", "System Notification"),
+    (r"(Disk|Storage)\s+(usage|space)\s+(at|reached|exceeded)\s+\d+%", "System Notification"),
+    (r"CPU\s+usage\s+at\s+\d+%", "System Notification"),
+    (r"Memory\s+(usage|limit)\s+(at|reached|exceeded)\s+\d+%", "System Notification"),
+    # Deployment / config
+    (r"Deployment\s+(of|for)\s+\S+\s+(completed|failed|started)", "System Notification"),
+    (r"Configuration\s+(reloaded|updated|applied)\s+successfully", "System Notification"),
+    # ── Error ───────────────────────────────────────────────────────────────
+    (r"\bERROR\b.*\b(exception|failed|failure|crash|timeout|unavailable)\b", "Error"),
+    (r"System\s+crashed\s+due\s+to", "Error"),
+    (r"(connection|request|task|job)\s+(timed?\s*out|timeout)", "Error"),
+    (r"service\s+\S+\s+(is\s+down|unavailable|unreachable)", "Error"),
+    (r"database\s+connection\s+(failed|refused|lost|dropped)", "Error"),
+    (r"disk\s+(I/O\s+)?failure", "Error"),
+    (r"driver\s+error(s)?\s+(when|during|on)", "Error"),
+    (r"(replication|sync)\s+task\s+(did\s+not\s+complete|failed)", "Error"),
+    (r"null\s+pointer|segmentation\s+fault|stack\s+overflow", "Error"),
+    # ── Critical Error ──────────────────────────────────────────────────────
+    (r"\bCRITICAL\b", "Critical Error"),
+    (r"(FATAL|PANIC)\b", "Critical Error"),
+    (r"(data\s+loss|data\s+corruption)\s+(detected|occurred)", "Critical Error"),
+    (r"(cluster|node|shard)\s+(failure|crashed|went\s+down)", "Critical Error"),
+    (r"(catastrophic|unrecoverable)\s+(failure|error)", "Critical Error"),
+    (r"kernel\s+panic", "Critical Error"),
+    (r"out[\s-]of[\s-](memory|disk)\s+(error|killed|OOM)", "Critical Error"),
+]
+# Pre-compile all patterns at import time (not per-call)
+REGEX_PATTERNS: list[tuple[re.Pattern, str]] = [
+    (re.compile(pat, re.IGNORECASE), label)
+    for pat, label in _RAW_PATTERNS
+]
+def classify_with_regex(log_message: str) -> Optional[str]:
+    """
+    Tier 1: Rule-based classifier.
+    Returns category label, or None if no pattern matches.
+    Latency: sub-millisecond (patterns pre-compiled at import).
+    """
+    for pattern, label in REGEX_PATTERNS:
+        if pattern.search(log_message):
+            return label
+    return None
+def get_regex_coverage(log_messages: list[str]) -> dict:
+    """Measure regex tier coverage and per-label breakdown."""
+    label_counts: dict[str, int] = {}
+    missed = 0
+    for msg in log_messages:
+        label = classify_with_regex(msg)
+        if label:
+            label_counts[label] = label_counts.get(label, 0) + 1
+        else:
+            missed += 1
+    total   = len(log_messages)
+    matched = total - missed
+    return {
+        "total":        total,
+        "matched":      matched,
+        "missed":       missed,
+        "coverage_pct": round(matched / total * 100, 2) if total else 0.0,
+        "label_breakdown": label_counts,
+    }
+def benchmark_regex(log_messages: list[str], runs: int = 3) -> dict:
+    """Measure regex tier latency (p50 / p95 / p99) over multiple runs."""
+    import statistics
+    per_log_ms: list[float] = []
+    for _ in range(runs):
+        for msg in log_messages:
+            t0 = time.perf_counter()
+            classify_with_regex(msg)
+            per_log_ms.append((time.perf_counter() - t0) * 1000)
+    per_log_ms.sort()
+    return {
+        "p50_ms":  round(statistics.median(per_log_ms), 4),
+        "p95_ms":  round(per_log_ms[int(len(per_log_ms) * 0.95)], 4),
+        "p99_ms":  round(per_log_ms[int(len(per_log_ms) * 0.99)], 4),
+        "mean_ms": round(statistics.mean(per_log_ms), 4),
+    }
+# ── CLI self-test ────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    test_cases: list[tuple[str, str]] = [
+        # HTTP
+        ("GET /api/v2/resource HTTP/1.1 status: 200 len: 1583 time: 0.19", "HTTP Status"),
+        ("POST /v1/users HTTP/1.1 status: 201 len: 42 time: 0.05", "HTTP Status"),
+        ("nova.osapi_compute.wsgi.server GET /v2/servers/detail HTTP/1.1 status: 404", "HTTP Status"),
+        # Security
+        ("Multiple login failures occurred on user 6454 account", "Security Alert"),
+        ("IP 192.168.133.114 blocked due to potential attack", "Security Alert"),
+        ("Brute force login attempt from 10.0.0.5 detected", "Security Alert"),
+        ("Admin access escalation detected for user 9429", "Security Alert"),
+        # User Action
+        ("User User12345 logged in.", "User Action"),
+        ("Account with ID 456 created by Admin.", "User Action"),
+        # System Notification
+        ("Backup completed successfully.", "System Notification"),
+        ("CPU usage at 98% for the last 10 minutes on node-7", "System Notification"),
+        ("Health check passed for service payments-api", "System Notification"),
+        # Error
+        ("System crashed due to disk I/O failure on node-3", "Error"),
+        ("Database connection failed after 3 retries", "Error"),
+        # Critical
+        ("CRITICAL: data corruption detected on shard-14", "Critical Error"),
+        ("kernel panic: not syncing: VFS: unable to mount root fs", "Critical Error"),
+        # Should be None (unmatched)
+        ("The 'BulkEmailSender' feature will be deprecated in v5.0.", None),
+        ("Case escalation for ticket 7324 failed.", None),
+    ]
+    correct = 0
+    print(f"{'Expected':<22} {'Got':<22} {'✓/✗'} | Log")
+    print("─" * 100)
+    for log, expected in test_cases:
+        got = classify_with_regex(log)
+        ok  = got == expected
+        correct += ok
+        icon = "✓" if ok else "✗"
+        print(f"{str(expected):<22} {str(got):<22} {icon}   | {log[:55]}")
+    print(f"\n{correct}/{len(test_cases)} correct")
+    # Coverage demo
+    all_logs = [log for log, _ in test_cases]
+    cov = get_regex_coverage(all_logs)
+    print(f"\nCoverage: {cov['coverage_pct']}%  ({cov['matched']}/{cov['total']} matched)")
+    print("Label breakdown:", cov["label_breakdown"])
+    # Latency benchmark
+    lat = benchmark_regex(all_logs * 100)
+    print(f"\nLatency (p50/p95/p99): {lat['p50_ms']}ms / {lat['p95_ms']}ms / {lat['p99_ms']}ms")

hf_space/requirements.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+# Core
+gradio>=4.44.0
+pandas>=2.0.0
+numpy>=1.24.0
+joblib>=1.3.0
+scikit-learn>=1.3.0
+# Embedding + BERT
+sentence-transformers>=2.7.0
+transformers>=4.38.0
+# ONNX (optional, 3-5x speedup)
+onnxruntime>=1.17.0
+optimum[onnxruntime]>=1.16.0
+# LLM
+huggingface-hub>=0.21.0
+# FastAPI (production API)
+fastapi>=0.110.0
+uvicorn[standard]>=0.29.0
+pydantic>=2.0.0
+# Observability
+psutil>=5.9.0