Spaces:

NOT-OMEGA
/

LogAI-Engine

Sleeping

App Files Files Community

NOT-OMEGA commited on Apr 14

Commit

9399c84

verified ·

1 Parent(s): b5d6b20

Delete hf_space

Browse files

Files changed (13) hide show

hf_space/Dockerfile +0 -20
hf_space/app_gradio.py +0 -542
hf_space/classify.py +0 -198
hf_space/models/log_classifier.joblib +0 -3
hf_space/onnx_model/config.json +0 -24
hf_space/onnx_model/special_tokens_map.json +0 -37
hf_space/onnx_model/tokenizer.json +0 -0
hf_space/onnx_model/tokenizer_config.json +0 -65
hf_space/onnx_model/vocab.txt +0 -0
hf_space/processor_bert.py +0 -216
hf_space/processor_llm.py +0 -192
hf_space/processor_regex.py +0 -220
hf_space/requirements.txt +0 -25

hf_space/Dockerfile DELETED Viewed

@@ -1,20 +0,0 @@
-FROM python:3.11-slim
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    curl \
-    && rm -rf /var/lib/apt/lists/*
-WORKDIR /app
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-COPY . .
-RUN useradd -m -u 1000 appuser \
-    && chown -R appuser:appuser /app
-USER appuser
-EXPOSE 7860
-CMD ["python", "app_gradio.py"]

hf_space/app_gradio.py DELETED Viewed

@@ -1,542 +0,0 @@
-"""
-Log Classification System — HuggingFace Spaces
-Ultra-modern 3D UI with custom CSS
-"""
-from __future__ import annotations
-import io
-import time
-import pandas as pd
-import gradio as gr
-from classify import classify_log, classify_csv
-SOURCES = [
-    "ModernCRM", "ModernHR", "BillingSystem",
-    "AnalyticsEngine", "ThirdPartyAPI", "LegacyCRM",
-]
-TIER_COLORS = {
-    "Regex":          "🟢",
-    "BERT":           "🔵",
-    "LLM":            "🟡",
-    "LLM (fallback)": "🟠",
-}
-EXAMPLE_LOGS = [
-    ["ModernCRM",       "User User12345 logged in."],
-    ["ModernHR",        "Multiple login failures occurred on user 6454 account"],
-    ["BillingSystem",   "GET /v2/servers/detail HTTP/1.1 status: 200 len: 1583 time: 0.19"],
-    ["AnalyticsEngine", "System crashed due to disk I/O failure on node-3"],
-    ["LegacyCRM",       "Case escalation for ticket ID 7324 failed — support agent is no longer active."],
-    ["LegacyCRM",       "The 'BulkEmailSender' feature will be deprecated in v5.0. Use 'EmailCampaignManager'."],
-]
-# ── Custom CSS — 3D Modern Dark Theme ──────────────────────────────────────
-CUSTOM_CSS = """
-@import url('https://fonts.googleapis.com/css2?family=Rajdhani:wght@400;500;600;700&family=Share+Tech+Mono&family=Exo+2:wght@300;400;600;700&display=swap');
-:root {
-    --bg-primary: #050810;
-    --bg-secondary: #0a0f1e;
-    --bg-card: #0d1425;
-    --bg-card-hover: #111a30;
-    --accent-cyan: #00d4ff;
-    --accent-blue: #0066ff;
-    --accent-purple: #7c3aed;
-    --accent-green: #00ff88;
-    --accent-orange: #ff6b00;
-    --text-primary: #e2e8f0;
-    --text-secondary: #94a3b8;
-    --text-muted: #475569;
-    --border-glow: rgba(0, 212, 255, 0.3);
-    --shadow-3d: 0 20px 60px rgba(0, 0, 0, 0.8), 0 0 40px rgba(0, 102, 255, 0.15);
-    --glow-cyan: 0 0 20px rgba(0, 212, 255, 0.4), 0 0 40px rgba(0, 212, 255, 0.2);
-    --glow-blue: 0 0 20px rgba(0, 102, 255, 0.4);
-}
-/* ── Base ── */
-body, .gradio-container {
-    background: var(--bg-primary) !important;
-    font-family: 'Exo 2', sans-serif !important;
-    color: var(--text-primary) !important;
-}
-.gradio-container {
-    background:
-        radial-gradient(ellipse at 20% 20%, rgba(0, 102, 255, 0.08) 0%, transparent 50%),
-        radial-gradient(ellipse at 80% 80%, rgba(124, 58, 237, 0.08) 0%, transparent 50%),
-        radial-gradient(ellipse at 50% 50%, rgba(0, 212, 255, 0.03) 0%, transparent 70%),
-        var(--bg-primary) !important;
-    min-height: 100vh;
-}
-/* ── Header ── */
-.main-header {
-    text-align: center;
-    padding: 48px 24px 32px;
-    position: relative;
-}
-.main-header::before {
-    content: '';
-    position: absolute;
-    top: 0; left: 50%;
-    transform: translateX(-50%);
-    width: 600px; height: 2px;
-    background: linear-gradient(90deg, transparent, var(--accent-cyan), var(--accent-blue), transparent);
-    box-shadow: var(--glow-cyan);
-}
-/* ── Tab Navigation ── */
-.tab-nav {
-    background: rgba(13, 20, 37, 0.8) !important;
-    border: 1px solid rgba(0, 212, 255, 0.15) !important;
-    border-radius: 16px !important;
-    padding: 6px !important;
-    backdrop-filter: blur(20px) !important;
-    box-shadow: var(--shadow-3d) !important;
-}
-.tab-nav button {
-    font-family: 'Rajdhani', sans-serif !important;
-    font-weight: 600 !important;
-    font-size: 14px !important;
-    letter-spacing: 1.5px !important;
-    text-transform: uppercase !important;
-    color: var(--text-secondary) !important;
-    background: transparent !important;
-    border: none !important;
-    border-radius: 10px !important;
-    padding: 12px 24px !important;
-    transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
-}
-.tab-nav button.selected {
-    color: var(--accent-cyan) !important;
-    background: linear-gradient(135deg, rgba(0, 212, 255, 0.1), rgba(0, 102, 255, 0.1)) !important;
-    box-shadow: 0 0 20px rgba(0, 212, 255, 0.2), inset 0 1px 0 rgba(0, 212, 255, 0.3) !important;
-    border: 1px solid rgba(0, 212, 255, 0.3) !important;
-}
-/* ── Cards / Blocks ── */
-.gradio-group, .gr-group {
-    background: var(--bg-card) !important;
-    border: 1px solid rgba(0, 212, 255, 0.1) !important;
-    border-radius: 20px !important;
-    box-shadow: var(--shadow-3d), inset 0 1px 0 rgba(255,255,255,0.03) !important;
-    transition: all 0.4s ease !important;
-    transform: perspective(1000px) rotateX(0deg);
-    position: relative;
-    overflow: hidden;
-}
-.gradio-group::before {
-    content: '';
-    position: absolute;
-    top: 0; left: 0; right: 0;
-    height: 1px;
-    background: linear-gradient(90deg, transparent, rgba(0, 212, 255, 0.5), transparent);
-}
-.gradio-group:hover {
-    border-color: rgba(0, 212, 255, 0.25) !important;
-    box-shadow: var(--shadow-3d), var(--glow-cyan) !important;
-    transform: perspective(1000px) translateY(-4px) !important;
-}
-/* ── Labels ── */
-label span, .gr-label {
-    font-family: 'Rajdhani', sans-serif !important;
-    font-weight: 600 !important;
-    letter-spacing: 1.5px !important;
-    text-transform: uppercase !important;
-    font-size: 11px !important;
-    color: var(--accent-cyan) !important;
-    opacity: 0.85;
-}
-/* ── Inputs ── */
-input, textarea, select, .gr-input {
-    background: rgba(5, 8, 16, 0.8) !important;
-    border: 1px solid rgba(0, 212, 255, 0.15) !important;
-    border-radius: 12px !important;
-    color: var(--text-primary) !important;
-    font-family: 'Share Tech Mono', monospace !important;
-    font-size: 13px !important;
-    transition: all 0.3s ease !important;
-    padding: 12px 16px !important;
-}
-input:focus, textarea:focus {
-    border-color: var(--accent-cyan) !important;
-    box-shadow: 0 0 0 3px rgba(0, 212, 255, 0.1), var(--glow-cyan) !important;
-    outline: none !important;
-    background: rgba(0, 212, 255, 0.03) !important;
-}
-/* ── Dropdown ── */
-.gr-dropdown select, .gradio-dropdown {
-    background: rgba(5, 8, 16, 0.9) !important;
-    border: 1px solid rgba(0, 212, 255, 0.2) !important;
-    border-radius: 12px !important;
-    color: var(--accent-cyan) !important;
-    font-family: 'Rajdhani', sans-serif !important;
-    font-weight: 600 !important;
-}
-/* ── Primary Button ── */
-button.primary, .gr-button-primary, button[variant="primary"] {
-    font-family: 'Rajdhani', sans-serif !important;
-    font-weight: 700 !important;
-    font-size: 15px !important;
-    letter-spacing: 2px !important;
-    text-transform: uppercase !important;
-    background: linear-gradient(135deg, #0066ff 0%, #00d4ff 50%, #0066ff 100%) !important;
-    background-size: 200% 200% !important;
-    border: none !important;
-    border-radius: 12px !important;
-    padding: 14px 32px !important;
-    color: #fff !important;
-    box-shadow:
-        0 8px 32px rgba(0, 102, 255, 0.4),
-        0 2px 8px rgba(0, 0, 0, 0.5),
-        inset 0 1px 0 rgba(255,255,255,0.2) !important;
-    transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
-    animation: gradientShift 3s ease infinite !important;
-    position: relative !important;
-    overflow: hidden !important;
-}
-button.primary::before {
-    content: '';
-    position: absolute;
-    top: -50%; left: -60%;
-    width: 40%; height: 200%;
-    background: rgba(255,255,255,0.1);
-    transform: skewX(-20deg);
-    transition: left 0.6s ease;
-}
-button.primary:hover::before {
-    left: 120%;
-}
-button.primary:hover {
-    transform: translateY(-3px) scale(1.02) !important;
-    box-shadow:
-        0 16px 48px rgba(0, 102, 255, 0.5),
-        0 0 30px rgba(0, 212, 255, 0.3),
-        inset 0 1px 0 rgba(255,255,255,0.3) !important;
-}
-button.primary:active {
-    transform: translateY(0px) scale(0.98) !important;
-}
-@keyframes gradientShift {
-    0%, 100% { background-position: 0% 50%; }
-    50% { background-position: 100% 50%; }
-}
-/* ── Output Textboxes — 3D Result Cards ── */
-.output-card input, .output-card textarea {
-    background: linear-gradient(135deg, rgba(0, 212, 255, 0.05), rgba(0, 102, 255, 0.05)) !important;
-    border: 1px solid rgba(0, 212, 255, 0.2) !important;
-    border-radius: 14px !important;
-    font-family: 'Share Tech Mono', monospace !important;
-    font-size: 16px !important;
-    font-weight: bold !important;
-    color: var(--accent-cyan) !important;
-    text-align: center !important;
-    box-shadow: inset 0 2px 8px rgba(0,0,0,0.3), 0 0 20px rgba(0, 212, 255, 0.1) !important;
-}
-/* ── Table / DataFrame ── */
-table {
-    border-collapse: separate !important;
-    border-spacing: 0 4px !important;
-    font-family: 'Share Tech Mono', monospace !important;
-    font-size: 12px !important;
-}
-th {
-    background: rgba(0, 102, 255, 0.2) !important;
-    color: var(--accent-cyan) !important;
-    font-family: 'Rajdhani', sans-serif !important;
-    letter-spacing: 1.5px !important;
-    text-transform: uppercase !important;
-    font-size: 11px !important;
-    padding: 10px 16px !important;
-    border: none !important;
-}
-td {
-    background: rgba(13, 20, 37, 0.6) !important;
-    color: var(--text-secondary) !important;
-    padding: 8px 16px !important;
-    border: none !important;
-    border-top: 1px solid rgba(0, 212, 255, 0.05) !important;
-    transition: background 0.2s ease !important;
-}
-tr:hover td {
-    background: rgba(0, 212, 255, 0.05) !important;
-    color: var(--text-primary) !important;
-}
-/* ── Markdown ── */
-.prose, .markdown {
-    color: var(--text-secondary) !important;
-    font-family: 'Exo 2', sans-serif !important;
-}
-.prose h1, .markdown h1 {
-    font-family: 'Rajdhani', sans-serif !important;
-    font-size: 3rem !important;
-    font-weight: 700 !important;
-    letter-spacing: 3px !important;
-    text-transform: uppercase !important;
-    background: linear-gradient(135deg, #ffffff 0%, var(--accent-cyan) 40%, var(--accent-blue) 100%) !important;
-    -webkit-background-clip: text !important;
-    -webkit-text-fill-color: transparent !important;
-    background-clip: text !important;
-    filter: drop-shadow(0 0 30px rgba(0, 212, 255, 0.3)) !important;
-    margin-bottom: 8px !important;
-}
-.prose h2, .markdown h2 {
-    font-family: 'Rajdhani', sans-serif !important;
-    font-size: 1.4rem !important;
-    font-weight: 600 !important;
-    letter-spacing: 2px !important;
-    color: var(--accent-cyan) !important;
-    text-transform: uppercase !important;
-    border-bottom: 1px solid rgba(0, 212, 255, 0.2) !important;
-    padding-bottom: 8px !important;
-}
-.prose p, .markdown p {
-    color: var(--text-secondary) !important;
-    line-height: 1.7 !important;
-    font-size: 14px !important;
-}
-.prose strong, .markdown strong {
-    color: var(--accent-cyan) !important;
-}
-/* ── Code blocks ── */
-code, pre {
-    font-family: 'Share Tech Mono', monospace !important;
-    background: rgba(0, 212, 255, 0.05) !important;
-    border: 1px solid rgba(0, 212, 255, 0.15) !important;
-    border-radius: 8px !important;
-    color: var(--accent-cyan) !important;
-    font-size: 12px !important;
-}
-/* ── Examples Table ── */
-.examples {
-    background: var(--bg-card) !important;
-    border: 1px solid rgba(0, 212, 255, 0.1) !important;
-    border-radius: 14px !important;
-    overflow: hidden !important;
-}
-.examples table th {
-    background: rgba(0, 102, 255, 0.15) !important;
-}
-/* ── File Upload ── */
-.gr-file {
-    background: rgba(5, 8, 16, 0.8) !important;
-    border: 2px dashed rgba(0, 212, 255, 0.25) !important;
-    border-radius: 16px !important;
-    transition: all 0.3s ease !important;
-}
-.gr-file:hover {
-    border-color: var(--accent-cyan) !important;
-    background: rgba(0, 212, 255, 0.03) !important;
-    box-shadow: var(--glow-cyan) !important;
-}
-/* ── Scrollbar ── */
-::-webkit-scrollbar { width: 6px; height: 6px; }
-::-webkit-scrollbar-track { background: var(--bg-secondary); }
-::-webkit-scrollbar-thumb {
-    background: linear-gradient(var(--accent-blue), var(--accent-cyan));
-    border-radius: 3px;
-}
-/* ── Pulsing accent line ── */
-@keyframes pulse-glow {
-    0%, 100% { opacity: 0.4; box-shadow: 0 0 10px rgba(0,212,255,0.3); }
-    50% { opacity: 1; box-shadow: 0 0 30px rgba(0,212,255,0.8); }
-}
-/* ── Tier badge colors ── */
-.tier-regex  { color: #00ff88 !important; }
-.tier-bert   { color: #00d4ff !important; }
-.tier-llm    { color: #ffd700 !important; }
-"""
-# ── Functions ───────────────────────────────────────────────────────────────
-def classify_single(source: str, log_message: str):
-    if not log_message.strip():
-        return "—", "—", "—", "—"
-    t0 = time.perf_counter()
-    result = classify_log(source, log_message)
-    latency_ms = (time.perf_counter() - t0) * 1000
-    label      = result["label"]
-    tier       = result["tier"]
-    confidence = f"{result['confidence']:.1%}" if result["confidence"] is not None else "N/A"
-    icon       = TIER_COLORS.get(tier, "⚪")
-    return label, f"{icon} {tier}", confidence, f"{latency_ms:.1f} ms"
-def classify_batch(file):
-    if file is None:
-        return None, "⚠️ Please upload a CSV file."
-    try:
-        output_path, df = classify_csv(file.name, "/tmp/classified_output.csv")
-    except ValueError as e:
-        return None, f"⚠️ {e}"
-    except Exception as e:
-        return None, f"❌ Error: {e}"
-    total = len(df)
-    tier_counts  = df["tier_used"].value_counts().to_dict()
-    label_counts = df["predicted_label"].value_counts().to_dict()
-    tier_lines  = "\n".join(f"  {TIER_COLORS.get(k,'⚪')} {k}: {v} ({v/total:.0%})" for k, v in tier_counts.items())
-    label_lines = "\n".join(f"  • {k}: {v}" for k, v in label_counts.items())
-    stats = (
-        f"✅ Classified {total} logs\n\n"
-        f"📊 Tier breakdown:\n{tier_lines}\n\n"
-        f"🏷️ Label distribution:\n{label_lines}"
-    )
-    return output_path, stats
-# ── UI ───────────────────────────────────────────────────────────────────────
-with gr.Blocks(
-    title="LOG CLASSIFICATION SYSTEM",
-    theme=gr.themes.Base(
-        primary_hue="blue",
-        secondary_hue="cyan",
-        neutral_hue="slate",
-        font=[gr.themes.GoogleFont("Exo 2"), "sans-serif"],
-        font_mono=[gr.themes.GoogleFont("Share Tech Mono"), "monospace"],
-    ).set(
-        body_background_fill="#050810",
-        body_text_color="#e2e8f0",
-        block_background_fill="#0d1425",
-        block_border_color="rgba(0,212,255,0.15)",
-        block_label_text_color="#00d4ff",
-        input_background_fill="#050810",
-        input_border_color="rgba(0,212,255,0.2)",
-        button_primary_background_fill="linear-gradient(135deg, #0066ff, #00d4ff)",
-        button_primary_text_color="#ffffff",
-        border_color_accent="#00d4ff",
-        color_accent_soft="rgba(0,212,255,0.1)",
-    ),
-    css=CUSTOM_CSS
-) as demo:
-    gr.Markdown("""
-# 🔍 LOG CLASSIFICATION SYSTEM
-**3-tier hybrid pipeline** — 🟢 Regex · 🔵 BERT + ML · 🟡 LLM
-*Enterprise-grade log monitoring at production scale*
-""")
-    with gr.Tabs():
-        # ── Tab 1: Single Log ─────────────────────────────────────────────
-        with gr.Tab("⚡ SINGLE LOG"):
-            with gr.Row():
-                with gr.Column(scale=1):
-                    source_input = gr.Dropdown(
-                        choices=SOURCES,
-                        value="ModernCRM",
-                        label="SOURCE SYSTEM",
-                    )
-                with gr.Column(scale=3):
-                    log_input = gr.Textbox(
-                        label="LOG MESSAGE",
-                        placeholder="Paste a log message here...",
-                        lines=3,
-                    )
-            classify_btn = gr.Button("▶  CLASSIFY LOG", variant="primary", size="lg")
-            with gr.Row():
-                label_out      = gr.Textbox(label="🏷️ PREDICTED LABEL",  interactive=False)
-                tier_out       = gr.Textbox(label="⚙️  TIER USED",        interactive=False)
-                confidence_out = gr.Textbox(label="📈 CONFIDENCE",        interactive=False)
-                latency_out    = gr.Textbox(label="⏱️  LATENCY",          interactive=False)
-            classify_btn.click(
-                fn=classify_single,
-                inputs=[source_input, log_input],
-                outputs=[label_out, tier_out, confidence_out, latency_out],
-            )
-            gr.Examples(
-                examples=EXAMPLE_LOGS,
-                inputs=[source_input, log_input],
-                label="📋 EXAMPLE LOGS — click to try",
-            )
-        # ── Tab 2: Batch CSV ──────────────────────────────────────────────
-        with gr.Tab("📦 BATCH CSV"):
-            gr.Markdown("""
-### Bulk Classification
-Upload a CSV with columns: **`source`**, **`log_message`**
-Output includes: `predicted_label`, `tier_used`, `confidence`, `latency_ms`
-""")
-            with gr.Row():
-                with gr.Column():
-                    csv_input  = gr.File(label="📂 UPLOAD CSV", file_types=[".csv"])
-                    batch_btn  = gr.Button("▶  CLASSIFY ALL", variant="primary")
-                with gr.Column():
-                    csv_output = gr.File(label="📥 DOWNLOAD RESULTS")
-                    stats_out  = gr.Textbox(label="📊 STATISTICS", lines=12, interactive=False)
-            batch_btn.click(
-                fn=classify_batch,
-                inputs=[csv_input],
-                outputs=[csv_output, stats_out],
-            )
-            gr.Markdown("""
-**Sample CSV format:**
-```
-source,log_message
-ModernCRM,User User123 logged in.
-LegacyCRM,Case escalation for ticket ID 7324 failed.
-BillingSystem,GET /api/v2/invoice HTTP/1.1 status: 500
-```
-""")
-        # ── Tab 3: Architecture ───────────────────────────────────────────
-        with gr.Tab("🏗️ ARCHITECTURE"):
-            gr.Markdown("""
-## 3-Tier Hybrid Pipeline
-| Tier | Method | Coverage | Latency | Trigger |
-|------|--------|----------|---------|---------|
-| 🟢 **Regex** | Python `re` patterns | ~21% | < 1ms | Fixed patterns |
-| 🔵 **BERT** | `all-MiniLM-L6-v2` + LogReg | ~79% | 20–80ms | High-volume categories |
-| 🟡 **LLM** | HuggingFace Inference API | ~0.3% | 500–2000ms | LegacyCRM + rare patterns |
-## Model Performance
-- **Training data**: 2,410 synthetic enterprise logs
-- **Confidence threshold**: 0.5 (below → escalate to LLM)
-- **Source-aware routing**: `LegacyCRM` → LLM directly
-## Environment Variables
-| Secret | Purpose |
-|--------|---------|
-| `HF_TOKEN` | LLM inference for LegacyCRM logs |
-""")
-if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

hf_space/classify.py DELETED Viewed

@@ -1,198 +0,0 @@
-"""
-classify.py — 3-Tier Hybrid Pipeline (V3 — Latency-Tracked)
-Architecture:
-  LegacyCRM → LLM directly
-  Others    → Regex → BERT (batch) → LLM fallback
-Changes in V3:
-  - Tier-wise latency tracking (regex_ms, bert_ms, llm_ms)
-  - Pipeline summary with p50/p95 per tier
-  - Defensive: LLM timeout + retry baked in via processor_llm
-  - classify_logs returns richer result dict
-"""
-from __future__ import annotations
-import time
-import statistics
-import pandas as pd
-from processor_regex import classify_with_regex
-from processor_bert  import classify_batch as bert_batch
-from processor_llm   import classify_with_llm
-LEGACY_SOURCE = "LegacyCRM"
-# ── Result type ─────────────────────────────────────────────────────────────
-def _make_result(label: str, tier: str, confidence, latency_ms: float) -> dict:
-    return {
-        "label":      label,
-        "tier":       tier,
-        "confidence": confidence,
-        "latency_ms": round(latency_ms, 3),
-    }
-# ── Single log (backward-compatible) ────────────────────────────────────────
-def classify_log(source: str, log_msg: str) -> dict:
-    """Single log classify karo. Returns label, tier, confidence, latency_ms."""
-    results = classify_logs([(source, log_msg)])
-    return results[0]
-# ── Batch pipeline (main entry point) ───────────────────────────────────────
-def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
-    """
-    Batch classify with 3-tier routing + per-result latency.
-    Returns list of dicts:
-      { label, tier, confidence, latency_ms }
-    Tier routing:
-      LegacyCRM source → LLM directly
-      Regex match      → done (sub-ms)
-      Remainder        → BERT batch → LLM if low confidence
-    """
-    n       = len(logs)
-    results = [None] * n
-    # ── Step 1: Route to groups ─────────────────────────────────────────────
-    llm_indices   = []
-    bert_indices  = []
-    entry_times   = [time.perf_counter()] * n  # approximate per-log start
-    t_route_start = time.perf_counter()
-    for i, (source, log_msg) in enumerate(logs):
-        entry_times[i] = time.perf_counter()
-        if source == LEGACY_SOURCE:
-            llm_indices.append(i)
-        else:
-            t0    = time.perf_counter()
-            label = classify_with_regex(log_msg)
-            t1    = time.perf_counter()
-            if label:
-                results[i] = _make_result(label, "Regex", 1.0, (t1 - t0) * 1000)
-            else:
-                bert_indices.append(i)
-    # ── Step 2: BERT batch ──────────────────────────────────────────────────
-    if bert_indices:
-        bert_msgs = [logs[i][1] for i in bert_indices]
-        t_bert_start = time.perf_counter()
-        bert_results = bert_batch(bert_msgs)
-        t_bert_end   = time.perf_counter()
-        bert_ms_per_log = (t_bert_end - t_bert_start) * 1000 / len(bert_msgs)
-        for idx, (label, conf) in zip(bert_indices, bert_results):
-            if label != "Unclassified":
-                results[idx] = _make_result(label, "BERT", conf, bert_ms_per_log)
-            else:
-                llm_indices.append(idx)
-    # ── Step 3: LLM (LegacyCRM + BERT fallback) ────────────────────────────
-    for i in llm_indices:
-        _, log_msg = logs[i]
-        t0    = time.perf_counter()
-        label = classify_with_llm(log_msg)
-        t1    = time.perf_counter()
-        tier  = "LLM" if logs[i][0] == LEGACY_SOURCE else "LLM (fallback)"
-        results[i] = _make_result(label, tier, None, (t1 - t0) * 1000)
-    return results
-# ── Pipeline summary ─────────────────────────────────────────────────────────
-def pipeline_summary(results: list[dict]) -> dict:
-    """
-    Aggregate stats from classify_logs output.
-    Useful for dashboard and benchmark reporting.
-    """
-    tier_groups: dict[str, list[float]] = {}
-    label_counts: dict[str, int] = {}
-    for r in results:
-        tier = r["tier"]
-        tier_groups.setdefault(tier, []).append(r["latency_ms"])
-        label_counts[r["label"]] = label_counts.get(r["label"], 0) + 1
-    total = len(results)
-    tier_stats = {}
-    for tier, latencies in tier_groups.items():
-        latencies_sorted = sorted(latencies)
-        n = len(latencies_sorted)
-        tier_stats[tier] = {
-            "count":    n,
-            "pct":      round(n / total * 100, 1),
-            "p50_ms":   round(statistics.median(latencies_sorted), 2),
-            "p95_ms":   round(latencies_sorted[min(int(n * 0.95), n - 1)], 2),
-            "p99_ms":   round(latencies_sorted[min(int(n * 0.99), n - 1)], 2),
-            "mean_ms":  round(statistics.mean(latencies_sorted), 2),
-        }
-    return {
-        "total":        total,
-        "tier_stats":   tier_stats,
-        "label_counts": label_counts,
-    }
-# ── CSV batch classify ───────────────────────────────────────────────────────
-def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str, pd.DataFrame]:
-    """
-    CSV file classify karo.
-    Required columns: 'source', 'log_message'
-    Output: adds 'predicted_label', 'tier_used', 'confidence', 'latency_ms'
-    """
-    df = pd.read_csv(input_path)
-    required = {"source", "log_message"}
-    if not required.issubset(df.columns):
-        raise ValueError(f"CSV mein ye columns chahiye: {required}. Mila: {set(df.columns)}")
-    log_pairs = list(zip(df["source"], df["log_message"]))
-    results   = classify_logs(log_pairs)
-    df["predicted_label"] = [r["label"]      for r in results]
-    df["tier_used"]        = [r["tier"]       for r in results]
-    df["latency_ms"]       = [r["latency_ms"] for r in results]
-    df["confidence"]       = [
-        f"{r['confidence']:.1%}" if r["confidence"] is not None else "N/A"
-        for r in results
-    ]
-    df.to_csv(output_path, index=False)
-    return output_path, df
-# Aliases
-classify = classify_logs
-# ── Self-test ────────────────────────────────────────────────────────────────
-if __name__ == "__main__":
-    sample = [
-        ("ModernCRM",       "IP 192.168.133.114 blocked due to potential attack"),
-        ("BillingSystem",   "User User12345 logged in."),
-        ("AnalyticsEngine", "File data_6957.csv uploaded successfully by user User265."),
-        ("ModernHR",        "GET /v2/servers/detail HTTP/1.1 status: 200 len: 1583 time: 0.19"),
-        ("ModernHR",        "Admin access escalation detected for user 9429"),
-        ("LegacyCRM",       "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active."),
-        ("LegacyCRM",       "The 'ReportGenerator' module will be retired in version 4.0."),
-    ]
-    print(f'{"Source":<20} {"Tier":<18} {"Conf":>6} {"Lat(ms)":>8}  {"Label":<25} Log')
-    print("─" * 115)
-    results = classify_logs(sample)
-    for (source, log), r in zip(sample, results):
-        conf = f"{r['confidence']:.0%}" if r["confidence"] else "  N/A"
-        print(f'{source:<20} {r["tier"]:<18} {conf:>6} {r["latency_ms"]:>8.1f}  {r["label"]:<25} {log[:40]}')
-    summary = pipeline_summary(results)
-    print("\n📊 Pipeline Summary:")
-    for tier, stats in summary["tier_stats"].items():
-        print(f"  {tier}: {stats['count']} logs ({stats['pct']}%) | "
-              f"p50={stats['p50_ms']}ms p95={stats['p95_ms']}ms p99={stats['p99_ms']}ms")
-    print("\n🏷️  Label distribution:")
-    for label, count in sorted(summary["label_counts"].items(), key=lambda x: -x[1]):
-        print(f"  • {label}: {count}")

hf_space/models/log_classifier.joblib DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9bfe9c71b71412797de0d426be2255566dbf6cf87b3f2ae5d2cd1fd69a98d18d
-size 23997

hf_space/onnx_model/config.json DELETED Viewed

@@ -1,24 +0,0 @@
-{
-  "architectures": [
-    "BertModel"
-  ],
-  "attention_probs_dropout_prob": 0.1,
-  "classifier_dropout": null,
-  "gradient_checkpointing": false,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.1,
-  "hidden_size": 384,
-  "initializer_range": 0.02,
-  "intermediate_size": 1536,
-  "layer_norm_eps": 1e-12,
-  "max_position_embeddings": 512,
-  "model_type": "bert",
-  "num_attention_heads": 12,
-  "num_hidden_layers": 6,
-  "pad_token_id": 0,
-  "position_embedding_type": "absolute",
-  "transformers_version": "4.57.6",
-  "type_vocab_size": 2,
-  "use_cache": true,
-  "vocab_size": 30522
-}

hf_space/onnx_model/special_tokens_map.json DELETED Viewed

@@ -1,37 +0,0 @@
-{
-  "cls_token": {
-    "content": "[CLS]",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "mask_token": {
-    "content": "[MASK]",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "pad_token": {
-    "content": "[PAD]",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "sep_token": {
-    "content": "[SEP]",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "unk_token": {
-    "content": "[UNK]",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  }
-}

hf_space/onnx_model/tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

hf_space/onnx_model/tokenizer_config.json DELETED Viewed

@@ -1,65 +0,0 @@
-{
-  "added_tokens_decoder": {
-    "0": {
-      "content": "[PAD]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "100": {
-      "content": "[UNK]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "101": {
-      "content": "[CLS]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "102": {
-      "content": "[SEP]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "103": {
-      "content": "[MASK]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "clean_up_tokenization_spaces": false,
-  "cls_token": "[CLS]",
-  "do_basic_tokenize": true,
-  "do_lower_case": true,
-  "extra_special_tokens": {},
-  "mask_token": "[MASK]",
-  "max_length": 128,
-  "model_max_length": 512,
-  "never_split": null,
-  "pad_to_multiple_of": null,
-  "pad_token": "[PAD]",
-  "pad_token_type_id": 0,
-  "padding_side": "right",
-  "sep_token": "[SEP]",
-  "stride": 0,
-  "strip_accents": null,
-  "tokenize_chinese_chars": true,
-  "tokenizer_class": "BertTokenizer",
-  "truncation_side": "right",
-  "truncation_strategy": "longest_first",
-  "unk_token": "[UNK]"
-}

hf_space/onnx_model/vocab.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

hf_space/processor_bert.py DELETED Viewed

@@ -1,216 +0,0 @@
-"""
-processor_bert_fast.py — ONNX Runtime powered BERT classifier
-Speed: 82 logs/s → 2000+ logs/s
-Kaise kaam karta hai:
-1. ONNX Runtime: Normal PyTorch se 3-5x faster
-2. Batch processing: 64 logs ek saath process
-3. Pre-allocated buffers: Memory waste nahi
-"""
-from __future__ import annotations
-import os
-import numpy as np
-import joblib
-# ── Check karo kaunsa method use karna hai ──────────────────
-_USE_ONNX = False
-_embedding_model = None
-_classifier       = None
-_ort_session      = None
-_ort_tokenizer    = None
-MODEL_PATH    = os.path.join(os.path.dirname(__file__), 'models', 'log_classifier.joblib')
-ONNX_DIR      = os.path.join(os.path.dirname(__file__), 'models', 'onnx')
-CONFIDENCE_THRESHOLD = 0.30
-DEFAULT_BATCH = 64
-def _load_models():
-    """Lazily load models — pehli call pe hi load hoga, baar baar nahi."""
-    global _USE_ONNX, _embedding_model, _classifier, _ort_session, _ort_tokenizer
-    if _classifier is not None:
-        return  # Already loaded
-    # ── Classifier load karo ───────────────────────────────
-    if not os.path.exists(MODEL_PATH):
-        raise FileNotFoundError(
-            f'Model nahi mila: {MODEL_PATH}\n'
-            'Pehle Colab notebook run karo aur model download karo.'
-        )
-    _classifier = joblib.load(MODEL_PATH)
-    # ── ONNX try karo (fast), fallback to PyTorch ──────────
-    onnx_model_file = os.path.join(ONNX_DIR, 'model.onnx')
-    if os.path.exists(onnx_model_file):
-        try:
-            import onnxruntime as ort
-            from transformers import AutoTokenizer
-            # CPU optimized session options
-            sess_opts = ort.SessionOptions()
-            sess_opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
-            sess_opts.intra_op_num_threads = os.cpu_count()
-            sess_opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
-            _ort_session = ort.InferenceSession(
-                onnx_model_file,
-                sess_options=sess_opts,
-                providers=['CPUExecutionProvider']
-            )
-            _ort_tokenizer = AutoTokenizer.from_pretrained(ONNX_DIR)
-            _USE_ONNX = True
-            print('[BERT] ✅ ONNX Runtime loaded — FAST MODE')
-        except Exception as e:
-            print(f'[BERT] ONNX load failed ({e}), fallback to PyTorch')
-            _USE_ONNX = False
-    if not _USE_ONNX:
-        from sentence_transformers import SentenceTransformer
-        _embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
-        print('[BERT] ⚠️  PyTorch mode (install ONNX for 3-5x speedup)')
-def _embed_onnx(texts: list[str]) -> np.ndarray:
-    """ONNX Runtime se embeddings generate karo — FAST."""
-    import torch
-    inputs = _ort_tokenizer(
-        texts,
-        padding=True,
-        truncation=True,
-        max_length=128,
-        return_tensors='np'  # NumPy directly (faster than PyTorch tensors)
-    )
-    # ONNX session run
-    ort_inputs = {
-        'input_ids':      inputs['input_ids'].astype(np.int64),
-        'attention_mask': inputs['attention_mask'].astype(np.int64),
-    }
-    if 'token_type_ids' in [i.name for i in _ort_session.get_inputs()]:
-        ort_inputs['token_type_ids'] = inputs.get(
-            'token_type_ids', np.zeros_like(inputs['input_ids'])
-        ).astype(np.int64)
-    outputs = _ort_session.run(None, ort_inputs)
-    hidden  = outputs[0]  # (batch, seq_len, hidden)
-    # Mean pooling (attention mask weighted)
-    mask    = inputs['attention_mask'][:, :, None].astype(np.float32)
-    summed  = (hidden * mask).sum(axis=1)
-    counts  = mask.sum(axis=1)
-    embeddings = summed / counts
-    # L2 normalize
-    norms  = np.linalg.norm(embeddings, axis=1, keepdims=True)
-    return  embeddings / (norms + 1e-8)
-def _embed_pytorch(texts: list[str]) -> np.ndarray:
-    """PyTorch fallback."""
-    return _embedding_model.encode(
-        texts,
-        batch_size=DEFAULT_BATCH,
-        convert_to_numpy=True,
-        normalize_embeddings=True,
-        show_progress_bar=False
-    )
-# ── PUBLIC API ──────────────────────────────────────────────
-def classify_with_bert(log_message: str) -> tuple[str, float]:
-    """
-    Single log classify karo.
-    Returns: (label, confidence)
-    """
-    _load_models()
-    results = classify_batch([log_message])
-    return results[0]
-def classify_batch(log_messages: list[str]) -> list[tuple[str, float]]:
-    """
-    Multiple logs ek saath classify karo — MUCH FASTER!
-    Returns: list of (label, confidence) tuples
-    Example:
-        results = classify_batch(['log1', 'log2', 'log3'])
-        for label, conf in results:
-            print(f'{label}: {conf:.1%}')
-    """
-    _load_models()
-    if not log_messages:
-        return []
-    results = []
-    # Process in batches
-    for i in range(0, len(log_messages), DEFAULT_BATCH):
-        batch = log_messages[i:i + DEFAULT_BATCH]
-        # Generate embeddings
-        if _USE_ONNX:
-            embeddings = _embed_onnx(batch)
-        else:
-            embeddings = _embed_pytorch(batch)
-        # Classify
-        probs   = _classifier.predict_proba(embeddings)
-        max_probs = probs.max(axis=1)
-        labels    = _classifier.predict(embeddings)
-        for label, conf in zip(labels, max_probs):
-            if conf < CONFIDENCE_THRESHOLD:
-                results.append(('Unclassified', float(conf)))
-            else:
-                results.append((str(label), float(conf)))
-    return results
-def get_classes() -> list[str]:
-    """Classifier ke classes return karo."""
-    _load_models()
-    return list(_classifier.classes_)
-def is_onnx_mode() -> bool:
-    """Check karo ONNX use ho raha hai ya nahi."""
-    _load_models()
-    return _USE_ONNX
-# ── TEST ────────────────────────────────────────────────────
-if __name__ == '__main__':
-    import time
-    test_logs = [
-        'GET /v2/servers/detail HTTP/1.1 status: 404 len: 1583 time: 0.19',
-        'System crashed due to driver errors when restarting the server',
-        'Multiple login failures occurred on user 6454 account',
-        'Admin access escalation detected for user 9429',
-        'CPU usage at 98% for the last 10 minutes on node-7',
-        'Backup completed successfully.',
-        'User User123 logged in.',
-        'Data replication task for shard 14 did not complete',
-        'Hey bro chill ya!',     # should be Unclassified
-    ]
-    print('Single log test:')
-    for log in test_logs:
-        label, conf = classify_with_bert(log)
-        print(f'  [{conf:.0%}] {label:25s} | {log[:60]}')
-    print(f'\nMode: {"ONNX 🚀" if is_onnx_mode() else "PyTorch"}')
-    # Speed test
-    big_batch = test_logs * 100
-    t0 = time.perf_counter()
-    classify_batch(big_batch)
-    elapsed = time.perf_counter() - t0
-    print(f'\nSpeed: {len(big_batch)/elapsed:.0f} logs/s  ({elapsed*1000/len(big_batch):.1f}ms/log)')

hf_space/processor_llm.py DELETED Viewed

@@ -1,192 +0,0 @@
-"""
-processor_llm.py — Tier 3: LLM-based Classifier
-Used for:
-  - LegacyCRM logs (Workflow Error, Deprecation Warning)
-  - BERT fallback when confidence < threshold
-Production hardening in V3:
-  - Timeout (configurable, default 5s)
-  - Retry with exponential backoff (max 2 retries)
-  - Explicit failure modes: returns "Unclassified" on all error paths
-  - Caching for repeated log patterns (hash-based, in-memory)
-  - Token budget enforcement (max_tokens=15)
-"""
-from __future__ import annotations
-import os
-import re
-import time
-import hashlib
-import logging
-from functools import lru_cache
-from typing import Optional
-logger = logging.getLogger(__name__)
-# ── Config ─────────────────────────────────────────────────────────────────
-HF_TOKEN   = os.getenv("HF_TOKEN")
-LLM_MODEL  = "mistralai/Mistral-7B-Instruct-v0.3"
-VALID_CATEGORIES = ["Workflow Error", "Deprecation Warning"]
-# Retry / timeout config
-MAX_RETRIES     = 2
-RETRY_DELAY_SEC = 1.0   # doubles on each retry (exponential backoff)
-REQUEST_TIMEOUT = 5     # seconds — fail fast, do not hang pipeline
-# In-memory cache to avoid redundant LLM calls for repeated logs
-_RESPONSE_CACHE: dict[str, str] = {}
-MAX_CACHE_SIZE = 1000  # evict oldest when full (simple FIFO)
-SYSTEM_PROMPT = (
-    "You are an enterprise log classifier. "
-    "Classify log messages into exactly one category. "
-    "Return ONLY the category name — no explanation, no punctuation."
-)
-FEW_SHOT_EXAMPLES = [
-    {
-        "log":   "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active.",
-        "label": "Workflow Error",
-    },
-    {
-        "log":   "The 'BulkEmailSender' feature is no longer supported. Use 'EmailCampaignManager' instead.",
-        "label": "Deprecation Warning",
-    },
-    {
-        "log":   "Invoice generation aborted for order ID 8910 due to invalid tax calculation module.",
-        "label": "Workflow Error",
-    },
-]
-# ── Cache helpers ────────────────────────────────────────────────────────────
-def _cache_key(log_msg: str) -> str:
-    return hashlib.md5(log_msg.strip().encode()).hexdigest()
-def _cache_get(log_msg: str) -> Optional[str]:
-    return _RESPONSE_CACHE.get(_cache_key(log_msg))
-def _cache_set(log_msg: str, label: str) -> None:
-    key = _cache_key(log_msg)
-    if len(_RESPONSE_CACHE) >= MAX_CACHE_SIZE:
-        # Evict oldest (first inserted) key
-        oldest = next(iter(_RESPONSE_CACHE))
-        del _RESPONSE_CACHE[oldest]
-    _RESPONSE_CACHE[key] = label
-def get_cache_stats() -> dict:
-    return {"size": len(_RESPONSE_CACHE), "max_size": MAX_CACHE_SIZE}
-# ── Prompt builder ───────────────────────────────────────────────────────────
-def _build_messages(log_msg: str) -> list[dict]:
-    categories_str = ", ".join(f'"{c}"' for c in VALID_CATEGORIES)
-    user_content = (
-        f'Classify the following log into one of these categories: {categories_str}.\n'
-        'If none fits, return "Unclassified".\n\n'
-    )
-    for ex in FEW_SHOT_EXAMPLES:
-        user_content += f'Log: {ex["log"]}\nCategory: {ex["label"]}\n\n'
-    user_content += f"Log: {log_msg}\nCategory:"
-    return [
-        {"role": "system", "content": SYSTEM_PROMPT},
-        {"role": "user",   "content": user_content},
-    ]
-# ── Normalize raw LLM output ─────────────────────────────────────────────────
-def _normalize(raw: str) -> str:
-    """Map raw LLM output to a valid category or 'Unclassified'."""
-    raw = raw.strip().strip('"').strip("'")
-    for cat in VALID_CATEGORIES:
-        if cat.lower() in raw.lower():
-            return cat
-    return "Unclassified"
-# ── Main classify function ────────────────────────────────────────────────────
-def classify_with_llm(log_msg: str) -> str:
-    """
-    Tier 3 LLM classifier with:
-      - In-memory cache (avoids duplicate API calls)
-      - Timeout (REQUEST_TIMEOUT seconds)
-      - Retry with exponential backoff (MAX_RETRIES attempts)
-      - Explicit fallback to "Unclassified" on all error paths
-    Latency: 500–2000ms on cache miss; ~0ms on cache hit.
-    """
-    # ── Cache hit ────────────────────────────────────────────────────────────
-    cached = _cache_get(log_msg)
-    if cached is not None:
-        logger.debug(f"[LLM] Cache hit for: {log_msg[:60]}")
-        return cached
-    # ── Inference with retry ─────────────────────────────────────────────────
-    if not HF_TOKEN:
-        logger.warning("[LLM] HF_TOKEN not set — returning Unclassified")
-        return "Unclassified"
-    from huggingface_hub import InferenceClient
-    client  = InferenceClient(token=HF_TOKEN, timeout=REQUEST_TIMEOUT)
-    delay   = RETRY_DELAY_SEC
-    last_err: Optional[Exception] = None
-    for attempt in range(1, MAX_RETRIES + 2):  # +2: initial + MAX_RETRIES
-        try:
-            response = client.chat.completions.create(
-                model=LLM_MODEL,
-                messages=_build_messages(log_msg),
-                max_tokens=15,
-                temperature=0.1,
-            )
-            raw   = response.choices[0].message.content
-            label = _normalize(raw)
-            _cache_set(log_msg, label)
-            logger.debug(f"[LLM] Attempt {attempt}: '{raw.strip()}' → '{label}'")
-            return label
-        except Exception as e:
-            last_err = e
-            if attempt <= MAX_RETRIES:
-                logger.warning(f"[LLM] Attempt {attempt} failed ({e}), retrying in {delay:.1f}s…")
-                time.sleep(delay)
-                delay *= 2  # exponential backoff
-            else:
-                logger.error(f"[LLM] All {MAX_RETRIES + 1} attempts failed. Last error: {e}")
-    return "Unclassified"
-# ── Batch classify (serial — LLM is already rate-limited) ────────────────────
-def classify_batch_llm(log_msgs: list[str]) -> list[str]:
-    """Classify multiple logs through LLM. Each call is sequential to respect rate limits."""
-    return [classify_with_llm(msg) for msg in log_msgs]
-# ── CLI test ─────────────────────────────────────────────────────────────────
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO)
-    test_logs = [
-        "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active.",
-        "The 'ReportGenerator' module will be retired in version 4.0. Migrate to 'AdvancedAnalyticsSuite'.",
-        "System reboot initiated by user 12345.",   # should be Unclassified
-    ]
-    for log in test_logs:
-        result = classify_with_llm(log)
-        print(f"{result:25s} | {log[:80]}")
-    # Cache hit test
-    print("\n── Cache hit test ──")
-    t0 = time.perf_counter()
-    classify_with_llm(test_logs[0])
-    t1 = time.perf_counter()
-    print(f"Cache hit latency: {(t1-t0)*1000:.2f}ms")
-    print(f"Cache stats: {get_cache_stats()}")

hf_space/processor_regex.py DELETED Viewed

@@ -1,220 +0,0 @@
-"""
-processor_regex.py — Tier 1: Rule-based Classifier
-Target coverage: 40%+ (up from 15%)
-Latency: sub-millisecond per log
-New pattern groups added:
-  - HTTP request/response logs   (was completely missing!)
-  - Auth / credential events     (login failures, MFA, lockouts)
-  - System/infra events          (disk, CPU, memory, cron)
-  - Network / firewall events    (IP block, port scan)
-  - Structured error codes       (ERROR, CRITICAL prefix logs)
-"""
-from __future__ import annotations
-import re
-import time
-from typing import Optional
-# ---------------------------------------------------------------------------
-# Pattern registry: (compiled_pattern, label)
-# Order matters — more specific patterns FIRST to avoid mis-labeling.
-# ---------------------------------------------------------------------------
-_RAW_PATTERNS: list[tuple[str, str]] = [
-    # ── HTTP Status ─────────────────────────────────────────────────────────
-    # Covers: GET/POST/PUT/DELETE/PATCH + status code in request line
-    (r"\b(GET|POST|PUT|DELETE|PATCH|HEAD|OPTIONS)\s+\S+\s+HTTP/\d", "HTTP Status"),
-    # Nova / OpenStack style
-    (r"nova\.\S+\s+(GET|POST|PUT|DELETE)\s+\S+\s+HTTP/\d", "HTTP Status"),
-    # Status code only style: "returned HTTP 200" or "status: 404"
-    (r"\bstatus[:\s]+\d{3}\b", "HTTP Status"),
-    (r"\breturned\s+HTTP\s+\d{3}\b", "HTTP Status"),
-    (r"\bHTTP\s+status\s+code\s*[:-]?\s*\d{3}\b", "HTTP Status"),
-    # API response style
-    (r"\bAPI\s+(call|request)\s+\S+\s+completed\s+with\s+status\s+\d{3}", "HTTP Status"),
-    (r"\bEndpoint\s+\S+\s+responded\s+with\s+code\s+\d{3}", "HTTP Status"),
-    # ── Security Alert ──────────────────────────────────────────────────────
-    # Brute force / login failures
-    (r"(multiple\s+)?(bad\s+|failed?\s+)?login\s+(failure|attempt|failures)", "Security Alert"),
-    (r"brute[\s_-]force\s+(login|attack|attempt)", "Security Alert"),
-    # Unauthorized access
-    (r"unauthorized\s+(access|admin|privilege|attempt)", "Security Alert"),
-    (r"access\s+denied\s+(for|to)\s+(user|ip|host)", "Security Alert"),
-    # Privilege escalation
-    (r"(admin\s+)?access\s+escalation\s+detected", "Security Alert"),
-    (r"privilege\s+(elev|escalat)", "Security Alert"),
-    # IP blocking / suspicious traffic
-    (r"IP\s+\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\s+blocked", "Security Alert"),
-    (r"(suspicious|anomalous)\s+(login|traffic|activity|request)", "Security Alert"),
-    (r"potential\s+(DDoS|attack|breach|intrusion)", "Security Alert"),
-    (r"security\s+breach\s+suspected", "Security Alert"),
-    (r"(API\s+security\s+breach|bypass\s+API\s+security)", "Security Alert"),
-    (r"port\s+scan\s+(detected|attempt)", "Security Alert"),
-    # ── User Action ─────────────────────────────────────────────────────────
-    (r"User\s+\w+\d*\s+logged\s+(in|out)", "User Action"),
-    (r"Account\s+(with\s+)?ID\s+\S+\s+created\s+by", "User Action"),
-    (r"User\s+\w+\d*\s+(updated\s+profile|changed\s+password|enabled\s+two|downloaded|exported)", "User Action"),
-    (r"(New\s+user|user\s+\w+\d*)\s+registered", "User Action"),
-    (r"Account\s+\S+\s+deleted\s+by\s+(administrator|admin)", "User Action"),
-    (r"User\s+\w+\d*\s+(tried|attempted)", "User Action"),
-    # ── System Notification ─────────────────────────────────────────────────
-    # Backup events
-    (r"Backup\s+(started|ended|completed\s+successfully|failed|aborted)", "System Notification"),
-    (r"System\s+updated\s+to\s+version", "System Notification"),
-    (r"File\s+\S+\s+uploaded\s+successfully\s+by\s+user", "System Notification"),
-    (r"Disk\s+cleanup\s+completed\s+successfully", "System Notification"),
-    (r"System\s+reboot\s+initiated\s+by\s+user", "System Notification"),
-    (r"Scheduled\s+maintenance\s+(started|completed)", "System Notification"),
-    (r"Service\s+\w+\s+restarted\s+successfully", "System Notification"),
-    # NEW: cache, cron, health check, cert, log rotation
-    (r"Cache\s+cleared\s+successfully", "System Notification"),
-    (r"Log\s+rotation\s+completed", "System Notification"),
-    (r"Health\s+check\s+(passed|failed)\s+for\s+service", "System Notification"),
-    (r"Certificate\s+(renewed|expired|revoked)\s+successfully", "System Notification"),
-    (r"Cron\s+job\s+\S+\s+(executed|failed|completed)\s+successfully", "System Notification"),
-    (r"(Disk|Storage)\s+(usage|space)\s+(at|reached|exceeded)\s+\d+%", "System Notification"),
-    (r"CPU\s+usage\s+at\s+\d+%", "System Notification"),
-    (r"Memory\s+(usage|limit)\s+(at|reached|exceeded)\s+\d+%", "System Notification"),
-    # Deployment / config
-    (r"Deployment\s+(of|for)\s+\S+\s+(completed|failed|started)", "System Notification"),
-    (r"Configuration\s+(reloaded|updated|applied)\s+successfully", "System Notification"),
-    # ── Error ───────────────────────────────────────────────────────────────
-    (r"\bERROR\b.*\b(exception|failed|failure|crash|timeout|unavailable)\b", "Error"),
-    (r"System\s+crashed\s+due\s+to", "Error"),
-    (r"(connection|request|task|job)\s+(timed?\s*out|timeout)", "Error"),
-    (r"service\s+\S+\s+(is\s+down|unavailable|unreachable)", "Error"),
-    (r"database\s+connection\s+(failed|refused|lost|dropped)", "Error"),
-    (r"disk\s+(I/O\s+)?failure", "Error"),
-    (r"driver\s+error(s)?\s+(when|during|on)", "Error"),
-    (r"(replication|sync)\s+task\s+(did\s+not\s+complete|failed)", "Error"),
-    (r"null\s+pointer|segmentation\s+fault|stack\s+overflow", "Error"),
-    # ── Critical Error ──────────────────────────────────────────────────────
-    (r"\bCRITICAL\b", "Critical Error"),
-    (r"(FATAL|PANIC)\b", "Critical Error"),
-    (r"(data\s+loss|data\s+corruption)\s+(detected|occurred)", "Critical Error"),
-    (r"(cluster|node|shard)\s+(failure|crashed|went\s+down)", "Critical Error"),
-    (r"(catastrophic|unrecoverable)\s+(failure|error)", "Critical Error"),
-    (r"kernel\s+panic", "Critical Error"),
-    (r"out[\s-]of[\s-](memory|disk)\s+(error|killed|OOM)", "Critical Error"),
-]
-# Pre-compile all patterns at import time (not per-call)
-REGEX_PATTERNS: list[tuple[re.Pattern, str]] = [
-    (re.compile(pat, re.IGNORECASE), label)
-    for pat, label in _RAW_PATTERNS
-]
-def classify_with_regex(log_message: str) -> Optional[str]:
-    """
-    Tier 1: Rule-based classifier.
-    Returns category label, or None if no pattern matches.
-    Latency: sub-millisecond (patterns pre-compiled at import).
-    """
-    for pattern, label in REGEX_PATTERNS:
-        if pattern.search(log_message):
-            return label
-    return None
-def get_regex_coverage(log_messages: list[str]) -> dict:
-    """Measure regex tier coverage and per-label breakdown."""
-    label_counts: dict[str, int] = {}
-    missed = 0
-    for msg in log_messages:
-        label = classify_with_regex(msg)
-        if label:
-            label_counts[label] = label_counts.get(label, 0) + 1
-        else:
-            missed += 1
-    total   = len(log_messages)
-    matched = total - missed
-    return {
-        "total":        total,
-        "matched":      matched,
-        "missed":       missed,
-        "coverage_pct": round(matched / total * 100, 2) if total else 0.0,
-        "label_breakdown": label_counts,
-    }
-def benchmark_regex(log_messages: list[str], runs: int = 3) -> dict:
-    """Measure regex tier latency (p50 / p95 / p99) over multiple runs."""
-    import statistics
-    per_log_ms: list[float] = []
-    for _ in range(runs):
-        for msg in log_messages:
-            t0 = time.perf_counter()
-            classify_with_regex(msg)
-            per_log_ms.append((time.perf_counter() - t0) * 1000)
-    per_log_ms.sort()
-    return {
-        "p50_ms":  round(statistics.median(per_log_ms), 4),
-        "p95_ms":  round(per_log_ms[int(len(per_log_ms) * 0.95)], 4),
-        "p99_ms":  round(per_log_ms[int(len(per_log_ms) * 0.99)], 4),
-        "mean_ms": round(statistics.mean(per_log_ms), 4),
-    }
-# ── CLI self-test ────────────────────────────────────────────────────────────
-if __name__ == "__main__":
-    test_cases: list[tuple[str, str]] = [
-        # HTTP
-        ("GET /api/v2/resource HTTP/1.1 status: 200 len: 1583 time: 0.19", "HTTP Status"),
-        ("POST /v1/users HTTP/1.1 status: 201 len: 42 time: 0.05", "HTTP Status"),
-        ("nova.osapi_compute.wsgi.server GET /v2/servers/detail HTTP/1.1 status: 404", "HTTP Status"),
-        # Security
-        ("Multiple login failures occurred on user 6454 account", "Security Alert"),
-        ("IP 192.168.133.114 blocked due to potential attack", "Security Alert"),
-        ("Brute force login attempt from 10.0.0.5 detected", "Security Alert"),
-        ("Admin access escalation detected for user 9429", "Security Alert"),
-        # User Action
-        ("User User12345 logged in.", "User Action"),
-        ("Account with ID 456 created by Admin.", "User Action"),
-        # System Notification
-        ("Backup completed successfully.", "System Notification"),
-        ("CPU usage at 98% for the last 10 minutes on node-7", "System Notification"),
-        ("Health check passed for service payments-api", "System Notification"),
-        # Error
-        ("System crashed due to disk I/O failure on node-3", "Error"),
-        ("Database connection failed after 3 retries", "Error"),
-        # Critical
-        ("CRITICAL: data corruption detected on shard-14", "Critical Error"),
-        ("kernel panic: not syncing: VFS: unable to mount root fs", "Critical Error"),
-        # Should be None (unmatched)
-        ("The 'BulkEmailSender' feature will be deprecated in v5.0.", None),
-        ("Case escalation for ticket 7324 failed.", None),
-    ]
-    correct = 0
-    print(f"{'Expected':<22} {'Got':<22} {'✓/✗'} | Log")
-    print("─" * 100)
-    for log, expected in test_cases:
-        got = classify_with_regex(log)
-        ok  = got == expected
-        correct += ok
-        icon = "✓" if ok else "✗"
-        print(f"{str(expected):<22} {str(got):<22} {icon}   | {log[:55]}")
-    print(f"\n{correct}/{len(test_cases)} correct")
-    # Coverage demo
-    all_logs = [log for log, _ in test_cases]
-    cov = get_regex_coverage(all_logs)
-    print(f"\nCoverage: {cov['coverage_pct']}%  ({cov['matched']}/{cov['total']} matched)")
-    print("Label breakdown:", cov["label_breakdown"])
-    # Latency benchmark
-    lat = benchmark_regex(all_logs * 100)
-    print(f"\nLatency (p50/p95/p99): {lat['p50_ms']}ms / {lat['p95_ms']}ms / {lat['p99_ms']}ms")

hf_space/requirements.txt DELETED Viewed

@@ -1,25 +0,0 @@
-# Core
-gradio>=4.44.0
-pandas>=2.0.0
-numpy>=1.24.0
-joblib>=1.3.0
-scikit-learn>=1.3.0
-# Embedding + BERT
-sentence-transformers>=2.7.0
-transformers>=4.38.0
-# ONNX (optional, 3-5x speedup)
-onnxruntime>=1.17.0
-optimum[onnxruntime]>=1.16.0
-# LLM
-huggingface-hub>=0.21.0
-# FastAPI (production API)
-fastapi>=0.110.0
-uvicorn[standard]>=0.29.0
-pydantic>=2.0.0
-# Observability
-psutil>=5.9.0