Spaces:

NOT-OMEGA
/

LogAI-Engine

Sleeping

App Files Files Community

NOT-OMEGA commited on Apr 16

Commit

668419c

verified ·

1 Parent(s): 9ca9aea

Update classify.py

Browse files

Files changed (1) hide show

classify.py +91 -55

classify.py CHANGED Viewed

@@ -1,21 +1,23 @@
 """
-classify.py — 3-Tier Hybrid Pipeline (V9 — Balanced CPU & Gradio Safe)
-Architecture:
-  LegacyCRM → LLM directly
-  Others    → Regex → BERT (batch) → LLM fallback
-Changes in V9:
-  - Fixed CPU Starvation: Limited max_workers to half the CPU cores to prevent Gradio WebSocket timeouts.
-  - Reduced IPC Overhead: Lowered chunk_size to 10,000 to prevent CPU lockups during cross-process data pickling.
-  - Restored Multi-processing: Outer chunks use ProcessPoolExecutor for speed, inner LLM calls use ThreadPoolExecutor.
 """
 from __future__ import annotations
 import os
 import time
 import statistics
 import pandas as pd
-from functools import lru_cache
 from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
 from processor_regex import classify_with_regex
 from processor_bert  import classify_batch as bert_batch
@@ -24,6 +26,36 @@ from processor_llm   import classify_with_llm
 # ── Config ──────────────────────────────────────────────────────────────────
 LEGACY_SOURCE = os.getenv("LEGACY_SOURCE", "LegacyCRM")
 # ── Result type ─────────────────────────────────────────────────────────────
 def _make_result(label: str, tier: str, confidence, latency_ms: float) -> dict:
@@ -31,17 +63,10 @@ def _make_result(label: str, tier: str, confidence, latency_ms: float) -> dict:
         "label":      label,
         "tier":       tier,
         "confidence": confidence,
-        "latency_ms": round(latency_ms, 4),
     }
-# ── Caching Layer (Sharded per Worker) ──────────────────────────────────────
-@lru_cache(maxsize=500000)
-def cached_llm_call(log_msg: str) -> str:
-    """Executes the expensive LLM call only if the string misses the cache."""
-    return classify_with_llm(log_msg)
 # ── Single log (backward-compatible) ────────────────────────────────────────
 def classify_log(source: str, log_msg: str) -> dict:
     results = classify_logs([(source, log_msg)])
@@ -49,13 +74,13 @@ def classify_log(source: str, log_msg: str) -> dict:
 # ── Batch pipeline (main entry point) ───────────────────────────────────────
-def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
     n       = len(logs)
     results = [None] * n
     # ── Step 1: Route to groups ─────────────────────────────────────────────
-    llm_indices   = []
-    bert_indices  = []
     for i, (source, log_msg) in enumerate(logs):
         if source == LEGACY_SOURCE:
@@ -63,7 +88,7 @@ def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
         else:
             t_start = time.perf_counter()
             label = classify_with_regex(log_msg)
             if label:
                 latency_ms = (time.perf_counter() - t_start) * 1000
                 results[i] = _make_result(label, "Regex", 1.0, latency_ms)
@@ -76,28 +101,36 @@ def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
         t_bert_start = time.perf_counter()
         bert_results = bert_batch(bert_msgs)
-        t_bert_end   = time.perf_counter()
-        bert_ms_per_log = (t_bert_end - t_bert_start) * 1000 / len(bert_msgs)
         for idx, (label, conf) in zip(bert_indices, bert_results):
             if label != "Unclassified":
-                results[idx] = _make_result(label, "BERT", conf, bert_ms_per_log)
             else:
                 llm_indices.append(idx)
-    # ── Step 3: LLM (I/O Bound - Threading Applied Here) ────────────────────
     if llm_indices:
         def parallel_llm(idx):
             src, msg = logs[idx]
             t_llm_0 = time.perf_counter()
-            label = cached_llm_call(msg)
             t_llm_ms = (time.perf_counter() - t_llm_0) * 1000
             base_tier = "LLM" if src == LEGACY_SOURCE else "LLM (fallback)"
-            tier = f"{base_tier} (Cache Hit)" if t_llm_ms < 5 else f"{base_tier} (API Call)"
             return idx, _make_result(label, tier, None, t_llm_ms)
         with ThreadPoolExecutor() as executor:
@@ -110,9 +143,14 @@ def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
 # ── Pipeline summary ─────────────────────────────────────────────────────────
-def pipeline_summary(results: list[dict]) -> dict:
-    tier_groups: dict[str, list[float]] = {}
-    label_counts: dict[str, int] = {}
     for r in results:
         tier = r["tier"]
@@ -131,7 +169,7 @@ def pipeline_summary(results: list[dict]) -> dict:
             "p95_ms":   round(latencies_sorted[min(int(n * 0.95), n - 1)], 4),
             "p99_ms":   round(latencies_sorted[min(int(n * 0.99), n - 1)], 4),
             "mean_ms":  round(statistics.mean(latencies_sorted), 4),
-            "total_ms": round(sum(latencies_sorted), 4),
         }
     return {
@@ -142,13 +180,13 @@ def pipeline_summary(results: list[dict]) -> dict:
 # ── Multiprocessing Helper ───────────────────────────────────────────────────
-def _process_chunk(chunk: list[tuple[str, str]]) -> list[dict]:
-    """Top-level helper function required for ProcessPoolExecutor mapping."""
     return classify_logs(chunk)
 # ── CSV batch classify (Balanced Processing) ─────────────────────────────────
-def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str, pd.DataFrame]:
     """
     Balanced Batch Processing to prevent CPU Starvation UI crashes.
     """
@@ -157,33 +195,31 @@ def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str,
     if not required.issubset(df.columns):
         raise ValueError(f"Missing required columns in CSV. Expected: {required}. Found: {set(df.columns)}")
-    log_pairs = list(zip(df["source"], df["log_message"]))
     total_logs = len(log_pairs)
-    # FIX: Use exactly half of the available CPU cores (minimum 1).
-    # This leaves the other half for Gradio websockets and the OS.
     safe_cores = max(1, os.cpu_count() // 2)
-    # FIX: Reduce chunk size to 10,000.
-    # Massive chunks cause CPU lockups during inter-process data pickling.
-    chunk_size = 10000
     chunks = [log_pairs[i:i + chunk_size] for i in range(0, total_logs, chunk_size)]
     results = []
     print(f"🔥 Firing up {safe_cores} CPU Cores (Leaving remaining for UI)...")
     t_start = time.perf_counter()
     with ProcessPoolExecutor(max_workers=safe_cores) as executor:
         for chunk_result in executor.map(_process_chunk, chunks):
             results.extend(chunk_result)
     t_end = time.perf_counter()
     print(f"⏱️ True Wall-Clock Processing Time: {(t_end - t_start):.2f} seconds")
-    df["predicted_label"] = [r["label"]       for r in results]
-    df["tier_used"]       = [r["tier"]        for r in results]
-    df["latency_ms"]      = [r["latency_ms"]  for r in results]
     df["confidence"]      = [
         f"{r['confidence']:.1%}" if r["confidence"] is not None else "N/A"
         for r in results
@@ -194,4 +230,4 @@ def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str,
 # Aliases
-classify = classify_logs

 """
+classify.py — 3-Tier Hybrid Pipeline (V10 — Bug Fixed)
+Bug fixes vs V9:
+  1. BERT latency was reporting cumulative sum of per-log values (= total batch ms),
+     not actual per-log latency. Now stores real wall-clock batch time separately
+     and reports true per-log ms.
+  2. @lru_cache was per-process — with ProcessPoolExecutor every worker had its own
+     cold cache, so cross-process cache hits were impossible. Replaced with a
+     multiprocessing.Manager dict shared across all workers.
+  3. LLM tier label was using a hard '<5ms' threshold to detect cache hits which
+     was unreliable (cold process startup skews timings). Now uses an explicit
+     boolean returned alongside the label.
 """
 from __future__ import annotations
 import os
 import time
 import statistics
 import pandas as pd
+import multiprocessing
 from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
 from processor_regex import classify_with_regex
 from processor_bert  import classify_batch as bert_batch
 # ── Config ──────────────────────────────────────────────────────────────────
 LEGACY_SOURCE = os.getenv("LEGACY_SOURCE", "LegacyCRM")
+# ── Shared cross-process LLM cache ──────────────────────────────────────────
+# BUG FIX #2: @lru_cache is per-process. With ProcessPoolExecutor, every worker
+# has its own private cache that never warms across processes.
+# Using multiprocessing.Manager().dict() gives a single shared cache for all workers.
+_manager = None
+_shared_llm_cache = None
+def _get_shared_cache():
+    """Return (or lazily create) the shared cross-process LLM cache."""
+    global _manager, _shared_llm_cache
+    if _shared_llm_cache is None:
+        _manager = multiprocessing.Manager()
+        _shared_llm_cache = _manager.dict()
+    return _shared_llm_cache
+def _cached_llm_call(log_msg: str, cache: dict) -> tuple:
+    """
+    Call LLM with shared cross-process cache.
+    Returns (label, cache_hit).
+    BUG FIX #2: uses shared dict instead of @lru_cache so all worker processes
+    benefit from each other's lookups.
+    BUG FIX #3: returns explicit cache_hit boolean instead of inferring from latency.
+    """
+    if log_msg in cache:
+        return cache[log_msg], True
+    label = classify_with_llm(log_msg)
+    cache[log_msg] = label
+    return label, False
 # ── Result type ─────────────────────────────────────────────────────────────
 def _make_result(label: str, tier: str, confidence, latency_ms: float) -> dict:
         "label":      label,
         "tier":       tier,
         "confidence": confidence,
+        "latency_ms": round(latency_ms, 4),
     }
 # ── Single log (backward-compatible) ────────────────────────────────────────
 def classify_log(source: str, log_msg: str) -> dict:
     results = classify_logs([(source, log_msg)])
 # ── Batch pipeline (main entry point) ───────────────────────────────────────
+def classify_logs(logs: list) -> list:
     n       = len(logs)
     results = [None] * n
     # ── Step 1: Route to groups ─────────────────────────────────────────────
+    llm_indices  = []
+    bert_indices = []
     for i, (source, log_msg) in enumerate(logs):
         if source == LEGACY_SOURCE:
         else:
             t_start = time.perf_counter()
             label = classify_with_regex(log_msg)
             if label:
                 latency_ms = (time.perf_counter() - t_start) * 1000
                 results[i] = _make_result(label, "Regex", 1.0, latency_ms)
         t_bert_start = time.perf_counter()
         bert_results = bert_batch(bert_msgs)
+        t_bert_wall_ms = (time.perf_counter() - t_bert_start) * 1000
+        # BUG FIX #1: store TRUE per-log wall-clock ms.
+        # Old code did: bert_ms_per_log = total_ms / n, then assigned that same
+        # value to every log. pipeline_summary() then summed all n copies back up
+        # to total_ms — making BERT look like it took 2,962,635 ms on 2M logs.
+        bert_per_log_ms = t_bert_wall_ms / len(bert_msgs)
         for idx, (label, conf) in zip(bert_indices, bert_results):
             if label != "Unclassified":
+                results[idx] = _make_result(label, "BERT", conf, bert_per_log_ms)
             else:
                 llm_indices.append(idx)
+    # ── Step 3: LLM (I/O Bound — Threading Applied Here) ────────────────────
     if llm_indices:
+        cache = _get_shared_cache()
         def parallel_llm(idx):
             src, msg = logs[idx]
             t_llm_0 = time.perf_counter()
+            # BUG FIX #2 + #3: shared cache + explicit cache_hit flag
+            label, cache_hit = _cached_llm_call(msg, cache)
             t_llm_ms = (time.perf_counter() - t_llm_0) * 1000
             base_tier = "LLM" if src == LEGACY_SOURCE else "LLM (fallback)"
+            # BUG FIX #3: explicit boolean, not fragile latency threshold
+            tier = f"{base_tier} (Cache Hit)" if cache_hit else f"{base_tier} (API Call)"
             return idx, _make_result(label, tier, None, t_llm_ms)
         with ThreadPoolExecutor() as executor:
 # ── Pipeline summary ─────────────────────────────────────────────────────────
+def pipeline_summary(results: list) -> dict:
+    """
+    BUG FIX #1: With corrected per-log latency values (true wall-clock / n),
+    total_ms now reflects real batch wall time instead of the old tautology of
+    (total_ms/n) * n = total_ms that showed as 2,962,635 ms for BERT.
+    """
+    tier_groups = {}
+    label_counts = {}
     for r in results:
         tier = r["tier"]
             "p95_ms":   round(latencies_sorted[min(int(n * 0.95), n - 1)], 4),
             "p99_ms":   round(latencies_sorted[min(int(n * 0.99), n - 1)], 4),
             "mean_ms":  round(statistics.mean(latencies_sorted), 4),
+            "total_ms": round(sum(latencies_sorted), 4),
         }
     return {
 # ── Multiprocessing Helper ───────────────────────────────────────────────────
+def _process_chunk(chunk: list) -> list:
+    """Top-level helper required for ProcessPoolExecutor mapping."""
     return classify_logs(chunk)
 # ── CSV batch classify (Balanced Processing) ─────────────────────────────────
+def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple:
     """
     Balanced Batch Processing to prevent CPU Starvation UI crashes.
     """
     if not required.issubset(df.columns):
         raise ValueError(f"Missing required columns in CSV. Expected: {required}. Found: {set(df.columns)}")
+    log_pairs  = list(zip(df["source"], df["log_message"]))
     total_logs = len(log_pairs)
+    # Use exactly half the available CPU cores — leaves the other half for Gradio.
     safe_cores = max(1, os.cpu_count() // 2)
+    # Chunk size of 10,000 prevents CPU lockups during inter-process pickling.
+    chunk_size = 10000
     chunks = [log_pairs[i:i + chunk_size] for i in range(0, total_logs, chunk_size)]
     results = []
     print(f"🔥 Firing up {safe_cores} CPU Cores (Leaving remaining for UI)...")
     t_start = time.perf_counter()
     with ProcessPoolExecutor(max_workers=safe_cores) as executor:
         for chunk_result in executor.map(_process_chunk, chunks):
             results.extend(chunk_result)
     t_end = time.perf_counter()
     print(f"⏱️ True Wall-Clock Processing Time: {(t_end - t_start):.2f} seconds")
+    df["predicted_label"] = [r["label"]      for r in results]
+    df["tier_used"]       = [r["tier"]       for r in results]
+    df["latency_ms"]      = [r["latency_ms"] for r in results]
     df["confidence"]      = [
         f"{r['confidence']:.1%}" if r["confidence"] is not None else "N/A"
         for r in results
 # Aliases
+classify = classify_logs