Spaces:

NOT-OMEGA
/

LogAI-Engine

Sleeping

App Files Files Community

NOT-OMEGA commited on Apr 15

Commit

ca8312a

verified ·

1 Parent(s): 3812273

Update classify.py

Browse files

Files changed (1) hide show

classify.py +20 -77

classify.py CHANGED Viewed

@@ -1,14 +1,14 @@
 """
-classify.py — 3-Tier Hybrid Pipeline (V9 — Balanced CPU & Gradio Safe)
 Architecture:
   LegacyCRM → LLM directly
   Others    → Regex → BERT (batch) → LLM fallback
-Changes in V9:
-  - Fixed CPU Starvation: Limited max_workers to half the CPU cores to prevent Gradio WebSocket timeouts.
-  - Reduced IPC Overhead: Lowered chunk_size to 10,000 to prevent CPU lockups during cross-process data pickling.
-  - Restored Multi-processing: Outer chunks use ProcessPoolExecutor for speed, inner LLM calls use ThreadPoolExecutor.
 """
 from __future__ import annotations
 import os
@@ -16,7 +16,7 @@ import time
 import statistics
 import pandas as pd
 from functools import lru_cache
-from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
 from processor_regex import classify_with_regex
 from processor_bert  import classify_batch as bert_batch
 from processor_llm   import classify_with_llm
@@ -24,6 +24,8 @@ from processor_llm   import classify_with_llm
 # ── Config ──────────────────────────────────────────────────────────────────
 LEGACY_SOURCE = os.getenv("LEGACY_SOURCE", "LegacyCRM")
 # ── Result type ─────────────────────────────────────────────────────────────
 def _make_result(label: str, tier: str, confidence, latency_ms: float) -> dict:
@@ -34,26 +36,22 @@ def _make_result(label: str, tier: str, confidence, latency_ms: float) -> dict:
         "latency_ms": round(latency_ms, 4),
     }
-# ── Caching Layer (Sharded per Worker) ──────────────────────────────────────
 @lru_cache(maxsize=500000)
 def cached_llm_call(log_msg: str) -> str:
     """Executes the expensive LLM call only if the string misses the cache."""
     return classify_with_llm(log_msg)
 # ── Single log (backward-compatible) ────────────────────────────────────────
 def classify_log(source: str, log_msg: str) -> dict:
     results = classify_logs([(source, log_msg)])
     return results[0]
 # ── Batch pipeline (main entry point) ───────────────────────────────────────
 def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
     n       = len(logs)
     results = [None] * n
-    # ── Step 1: Route to groups ─────────────────────────────────────────────
     llm_indices   = []
     bert_indices  = []
@@ -70,7 +68,7 @@ def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
             else:
                 bert_indices.append(i)
-    # ── Step 2: BERT batch (CPU Bound) ──────────────────────────────────────
     if bert_indices:
         bert_msgs = [logs[i][1] for i in bert_indices]
@@ -86,7 +84,7 @@ def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
             else:
                 llm_indices.append(idx)
-    # ── Step 3: LLM (I/O Bound - Threading Applied Here) ────────────────────
     if llm_indices:
         def parallel_llm(idx):
             src, msg = logs[idx]
@@ -100,58 +98,16 @@ def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
             return idx, _make_result(label, tier, None, t_llm_ms)
-        with ThreadPoolExecutor() as executor:
-            llm_results = list(executor.map(parallel_llm, llm_indices))
-        for idx, res in llm_results:
             results[idx] = res
     return results
-# ── Pipeline summary ─────────────────────────────────────────────────────────
-def pipeline_summary(results: list[dict]) -> dict:
-    tier_groups: dict[str, list[float]] = {}
-    label_counts: dict[str, int] = {}
-    for r in results:
-        tier = r["tier"]
-        tier_groups.setdefault(tier, []).append(r["latency_ms"])
-        label_counts[r["label"]] = label_counts.get(r["label"], 0) + 1
-    total = len(results)
-    tier_stats = {}
-    for tier, latencies in tier_groups.items():
-        latencies_sorted = sorted(latencies)
-        n = len(latencies_sorted)
-        tier_stats[tier] = {
-            "count":    n,
-            "pct":      round(n / total * 100, 1),
-            "p50_ms":   round(statistics.median(latencies_sorted), 4),
-            "p95_ms":   round(latencies_sorted[min(int(n * 0.95), n - 1)], 4),
-            "p99_ms":   round(latencies_sorted[min(int(n * 0.99), n - 1)], 4),
-            "mean_ms":  round(statistics.mean(latencies_sorted), 4),
-            "total_ms": round(sum(latencies_sorted), 4),
-        }
-    return {
-        "total":        total,
-        "tier_stats":   tier_stats,
-        "label_counts": label_counts,
-    }
-# ── Multiprocessing Helper ───────────────────────────────────────────────────
-def _process_chunk(chunk: list[tuple[str, str]]) -> list[dict]:
-    """Top-level helper function required for ProcessPoolExecutor mapping."""
-    return classify_logs(chunk)
-# ── CSV batch classify (Balanced Processing) ─────────────────────────────────
 def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str, pd.DataFrame]:
-    """
-    Balanced Batch Processing to prevent CPU Starvation UI crashes.
-    """
     df = pd.read_csv(input_path)
     required = {"source", "log_message"}
     if not required.issubset(df.columns):
@@ -160,25 +116,14 @@ def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str,
     log_pairs = list(zip(df["source"], df["log_message"]))
     total_logs = len(log_pairs)
-    # FIX: Use exactly half of the available CPU cores (minimum 1).
-    # This leaves the other half for Gradio websockets and the OS.
-    safe_cores = max(1, os.cpu_count() // 2)
-    # FIX: Reduce chunk size to 10,000.
-    # Massive chunks cause CPU lockups during inter-process data pickling.
-    chunk_size = 10000
-    chunks = [log_pairs[i:i + chunk_size] for i in range(0, total_logs, chunk_size)]
-    results = []
-    print(f"🔥 Firing up {safe_cores} CPU Cores (Leaving remaining for UI)...")
-    t_start = time.perf_counter()
-    with ProcessPoolExecutor(max_workers=safe_cores) as executor:
-        for chunk_result in executor.map(_process_chunk, chunks):
-            results.extend(chunk_result)
     t_end = time.perf_counter()
     print(f"⏱️ True Wall-Clock Processing Time: {(t_end - t_start):.2f} seconds")
     df["predicted_label"] = [r["label"]       for r in results]
@@ -192,6 +137,4 @@ def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str,
     df.to_csv(output_path, index=False)
     return output_path, df
-# Aliases
 classify = classify_logs

 """
+classify.py — 3-Tier Hybrid Pipeline (V10 — Thread-Safe & Shared Cache)
 Architecture:
   LegacyCRM → LLM directly
   Others    → Regex → BERT (batch) → LLM fallback
+Changes in V10:
+  - Removed buggy ProcessPoolExecutor (Fixes fork deadlocks & memory spikes).
+  - Global ThreadPoolExecutor for LLM (Fixes thread thrashing & context switching).
+  - LRU Cache is now genuinely shared across the entire run.
 """
 from __future__ import annotations
 import os
 import statistics
 import pandas as pd
 from functools import lru_cache
+from concurrent.futures import ThreadPoolExecutor
 from processor_regex import classify_with_regex
 from processor_bert  import classify_batch as bert_batch
 from processor_llm   import classify_with_llm
 # ── Config ──────────────────────────────────────────────────────────────────
 LEGACY_SOURCE = os.getenv("LEGACY_SOURCE", "LegacyCRM")
+# FIX: One global pool to prevent OS thread thrashing per chunk.
+_llm_executor = ThreadPoolExecutor(max_workers=min(32, (os.cpu_count() or 1) * 4))
 # ── Result type ─────────────────────────────────────────────────────────────
 def _make_result(label: str, tier: str, confidence, latency_ms: float) -> dict:
         "latency_ms": round(latency_ms, 4),
     }
+# ── Caching Layer (Now Global) ──────────────────────────────────────────────
 @lru_cache(maxsize=500000)
 def cached_llm_call(log_msg: str) -> str:
     """Executes the expensive LLM call only if the string misses the cache."""
     return classify_with_llm(log_msg)
 # ── Single log (backward-compatible) ────────────────────────────────────────
 def classify_log(source: str, log_msg: str) -> dict:
     results = classify_logs([(source, log_msg)])
     return results[0]
 # ── Batch pipeline (main entry point) ───────────────────────────────────────
 def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
     n       = len(logs)
     results = [None] * n
     llm_indices   = []
     bert_indices  = []
             else:
                 bert_indices.append(i)
+    # ── Step 2: BERT batch (ONNX handles its own multi-threading) ───────────
     if bert_indices:
         bert_msgs = [logs[i][1] for i in bert_indices]
             else:
                 llm_indices.append(idx)
+    # ── Step 3: LLM (I/O Bound - Using Global Thread Pool) ──────────────────
     if llm_indices:
         def parallel_llm(idx):
             src, msg = logs[idx]
             return idx, _make_result(label, tier, None, t_llm_ms)
+        # Delegate entirely to the pre-warmed global thread pool
+        futures = [_llm_executor.submit(parallel_llm, idx) for idx in llm_indices]
+        for future in futures:
+            idx, res = future.result()
             results[idx] = res
     return results
 def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str, pd.DataFrame]:
+    """Single-process batch processing (relying on ONNX C++ threads + Python network threads)"""
     df = pd.read_csv(input_path)
     required = {"source", "log_message"}
     if not required.issubset(df.columns):
     log_pairs = list(zip(df["source"], df["log_message"]))
     total_logs = len(log_pairs)
+    print(f"🔥 Processing {total_logs} logs (Thread Pool active for LLMs)...")
+    t_start = time.perf_counter()
+    # Process everything in one go - let classify_logs handle the internal batching
+    results = classify_logs(log_pairs)
     t_end = time.perf_counter()
     print(f"⏱️ True Wall-Clock Processing Time: {(t_end - t_start):.2f} seconds")
     df["predicted_label"] = [r["label"]       for r in results]
     df.to_csv(output_path, index=False)
     return output_path, df
 classify = classify_logs