Spaces:

NOT-OMEGA
/

LogAI-Engine

Sleeping

App Files Files Community

NOT-OMEGA commited on Apr 15

Commit

1a9b340

verified ·

1 Parent(s): 7d3f899

Update classify.py

Browse files

Files changed (1) hide show

classify.py +81 -27

classify.py CHANGED Viewed

@@ -1,12 +1,20 @@
 """
-classify.py — 3-Tier Hybrid Pipeline (V11 — MAX SPEED + SAFE MULTIPROCESSING + UI FIX)
 """
 from __future__ import annotations
 import os
 import time
 import statistics
 import pandas as pd
-import multiprocessing as mp
 from functools import lru_cache
 from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
 from processor_regex import classify_with_regex
@@ -16,6 +24,7 @@ from processor_llm   import classify_with_llm
 # ── Config ──────────────────────────────────────────────────────────────────
 LEGACY_SOURCE = os.getenv("LEGACY_SOURCE", "LegacyCRM")
 # ── Result type ─────────────────────────────────────────────────────────────
 def _make_result(label: str, tier: str, confidence, latency_ms: float) -> dict:
     return {
@@ -25,26 +34,29 @@ def _make_result(label: str, tier: str, confidence, latency_ms: float) -> dict:
         "latency_ms": round(latency_ms, 4),
     }
-# ── Caching Layer ───────────────────────────────────────────────────────────
-@lru_cache(maxsize=10000) # Reduced maxsize per-worker to prevent OOM
 def cached_llm_call(log_msg: str) -> str:
     return classify_with_llm(log_msg)
-# ── Single log (backward-compatible for UI) ─────────────────────────────────
 def classify_log(source: str, log_msg: str) -> dict:
-    """Used by Gradio real-time analyzer tab."""
     results = classify_logs([(source, log_msg)])
     return results[0]
 def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
-    """Processes a chunk of logs."""
     n       = len(logs)
     results = [None] * n
     llm_indices   = []
     bert_indices  = []
-    # Step 1: Regex (Now running on multiple cores in parallel!)
     for i, (source, log_msg) in enumerate(logs):
         if source == LEGACY_SOURCE:
             llm_indices.append(i)
@@ -58,9 +70,10 @@ def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
             else:
                 bert_indices.append(i)
-    # Step 2: BERT
     if bert_indices:
         bert_msgs = [logs[i][1] for i in bert_indices]
         t_bert_start = time.perf_counter()
         bert_results = bert_batch(bert_msgs)
         t_bert_end   = time.perf_counter()
@@ -73,10 +86,11 @@ def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
             else:
                 llm_indices.append(idx)
-    # Step 3: LLM (Threaded inside each process)
     if llm_indices:
         def parallel_llm(idx):
             src, msg = logs[idx]
             t_llm_0 = time.perf_counter()
             label = cached_llm_call(msg)
             t_llm_ms = (time.perf_counter() - t_llm_0) * 1000
@@ -86,45 +100,83 @@ def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
             return idx, _make_result(label, tier, None, t_llm_ms)
-        # Inner ThreadPool for API network requests
-        with ThreadPoolExecutor(max_workers=10) as executor:
-            for idx, res in executor.map(parallel_llm, llm_indices):
-                results[idx] = res
     return results
 def _process_chunk(chunk: list[tuple[str, str]]) -> list[dict]:
-    """Helper function for mapping."""
     return classify_logs(chunk)
-# ── CSV batch classify (Safe Spawn Multiprocessing) ─────────────────────────
 def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str, pd.DataFrame]:
     df = pd.read_csv(input_path)
     required = {"source", "log_message"}
     if not required.issubset(df.columns):
-        raise ValueError(f"Missing required columns in CSV.")
     log_pairs = list(zip(df["source"], df["log_message"]))
     total_logs = len(log_pairs)
-    # Use max cores for speed, but leave 1 for the OS/Gradio UI
-    safe_cores = max(1, (os.cpu_count() or 1) - 1)
-    chunk_size = 5000 # Slightly smaller chunks so data copies faster between processes
     chunks = [log_pairs[i:i + chunk_size] for i in range(0, total_logs, chunk_size)]
     results = []
-    print(f"🔥 Firing up {safe_cores} CPU Cores with SAFE SPAWN context...")
     t_start = time.perf_counter()
-    # FIX: Correctly pass the spawn context to ProcessPoolExecutor
-    ctx = mp.get_context('spawn')
-    with ProcessPoolExecutor(max_workers=safe_cores, mp_context=ctx) as executor:
         for chunk_result in executor.map(_process_chunk, chunks):
             results.extend(chunk_result)
     t_end = time.perf_counter()
     print(f"⏱️ True Wall-Clock Processing Time: {(t_end - t_start):.2f} seconds")
@@ -140,4 +192,6 @@ def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str,
     df.to_csv(output_path, index=False)
     return output_path, df
 classify = classify_logs

 """
+classify.py — 3-Tier Hybrid Pipeline (V9 — Balanced CPU & Gradio Safe)
+Architecture:
+  LegacyCRM → LLM directly
+  Others    → Regex → BERT (batch) → LLM fallback
+Changes in V9:
+  - Fixed CPU Starvation: Limited max_workers to half the CPU cores to prevent Gradio WebSocket timeouts.
+  - Reduced IPC Overhead: Lowered chunk_size to 10,000 to prevent CPU lockups during cross-process data pickling.
+  - Restored Multi-processing: Outer chunks use ProcessPoolExecutor for speed, inner LLM calls use ThreadPoolExecutor.
 """
 from __future__ import annotations
 import os
 import time
 import statistics
 import pandas as pd
 from functools import lru_cache
 from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
 from processor_regex import classify_with_regex
 # ── Config ──────────────────────────────────────────────────────────────────
 LEGACY_SOURCE = os.getenv("LEGACY_SOURCE", "LegacyCRM")
 # ── Result type ─────────────────────────────────────────────────────────────
 def _make_result(label: str, tier: str, confidence, latency_ms: float) -> dict:
     return {
         "latency_ms": round(latency_ms, 4),
     }
+# ── Caching Layer (Sharded per Worker) ──────────────────────────────────────
+@lru_cache(maxsize=500000)
 def cached_llm_call(log_msg: str) -> str:
+    """Executes the expensive LLM call only if the string misses the cache."""
     return classify_with_llm(log_msg)
+# ── Single log (backward-compatible) ────────────────────────────────────────
 def classify_log(source: str, log_msg: str) -> dict:
     results = classify_logs([(source, log_msg)])
     return results[0]
+# ── Batch pipeline (main entry point) ───────────────────────────────────────
 def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
     n       = len(logs)
     results = [None] * n
+    # ── Step 1: Route to groups ─────────────────────────────────────────────
     llm_indices   = []
     bert_indices  = []
     for i, (source, log_msg) in enumerate(logs):
         if source == LEGACY_SOURCE:
             llm_indices.append(i)
             else:
                 bert_indices.append(i)
+    # ── Step 2: BERT batch (CPU Bound) ──────────────────────────────────────
     if bert_indices:
         bert_msgs = [logs[i][1] for i in bert_indices]
         t_bert_start = time.perf_counter()
         bert_results = bert_batch(bert_msgs)
         t_bert_end   = time.perf_counter()
             else:
                 llm_indices.append(idx)
+    # ── Step 3: LLM (I/O Bound - Threading Applied Here) ────────────────────
     if llm_indices:
         def parallel_llm(idx):
             src, msg = logs[idx]
             t_llm_0 = time.perf_counter()
             label = cached_llm_call(msg)
             t_llm_ms = (time.perf_counter() - t_llm_0) * 1000
             return idx, _make_result(label, tier, None, t_llm_ms)
+        with ThreadPoolExecutor() as executor:
+            llm_results = list(executor.map(parallel_llm, llm_indices))
+        for idx, res in llm_results:
+            results[idx] = res
     return results
+# ── Pipeline summary ─────────────────────────────────────────────────────────
+def pipeline_summary(results: list[dict]) -> dict:
+    tier_groups: dict[str, list[float]] = {}
+    label_counts: dict[str, int] = {}
+    for r in results:
+        tier = r["tier"]
+        tier_groups.setdefault(tier, []).append(r["latency_ms"])
+        label_counts[r["label"]] = label_counts.get(r["label"], 0) + 1
+    total = len(results)
+    tier_stats = {}
+    for tier, latencies in tier_groups.items():
+        latencies_sorted = sorted(latencies)
+        n = len(latencies_sorted)
+        tier_stats[tier] = {
+            "count":    n,
+            "pct":      round(n / total * 100, 1),
+            "p50_ms":   round(statistics.median(latencies_sorted), 4),
+            "p95_ms":   round(latencies_sorted[min(int(n * 0.95), n - 1)], 4),
+            "p99_ms":   round(latencies_sorted[min(int(n * 0.99), n - 1)], 4),
+            "mean_ms":  round(statistics.mean(latencies_sorted), 4),
+            "total_ms": round(sum(latencies_sorted), 4),
+        }
+    return {
+        "total":        total,
+        "tier_stats":   tier_stats,
+        "label_counts": label_counts,
+    }
+# ── Multiprocessing Helper ───────────────────────────────────────────────────
 def _process_chunk(chunk: list[tuple[str, str]]) -> list[dict]:
+    """Top-level helper function required for ProcessPoolExecutor mapping."""
     return classify_logs(chunk)
+# ── CSV batch classify (Balanced Processing) ─────────────────────────────────
 def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str, pd.DataFrame]:
+    """
+    Balanced Batch Processing to prevent CPU Starvation UI crashes.
+    """
     df = pd.read_csv(input_path)
     required = {"source", "log_message"}
     if not required.issubset(df.columns):
+        raise ValueError(f"Missing required columns in CSV. Expected: {required}. Found: {set(df.columns)}")
     log_pairs = list(zip(df["source"], df["log_message"]))
     total_logs = len(log_pairs)
+    # FIX: Use exactly half of the available CPU cores (minimum 1).
+    # This leaves the other half for Gradio websockets and the OS.
+    safe_cores = max(1, os.cpu_count() // 2)
+    # FIX: Reduce chunk size to 10,000.
+    # Massive chunks cause CPU lockups during inter-process data pickling.
+    chunk_size = 10000
     chunks = [log_pairs[i:i + chunk_size] for i in range(0, total_logs, chunk_size)]
     results = []
+    print(f"🔥 Firing up {safe_cores} CPU Cores (Leaving remaining for UI)...")
     t_start = time.perf_counter()
+    with ProcessPoolExecutor(max_workers=safe_cores) as executor:
         for chunk_result in executor.map(_process_chunk, chunks):
             results.extend(chunk_result)
     t_end = time.perf_counter()
     print(f"⏱️ True Wall-Clock Processing Time: {(t_end - t_start):.2f} seconds")
     df.to_csv(output_path, index=False)
     return output_path, df
+# Aliases
 classify = classify_logs