Spaces:

NOT-OMEGA
/

LogAI-Engine

Sleeping

App Files Files Community

NOT-OMEGA commited on Apr 15

Commit

6ffecdd

verified ·

1 Parent(s): a65e2b6

Update classify.py

Browse files

Files changed (1) hide show

classify.py +22 -15

classify.py CHANGED Viewed

@@ -1,14 +1,14 @@
 """
-classify.py — 3-Tier Hybrid Pipeline (V6 — Hybrid Concurrency)
 Architecture:
   LegacyCRM → LLM directly
   Others    → Regex → BERT (batch) → LLM fallback
-Changes in V6 (Final Polish):
-  - Solved the GIL Bottleneck: Chunks run sequentially so BERT gets 100% CPU without thread-locking.
-  - LLM concurrency: ThreadPoolExecutor is retained ONLY for LLM I/O calls.
-  - Perfect Caching: Outer chunks are sequential, meaning the 500k @lru_cache stays in the main process memory for the entire 2M logs.
 """
 from __future__ import annotations
 import os
@@ -16,7 +16,7 @@ import time
 import statistics
 import pandas as pd
 from functools import lru_cache
-from concurrent.futures import ThreadPoolExecutor
 from processor_regex import classify_with_regex
 from processor_bert  import classify_batch as bert_batch
 from processor_llm   import classify_with_llm
@@ -35,7 +35,7 @@ def _make_result(label: str, tier: str, confidence, latency_ms: float) -> dict:
     }
-# ── Caching Layer (Max RAM Eater) ───────────────────────────────────────────
 @lru_cache(maxsize=500000)
 def cached_llm_call(log_msg: str) -> str:
     """Executes the expensive LLM call only if the string misses the cache."""
@@ -70,7 +70,7 @@ def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
             else:
                 bert_indices.append(i)
-    # ── Step 2: BERT batch (CPU Bound - No Threads Allowed Here) ────────────
     if bert_indices:
         bert_msgs = [logs[i][1] for i in bert_indices]
@@ -141,12 +141,18 @@ def pipeline_summary(results: list[dict]) -> dict:
     }
 # ── CSV batch classify (Hybrid Processing) ───────────────────────────────────
 def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str, pd.DataFrame]:
     """
     Ultra-Optimized Batch Processing for 2M+ Logs.
-    Outer chunks run sequentially (bypasses GIL for BERT, preserves main memory cache).
-    Inner LLM calls thread automatically inside classify_logs.
     """
     df = pd.read_csv(input_path)
     required = {"source", "log_message"}
@@ -156,18 +162,19 @@ def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str,
     log_pairs = list(zip(df["source"], df["log_message"]))
     total_logs = len(log_pairs)
-    # Chunk size controls how much data is in RAM at once
     chunk_size = 50000
     chunks = [log_pairs[i:i + chunk_size] for i in range(0, total_logs, chunk_size)]
     results = []
-    print(f"🔥 Processing {len(chunks)} chunks... (BERT handles CPU batching, LLM handles I/O threads)")
     t_start = time.perf_counter()
-    # Process sequentially to avoid GIL locks on BERT and keep the cache in one memory block
-    for chunk in chunks:
-        results.extend(classify_logs(chunk))
     t_end = time.perf_counter()
     print(f"⏱️ True Wall-Clock Processing Time: {(t_end - t_start):.2f} seconds")

 """
+classify.py — 3-Tier Hybrid Pipeline (V7 — Maximum Speed & Sharded Caching)
 Architecture:
   LegacyCRM → LLM directly
   Others    → Regex → BERT (batch) → LLM fallback
+Changes in V7:
+  - Unfroze the Gradio UI and restored Processing Speed: Brought back ProcessPoolExecutor for the outer CSV chunks to utilize ALL CPU cores.
+  - LLM concurrency: ThreadPoolExecutor is retained inside classify_logs specifically for LLM I/O calls.
+  - Cache Architecture: Using a "Sharded Cache" approach. Each CPU worker process gets its own 500k @lru_cache, which is perfectly safe for 18GB RAM and avoids GIL locks entirely.
 """
 from __future__ import annotations
 import os
 import statistics
 import pandas as pd
 from functools import lru_cache
+from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
 from processor_regex import classify_with_regex
 from processor_bert  import classify_batch as bert_batch
 from processor_llm   import classify_with_llm
     }
+# ── Caching Layer (Sharded per CPU Core) ────────────────────────────────────
 @lru_cache(maxsize=500000)
 def cached_llm_call(log_msg: str) -> str:
     """Executes the expensive LLM call only if the string misses the cache."""
             else:
                 bert_indices.append(i)
+    # ── Step 2: BERT batch (CPU Bound - Uses full core without GIL) ─────────
     if bert_indices:
         bert_msgs = [logs[i][1] for i in bert_indices]
     }
+# ── Multiprocessing Helper ───────────────────────────────────────────────────
+def _process_chunk(chunk: list[tuple[str, str]]) -> list[dict]:
+    """Top-level helper function required for ProcessPoolExecutor mapping."""
+    return classify_logs(chunk)
 # ── CSV batch classify (Hybrid Processing) ───────────────────────────────────
 def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str, pd.DataFrame]:
     """
     Ultra-Optimized Batch Processing for 2M+ Logs.
+    Outer chunks use ProcessPoolExecutor to smash through BERT on all CPU cores.
+    Inner LLM calls automatically use ThreadPoolExecutor to handle network I/O.
     """
     df = pd.read_csv(input_path)
     required = {"source", "log_message"}
     log_pairs = list(zip(df["source"], df["log_message"]))
     total_logs = len(log_pairs)
+    max_cores = max(1, os.cpu_count() - 1)
     chunk_size = 50000
     chunks = [log_pairs[i:i + chunk_size] for i in range(0, total_logs, chunk_size)]
     results = []
+    print(f"🔥 Firing up {max_cores} Process Cores... (BERT gets raw CPU, LLM gets Threads)")
     t_start = time.perf_counter()
+    # Brought ProcessPoolExecutor back to unblock the CPU and UI
+    with ProcessPoolExecutor(max_workers=max_cores) as executor:
+        for chunk_result in executor.map(_process_chunk, chunks):
+            results.extend(chunk_result)
     t_end = time.perf_counter()
     print(f"⏱️ True Wall-Clock Processing Time: {(t_end - t_start):.2f} seconds")