Spaces:

NOT-OMEGA
/

LogAI-Engine

Sleeping

App Files Files Community

NOT-OMEGA commited on Apr 15

Commit

5b20649

verified ·

1 Parent(s): 6ffecdd

Update classify.py

Browse files

Files changed (1) hide show

classify.py +19 -26

classify.py CHANGED Viewed

@@ -1,14 +1,14 @@
 """
-classify.py — 3-Tier Hybrid Pipeline (V7 — Maximum Speed & Sharded Caching)
 Architecture:
   LegacyCRM → LLM directly
   Others    → Regex → BERT (batch) → LLM fallback
-Changes in V7:
-  - Unfroze the Gradio UI and restored Processing Speed: Brought back ProcessPoolExecutor for the outer CSV chunks to utilize ALL CPU cores.
-  - LLM concurrency: ThreadPoolExecutor is retained inside classify_logs specifically for LLM I/O calls.
-  - Cache Architecture: Using a "Sharded Cache" approach. Each CPU worker process gets its own 500k @lru_cache, which is perfectly safe for 18GB RAM and avoids GIL locks entirely.
 """
 from __future__ import annotations
 import os
@@ -16,7 +16,7 @@ import time
 import statistics
 import pandas as pd
 from functools import lru_cache
-from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
 from processor_regex import classify_with_regex
 from processor_bert  import classify_batch as bert_batch
 from processor_llm   import classify_with_llm
@@ -35,7 +35,7 @@ def _make_result(label: str, tier: str, confidence, latency_ms: float) -> dict:
     }
-# ── Caching Layer (Sharded per CPU Core) ────────────────────────────────────
 @lru_cache(maxsize=500000)
 def cached_llm_call(log_msg: str) -> str:
     """Executes the expensive LLM call only if the string misses the cache."""
@@ -70,7 +70,7 @@ def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
             else:
                 bert_indices.append(i)
-    # ── Step 2: BERT batch (CPU Bound - Uses full core without GIL) ─────────
     if bert_indices:
         bert_msgs = [logs[i][1] for i in bert_indices]
@@ -86,7 +86,7 @@ def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
             else:
                 llm_indices.append(idx)
-    # ── Step 3: LLM (I/O Bound - Threading Applied Here) ────────────────────
     if llm_indices:
         def parallel_llm(idx):
             src, msg = logs[idx]
@@ -100,6 +100,7 @@ def classify_logs(logs: list[tuple[str, str]]) -> list[dict]:
             return idx, _make_result(label, tier, None, t_llm_ms)
         with ThreadPoolExecutor() as executor:
             llm_results = list(executor.map(parallel_llm, llm_indices))
@@ -141,18 +142,11 @@ def pipeline_summary(results: list[dict]) -> dict:
     }
-# ── Multiprocessing Helper ───────────────────────────────────────────────────
-def _process_chunk(chunk: list[tuple[str, str]]) -> list[dict]:
-    """Top-level helper function required for ProcessPoolExecutor mapping."""
-    return classify_logs(chunk)
-# ── CSV batch classify (Hybrid Processing) ───────────────────────────────────
 def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str, pd.DataFrame]:
     """
-    Ultra-Optimized Batch Processing for 2M+ Logs.
-    Outer chunks use ProcessPoolExecutor to smash through BERT on all CPU cores.
-    Inner LLM calls automatically use ThreadPoolExecutor to handle network I/O.
     """
     df = pd.read_csv(input_path)
     required = {"source", "log_message"}
@@ -162,19 +156,18 @@ def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str,
     log_pairs = list(zip(df["source"], df["log_message"]))
     total_logs = len(log_pairs)
-    max_cores = max(1, os.cpu_count() - 1)
-    chunk_size = 50000
     chunks = [log_pairs[i:i + chunk_size] for i in range(0, total_logs, chunk_size)]
     results = []
-    print(f"🔥 Firing up {max_cores} Process Cores... (BERT gets raw CPU, LLM gets Threads)")
     t_start = time.perf_counter()
-    # Brought ProcessPoolExecutor back to unblock the CPU and UI
-    with ProcessPoolExecutor(max_workers=max_cores) as executor:
-        for chunk_result in executor.map(_process_chunk, chunks):
-            results.extend(chunk_result)
     t_end = time.perf_counter()
     print(f"⏱️ True Wall-Clock Processing Time: {(t_end - t_start):.2f} seconds")

 """
+classify.py — 3-Tier Hybrid Pipeline (V8 — Cloud Container Safe)
 Architecture:
   LegacyCRM → LLM directly
   Others    → Regex → BERT (batch) → LLM fallback
+Changes in V8 (Stability First):
+  - Removed ProcessPoolExecutor: It was causing Out-Of-Memory (OOM) crashes on Hugging Face Spaces by duplicating the BERT model across CPU cores.
+  - Reverted to Sequential Chunks: Protects the 16GB RAM limit and keeps the 500k @lru_cache perfectly intact in the main process.
+  - Retained ThreadPoolExecutor: Only used for LLM API calls (I/O bound), which is safe and won't crash the container.
 """
 from __future__ import annotations
 import os
 import statistics
 import pandas as pd
 from functools import lru_cache
+from concurrent.futures import ThreadPoolExecutor
 from processor_regex import classify_with_regex
 from processor_bert  import classify_batch as bert_batch
 from processor_llm   import classify_with_llm
     }
+# ── Caching Layer (Single Process - RAM Safe) ───────────────────────────────
 @lru_cache(maxsize=500000)
 def cached_llm_call(log_msg: str) -> str:
     """Executes the expensive LLM call only if the string misses the cache."""
             else:
                 bert_indices.append(i)
+    # ── Step 2: BERT batch (Sequential - RAM Safe) ──────────────────────────
     if bert_indices:
         bert_msgs = [logs[i][1] for i in bert_indices]
             else:
                 llm_indices.append(idx)
+    # ── Step 3: LLM (I/O Bound - Threading Safe) ────────────────────────────
     if llm_indices:
         def parallel_llm(idx):
             src, msg = logs[idx]
             return idx, _make_result(label, tier, None, t_llm_ms)
+        # ThreadPoolExecutor is safe for Gradio/HF Spaces because it shares memory
         with ThreadPoolExecutor() as executor:
             llm_results = list(executor.map(parallel_llm, llm_indices))
     }
+# ── CSV batch classify (Container Safe Processing) ───────────────────────────
 def classify_csv(input_path: str, output_path: str = "output.csv") -> tuple[str, pd.DataFrame]:
     """
+    Stable Batch Processing for 2M+ Logs on Hugging Face Spaces.
+    Runs chunks sequentially to prevent OOM memory crashes.
     """
     df = pd.read_csv(input_path)
     required = {"source", "log_message"}
     log_pairs = list(zip(df["source"], df["log_message"]))
     total_logs = len(log_pairs)
+    # Reduced chunk size slightly to give the container more breathing room
+    chunk_size = 25000
     chunks = [log_pairs[i:i + chunk_size] for i in range(0, total_logs, chunk_size)]
     results = []
+    print(f"🔥 Processing {len(chunks)} chunks sequentially to protect RAM...")
     t_start = time.perf_counter()
+    # Sequential loop: Prevents Gradio from crashing and keeps memory stable
+    for chunk in chunks:
+        results.extend(classify_logs(chunk))
     t_end = time.perf_counter()
     print(f"⏱️ True Wall-Clock Processing Time: {(t_end - t_start):.2f} seconds")