Spaces:

petermutwiri
/

analytics-engine

Paused

App Files Files Community

shaliz-kong commited on Nov 30, 2025

Commit

14aa120

1 Parent(s): 6439a99

refactored mapper and analytics worker

Browse files

Files changed (2) hide show

app/mapper.py +217 -352
app/tasks/analytics_worker.py +492 -185

app/mapper.py CHANGED Viewed

@@ -1,17 +1,16 @@
-# app/mapper.py – BULLETPROOF VERSION
 import os
 import json
-# import duckdb
 import pandas as pd
 import numpy as np
 from datetime import datetime, timedelta
-from app.db import get_conn, ensure_raw_table, transactional_conn,ensure_schema_versions_table
-# app/mapper.py (add line 1)
-from app.hybrid_entity_detector import hybrid_detect_entity_type
 import time
 from app.core.event_hub import event_hub
-# ----------------------  Canonical schema base  ---------------------- #
 CANONICAL = {
     "timestamp":  ["timestamp", "date", "sale_date", "created_at"],
     "product_id": ["sku", "barcode", "plu", "product_id", "item_code"],
@@ -25,45 +24,20 @@ CANONICAL = {
 ALIAS_FILE = "./db/alias_memory.json"
 def map_pandas_to_duck(col: str, series: pd.Series) -> str:
     if pd.api.types.is_bool_dtype(series):     return "BOOLEAN"
     if pd.api.types.is_integer_dtype(series):  return "BIGINT"
     if pd.api.types.is_float_dtype(series):    return "DOUBLE"
     if pd.api.types.is_datetime64_any_dtype(series): return "TIMESTAMP"
     return "VARCHAR"
-# ----------  entity detection(uses ai to detect entity from the data) ---------- #
-def ensure_canonical_table(duck: duckdb.DuckDBPyConnection, df: pd.DataFrame, entity_type: str) -> str:
-    """Creates entity-specific table: main.sales_canonical, main.inventory_canonical, etc."""
-    table_name = f"main.{entity_type}_canonical"
-    # Create base table if doesn't exist
-    duck.execute(f"""
-        CREATE TABLE IF NOT EXISTS {table_name} (
-            id UUID DEFAULT uuid(),
-            _ingested_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
-        )
-    """)
-    # Get existing columns (lowercase for comparison)
-    existing_cols_raw = duck.execute(f"PRAGMA table_info('{table_name}')").fetchall()
-    existing_cols = {str(r[0]).lower() for r in existing_cols_raw}
-    # ✅ BULLETPROOF: Add missing columns with safe name handling
-    for col in df.columns:
-        col_name = str(col).lower().strip()  # ✅ FORCE STRING
-        if col_name not in existing_cols:
-            try:
-                dtype = map_pandas_to_duck(col_name, df[col])
-                print(f"[mapper] ➕ Adding column '{col_name}:{dtype}'")
-                duck.execute(f"ALTER TABLE {table_name} ADD COLUMN {col_name} {dtype}")
-            except Exception as e:
-                print(f"[mapper] ⚠️ Skipping column {col_name}: {e}")
-    return table_name
-# ----------  Alias Memory ---------- #
 def load_dynamic_aliases() -> None:
     if os.path.exists(ALIAS_FILE):
         try:
             with open(ALIAS_FILE) as f:
@@ -74,22 +48,24 @@ def load_dynamic_aliases() -> None:
                 else:
                     CANONICAL[k] = v
         except Exception as e:
-            print(f"[mapper] ⚠️ failed to load alias memory: {e}")
 def save_dynamic_aliases() -> None:
     os.makedirs(os.path.dirname(ALIAS_FILE), exist_ok=True)
     with open(ALIAS_FILE, "w") as f:
         json.dump(CANONICAL, f, indent=2)
-# ✅ Module-level cache: (org_id, source_id) -> entity_info
-_ENTITY_CACHE = {}
-_INDUSTRY_CACHE = {}  # NEW
-def poll_for_entity(org_id: str, source_id: str, timeout: int = 30) -> dict:
     """
-    🎯 Capped at 2 Redis calls (immediate + after 5s sleep).
-    In-memory cache prevents re-polling the same source.
     """
-    # 1. Check cache (zero Redis calls)
     cache_key = (org_id, source_id)
     if cache_key in _ENTITY_CACHE:
         print(f"[poll] 💾 CACHE HIT: {cache_key}")
         return _ENTITY_CACHE[cache_key]
@@ -97,62 +73,75 @@ def poll_for_entity(org_id: str, source_id: str, timeout: int = 30) -> dict:
     entity_key = f"entity:{org_id}:{source_id}"
     print(f"[poll] ⏳ Polling for key: {entity_key}")
-    # 2. First attempt (immediate)
     data = event_hub.get_key(entity_key)
     if data:
         entity_info = json.loads(data)
-        print(f"[poll] ✅ SUCCESS on first attempt: {entity_info['entity_type']}")
         _ENTITY_CACHE[cache_key] = entity_info
         return entity_info
-    # 3. Sleep 5 seconds (gives worker time)
-    print("[poll] 🔄 First check failed, sleeping 5s...")
-    time.sleep(5.0)
-    # 4. Second attempt (final)
     data = event_hub.get_key(entity_key)
     if data:
         entity_info = json.loads(data)
-        print(f"[poll] ✅ SUCCESS on second attempt: {entity_info['entity_type']}")
         _ENTITY_CACHE[cache_key] = entity_info
         return entity_info
-    # 5. Emergency fallback (worker is dead)
-    print("[poll] ⚠️ Both attempts failed - using direct detection")
-    # Use the combined fallback so we only hit DuckDB once and write both
-    # entity AND industry keys atomically when possible.
     entity_info, industry_info = _fallback_combined(org_id, source_id)
-    # Invalidate local cache entry so that subsequent callers read Redis first
-    _ENTITY_CACHE.pop((org_id, source_id), None)
     return entity_info
-# OLD: _fallback_detection kept for reference (commented out during refactor)
-"""
-def _fallback_detection(org_id: str, source_id: str) -> dict:
-    # (original implementation)
-    ...
-"""
 def _fallback_combined(org_id: str, source_id: str) -> tuple[dict, dict]:
-    """Single DuckDB query to produce both entity and industry detections.
-    Guarantees:
-    - Always writes `entity:{org_id}:{source_id}` and
-      `industry:{org_id}:{source_id}` to Redis (or logs if write fails).
-    - Updates module caches then invalidates them so readers re-check Redis.
-    - Attempts parallel detection to reduce latency.
     """
     print(f"[fallback_combined] 🚨 Running combined fallback for {org_id}/{source_id}")
-    # Default UNKNOWN placeholders
     entity_info = {"entity_type": "UNKNOWN", "confidence": 0.0}
     industry_info = {"industry": "UNKNOWN", "confidence": 0.0}
     try:
         conn = get_conn(org_id)
         rows = conn.execute("""
@@ -161,152 +150,71 @@ def _fallback_combined(org_id: str, source_id: str) -> tuple[dict, dict]:
             WHERE row_data IS NOT NULL
             USING SAMPLE 100
         """).fetchall()
         if rows:
             parsed = [json.loads(r[0]) for r in rows if r[0]]
             df = pd.DataFrame(parsed)
             df.columns = [str(col).lower().strip() for col in df.columns]
-            # Run both detectors concurrently (thread pool for CPU/IO-bound work)
-            from concurrent.futures import ThreadPoolExecutor
-            def run_entity():
                 try:
                     return hybrid_detect_entity_type(org_id, df, f"{source_id}.json")
                 except Exception as e:
-                    print(f"[fallback_combined] ❌ entity detection failed: {e}")
                     return ("UNKNOWN", 0.0, False)
-            def run_industry():
                 try:
                     from app.hybrid_industry_detector import hybrid_detect_industry_type
                     return hybrid_detect_industry_type(org_id, df, source_id)
                 except Exception as e:
-                    print(f"[fallback_combined] ❌ industry detection failed: {e}")
                     return ("UNKNOWN", 0.0, False)
             with ThreadPoolExecutor(max_workers=2) as ex:
-                ent_f = ex.submit(run_entity)
-                ind_f = ex.submit(run_industry)
-                ent_res = ent_f.result()
-                ind_res = ind_f.result()
-            entity_info = {"entity_type": ent_res[0], "confidence": ent_res[1]}
-            industry_info = {"industry": ind_res[0], "confidence": ind_res[1]}
-            print(f"[fallback_combined] ✅ Entity: {entity_info['entity_type']} ({entity_info['confidence']:.2%})")
-            print(f"[fallback_combined] ✅ Industry: {industry_info['industry']} ({industry_info['confidence']:.2%})")
     except Exception as e:
-        print(f"[fallback_combined] ❌ Combined fallback failed: {e}")
-    # Persist to Redis; prefer pipeline when available for minimal window
-    e_key = f"entity:{org_id}:{source_id}"
-    i_key = f"industry:{org_id}:{source_id}"
     try:
-        pipe = event_hub.pipeline()
-        if pipe is not None:
-            try:
-                pipe.setex(e_key, 3600, json.dumps(entity_info))
-                pipe.setex(i_key, 3600, json.dumps(industry_info))
-                # Also add a per-source readiness nudging stream entry
-                try:
-                    # Structured readiness message for per-source stream
-                    msg = json.dumps({
-                        "org_id": org_id,
-                        "source_id": source_id,
-                        "status": "ready"
-                    })
-                    pipe.xadd(event_hub.stream_key(org_id, source_id), {"message": msg})
-                except Exception:
-                    # Not fatal; continue
-                    pass
-                pipe.execute()
-            except Exception as e:
-                print(f"[fallback_combined] ❌ Pipeline execute failed: {e}")
-                # Fall back to sequential writes
-                event_hub.setex(e_key, 3600, json.dumps(entity_info))
-                event_hub.setex(i_key, 3600, json.dumps(industry_info))
-        else:
-            # Pipeline not available; do sequential writes
-            event_hub.setex(e_key, 3600, json.dumps(entity_info))
-            event_hub.setex(i_key, 3600, json.dumps(industry_info))
-    except Exception as e:
-        print(f"[fallback_combined] ❌ Redis write failed: {e}")
-    # Update caches (then immediately invalidate to avoid stale-reads window)
-    _ENTITY_CACHE[(org_id, source_id)] = entity_info
-    _INDUSTRY_CACHE[(org_id, source_id)] = industry_info
-    _ENTITY_CACHE.pop((org_id, source_id), None)
-    _INDUSTRY_CACHE.pop((org_id, source_id), None)
-    return entity_info, industry_info
-    #poll for industry from redis
-def poll_for_industry(org_id: str, source_id: str, timeout: int = 10) -> dict:
-    """
-    🎯 Polls Redis for industry detection result (user-facing dashboard label).
-    Capped at 2 Redis calls (immediate + after 5s sleep).
-    In-memory cache prevents re-polling the same source.
-    Returns:
-        dict: {"industry": str, "confidence": float}
-    """
     cache_key = (org_id, source_id)
-    # 1. Check cache FIRST
-    if cache_key in _INDUSTRY_CACHE:
-        print(f"[poll_industry] 💾 CACHE HIT: {cache_key}")
-        return _INDUSTRY_CACHE[cache_key]
-    industry_key = f"industry:{org_id}:{source_id}"
-    print(f"[poll_industry] ⏳ Polling for key: {industry_key}")
-    # 2. First attempt (immediate)
-    data = event_hub.get_key(industry_key)
-    if data:
-        industry_info = json.loads(data)
-        _INDUSTRY_CACHE[cache_key] = industry_info
-        print(f"[poll_industry] ✅ SUCCESS on first attempt: {industry_info['industry']}")
-        return industry_info
-    # 3. Sleep 5 seconds (gives worker time)
-    print("[poll_industry] 🔄 First check failed, sleeping 5s...")
-    time.sleep(5.0)
-    # 4. Second attempt (final)
-    data = event_hub.get_key(industry_key)
-    if data:
-        industry_info = json.loads(data)
-        _INDUSTRY_CACHE[cache_key] = industry_info
-        print(f"[poll_industry] ✅ SUCCESS on second attempt: {industry_info['industry']}")
-        return industry_info
-    # 5. Emergency fallback (worker is dead)
-    print("[poll_industry] ⚠️ Both attempts failed - using direct detection")
-    industry_info = _fallback_industry_detection(org_id, source_id)
-    # 🎯 NEW: Force write to Redis (ensure it's there)
-    event_hub.setex(
-        f"industry:{org_id}:{source_id}",
-        3600,
-        json.dumps(industry_info)
-    )
-    # 🎯 NEW: Clear stale cache so next read is fresh
-    if (org_id, source_id) in _INDUSTRY_CACHE:
-        del _INDUSTRY_CACHE[(org_id, source_id)]
-    return industry_info
-    #fallback industry detection
 def _fallback_industry_detection(org_id: str, source_id: str) -> dict:
     """
-    Emergency: Run industry detection directly from DuckDB data.
-    Uses the actual hybrid detector module you have.
-    Writes result to Redis and cache for recovery.
     """
-    print(f"[fallback_industry] 🚨 Running fallback for {org_id}/{source_id}")
     try:
         conn = get_conn(org_id)
@@ -318,90 +226,80 @@ def _fallback_industry_detection(org_id: str, source_id: str) -> dict:
         """).fetchall()
         if not rows:
-            print("[fallback_industry] ❌ No data found, returning UNKNOWN")
-            industry_info = {"industry": "UNKNOWN", "confidence": 0.0}
-        else:
-            parsed = [json.loads(r[0]) for r in rows if r[0]]
-            df = pd.DataFrame(parsed)
-             # ✅ ADD THIS LINE - Normalize column names before detection
-            df.columns = [str(col).lower().strip() for col in df.columns]
-            # ✅ CORRECT: Import from your actual module
-            from app.hybrid_industry_detector import hybrid_detect_industry_type
-            # Call it (note: it returns 3 values: industry, confidence, is_confident)
-            industry, confidence, _ = hybrid_detect_industry_type(org_id, df, source_id)
-            industry_info = {"industry": industry, "confidence": confidence}
-            print(f"[fallback_industry] ✅ Direct detection: {industry} ({confidence:.2%})")
-        # ✅ CRITICAL: Write to Redis BEFORE returning
         redis_key = f"industry:{org_id}:{source_id}"
         event_hub.setex(redis_key, 3600, json.dumps(industry_info))
-        print(f"[fallback_industry] 💾 WRITTEN TO REDIS: {redis_key}")
-        # ✅ Also populate module cache
-        cache_key = (org_id, source_id)
-        _INDUSTRY_CACHE[cache_key] = industry_info
         return industry_info
     except Exception as e:
         print(f"[fallback_industry] ❌ Failed: {e}")
-        # ✅ Even on error, write UNKNOWN to Redis so worker doesn't hang
         redis_key = f"industry:{org_id}:{source_id}"
         event_hub.setex(redis_key, 3600, json.dumps({"industry": "UNKNOWN", "confidence": 0.0}))
         return {"industry": "UNKNOWN", "confidence": 0.0}
 def canonify_df(org_id: str, source_id: str, hours_window: int = 24) -> tuple[pd.DataFrame, str, float]:
     """
-    ENTERPRISE DATA INGESTION PIPELINE (v2.0)
-    ==========================================
-    Transforms raw audit data into queryable canonical format while:
-    ✅ Preserving ALL original columns (no data loss)
-    ✅ Mapping recognized fields to canonical schema
-    ✅ Versioning schema changes for audit & rollback
-    ✅ Enforcing minimum schema contracts
-    ✅ Operating transactionally for data integrity
-    ✅ Handling background worker failures gracefully
-    Flow:
-    1. Fetch raw audit trail from main.raw_rows
-    2. Parse nested JSON (handles {tables: {...}}, {data: [...]}, etc.)
-    3. Normalize column names (force string, lowercase, dedupe)
-    4. Map to canonical schema BUT keep unmapped columns intact
-    5. Learn new column aliases for future mapping improvements
-    6. Type-cast canonical fields (timestamp, qty, total, etc.)
-    7. Poll Redis for entity type & industry (with fallback)
-    8. Version the schema if structure changed
-    9. Enforce schema contract (ensure required canonical columns exist)
-    10. Transactionally insert into entity-specific table
-    11. Return full DataFrame + industry metadata for frontend
-    Args:
-        org_id: Tenant identifier (e.g., "org_synth_123")
-        source_id: Data source UUID for entity/industry detection
-        hours_window: Hours of raw data to consider (default: 24h)
-    Returns:
-        tuple: (DataFrame with ALL columns, industry: str, confidence: float)
-    Raises:
-        ValueError: If schema contract is violated (missing required columns)
-        HTTPException: On critical failures (quota, insertion errors)
     """
     start_time = datetime.now()
     print(f"\n[canonify] 🚀 Starting pipeline for {org_id}/{source_id}")
-    # Load dynamic aliases before mapping
     load_dynamic_aliases()
-    # 1️⃣ FETCH RAW AUDIT DATA
     with get_conn(org_id) as conn:
         ensure_raw_table(conn)
-        # ✅ FIXED: Calculate cutoff in Python, bind properly
         cutoff_time = datetime.now() - timedelta(hours=hours_window)
         try:
@@ -415,26 +313,26 @@ def canonify_df(org_id: str, source_id: str, hours_window: int = 24) -> tuple[pd
         except Exception as e:
             print(f"[canonify] ❌ SQL read error: {e}")
             return pd.DataFrame(), "unknown", 0.0
     if not rows:
-        print("[canonify] ⚠️ No audit rows found in window")
         return pd.DataFrame(), "unknown", 0.0
-    # 2️⃣ PARSE NESTED JSON PAYLOADS
     parsed, malformed_count = [], 0
     for r in rows:
         raw = r[0]
         if not raw:
             malformed_count += 1
             continue
         try:
             obj = raw if isinstance(raw, (dict, list)) else json.loads(str(raw))
         except Exception:
             malformed_count += 1
             continue
-        # Extract rows from various payload structures
         if isinstance(obj, dict):
             if "rows" in obj and isinstance(obj["rows"], list):
                 parsed.extend(obj["rows"])
@@ -450,34 +348,31 @@ def canonify_df(org_id: str, source_id: str, hours_window: int = 24) -> tuple[pd
             parsed.extend(obj)
         else:
             malformed_count += 1
     if malformed_count:
         print(f"[canonify] ⚠️ Skipped {malformed_count} malformed rows")
     if not parsed:
         print("[canonify] ❌ No valid data after parsing")
         return pd.DataFrame(), "unknown", 0.0
-    # 3️⃣ NORMALIZE COLUMN NAMES (Bulletproof)
     df = pd.DataFrame(parsed)
     df.columns = [str(col).lower().strip() for col in df.columns]
     df = df.loc[:, ~df.columns.duplicated()]
     print(f"[canonify] 📊 Parsed DataFrame: {len(df)} rows × {len(df.columns)} cols")
-    # 4️⃣ MAP TO CANONICAL SCHEMA (Preserve All Columns)
-    # Build mapping: original_col → canonical_col
     mapping, canonical_used = {}, set()
     for canon, aliases in CANONICAL.items():
         for col in df.columns:
             if any(str(alias).lower() in col for alias in aliases):
-                # If multiple cols map to same canonical (e.g., begin/end datetime),
-                # keep first as canonical, others stay original
                 if canon not in canonical_used:
                     mapping[col] = canon
                     canonical_used.add(canon)
                     print(f"[canonify] 🔀 Mapped '{col}' → canonical '{canon}'")
                 break
-    # Learn new aliases for future improvements
     for col in df.columns:
         for canon in CANONICAL.keys():
             if str(canon).lower() in col and col not in CANONICAL[canon]:
@@ -485,11 +380,10 @@ def canonify_df(org_id: str, source_id: str, hours_window: int = 24) -> tuple[pd
                 print(f"[canonify] 🧠 Learned new alias: {canon} ← {col}")
     save_dynamic_aliases()
-    # Apply mapping but keep ALL columns
     renamed = df.rename(columns=mapping)
-    # Build final column list: canonicals first (deduped), then originals
     final_columns, seen = [], set()
     for col in renamed.columns:
         if col in CANONICAL.keys():
@@ -501,8 +395,8 @@ def canonify_df(org_id: str, source_id: str, hours_window: int = 24) -> tuple[pd
     df = renamed[final_columns].copy()
     print(f"[canonify] ✅ Kept columns: {list(df.columns)}")
-    # 5️⃣ TYPE CONVERSIONS (Best Effort)
     try:
         if "timestamp" in df:
             df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
@@ -514,24 +408,25 @@ def canonify_df(org_id: str, source_id: str, hours_window: int = 24) -> tuple[pd
             if col in df:
                 df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
     except Exception as e:
-        print(f"[canonify] ⚠️ Type conversion warning (non-critical): {e}")
-    # 6️⃣ DETECT ENTITY & INDUSTRY (with worker fallback)
     entity_info = poll_for_entity(org_id, source_id)
     entity_type = entity_info["entity_type"]
     industry_info = poll_for_industry(org_id, source_id)
     industry = industry_info["industry"]
     industry_confidence = industry_info["confidence"]
     print(f"[canonify] 🎯 Entity: {entity_type}, Industry: {industry} ({industry_confidence:.2%})")
-        # 8️⃣ SCHEMA VERSIONING & TRANSACTIONAL INSERT
     os.makedirs("./db", exist_ok=True)
     with transactional_conn(org_id) as duck:
         ensure_schema_versions_table(duck)
-        # 8a) Detect schema changes
         current_schema = {col: map_pandas_to_duck(col, df[col]) for col in df.columns}
         existing_schema_row = duck.execute("""
             SELECT schema_json, version_id FROM main.schema_versions
@@ -546,7 +441,6 @@ def canonify_df(org_id: str, source_id: str, hours_window: int = 24) -> tuple[pd
         version_id = None
         if is_new_schema:
-            # Manual auto-increment for DuckDB 0.10.3 compatibility
             version_id = duck.execute("""
                 INSERT INTO main.schema_versions
                 (version_id, table_name, schema_json, status)
@@ -555,11 +449,10 @@ def canonify_df(org_id: str, source_id: str, hours_window: int = 24) -> tuple[pd
             """, (f"{entity_type}_canonical", json.dumps(current_schema))).fetchone()[0]
             print(f"[canonify] 📝 Created schema v{version_id} for {entity_type}_canonical")
-        # 8b) Ensure table exists with current schema
         table_name = ensure_canonical_table(duck, df, entity_type)
-        # 8c) Transactional insert
-                # 8d) Clean and insert data
         if not df.empty:
             table_info = duck.execute(f"PRAGMA table_info('{table_name}')").fetchall()
             table_cols = [str(r[0]) for r in table_info]
@@ -567,7 +460,6 @@ def canonify_df(org_id: str, source_id: str, hours_window: int = 24) -> tuple[pd
             df_to_insert = df[[col for col in df.columns if col in table_cols]]
             if not df_to_insert.empty:
-                # 🔧 CRITICAL: Replace NaN/Infinity with None for JSON compliance
                 df_to_insert = df_to_insert.replace([np.inf, -np.inf, np.nan], None)
                 cols_str = ", ".join(df_to_insert.columns)
@@ -579,7 +471,7 @@ def canonify_df(org_id: str, source_id: str, hours_window: int = 24) -> tuple[pd
                 )
                 print(f"[canonify] 💾 Inserted {len(df_to_insert)} rows into {table_name}")
-            # 8d) Mark schema as applied post-insert
             if is_new_schema and version_id:
                 try:
                     duck.execute("""
@@ -589,61 +481,34 @@ def canonify_df(org_id: str, source_id: str, hours_window: int = 24) -> tuple[pd
                     """, (version_id,))
                     print(f"[canonify] ✅ Schema v{version_id} marked as applied")
                 except Exception as e:
-                    print(f"[canonify] ⚠️ Schema update warning (non-critical): {e}")
-     # At the very end of canonify_df function, line ~470
-    df = df.replace([np.inf, -np.inf, np.nan], None)  # Clean for JSON response
     duration_ms = (datetime.now() - start_time).total_seconds() * 1000
     print(f"[canonify] ✅ Pipeline complete in {duration_ms:.2f}ms for {org_id}")
-    # After line: print(f"[canonify] ✅ Pipeline complete in {duration_ms:.2f}ms")
-    if not df.empty:
-        # At the end of the canonify pipeline: ensure Redis keys for entity/industry
-        # are present (defensive) and nudge workers via stream AFTER commit.
-        try:
-            e_key = f"entity:{org_id}:{source_id}"
-            i_key = f"industry:{org_id}:{source_id}"
-            entity_payload = json.dumps({"entity_type": entity_type, "confidence": 1.0})
-            industry_payload = json.dumps({"industry": industry, "confidence": industry_confidence})
-            pipe = event_hub.pipeline()
-            if pipe is not None:
-                try:
-                    pipe.setex(e_key, 3600, entity_payload)
-                    pipe.setex(i_key, 3600, industry_payload)
-                    # per-source readiness nudge
-                    msg = json.dumps({"org_id": org_id, "source_id": source_id, "status": "ready"})
-                    pipe.xadd(event_hub.stream_key(org_id, source_id), {"message": msg})
-                    pipe.execute()
-                except Exception as e:
-                    print(f"[canonify] ⚠️ Pipeline nudge failed: {e}")
-                    # Fallback to sequential writes
-                    try:
-                        event_hub.setex(e_key, 3600, entity_payload)
-                        event_hub.setex(i_key, 3600, industry_payload)
-                    except Exception as re:
-                        print(f"[canonify] ❌ Redis setex fallback failed: {re}")
-            else:
-                # Pipeline not available; write sequentially
-                try:
-                    event_hub.setex(e_key, 3600, entity_payload)
-                    event_hub.setex(i_key, 3600, industry_payload)
-                    msg = json.dumps({"org_id": org_id, "source_id": source_id, "status": "ready"})
-                    event_hub.redis.xadd(event_hub.stream_key(org_id, source_id), {"message": msg})
-                except Exception as e:
-                    print(f"[canonify] ❌ Redis nudge failed: {e}")
-            # Emit central trigger for worker manager
-            try:
-                event_hub.emit_analytics_trigger(org_id, source_id, {
-                    "type": "kpi_compute",
-                    "entity_type": entity_type,
-                    "industry": industry,
-                    "rows_inserted": len(df)
-                })
-                print(f"[canonify] 🚀 Triggered analytics for {source_id}")
-            except Exception as e:
-                print(f"[canonify] ⚠️ Analytics trigger failed (non-critical): {e}")
-        except Exception as e:
-            print(f"[canonify] ⚠️ Finalization nudge failed: {e}")
     return df, industry, industry_confidence

 import os
 import json
 import pandas as pd
 import numpy as np
 from datetime import datetime, timedelta
+from concurrent.futures import ThreadPoolExecutor
 import time
+from app.db import get_conn, ensure_raw_table, transactional_conn, ensure_schema_versions_table
+from app.hybrid_entity_detector import hybrid_detect_entity_type
 from app.core.event_hub import event_hub
+# ---------------------- Canonical Schema ---------------------- #
 CANONICAL = {
     "timestamp":  ["timestamp", "date", "sale_date", "created_at"],
     "product_id": ["sku", "barcode", "plu", "product_id", "item_code"],
 ALIAS_FILE = "./db/alias_memory.json"
+# Module-level caches
+_ENTITY_CACHE = {}
+_INDUSTRY_CACHE = {}
 def map_pandas_to_duck(col: str, series: pd.Series) -> str:
+    """Map pandas dtype to DuckDB type"""
     if pd.api.types.is_bool_dtype(series):     return "BOOLEAN"
     if pd.api.types.is_integer_dtype(series):  return "BIGINT"
     if pd.api.types.is_float_dtype(series):    return "DOUBLE"
     if pd.api.types.is_datetime64_any_dtype(series): return "TIMESTAMP"
     return "VARCHAR"
 def load_dynamic_aliases() -> None:
+    """Load column alias mappings from disk"""
     if os.path.exists(ALIAS_FILE):
         try:
             with open(ALIAS_FILE) as f:
                 else:
                     CANONICAL[k] = v
         except Exception as e:
+            print(f"[mapper] ⚠️ Failed to load alias memory: {e}")
 def save_dynamic_aliases() -> None:
+    """Save column alias mappings to disk"""
     os.makedirs(os.path.dirname(ALIAS_FILE), exist_ok=True)
     with open(ALIAS_FILE, "w") as f:
         json.dump(CANONICAL, f, indent=2)
+# ==================== ENTITY & INDUSTRY DETECTION ====================
+def poll_for_entity(org_id: str, source_id: str, timeout: int = 10) -> dict:
     """
+    Poll Redis for entity detection result.
+    Uses cache first, then Redis, then fallback.
     """
     cache_key = (org_id, source_id)
+    # 1. Check cache (zero Redis calls)
     if cache_key in _ENTITY_CACHE:
         print(f"[poll] 💾 CACHE HIT: {cache_key}")
         return _ENTITY_CACHE[cache_key]
     entity_key = f"entity:{org_id}:{source_id}"
     print(f"[poll] ⏳ Polling for key: {entity_key}")
+    # 2. Try Redis (immediate)
     data = event_hub.get_key(entity_key)
     if data:
         entity_info = json.loads(data)
+        print(f"[poll] ✅ Redis hit: {entity_info['entity_type']}")
         _ENTITY_CACHE[cache_key] = entity_info
         return entity_info
+    # 3. Sleep briefly
+    print("[poll] 🔄 First check failed, sleeping 3s...")
+    time.sleep(3.0)
+    # 4. Try Redis again
     data = event_hub.get_key(entity_key)
     if data:
         entity_info = json.loads(data)
         _ENTITY_CACHE[cache_key] = entity_info
         return entity_info
+    # 5. Fallback (single DuckDB query for both entity & industry)
+    print("[poll] ⚠️ Using combined fallback")
     entity_info, industry_info = _fallback_combined(org_id, source_id)
+    # 6. Populate industry cache too (since we have it)
+    _INDUSTRY_CACHE[cache_key] = industry_info
     return entity_info
+def poll_for_industry(org_id: str, source_id: str, timeout: int = 10) -> dict:
+    """
+    Poll Redis for industry detection result.
+    Reuses data from entity poll to avoid duplicate Redis calls.
+    """
+    cache_key = (org_id, source_id)
+    # 1. Check cache (filled by poll_for_entity)
+    if cache_key in _INDUSTRY_CACHE:
+        print(f"[poll_industry] 💾 CACHE HIT: {cache_key}")
+        return _INDUSTRY_CACHE[cache_key]
+    # 2. If cache missed but entity was polled, the fallback already ran
+    #    So just check Redis one more time
+    industry_key = f"industry:{org_id}:{source_id}"
+    data = event_hub.get_key(industry_key)
+    if data:
+        industry_info = json.loads(data)
+        _INDUSTRY_CACHE[cache_key] = industry_info
+        return industry_info
+    # 3. Rare: fallback failed to write industry, run emergency fallback
+    print("[poll_industry] ⚠️ Cache miss, running emergency fallback")
+    industry_info = _fallback_industry_detection(org_id, source_id)
+    _INDUSTRY_CACHE[cache_key] = industry_info
+    return industry_info
 def _fallback_combined(org_id: str, source_id: str) -> tuple[dict, dict]:
+    """
+    SINGLE DuckDB query to detect BOTH entity and industry.
+    Writes BOTH keys to Redis atomically.
+    Updates caches WITHOUT immediately invalidating them.
     """
     print(f"[fallback_combined] 🚨 Running combined fallback for {org_id}/{source_id}")
+    # Default values
     entity_info = {"entity_type": "UNKNOWN", "confidence": 0.0}
     industry_info = {"industry": "UNKNOWN", "confidence": 0.0}
     try:
         conn = get_conn(org_id)
         rows = conn.execute("""
             WHERE row_data IS NOT NULL
             USING SAMPLE 100
         """).fetchall()
         if rows:
             parsed = [json.loads(r[0]) for r in rows if r[0]]
             df = pd.DataFrame(parsed)
             df.columns = [str(col).lower().strip() for col in df.columns]
+            # Parallel detection
+            def detect_entity():
                 try:
                     return hybrid_detect_entity_type(org_id, df, f"{source_id}.json")
                 except Exception as e:
+                    print(f"[fallback] Entity detection failed: {e}")
                     return ("UNKNOWN", 0.0, False)
+            def detect_industry():
                 try:
                     from app.hybrid_industry_detector import hybrid_detect_industry_type
                     return hybrid_detect_industry_type(org_id, df, source_id)
                 except Exception as e:
+                    print(f"[fallback] Industry detection failed: {e}")
                     return ("UNKNOWN", 0.0, False)
             with ThreadPoolExecutor(max_workers=2) as ex:
+                ent_future = ex.submit(detect_entity)
+                ind_future = ex.submit(detect_industry)
+                entity_type, ent_conf, _ = ent_future.result()
+                industry, ind_conf, _ = ind_future.result()
+                entity_info = {"entity_type": entity_type, "confidence": ent_conf}
+                industry_info = {"industry": industry, "confidence": ind_conf}
+                print(f"[fallback] ✅ Entity: {entity_type} ({ent_conf:.2%}), Industry: {industry} ({ind_conf:.2%})")
     except Exception as e:
+        print(f"[fallback_combined] ❌ Failed: {e}")
+    # GUARANTEE: Write to Redis (pipeline for atomicity)
     try:
+        e_key = f"entity:{org_id}:{source_id}"
+        i_key = f"industry:{org_id}:{source_id}"
+        pipe = event_hub.redis.pipeline()
+        pipe.setex(e_key, 3600, json.dumps(entity_info))
+        pipe.setex(i_key, 3600, json.dumps(industry_info))
+        pipe.execute()
+        print(f"[fallback] 💾 WRITTEN to Redis: {e_key}, {i_key}")
+    except Exception as re:
+        print(f"[fallback] ❌ Redis write failed: {re}")
+    # Update caches (keep them valid!)
     cache_key = (org_id, source_id)
+    _ENTITY_CACHE[cache_key] = entity_info
+    _INDUSTRY_CACHE[cache_key] = industry_info
+    return entity_info, industry_info
 def _fallback_industry_detection(org_id: str, source_id: str) -> dict:
     """
+    Emergency fallback for industry only (rarely used).
+    Should only trigger if combined fallback fails.
     """
+    print(f"[fallback_industry] 🚨 Emergency fallback for {org_id}/{source_id}")
     try:
         conn = get_conn(org_id)
         """).fetchall()
         if not rows:
+            print("[fallback_industry] ❌ No data found")
+            return {"industry": "UNKNOWN", "confidence": 0.0}
+        parsed = [json.loads(r[0]) for r in rows if r[0]]
+        df = pd.DataFrame(parsed)
+        df.columns = [str(col).lower().strip() for col in df.columns]
+        from app.hybrid_industry_detector import hybrid_detect_industry_type
+        industry, confidence, _ = hybrid_detect_industry_type(org_id, df, source_id)
+        industry_info = {"industry": industry, "confidence": confidence}
+        print(f"[fallback_industry] ✅ Detected: {industry} ({confidence:.2%})")
+        # Write to Redis
         redis_key = f"industry:{org_id}:{source_id}"
         event_hub.setex(redis_key, 3600, json.dumps(industry_info))
+        print(f"[fallback_industry] 💾 WRITTEN to Redis: {redis_key}")
         return industry_info
     except Exception as e:
         print(f"[fallback_industry] ❌ Failed: {e}")
+        # Even on error, write UNKNOWN
         redis_key = f"industry:{org_id}:{source_id}"
         event_hub.setex(redis_key, 3600, json.dumps({"industry": "UNKNOWN", "confidence": 0.0}))
         return {"industry": "UNKNOWN", "confidence": 0.0}
+# ==================== ENTITY TABLE CREATION ====================
+def ensure_canonical_table(duck, df: pd.DataFrame, entity_type: str) -> str:
+    """Creates entity-specific table with safe column addition"""
+    table_name = f"main.{entity_type}_canonical"
+    # Create base table
+    duck.execute(f"""
+        CREATE TABLE IF NOT EXISTS {table_name} (
+            id UUID DEFAULT uuid(),
+            _ingested_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+        )
+    """)
+    # Get existing columns
+    existing_cols_raw = duck.execute(f"PRAGMA table_info('{table_name}')").fetchall()
+    existing_cols = {str(r[0]).lower() for r in existing_cols_raw}
+    # Add missing columns
+    for col in df.columns:
+        col_name = str(col).lower().strip()
+        if col_name not in existing_cols:
+            try:
+                dtype = map_pandas_to_duck(col_name, df[col])
+                print(f"[mapper] ➕ Adding column '{col_name}:{dtype}'")
+                duck.execute(f"ALTER TABLE {table_name} ADD COLUMN {col_name} {dtype}")
+            except Exception as e:
+                print(f"[mapper] ⚠️ Skipping column {col_name}: {e}")
+    return table_name
+# ==================== MAIN PIPELINE ====================
 def canonify_df(org_id: str, source_id: str, hours_window: int = 24) -> tuple[pd.DataFrame, str, float]:
     """
+    ENTERPRISE DATA INGESTION PIPELINE
+    Safe, idempotent, and Redis-efficient.
     """
     start_time = datetime.now()
     print(f"\n[canonify] 🚀 Starting pipeline for {org_id}/{source_id}")
+    # Load aliases
     load_dynamic_aliases()
+    # 1️⃣ FETCH RAW DATA
     with get_conn(org_id) as conn:
         ensure_raw_table(conn)
         cutoff_time = datetime.now() - timedelta(hours=hours_window)
         try:
         except Exception as e:
             print(f"[canonify] ❌ SQL read error: {e}")
             return pd.DataFrame(), "unknown", 0.0
     if not rows:
+        print("[canonify] ⚠️ No audit rows found")
         return pd.DataFrame(), "unknown", 0.0
+    # 2️⃣ PARSE JSON
     parsed, malformed_count = [], 0
     for r in rows:
         raw = r[0]
         if not raw:
             malformed_count += 1
             continue
         try:
             obj = raw if isinstance(raw, (dict, list)) else json.loads(str(raw))
         except Exception:
             malformed_count += 1
             continue
+        # Extract rows from various structures
         if isinstance(obj, dict):
             if "rows" in obj and isinstance(obj["rows"], list):
                 parsed.extend(obj["rows"])
             parsed.extend(obj)
         else:
             malformed_count += 1
     if malformed_count:
         print(f"[canonify] ⚠️ Skipped {malformed_count} malformed rows")
     if not parsed:
         print("[canonify] ❌ No valid data after parsing")
         return pd.DataFrame(), "unknown", 0.0
+    # 3️⃣ NORMALIZE COLUMNS
     df = pd.DataFrame(parsed)
     df.columns = [str(col).lower().strip() for col in df.columns]
     df = df.loc[:, ~df.columns.duplicated()]
     print(f"[canonify] 📊 Parsed DataFrame: {len(df)} rows × {len(df.columns)} cols")
+    # 4️⃣ MAP TO CANONICAL SCHEMA
     mapping, canonical_used = {}, set()
     for canon, aliases in CANONICAL.items():
         for col in df.columns:
             if any(str(alias).lower() in col for alias in aliases):
                 if canon not in canonical_used:
                     mapping[col] = canon
                     canonical_used.add(canon)
                     print(f"[canonify] 🔀 Mapped '{col}' → canonical '{canon}'")
                 break
+    # Learn new aliases
     for col in df.columns:
         for canon in CANONICAL.keys():
             if str(canon).lower() in col and col not in CANONICAL[canon]:
                 print(f"[canonify] 🧠 Learned new alias: {canon} ← {col}")
     save_dynamic_aliases()
+    # Apply mapping, keep all columns
     renamed = df.rename(columns=mapping)
     final_columns, seen = [], set()
     for col in renamed.columns:
         if col in CANONICAL.keys():
     df = renamed[final_columns].copy()
     print(f"[canonify] ✅ Kept columns: {list(df.columns)}")
+    # 5️⃣ TYPE CONVERSIONS
     try:
         if "timestamp" in df:
             df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
             if col in df:
                 df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
     except Exception as e:
+        print(f"[canonify] ⚠️ Type conversion warning: {e}")
+    # 6️⃣ DETECT ENTITY & INDUSTRY
     entity_info = poll_for_entity(org_id, source_id)
     entity_type = entity_info["entity_type"]
+    # Industry is fetched from cache filled by poll_for_entity
     industry_info = poll_for_industry(org_id, source_id)
     industry = industry_info["industry"]
     industry_confidence = industry_info["confidence"]
     print(f"[canonify] 🎯 Entity: {entity_type}, Industry: {industry} ({industry_confidence:.2%})")
+    # 7️⃣ SCHEMA VERSIONING & TRANSACTIONAL INSERT
     os.makedirs("./db", exist_ok=True)
     with transactional_conn(org_id) as duck:
         ensure_schema_versions_table(duck)
+        # Detect schema changes
         current_schema = {col: map_pandas_to_duck(col, df[col]) for col in df.columns}
         existing_schema_row = duck.execute("""
             SELECT schema_json, version_id FROM main.schema_versions
         version_id = None
         if is_new_schema:
             version_id = duck.execute("""
                 INSERT INTO main.schema_versions
                 (version_id, table_name, schema_json, status)
             """, (f"{entity_type}_canonical", json.dumps(current_schema))).fetchone()[0]
             print(f"[canonify] 📝 Created schema v{version_id} for {entity_type}_canonical")
+        # Ensure table exists
         table_name = ensure_canonical_table(duck, df, entity_type)
+        # Insert data
         if not df.empty:
             table_info = duck.execute(f"PRAGMA table_info('{table_name}')").fetchall()
             table_cols = [str(r[0]) for r in table_info]
             df_to_insert = df[[col for col in df.columns if col in table_cols]]
             if not df_to_insert.empty:
                 df_to_insert = df_to_insert.replace([np.inf, -np.inf, np.nan], None)
                 cols_str = ", ".join(df_to_insert.columns)
                 )
                 print(f"[canonify] 💾 Inserted {len(df_to_insert)} rows into {table_name}")
+            # Mark schema as applied
             if is_new_schema and version_id:
                 try:
                     duck.execute("""
                     """, (version_id,))
                     print(f"[canonify] ✅ Schema v{version_id} marked as applied")
                 except Exception as e:
+                    print(f"[canonify] ⚠️ Schema update warning: {e}")
+    # 8️⃣ FINAL: Clean DataFrame for response
+    df = df.replace([np.inf, -np.inf, np.nan], None)
     duration_ms = (datetime.now() - start_time).total_seconds() * 1000
     print(f"[canonify] ✅ Pipeline complete in {duration_ms:.2f}ms for {org_id}")
+    # 9️⃣ SINGLE, SAFE WORKER TRIGGER (idempotent)
+    try:
+        # Defensive: ensure keys exist (they should from poll_for_entity)
+        e_key = f"entity:{org_id}:{source_id}"
+        i_key = f"industry:{org_id}:{source_id}"
+        if not event_hub.exists(e_key) or not event_hub.exists(i_key):
+            print(f"[canonify] ⚠️ Keys missing, running fallback to ensure")
+            _fallback_combined(org_id, source_id)
+        # 🎯 ONE trigger message to worker manager
+        event_hub.emit_analytics_trigger(org_id, source_id, {
+            "type": "kpi_compute",
+            "entity_type": entity_type,
+            "industry": industry,
+            "rows_inserted": len(df),
+            "timestamp": datetime.now().isoformat()
+        })
+        print(f"[canonify] 🚀 Triggered analytics for {source_id}")
+    except Exception as e:
+        print(f"[canonify] ⚠️ Analytics trigger failed: {e}")
     return df, industry, industry_confidence

app/tasks/analytics_worker.py CHANGED Viewed

@@ -1,29 +1,39 @@
-# app/tasks/analytics_worker.py
 import asyncio
 import json
 import pandas as pd
 import logging
-from datetime import datetime,timedelta
-from typing import Dict, Any
-import time
 from app.core.event_hub import event_hub
 from app.db import get_conn
-from app.schemas.org_schema import OrgSchema  # AI schema mapper
-from app.service.column_embedding_service import ColumnEmbeddingService  # Vector engine
-from app.service.vector_service import VectorService  # AI query storage
 from app.engine.kpi_calculators.registry import get_kpi_calculator
-from app.service.embedding_service import EmbeddingService  # HF API fallback
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class AnalyticsWorker:
     """
     🧠+🚀 Hybrid: Deep reasoning + Async efficiency
-    -  Solves column mapping for any data shape
-    -  Non-blocking, cached, zero downtime
     """
     def __init__(self, org_id: str, source_id: str, hours_window: int = 24):
@@ -32,122 +42,208 @@ class AnalyticsWorker:
         self.hours_window = hours_window
         # Core engines
-        self.schema = OrgSchema(org_id)  # AI-powered schema resolver
-        self.col_embedder = ColumnEmbeddingService()  # For column mapping
-        self.txn_embedder = EmbeddingService()  # For transaction embeddings
-        self.vector_service = VectorService(org_id)  # For AI queries
-        self.computed_at = None
-        self._entity_type = None
     async def run(self) -> Dict[str, Any]:
         """
         🎯 THE ENGINE - Zero gaps, pure flow
-        1. Load data from DuckDB (wait for table)
-        2. Discover column mapping (AI, cached)
-        3. Alias columns for KPI calculator
-        4. Embed transactions (async, for AI queries)
-        5. Compute KPIs (industry-aware)
-        6. Publish to Redis (UI + AI channels)
-        7. Cache results (5 min)
         """
         start_time = datetime.now()
-        logger.info(f"\n[WORKER] 🚀 STARTING {self.org_id}/{self.source_id}")
-        # 🎯 NEW: Wait for entity/industry keys to exist
-        await self._wait_for_entity_and_industry()
         try:
-            # 1️⃣ LOAD DATA (handles missing tables)
             df = await self._load_dataframe()
             if df.empty:
                 await self._publish_status("error", "No data")
-                return {"error": "No data"}
             logger.info(f"[WORKER] 📊 Loaded {len(df)} rows × {len(df.columns)} cols")
-            # Fast from cache (~0ms), slow on first run (~30s)
             mapping = await self._discover_schema(df)
             if not mapping:
                 await self._publish_status("error", "Schema discovery failed")
-                return {"error": "No schema mapping"}
-            logger.info(f"[WORKER] 🔀 Mapping: {list(mapping.items())[:5]}...")  # Log first 5
-            # 3️⃣ ALIAS COLUMNS (clean code)
             df = self._alias_columns(df, mapping)
-            # 4️⃣ EMBED TRANSACTIONS (Elon's rocket - async)
-            # Does NOT block KPI computation
             embed_task = asyncio.create_task(
-                self._embed_transactions(df.head(1000)),  # Top 1000 for performance
-                name=f"embed-{self.org_id}"
             )
-            # 5️⃣ COMPUTE KPIs (industry-aware)
             industry = await self._get_industry()
             calculator = get_kpi_calculator(industry, self.org_id, df, self.source_id)
-            # Run CPU-heavy work in thread pool
             results = await asyncio.to_thread(calculator.compute_all)
-            self.computed_at = datetime.now()
-            logger.info(f"[WORKER] ✅ KPIs computed in {(self.computed_at - start_time).total_seconds():.2f}s")
-            # 6️⃣ PUBLISH TO REDIS (multiple channels)
             await self._publish(results)
-            # 7️⃣ CACHE (5 min TTL)
-            self._cache(results)
-            # Wait for embeddings (non-critical)
             try:
                 await asyncio.wait_for(embed_task, timeout=30)
                 logger.info("[WORKER] ✅ Embeddings completed")
             except asyncio.TimeoutError:
                 logger.warning("[WORKER] ⚠️ Embedding timeout, but KPIs published")
-            duration = (self.computed_at - start_time).total_seconds()
-            logger.info(f"[WORKER] 🎯 COMPLETE: {duration:.2f}s for {self.org_id}")
             return results
         except Exception as e:
             logger.error(f"[WORKER] ❌ CRITICAL: {e}", exc_info=True)
             await self._publish_status("error", str(e))
-            return {"error": str(e)}
-    # ==================== INTERNAL METHODS ====================
     async def _load_dataframe(self) -> pd.DataFrame:
-        """🐢 Sync load with table readiness check"""
         return await asyncio.to_thread(self._sync_load_dataframe)
     def _sync_load_dataframe(self) -> pd.DataFrame:
-        """Waits up to 30s for table + data"""
         conn = None
         MAX_WAIT = 30
-        RETRY_INTERVAL = 2
         try:
-            # Get entity type from hub-backed Redis
-            entity_key = f"entity:{self.org_id}:{self.source_id}"
-            entity_info = event_hub.get_key(entity_key)
             if not entity_info:
-                logger.warning(f"[LOAD] No entity info: {entity_key}")
                 return pd.DataFrame()
-            self._entity_type = json.loads(entity_info)["entity_type"]
             table_name = f"main.{self._entity_type}_canonical"
             cutoff = datetime.now() - timedelta(hours=self.hours_window)
             conn = get_conn(self.org_id)
-            # Wait for table + data
             start = time.time()
             while (time.time() - start) < MAX_WAIT:
                 try:
                     count = conn.execute(
                         f"SELECT COUNT(*) FROM {table_name} WHERE timestamp >= ?",
                         [cutoff]
@@ -156,19 +252,23 @@ class AnalyticsWorker:
                     if count > 0:
                         logger.info(f"[LOAD] Table ready: {count} rows (waited {(time.time() - start):.1f}s)")
                         break
-                    logger.info(f"[LOAD] Table empty (waited {(time.time() - start):.1f}s)")
                 except Exception as e:
                     if "does not exist" in str(e).lower():
-                        logger.info(f"[LOAD] Table doesn't exist (waited {(time.time() - start):.1f}s)")
                     else:
                         logger.warning(f"[LOAD] Error: {e}")
-                time.sleep(RETRY_INTERVAL)
             else:
                 logger.error(f"[LOAD] Timeout after {MAX_WAIT}s")
                 return pd.DataFrame()
-            # Load data
             df = conn.execute(
                 f"SELECT * FROM {table_name} WHERE timestamp >= ? ORDER BY timestamp DESC",
                 [cutoff]
@@ -188,20 +288,95 @@ class AnalyticsWorker:
                 except:
                     pass
-    async def _discover_schema(self, df: pd.DataFrame) -> Dict[str, str]:
         """
-        🧠 Einstein's discovery engine
-        Pattern → Vector → LLM (3-tier)
         """
         try:
-            # Fast: Redis cache (via hub)
             cache_key = f"schema:mapping:{self.org_id}"
             if cached := event_hub.get_key(cache_key):
-                logger.info("[SCHEMA] Cache hit")
                 return json.loads(cached)
             # Slow: AI discovery
-            logger.info("[SCHEMA] Cache miss, discovering...")
             mapping = self.schema.get_mapping()
             if not mapping:
@@ -210,38 +385,39 @@ class AnalyticsWorker:
             # Cache for 24h
             event_hub.setex(cache_key, 86400, json.dumps(mapping))
-            logger.info(f"[SCHEMA] Discovered {len(mapping)} mappings")
             return mapping
         except Exception as e:
-            logger.error(f"[SCHEMA] Discovery failed: {e}", exc_info=True)
             return {}
     def _alias_columns(self, df: pd.DataFrame, mapping: Dict[str, str]) -> pd.DataFrame:
-        """🔀 Renames actual columns to semantic names"""
         try:
-            rename_map = {actual: semantic for semantic, actual in mapping.items() if actual in df.columns}
             if not rename_map:
                 logger.warning("[ALIAS] No columns to alias")
                 return df
-            logger.info(f"[ALIAS] Renaming {len(rename_map)} columns: {rename_map}")
             return df.rename(columns=rename_map)
         except Exception as e:
-            logger.error(f"[ALIAS] Error: {e}")
             return df
     async def _embed_transactions(self, df: pd.DataFrame):
-        """
-        🚀 Elon's vector engine: Embeds for AI queries
-        Non-critical, runs async
-        """
         try:
             if df.empty:
-                logger.warning("[EMBED] No data")
                 return
             # Build semantic texts
@@ -268,10 +444,10 @@ class AnalyticsWorker:
                     })
             if not texts:
-                logger.warning("[EMBED] No valid texts")
                 return
-            # Generate embeddings (HF API or local)
             logger.info(f"[EMBED] Generating {len(texts)} embeddings...")
             embeddings = []
@@ -293,126 +469,257 @@ class AnalyticsWorker:
             logger.info(f"[EMBED] ✅ Stored {len(embeddings)} vectors")
         except Exception as e:
-            logger.error(f"[EMBED] Failed: {e}", exc_info=True)
             # Non-critical - don't raise
-    async def _get_industry(self) -> str:
-        """Get industry from Redis"""
         try:
-            key = f"industry:{self.org_id}:{self.source_id}"
-            if data := event_hub.get_key(key):
-                return json.loads(data).get("industry", "supermarket").lower()
-            return "supermarket"
-        except:
-            return "supermarket"
-    async def _wait_for_entity_and_industry(self):
-        """Block until entity and industry are detected (max 30s)"""
-        max_wait = 30
-        start = time.time()
-        while (time.time() - start) < max_wait:
-            entity_key = f"entity:{self.org_id}:{self.source_id}"
-            industry_key = f"industry:{self.org_id}:{self.source_id}"
-            try:
-                # If in-memory cache exists but Redis does not, invalidate cache
-                from app.mapper import _ENTITY_CACHE, _INDUSTRY_CACHE
-                cache_ent = _ENTITY_CACHE.get((self.org_id, self.source_id))
-                cache_ind = _INDUSTRY_CACHE.get((self.org_id, self.source_id))
-                ent_exists = event_hub.exists(entity_key)
-                ind_exists = event_hub.exists(industry_key)
-                if cache_ent and not ent_exists:
-                    _ENTITY_CACHE.pop((self.org_id, self.source_id), None)
-                    logger.debug(f"[WORKER] Cleared stale _ENTITY_CACHE for {self.org_id}/{self.source_id}")
-                if cache_ind and not ind_exists:
-                    _INDUSTRY_CACHE.pop((self.org_id, self.source_id), None)
-                    logger.debug(f"[WORKER] Cleared stale _INDUSTRY_CACHE for {self.org_id}/{self.source_id}")
-                if ent_exists and ind_exists:
-                    logger.info("[WORKER] ✅ Entity & industry keys found")
-                    return
-            except Exception as e:
-                logger.debug(f"[WORKER] Redis/cache check error: {e}")
-            logger.info("[WORKER] ⏳ Waiting for entity/industry keys...")
-            await asyncio.sleep(2)
-        logger.warning("[WORKER] ⚠️ Timeout waiting for keys, proceeding anyway")
-    # Change _publish() method to use streams
-async def _publish(self, results: Dict[str, Any]):
-    try:
-        ts = self.computed_at.isoformat() if self.computed_at else datetime.now().isoformat()
-        # Publish via central hub (streams + structured messages)
-        event_hub.emit_kpi_update(self.org_id, self.source_id, {
-            "data": results,
-            "rows": results.get("metadata", {}).get("rows_analyzed", 0),
-            "timestamp": ts
-        })
-        # Publish insights
-        for alert in results.get("predictive", {}).get("alerts", []):
-            event_hub.emit_insight(self.org_id, self.source_id, alert)
-        logger.info(f"[PUBLISH] 📤 Sent to stream for {self.org_id}/{self.source_id}")
-    except Exception as e:
-        logger.error(f"[PUBLISH] Error: {e}", exc_info=True)
-    async def _publish_status(self, status: str, message: str = ""):
-        """Publish status"""
         try:
-            event_hub.emit_status(self.org_id, self.source_id, status, message)
         except Exception as e:
-            logger.error(f"[STATUS] Error: {e}")
-    def _cache(self, results: Dict[str, Any]):
-        """Cache for 5 min"""
         try:
-            event_hub.setex(f"kpi_cache:{self.org_id}:{self.source_id}", 300, json.dumps(results))
-            logger.debug("[CACHE] Cached results")
         except Exception as e:
-            logger.warning(f"[CACHE] Error: {e}")
-# ---- Redis Listener (The Glue) ---- #
-async def redis_listener():
     """
-    🎧 Runs forever, triggers workers on Redis messages
-    Start this with: `asyncio.create_task(redis_listener())` in main.py
     """
-    pubsub = event_hub.redis.pubsub()
-    pubsub.psubscribe("analytics_trigger:*")
-    logger.info("🎧 Redis listener active - Einstein+Elon mode ENGAGED")
-    async for message in pubsub.listen():
-        if message["type"] == "pmessage":
-            try:
-                trigger = json.loads(message["data"])
-                logger.info(f"📡 Received: {trigger}")
-                # Non-blocking worker spawn
-                worker = AnalyticsWorker(
-                    trigger["org_id"],
-                    trigger["source_id"]
-                )
-                asyncio.create_task(worker.run())
-            except Exception as e:
-                logger.error(f"Listener error: {e}", exc_info=True)
-# ---- FastAPI Integration ---- #
 async def trigger_kpi_computation(org_id: str, source_id: str):
-    """Trigger the worker via Redis pubsub"""
     try:
-        # Use the hub which writes both to pubsub and to a small stream
-        event_hub.emit_analytics_trigger(org_id, source_id)
         logger.info(f"🎯 Triggered KPI computation: {org_id}/{source_id}")
     except Exception as e:
-        logger.error(f"Trigger failed: {e}")

 import asyncio
 import json
+import os
+import time
+from asyncio import Lock
+from datetime import datetime, timedelta
+from typing import Dict, Any, Optional
 import pandas as pd
 import logging
 from app.core.event_hub import event_hub
 from app.db import get_conn
+from app.schemas.org_schema import OrgSchema
+from app.service.column_embedding_service import ColumnEmbeddingService
+from app.service.vector_service import VectorService
 from app.engine.kpi_calculators.registry import get_kpi_calculator
+from app.service.embedding_service import EmbeddingService
+# Configure logging with request context
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s | %(levelname)s | [%(name)s] %(message)s'
+)
 logger = logging.getLogger(__name__)
+# Global lock registry to prevent duplicate workers per org/source
+_WORKER_LOCKS: Dict[str, Lock] = {}
 class AnalyticsWorker:
     """
     🧠+🚀 Hybrid: Deep reasoning + Async efficiency
+    - Solves column mapping for any data shape
+    - Non-blocking, cached, zero downtime
+    - Deduplication guaranteed via Redis + in-process locks
     """
     def __init__(self, org_id: str, source_id: str, hours_window: int = 24):
         self.hours_window = hours_window
         # Core engines
+        self.schema = OrgSchema(org_id)
+        self.col_embedder = ColumnEmbeddingService()
+        self.txn_embedder = EmbeddingService()
+        self.vector_service = VectorService(org_id)
+        self.computed_at: Optional[datetime] = None
+        self._entity_type: Optional[str] = None
+        # Deduplication keys
+        self.lock_key = f"worker:lock:{org_id}:{source_id}"
+        self.processed_key = f"worker:processed:{org_id}:{source_id}"
+        # Get or create in-process lock for this org/source pair
+        self._process_lock = _WORKER_LOCKS.setdefault(self.lock_key, Lock())
     async def run(self) -> Dict[str, Any]:
         """
         🎯 THE ENGINE - Zero gaps, pure flow
+        1. Acquire distributed lock (prevent duplicates across workers)
+        2. Wait for entity/industry keys with exponential backoff
+        3. Load data from DuckDB
+        4. Discover column mapping (AI, cached)
+        5. Alias columns for KPI calculator
+        6. Embed transactions (async, non-blocking)
+        7. Compute KPIs (industry-aware)
+        8. Publish to Redis (UI + AI channels)
+        9. Cache results (5 min)
+        10. Release lock & cleanup
         """
         start_time = datetime.now()
+        worker_id = f"{self.org_id}/{self.source_id}"
+        # 🎯 STEP 0: Check if already processed recently (idempotency)
+        if await self._is_already_processed():
+            logger.warning(f"[WORKER] ⚠️ Already processed {worker_id} in last 5min, skipping")
+            return {"status": "skipped", "reason": "already_processed"}
+        # 🎯 STEP 1: Acquire distributed lock (Redis + in-process)
+        if not await self._acquire_lock():
+            logger.warning(f"[WORKER] ❌ Lock not acquired for {worker_id}")
+            return {"status": "skipped", "reason": "lock_failed"}
         try:
+            logger.info(f"\n[WORKER] 🚀 STARTING {worker_id}")
+            # 🎯 STEP 2: Wait for entity/industry keys (exponential backoff)
+            await self._wait_for_entity_and_industry()
+            # 🎯 STEP 3: Load data with retry logic
             df = await self._load_dataframe()
             if df.empty:
                 await self._publish_status("error", "No data")
+                return {"status": "error", "reason": "no_data"}
             logger.info(f"[WORKER] 📊 Loaded {len(df)} rows × {len(df.columns)} cols")
+            # 🎯 STEP 4: Schema discovery (cached)
             mapping = await self._discover_schema(df)
             if not mapping:
                 await self._publish_status("error", "Schema discovery failed")
+                return {"status": "error", "reason": "no_schema"}
+            logger.info(f"[WORKER] 🔀 Mapping: {list(mapping.items())[:5]}...")
+            # 🎯 STEP 5: Alias columns
             df = self._alias_columns(df, mapping)
+            # 🎯 STEP 6: Embed transactions (fire-and-forget, non-blocking)
             embed_task = asyncio.create_task(
+                self._embed_transactions(df.head(1000)),
+                name=f"embed-{self.org_id}-{self.source_id}"
             )
+            # 🎯 STEP 7: Compute KPIs (CPU-bound, run in thread pool)
             industry = await self._get_industry()
             calculator = get_kpi_calculator(industry, self.org_id, df, self.source_id)
             results = await asyncio.to_thread(calculator.compute_all)
+            self.computed_at = datetime.now()
+            duration = (self.computed_at - start_time).total_seconds()
+            logger.info(f"[WORKER] ✅ KPIs computed in {duration:.2f}s")
+            # 🎯 STEP 8: Publish results (atomic pipeline)
             await self._publish(results)
+            # 🎯 STEP 9: Cache with TTL
+            await self._cache_results(results)
+            # 🎯 STEP 10: Mark as processed (idempotency)
+            await self._mark_processed()
+            # Wait for embeddings (30s timeout, non-critical)
             try:
                 await asyncio.wait_for(embed_task, timeout=30)
                 logger.info("[WORKER] ✅ Embeddings completed")
             except asyncio.TimeoutError:
                 logger.warning("[WORKER] ⚠️ Embedding timeout, but KPIs published")
+            logger.info(f"[WORKER] 🎯 COMPLETE: {worker_id} in {duration:.2f}s")
             return results
         except Exception as e:
             logger.error(f"[WORKER] ❌ CRITICAL: {e}", exc_info=True)
             await self._publish_status("error", str(e))
+            return {"status": "error", "reason": str(e)}
+        finally:
+            # 🎯 STEP 11: ALWAYS release lock
+            await self._release_lock()
+    # ==================== DEDUPLICATION & LOCKING ====================
+    async def _is_already_processed(self) -> bool:
+        """Check if this job was processed in last 5 minutes"""
+        try:
+            # Use Redis TTL to track processed jobs
+            return event_hub.redis.exists(self.processed_key)
+        except Exception as e:
+            logger.error(f"[LOCK] Error checking processed key: {e}")
+            return False
+    async def _acquire_lock(self) -> bool:
+        """
+        Acquire distributed lock using Redis SETNX + in-process lock.
+        Returns True if lock acquired, False otherwise.
+        """
+        try:
+            # Try Redis SETNX (set if not exists)
+            lock_acquired = event_hub.redis.setnx(self.lock_key, "1")
+            if not lock_acquired:
+                return False
+            # Set expiry (safety for crashed workers)
+            event_hub.redis.expire(self.lock_key, 300)  # 5 minute max runtime
+            # Also acquire in-process lock (prevents same-process duplicates)
+            acquired = await asyncio.wait_for(self._process_lock.acquire(), timeout=1.0)
+            if not acquired:
+                # Release Redis lock if in-process lock fails
+                event_hub.redis.delete(self.lock_key)
+                return False
+            logger.info(f"[LOCK] ✅ Acquired for {self.lock_key}")
+            return True
+        except Exception as e:
+            logger.error(f"[LOCK] Failed to acquire: {e}")
+            return False
+    async def _release_lock(self):
+        """Release both Redis and in-process locks"""
+        try:
+            # Release in-process lock
+            if self._process_lock.locked():
+                self._process_lock.release()
+            # Release Redis lock
+            event_hub.redis.delete(self.lock_key)
+            logger.info(f"[LOCK] 🔓 Released for {self.lock_key}")
+        except Exception as e:
+            logger.error(f"[LOCK] Error releasing: {e}")
+    async def _mark_processed(self):
+        """Mark this job as processed (TTL 5 minutes)"""
+        try:
+            event_hub.redis.setex(self.processed_key, 300, "1")
+        except Exception as e:
+            logger.error(f"[LOCK] Failed to mark processed: {e}")
+    # ==================== DATA LOADING ====================
     async def _load_dataframe(self) -> pd.DataFrame:
+        """Async wrapper for sync data loading"""
         return await asyncio.to_thread(self._sync_load_dataframe)
     def _sync_load_dataframe(self) -> pd.DataFrame:
+        """Wait for table + data with exponential backoff"""
         conn = None
         MAX_WAIT = 30
+        INITIAL_RETRY = 0.5
         try:
+            # Get entity type from Redis
+            entity_info = self._get_entity_info()
             if not entity_info:
+                logger.error(f"[LOAD] No entity info for {self.org_id}/{self.source_id}")
                 return pd.DataFrame()
+            self._entity_type = entity_info["entity_type"]
             table_name = f"main.{self._entity_type}_canonical"
             cutoff = datetime.now() - timedelta(hours=self.hours_window)
             conn = get_conn(self.org_id)
+            retry_delay = INITIAL_RETRY
+            # Exponential backoff wait
             start = time.time()
             while (time.time() - start) < MAX_WAIT:
                 try:
+                    # Check if table exists and has data
                     count = conn.execute(
                         f"SELECT COUNT(*) FROM {table_name} WHERE timestamp >= ?",
                         [cutoff]
                     if count > 0:
                         logger.info(f"[LOAD] Table ready: {count} rows (waited {(time.time() - start):.1f}s)")
                         break
+                    logger.debug(f"[LOAD] Table empty, retrying in {retry_delay}s...")
                 except Exception as e:
                     if "does not exist" in str(e).lower():
+                        logger.debug(f"[LOAD] Table doesn't exist yet, retrying...")
                     else:
                         logger.warning(f"[LOAD] Error: {e}")
+                time.sleep(retry_delay)
+                retry_delay = min(retry_delay * 1.5, 5.0)  # Cap at 5s
             else:
                 logger.error(f"[LOAD] Timeout after {MAX_WAIT}s")
                 return pd.DataFrame()
+            # Load the data
             df = conn.execute(
                 f"SELECT * FROM {table_name} WHERE timestamp >= ? ORDER BY timestamp DESC",
                 [cutoff]
                 except:
                     pass
+    def _get_entity_info(self) -> Optional[Dict[str, Any]]:
+        """Get entity info from Redis with cache invalidation"""
+        try:
+            from app.mapper import _ENTITY_CACHE
+            cache_key = (self.org_id, self.source_id)
+            entity_key = f"entity:{self.org_id}:{self.source_id}"
+            data = event_hub.get_key(entity_key)
+            if data:
+                entity_info = json.loads(data)
+                # Update cache but keep it fresh
+                _ENTITY_CACHE[cache_key] = entity_info
+                return entity_info
+            # Cache miss or stale: invalidate
+            _ENTITY_CACHE.pop(cache_key, None)
+            return None
+        except Exception as e:
+            logger.error(f"[ENTITY] Error: {e}")
+            return None
+    # ==================== ENTITY/INDUSTRY WAITING ====================
+    async def _wait_for_entity_and_industry(self):
         """
+        Wait for entity and industry keys with exponential backoff.
+        Also handles cache invalidation if Redis shows different data.
         """
+        MAX_WAIT = 30.0
+        INITIAL_DELAY = 0.5
+        MAX_DELAY = 5.0
+        entity_key = f"entity:{self.org_id}:{self.source_id}"
+        industry_key = f"industry:{self.org_id}:{self.source_id}"
+        delay = INITIAL_DELAY
+        start = time.time()
+        while (time.time() - start) < MAX_WAIT:
+            try:
+                # Check Redis directly (source of truth)
+                ent_exists = event_hub.exists(entity_key)
+                ind_exists = event_hub.exists(industry_key)
+                # If both exist, validate cache consistency
+                if ent_exists and ind_exists:
+                    from app.mapper import _ENTITY_CACHE, _INDUSTRY_CACHE
+                    cache_key = (self.org_id, self.source_id)
+                    # Invalidate cache if Redis has data but cache doesn't
+                    if cache_key not in _ENTITY_CACHE:
+                        data = event_hub.get_key(entity_key)
+                        if data:
+                            _ENTITY_CACHE[cache_key] = json.loads(data)
+                    if cache_key not in _INDUSTRY_CACHE:
+                        data = event_hub.get_key(industry_key)
+                        if data:
+                            _INDUSTRY_CACHE[cache_key] = json.loads(data)
+                    logger.info("[WORKER] ✅ Entity & industry keys found and validated")
+                    return
+                logger.info(f"[WORKER] ⏳ Waiting for keys (entity={ent_exists}, industry={ind_exists})...")
+            except Exception as e:
+                logger.debug(f"[WORKER] Redis check error: {e}")
+            await asyncio.sleep(delay)
+            delay = min(delay * 1.5, MAX_DELAY)
+        logger.warning(f"[WORKER] ⚠️ Timeout waiting for keys after {MAX_WAIT}s, proceeding anyway")
+    # ==================== SCHEMA & EMBEDDING ====================
+    async def _discover_schema(self, df: pd.DataFrame) -> Dict[str, str]:
+        """🧠 Einstein's discovery engine with caching"""
         try:
             cache_key = f"schema:mapping:{self.org_id}"
+            # Fast: Redis cache
             if cached := event_hub.get_key(cache_key):
+                logger.info("[SCHEMA] 💾 Cache hit")
                 return json.loads(cached)
             # Slow: AI discovery
+            logger.info("[SCHEMA] 🧠 Cache miss, discovering...")
             mapping = self.schema.get_mapping()
             if not mapping:
             # Cache for 24h
             event_hub.setex(cache_key, 86400, json.dumps(mapping))
+            logger.info(f"[SCHEMA] ✅ Discovered {len(mapping)} mappings")
             return mapping
         except Exception as e:
+            logger.error(f"[SCHEMA] ❌ Discovery failed: {e}", exc_info=True)
             return {}
     def _alias_columns(self, df: pd.DataFrame, mapping: Dict[str, str]) -> pd.DataFrame:
+        """🔀 Renames columns to semantic names"""
         try:
+            rename_map = {
+                actual: semantic
+                for semantic, actual in mapping.items()
+                if actual in df.columns
+            }
             if not rename_map:
                 logger.warning("[ALIAS] No columns to alias")
                 return df
+            logger.info(f"[ALIAS] 🔀 Renaming {len(rename_map)} columns")
             return df.rename(columns=rename_map)
         except Exception as e:
+            logger.error(f"[ALIAS] ❌ Error: {e}", exc_info=True)
             return df
     async def _embed_transactions(self, df: pd.DataFrame):
+        """🚀 Elon's vector engine (fire-and-forget)"""
         try:
             if df.empty:
+                logger.warning("[EMBED] No data to embed")
                 return
             # Build semantic texts
                     })
             if not texts:
+                logger.warning("[EMBED] No valid texts generated")
                 return
+            # Generate embeddings in batches
             logger.info(f"[EMBED] Generating {len(texts)} embeddings...")
             embeddings = []
             logger.info(f"[EMBED] ✅ Stored {len(embeddings)} vectors")
         except Exception as e:
+            logger.error(f"[EMBED] ❌ Failed: {e}", exc_info=True)
             # Non-critical - don't raise
+    # ==================== PUBLISHING & CACHING ====================
+    async def _publish(self, results: Dict[str, Any]):
+        """📤 Publish results to Redis (atomic pipeline)"""
         try:
+            ts = self.computed_at.isoformat() if self.computed_at else datetime.now().isoformat()
+            # Use atomic pipeline for minimal Redis calls
+            pipe = event_hub.redis.pipeline()
+            # Publish KPI update
+            kpi_data = {
+                "data": results,
+                "rows": results.get("metadata", {}).get("rows_analyzed", 0),
+                "timestamp": ts
+            }
+            pipe.setex(
+                f"kpi_cache:{self.org_id}:{self.source_id}",
+                300,  # 5 min TTL
+                json.dumps(kpi_data)
+            )
+            # Publish insights
+            for alert in results.get("predictive", {}).get("alerts", []):
+                pipe.lpush(
+                    f"insights:{self.org_id}:{self.source_id}",
+                    json.dumps(alert)
+                )
+                pipe.expire(f"insights:{self.org_id}:{self.source_id}", 300)
+            pipe.execute()
+            logger.info(f"[PUBLISH] 📤 Published KPIs for {self.org_id}/{self.source_id}")
+        except Exception as e:
+            logger.error(f"[PUBLISH] ❌ Error: {e}", exc_info=True)
+    async def _cache_results(self, results: Dict[str, Any]):
+        """💾 Cache results for 5 minutes"""
         try:
+            cache_key = f"kpi_cache:{self.org_id}:{self.source_id}"
+            event_hub.setex(cache_key, 300, json.dumps(results))
+            logger.debug("[CACHE] ✅ Results cached")
         except Exception as e:
+            logger.warning(f"[CACHE] ⚠️ Failed: {e}")
+    async def _publish_status(self, status: str, message: str = ""):
+        """📢 Publish worker status"""
         try:
+            status_data = {
+                "status": status,
+                "message": message,
+                "timestamp": datetime.now().isoformat(),
+                "worker_id": f"{self.org_id}:{self.source_id}"
+            }
+            event_hub.redis.setex(
+                f"worker:status:{self.org_id}:{self.source_id}",
+                60,
+                json.dumps(status_data)
+            )
         except Exception as e:
+            logger.error(f"[STATUS] ❌ Failed: {e}")
+# ==================== WORKER MANAGER & LISTENER ====================
+class WorkerManager:
     """
+    🎛️ Manages worker lifecycle and prevents Redis hammering
     """
+    def __init__(self):
+        self.active_workers: Dict[str, asyncio.Task] = {}
+        self._shutdown = False
+    async def start_listener(self):
+        """🎧 Listen to Redis pubsub for triggers"""
+        pubsub = event_hub.redis.pubsub()
+        pubsub.psubscribe("analytics_trigger:*")
+        logger.info("🎧 Worker Manager: Einstein+Elon mode ENGAGED")
+        try:
+            while not self._shutdown:
+                # Wait for message with timeout (prevents blocking forever)
+                message = pubsub.get_message(timeout=5.0)
+                if message and message["type"] == "pmessage":
+                    await self._handle_trigger(message)
+                # Cleanup completed tasks
+                self._cleanup_completed_workers()
+        except asyncio.CancelledError:
+            logger.info("[MANAGER] 🛑 Listener cancelled")
+        finally:
+            pubsub.close()
+    async def _handle_trigger(self, message: Dict[str, Any]):
+        """Process a single trigger message"""
+        try:
+            trigger = json.loads(message["data"])
+            org_id = trigger["org_id"]
+            source_id = trigger["source_id"]
+            worker_id = f"{org_id}:{source_id}"
+            # Skip if worker already running
+            if worker_id in self.active_workers:
+                logger.debug(f"[MANAGER] Worker {worker_id} already running")
+                return
+            # Spawn worker (non-blocking)
+            worker = AnalyticsWorker(org_id, source_id)
+            task = asyncio.create_task(
+                worker.run(),
+                name=f"worker-{org_id}-{source_id}"
+            )
+            self.active_workers[worker_id] = task
+            logger.info(f"[MANAGER] 🚀 Spawned worker for {worker_id}")
+        except Exception as e:
+            logger.error(f"[MANAGER] Trigger handling failed: {e}", exc_info=True)
+    def _cleanup_completed_workers(self):
+        """Remove completed/cancelled workers from registry"""
+        completed = []
+        for worker_id, task in self.active_workers.items():
+            if task.done():
+                completed.append(worker_id)
+                # Log completion status
+                if task.exception():
+                    logger.error(f"[MANAGER] Worker {worker_id} failed: {task.exception()}")
+                else:
+                    logger.debug(f"[MANAGER] Worker {worker_id} completed")
+        for worker_id in completed:
+            self.active_workers.pop(worker_id, None)
+    async def shutdown(self):
+        """Graceful shutdown"""
+        self._shutdown = True
+        logger.info("[MANAGER] 🛑 Shutting down workers...")
+        # Cancel all active tasks
+        for task in self.active_workers.values():
+            task.cancel()
+        # Wait for cancellation
+        if self.active_workers:
+            await asyncio.gather(*self.active_workers.values(), return_exceptions=True)
+        logger.info("[MANAGER] ✅ All workers terminated")
+# ==================== FASTAPI INTEGRATION ====================
+# Global manager instance
+_worker_manager: Optional[WorkerManager] = None
+async def get_worker_manager() -> WorkerManager:
+    """Get or create worker manager singleton"""
+    global _worker_manager
+    if _worker_manager is None:
+        _worker_manager = WorkerManager()
+    return _worker_manager
 async def trigger_kpi_computation(org_id: str, source_id: str):
+    """
+    🎯 FastAPI endpoint handler - triggers worker via Redis pubsub
+    Idempotent: multiple calls won't spawn duplicate workers
+    """
     try:
+        # Use event_hub which writes to both pubsub and stream for reliability
+        event_hub.emit_analytics_trigger(
+            org_id,
+            source_id,
+            {
+                "type": "kpi_compute",
+                "timestamp": datetime.now().isoformat()
+            }
+        )
         logger.info(f"🎯 Triggered KPI computation: {org_id}/{source_id}")
+        return {"status": "triggered", "org_id": org_id, "source_id": source_id}
     except Exception as e:
+        logger.error(f"Trigger failed: {e}", exc_info=True)
+        return {"status": "error", "message": str(e)}
+# ==================== BACKGROUND TASK (Optional) ====================
+async def continuous_kpi_refresh(manager: WorkerManager):
+    """
+    🎛️ Gentle background refresh - runs every 5 minutes
+    Only triggers for stale data (no active worker, no fresh cache)
+    """
+    await asyncio.sleep(10)  # Let app startup complete
+    while True:
+        try:
+            # Get all entity keys
+            entity_keys = event_hub.redis.keys("entity:*:*")
+            for key in entity_keys[:10]:  # Max 10 per cycle
+                key_str = key.decode()
+                _, org_id, source_id = key_str.split(":")
+                worker_id = f"{org_id}:{source_id}"
+                # Skip if worker already running
+                if worker_id in manager.active_workers:
+                    continue
+                # Skip if KPIs are fresh (< 5 min old)
+                cache_key = f"kpi_cache:{org_id}:{source_id}"
+                if event_hub.redis.exists(cache_key):
+                    continue
+                # Trigger refresh
+                await trigger_kpi_computation(org_id, source_id)
+                await asyncio.sleep(1)  # 1s gap
+        except Exception as e:
+            logger.error(f"[AUTO] Error: {e}", exc_info=True)
+        await asyncio.sleep(300)  # ⭐ Sleep 5 minutes
+# ==================== MAIN.PY INTEGRATION EXAMPLE ====================
+# In your main.py:
+#
+# from app.tasks.analytics_worker import get_worker_manager, continuous_kpi_refresh
+#
+# @app.on_event("startup")
+# async def start_workers():
+#     manager = await get_worker_manager()
+#     asyncio.create_task(manager.start_listener())
+#
+#     # Optional: Start background refresh
+#     if os.getenv("ENABLE_AUTO_REFRESH", "0") == "1":
+#         asyncio.create_task(continuous_kpi_refresh(manager))
+#
+# @app.on_event("shutdown")
+# async def stop_workers():
+#     manager = await get_worker_manager()
+#     await manager.shutdown()