Spaces:

petermutwiri
/

analytics-engine

Paused

App Files Files Community

Peter Mutwiri commited on Nov 23, 2025

Commit

0bd628a

1 Parent(s): 8a78694

igress pipeline completion

Browse files

Files changed (9) hide show

app/deps.py +157 -2
app/engine/kpi_calculators/base.py +132 -0
app/engine/kpi_calculators/supermarket.py +388 -0
app/main.py +106 -3
app/mapper.py +12 -0
app/qstash_client.py +1 -0
app/routers/analytics_stream.py +163 -0
app/routers/datasources.py +7 -4
app/tasks/analytics_worker.py +208 -0

app/deps.py CHANGED Viewed

@@ -1,13 +1,16 @@
 # ── Standard Library ──────────────────────────────────────────────────────────
 import os
-from typing import Optional
 import pathlib
 import logging
 # ── Third-Party ────────────────────────────────────────────────────────────────
 import duckdb
-from fastapi import HTTPException, Header
 from upstash_redis import Redis
 # ── Configuration Paths ────────────────────────────────────────────────────────
 # Use YOUR existing pattern from app/db.py (multi-tenant)
@@ -131,6 +134,95 @@ def get_redis():
     return _redis_client
 # ── API Security Dependency ────────────────────────────────────────────────────
 def verify_api_key(x_api_key: str = Header(..., alias="X-API-KEY")):
     """
@@ -151,6 +243,69 @@ def verify_api_key(x_api_key: str = Header(..., alias="X-API-KEY")):
     return x_api_key
 # ── Health Check Utilities ─────────────────────────────────────────────────────
 def check_all_services():
     """

 # ── Standard Library ──────────────────────────────────────────────────────────
 import os
+from typing import Optional, TYPE_CHECKING
 import pathlib
 import logging
+import time
 # ── Third-Party ────────────────────────────────────────────────────────────────
 import duckdb
+from fastapi import HTTPException, Header,Request, Depends
 from upstash_redis import Redis
+from collections import defaultdict
 # ── Configuration Paths ────────────────────────────────────────────────────────
 # Use YOUR existing pattern from app/db.py (multi-tenant)
     return _redis_client
+if TYPE_CHECKING:
+    from upstash_qstash import Client
+def get_qstash_client() -> "Client":
+    """
+    Initialize and return singleton QStash client for Hugging Face Spaces.
+    Required HF Secrets:
+    - QSTASH_TOKEN: Your QStash API token
+    Optional HF Secrets:
+    - QSTASH_URL: Custom QStash URL (defaults to official Upstash endpoint)
+    Returns:
+        Configured QStash Client instance
+    Raises:
+        RuntimeError: If QSTASH_TOKEN is missing or client initialization fails
+    """
+    # Singleton pattern: store instance as function attribute
+    if not hasattr(get_qstash_client, "_client"):
+        token = os.getenv("QSTASH_TOKEN")
+        if not token:
+            raise RuntimeError(
+                "❌ QSTASH_TOKEN not found. Please add it to HF Space Secrets."
+            )
+        # Dynamic import to avoid requiring package at module load time
+        try:
+            from upstash_qstash import Client
+        except ImportError:
+            raise RuntimeError(
+                "❌ upstash_qstash not installed. "
+                "Add to requirements.txt: upstash-qstash"
+            )
+        # Optional: Use custom URL if provided
+        qstash_url = os.getenv("QSTASH_URL")
+        try:
+            if qstash_url:
+                get_qstash_client._client = Client(token=token, url=qstash_url)
+                print(f"✅ QStash client initialized with custom URL: {qstash_url}")
+            else:
+                get_qstash_client._client = Client(token=token)
+                print("✅ QStash client initialized")
+        except Exception as e:
+            raise RuntimeError(f"❌ QStash client initialization failed: {e}")
+    return get_qstash_client._client
+def get_qstash_verifier():
+    """
+    Initialize QStash webhook verifier for receiving callbacks.
+    Used in /api/v1/analytics/callback endpoint to verify requests.
+    Required HF Secrets:
+    - QSTASH_CURRENT_SIGNING_KEY
+    - QSTASH_NEXT_SIGNING_KEY
+    Returns:
+        QStash Receiver/Verifier instance
+    """
+    if not hasattr(get_qstash_verifier, "_verifier"):
+        current_key = os.getenv("QSTASH_CURRENT_SIGNING_KEY")
+        next_key = os.getenv("QSTASH_NEXT_SIGNING_KEY")
+        if not current_key or not next_key:
+            raise RuntimeError(
+                "❌ QStash signing keys not configured. "
+                "Add QSTASH_CURRENT_SIGNING_KEY and QSTASH_NEXT_SIGNING_KEY to HF secrets."
+            )
+        try:
+            from upstash_qstash import Receiver
+            get_qstash_verifier._verifier = Receiver({
+                "current_signing_key": current_key,
+                "next_signing_key": next_key
+            })
+            print("✅ QStash verifier initialized")
+        except Exception as e:
+            raise RuntimeError(f"❌ QStash verifier initialization failed: {e}")
+    return get_qstash_verifier._verifier
 # ── API Security Dependency ────────────────────────────────────────────────────
 def verify_api_key(x_api_key: str = Header(..., alias="X-API-KEY")):
     """
     return x_api_key
+# ── New User Auth Dependency ──────────────────────────────────────────────────
+def get_current_user(request: Request, api_key: str = Depends(verify_api_key)):
+    """
+    Extracts org_id from query parameters (since auth happens on Vercel).
+    Use this in analytics endpoints that need org context.
+    Stack Auth on Vercel already validated the user,
+    so we trust the orgId passed in the query.
+    """
+    org_id = request.query_params.get("org_id") or request.query_params.get("orgId")
+    if not org_id:
+        raise HTTPException(
+            status_code=401,
+            detail="❌ org_id missing from query parameters. Vercel stack auth missing?"
+        )
+    # Validate org_id format (simple security check)
+    if not org_id.startswith("org_") and not org_id.startswith("user_"):
+        raise HTTPException(
+            status_code=400,
+            detail=f"❌ Invalid org_id format: {org_id}"
+        )
+    return {
+        "org_id": org_id,
+        "api_key": api_key,
+        "authenticated_at": datetime.utcnow().isoformat(),
+        "source": "vercel_stack_auth"
+    }
+# ── Rate Limiting (Optional but Recommended) ──────────────────────────────────
+# In-memory rate limiter (per org)
+_rate_limits = defaultdict(lambda: {"count": 0, "reset_at": 0})
+def rate_limit_org(max_requests: int = 100, window_seconds: int = 60):
+    """
+    Rate limiter per organization.
+    Prevents one org from DOSing the analytics engine.
+    """
+    def dependency(org_id: str = Depends(lambda r: get_current_user(r)["org_id"])):
+        now = time.time()
+        limit_data = _rate_limits[org_id]
+        # Reset window
+        if now > limit_data["reset_at"]:
+            limit_data["count"] = 0
+            limit_data["reset_at"] = now + window_seconds
+        # Check limit
+        if limit_data["count"] >= max_requests:
+            raise HTTPException(
+                status_code=429,
+                detail=f"⏸️ Rate limit exceeded for {org_id}: {max_requests} req/min"
+            )
+        limit_data["count"] += 1
+        return org_id
+    return dependency
 # ── Health Check Utilities ─────────────────────────────────────────────────────
 def check_all_services():
     """

app/engine/kpi_calculators/base.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# app/engine/kpi_calculators/base.py
+from abc import ABC, abstractmethod
+import pandas as pd
+import numpy as np
+from typing import Dict, Any, List, Optional, Set
+from datetime import datetime, timedelta
+import json
+import hashlib
+class BaseKPICalculator(ABC):
+    """
+    Abstract base for all industry-specific KPI calculators.
+    Guarantees consistent output format and error handling.
+    """
+    REQUIRED_COLUMNS: Set[str] = {"timestamp"}
+    OPTIONAL_COLUMNS: Set[str] = set()
+    def __init__(self, org_id: str, df: pd.DataFrame, source_id: str):
+        self.org_id = org_id
+        self.source_id = source_id
+        self.computed_at = datetime.utcnow()
+        # Validate schema
+        missing = self.REQUIRED_COLUMNS - set(df.columns)
+        if missing:
+            raise ValueError(f"Missing required columns: {missing}")
+        # Clean and store
+        self.df = self._clean_dataframe(df.copy())
+        self.cache_key = f"kpi_cache:{org_id}:{source_id}"
+    def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Universal data cleaning - bulletproof"""
+        # Replace infinities and NaNs with None (DuckDB-friendly)
+        df = df.replace([np.inf, -np.inf, np.nan], None)
+        # Ensure timestamp is datetime
+        if 'timestamp' in df.columns:
+            df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
+        # Standardize column names (lowercase, no spaces)
+        df.columns = [str(col).lower().strip().replace(' ', '_') for col in df.columns]
+        return df
+    @abstractmethod
+    def compute_all(self) -> Dict[str, Any]:
+        """
+        Return standardized KPI payload:
+        {
+            "realtime": {...},
+            "financial": {...},
+            "inventory": {...},
+            "customer": {...},
+            "predictive": {...},
+            "charts": {...}
+        }
+        """
+        pass
+    def _calculate_growth(self, current: Optional[float], previous: Optional[float]) -> float:
+        """Safe growth calculation - handles None and zero gracefully"""
+        if current is None or previous is None or previous == 0:
+            return 0.0
+        return ((current - previous) / previous) * 100
+    def _get_cached_value(self, metric_key: str) -> Optional[float]:
+        """Retrieve previous value for trend analysis"""
+        from app.redis_client import redis
+        try:
+            cached = redis.get(f"kpi_history:{self.org_id}:{self.source_id}:{metric_key}")
+            return float(cached) if cached else None
+        except Exception:
+            return None
+    def _cache_current_value(self, metric_key: str, value: float):
+        """Cache current value for next comparison"""
+        from app.redis_client import redis
+        try:
+            redis.setex(
+                f"kpi_history:{self.org_id}:{self.source_id}:{metric_key}",
+                86400,  # 24 hours
+                str(value)
+            )
+        except Exception:
+            pass
+    def _detect_data_quality_issues(self) -> List[str]:
+        """Audit data before KPI computation"""
+        issues = []
+        if self.df.empty:
+            issues.append("No data in window")
+            return issues
+        # Check for stale data
+        if 'timestamp' in self.df.columns:
+            latest = self.df['timestamp'].max()
+            if latest and (datetime.now() - latest).total_seconds() > 3600:
+                issues.append(f"Stale data: last record {latest}")
+        # Check for missing critical fields
+        critical_fields = ['total', 'items']
+        for field in critical_fields:
+            if field in self.df.columns and self.df[field].isna().all():
+                issues.append(f"All values missing for {field}")
+        # Check for outliers (99.9th percentile)
+        if 'total' in self.df.columns:
+            outliers = self.df[self.df['total'] > self.df['total'].quantile(0.999)]
+            if len(outliers) > 0:
+                issues.append(f"{len(outliers)} outlier transactions detected")
+        return issues
+# Factory pattern for industry selection
+def get_kpi_calculator(industry: str, org_id: str, df: pd.DataFrame, source_id: str) -> BaseKPICalculator:
+    """Factory to get the right calculator"""
+    from app.engine.kpi_calculators.supermarket import SupermarketKPICalculator
+    from app.engine.kpi_calculators.pharmaceutical import PharmaceuticalKPICalculator
+    from app.engine.kpi_calculators.manufacturing import ManufacturingKPICalculator
+    calculators = {
+        "supermarket": SupermarketKPICalculator,
+        "pharmaceutical": PharmaceuticalKPICalculator,
+        "manufacturing": ManufacturingKPICalculator,
+        "default": SupermarketKPICalculator  # Fallback
+    }
+    calculator_class = calculators.get(industry.lower(), calculators["default"])
+    return calculator_class(org_id, df, source_id)

app/engine/kpi_calculators/supermarket.py ADDED Viewed

	@@ -0,0 +1,388 @@

+# app/engine/kpi_calculators/supermarket.py
+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+from typing import Dict, Any, List, Optional
+from app.engine.kpi_calculators.base import BaseKPICalculator
+class SupermarketKPICalculator(BaseKPICalculator):
+    """Complete KPI engine for supermarkets and retail"""
+    OPTIONAL_COLUMNS = {
+        "workstationid", "operatorid", "items", "total", "qty", "category",
+        "artnum", "expiry_date", "cost", "customer_id", "promo_flag",
+        "trantime", "breaktime", "enddatetime"
+    }
+    def compute_all(self) -> Dict[str, Any]:
+        """Compute all supermarket KPIs with graceful degradation"""
+        # Check data quality first
+        quality_issues = self._detect_data_quality_issues()
+        if quality_issues:
+            print(f"[kpi] ⚠️ Data quality issues: {quality_issues}")
+        metrics = {
+            "realtime": self._compute_realtime_metrics(),
+            "financial": self._compute_financial_metrics(),
+            "inventory": self._compute_inventory_health(),
+            "customer": self._compute_customer_behavior(),
+            "predictive": self._compute_predictive_alerts(),
+            "charts": self._compute_chart_data(),
+            "metadata": {
+                "computed_at": self.computed_at.isoformat(),
+                "rows_analyzed": len(self.df),
+                "data_quality_issues": quality_issues,
+                "industry": "supermarket"
+            }
+        }
+        # Cache values for next run
+        self._cache_current_value("hourly_sales", metrics["realtime"]["hourly_sales"])
+        self._cache_current_value("daily_sales", metrics["financial"]["daily_sales"])
+        return metrics
+    def _compute_realtime_metrics(self) -> Dict[str, Any]:
+        """What's happening in the last hour"""
+        now = datetime.now()
+        one_hour_ago = now - timedelta(hours=1)
+        # Filter last hour safely
+        if 'timestamp' in self.df.columns:
+            last_hour = self.df[self.df['timestamp'] > one_hour_ago]
+        else:
+            last_hour = self.df
+        # Safe calculations with fallbacks
+        hourly_sales = float(last_hour['total'].sum()) if 'total' in last_hour.columns else 0.0
+        active_checkouts = 0
+        if 'workstationid' in last_hour.columns:
+            active_checkouts = int(len(last_hour['workstationid'].dropna().unique()))
+        items_per_minute = 0
+        if not last_hour.empty:
+            items_per_minute = int(len(last_hour) / 60)
+        # Transaction time (if available)
+        avg_transaction_time = 120.0  # Default 2 minutes
+        if 'trantime' in last_hour.columns and not last_hour['trantime'].isna().all():
+            try:
+                avg_transaction_time = float(last_hour.groupby('tranid')['trantime'].sum().mean())
+            except:
+                pass
+        # Queue length estimate
+        queue_length = 0
+        if 'workstationid' in last_hour.columns and not last_hour.empty:
+            try:
+                queue_length = int(last_hour.groupby('workstationid').size().mean())
+            except:
+                pass
+        # Growth calculation
+        prev_hourly = self._get_cached_value("hourly_sales")
+        growth = self._calculate_growth(hourly_sales, prev_hourly)
+        return {
+            "hourly_sales": hourly_sales,
+            "active_checkouts": active_checkouts,
+            "items_per_minute": items_per_minute,
+            "avg_transaction_time": avg_transaction_time,
+            "queue_length_estimate": queue_length,
+            "growth_vs_last_hour": growth
+        }
+    def _compute_financial_metrics(self) -> Dict[str, Any]:
+        """Money metrics with industry benchmarks"""
+        # Daily sales
+        daily_sales = float(self.df['total'].sum()) if 'total' in self.df.columns else 0.0
+        # Refunds/Voids
+        refund_rate = 0.0
+        if 'items' in self.df.columns and 'total' in self.df.columns:
+            refunds = self.df[
+                self.df['items'].astype(str).str.contains('refund|void|return', case=False, na=False)
+            ]['total'].abs().sum()
+            daily_sales_clean = self.df[
+                ~self.df['items'].astype(str).str.contains('refund|void|return', case=False, na=False)
+            ]['total'].sum()
+            if daily_sales_clean > 0:
+                refund_rate = float(refunds / daily_sales_clean * 100)
+        # Average basket
+        avg_basket = 0.0
+        avg_items = 0.0
+        if 'total' in self.df.columns and 'tranid' in self.df.columns:
+            try:
+                basket_values = self.df.groupby('tranid')['total'].sum()
+                avg_basket = float(basket_values.mean())
+                avg_items = float(self.df.groupby('tranid')['items'].count().mean()) if 'items' in self.df.columns else 0.0
+            except:
+                pass
+        # Gross margin (if cost available)
+        gross_margin = 28.5  # Industry average fallback
+        if 'cost' in self.df.columns and 'total' in self.df.columns:
+            total_sales = self.df['total'].sum()
+            total_cost = self.df['cost'].sum()
+            if total_sales > 0:
+                gross_margin = float((total_sales - total_cost) / total_sales * 100)
+        # Labor efficiency
+        labor_efficiency = 0.0
+        if 'operatorid' in self.df.columns and 'total' in self.df.columns:
+            unique_ops = self.df['operatorid'].nunique()
+            if unique_ops > 0:
+                labor_efficiency = float(daily_sales / unique_ops / 100)
+        return {
+            "daily_sales": daily_sales,
+            "gross_margin_pct": gross_margin,
+            "refund_rate": refund_rate,
+            "avg_basket_value": avg_basket,
+            "avg_items_per_basket": avg_items,
+            "labor_efficiency": labor_efficiency,
+            "sales_per_sqft": float(daily_sales / 5000)  # Assume 5k sqft
+        }
+    def _compute_inventory_health(self) -> Dict[str, Any]:
+        """Stock intelligence with predictive alerts"""
+        expiring_value = 0.0
+        stockout_risk = 0
+        wastage_rate = 0.0
+        alerts = []
+        # Expiry analysis
+        if 'expiry_date' in self.df.columns:
+            try:
+                expiring_soon = self.df[
+                    pd.to_datetime(self.df['expiry_date'], errors='coerce') <
+                    datetime.now() + timedelta(days=7)
+                ]
+                expiring_value = float(expiring_soon['total'].sum()) if 'total' in expiring_soon.columns else 0.0
+                if expiring_value > 5000:
+                    alerts.append(f"⚠️ KES {expiring_value:,.0f} expiring <7 days")
+            except:
+                pass
+        # Stock velocity (simple approach)
+        if 'artnum' in self.df.columns and 'qty' in self.df.columns:
+            try:
+                # Group by product and calculate velocity
+                product_stats = self.df.groupby('artnum').agg({
+                    'qty': 'sum',
+                    'total': 'sum'
+                }).fillna(0)
+                # Assume current stock = last qty value
+                current_stock = self.df.groupby('artnum')['qty'].last().fillna(0)
+                # Simple velocity (units per day)
+                daily_velocity = product_stats['qty'] / max(1, len(self.df.groupby(self.df['timestamp'].dt.date))))
+                days_left = (current_stock / daily_velocity).fillna(999)
+                stockout_risk = int((days_left < 2).sum())
+                if stockout_risk > 0:
+                    alerts.append(f"🚨 {stockout_risk} SKUs at stockout risk")
+            except:
+                pass
+        # Wastage rate
+        if len(self.df) > 0:
+            try:
+                wastage_rate = float(len(expiring_soon) / len(self.df) * 100) if 'expiring_soon' in locals() else 0.0
+            except:
+                pass
+        return {
+            "expiring_value": expiring_value,
+            "out_of_stock_skus": stockout_risk,
+            "wastage_rate": wastage_rate,
+            "stock_turnover": float(365 / 30),  # Simplified
+            "carrying_cost": float(self.df['total'].sum() * 0.02) if 'total' in self.df.columns else 0.0,
+            "alerts": alerts
+        }
+    def _compute_customer_behavior(self) -> Dict[str, Any]:
+        """Shopper insights with safe fallbacks"""
+        unique_customers = 0
+        repeat_rate = 0.0
+        peak_hour = 0
+        weekend_lift = 0.0
+        # Unique customers
+        if 'customer_id' in self.df.columns:
+            unique_customers = int(self.df['customer_id'].nunique())
+        elif 'operatorid' in self.df.columns:
+            unique_customers = int(self.df['operatorid'].nunique())
+        # Repeat rate (if customer_id available)
+        if 'customer_id' in self.df.columns and 'tranid' in self.df.columns:
+            try:
+                repeat_rate = float(
+                    self.df.groupby('customer_id')['tranid'].nunique().gt(1).mean() * 100
+                )
+            except:
+                pass
+        # Peak hour
+        if 'timestamp' in self.df.columns:
+            try:
+                hourly = self.df.groupby(self.df['timestamp'].dt.hour)['total'].sum()
+                peak_hour = int(hourly.idxmax()) if not hourly.empty else 0
+            except:
+                pass
+        # Weekend lift
+        if 'timestamp' in self.df.columns:
+            try:
+                self.df['is_weekend'] = self.df['timestamp'].dt.weekday >= 5
+                if self.df['is_weekend'].any():
+                    weekend_sales = self.df[self.df['is_weekend']]['total'].sum()
+                    weekday_sales = self.df[~self.df['is_weekend']]['total'].sum()
+                    if weekday_sales > 0:
+                        weekend_lift = float(weekend_sales / weekday_sales * 100 - 100)
+            except:
+                pass
+        return {
+            "unique_customers": unique_customers,
+            "repeat_rate": repeat_rate,
+            "peak_hour": peak_hour,
+            "weekend_lift_pct": weekend_lift,
+            "new_customers": int(unique_customers * 0.15),  # Assumption
+            "customer_acquisition_cost": 50.0,  # Placeholder
+            "customer_lifetime_value": 2500.0   # Placeholder
+        }
+    def _compute_predictive_alerts(self) -> Dict[str, Any]:
+        """AI-powered alerts without ML (rule-based intelligence)"""
+        alerts = []
+        # Unusual pattern detection
+        if 'timestamp' in self.df.columns and 'total' in self.df.columns:
+            try:
+                hourly_sales = self.df.groupby(self.df['timestamp'].dt.hour)['total'].sum()
+                if hourly_sales.std() > hourly_sales.mean() * 0.3:
+                    alerts.append({
+                        "severity": "warning",
+                        "title": "📊 Unusual Hourly Pattern",
+                        "description": "Sales variance exceeds 30%. Check for system errors.",
+                        "action": "investigate"
+                    })
+            except:
+                pass
+        # Staffing opportunity
+        if 'operatorid' in self.df.columns and 'total' in self.df.columns:
+            try:
+                operator_efficiency = self.df.groupby('operatorid')['total'].sum()
+                low_performers = operator_efficiency[operator_efficiency < operator_efficiency.quantile(0.1)]
+                if len(low_performers) > 0:
+                    alerts.append({
+                        "severity": "info",
+                        "title": "👥 Training Opportunity",
+                        "description": f"{len(low_performers)} operators below 10th percentile",
+                        "action": "schedule_training"
+                    })
+            except:
+                pass
+        # Promo opportunity for slow movers
+        if 'artnum' in self.df.columns and 'qty' in self.df.columns:
+            try:
+                slow_movers = self.df.groupby('artnum')['qty'].sum().nsmallest(5).index.tolist()
+                if slow_movers:
+                    alerts.append({
+                        "severity": "insight",
+                        "title": "💡 Promo Opportunity",
+                        "description": f"{len(slow_movers)} SKUs need velocity boost",
+                        "action": "create_promo"
+                    })
+            except:
+                pass
+        return {"alerts": alerts}
+    def _compute_chart_data(self) -> Dict[str, Any]:
+        """Frontend-ready chart data"""
+        hourly_sales = []
+        top_categories = []
+        customer_segments = []
+        # Hourly sales trend
+        if 'timestamp' in self.df.columns and 'total' in self.df.columns:
+            try:
+                hourly = self.df.groupby(self.df['timestamp'].dt.hour)['total'].sum()
+                hourly_sales = [{"label": f"{h:02d}:00", "value": float(v)}
+                               for h, v in hourly.reindex(range(24), fill_value=0).items()]
+            except:
+                hourly_sales = []
+        # Top categories (if available)
+        if 'category' in self.df.columns and 'total' in self.df.columns:
+            try:
+                category_sales = self.df.groupby('category')['total'].sum().nlargest(5)
+                top_categories = [{"label": k, "value": float(v)}
+                                for k, v in category_sales.items()]
+            except:
+                pass
+        # Customer segments (simplified RFM)
+        if 'customer_id' in self.df.columns and 'total' in self.df.columns:
+            try:
+                recency = (datetime.now() - self.df.groupby('customer_id')['timestamp'].max()).dt.days
+                frequency = self.df.groupby('customer_id')['tranid'].nunique()
+                monetary = self.df.groupby('customer_id')['total'].sum()
+                # Quintile-based segmentation
+                def segment_score(series):
+                    return pd.qcut(series, 5, labels=[1,2,3,4,5], duplicates='drop')
+                r_score = segment_score(recency)
+                f_score = segment_score(frequency)
+                m_score = segment_score(monetary)
+                # Simple segments
+                segments = {
+                    "VIP": int(((r_score <= 3) & (f_score >= 4) & (m_score >= 4)).sum()),
+                    "Regular": int(((r_score <= 3) & (f_score >= 2) & (m_score >= 2)).sum()),
+                    "At-Risk": int((r_score > 3).sum())
+                }
+                customer_segments = [{"label": k, "value": v} for k, v in segments.items()]
+            except:
+                customer_segments = [{"label": "All", "value": len(self.df)}]
+        return {
+            "hourly_sales": hourly_sales,
+            "top_categories": top_categories,
+            "customer_segments": customer_segments,
+            "sales_trend_7d": self._generate_trend_data(7)
+        }
+    def _generate_trend_data(self, days: int) -> List[Dict]:
+        """Generate realistic trend data - replace with Prophet ML"""
+        if 'total' not in self.df.columns:
+            return []
+        base = self.df['total'].sum() / max(1, len(self.df.groupby(self.df['timestamp'].dt.date))) if 'timestamp' in self.df.columns else 1
+        return [
+            {
+                "label": (datetime.now() - timedelta(days=i)).strftime('%a'),
+                "value": float(base * (1 + np.random.normal(0, 0.1)))
+            }
+            for i in range(days, 0, -1)
+        ]

app/main.py CHANGED Viewed

@@ -25,9 +25,9 @@ from contextlib import asynccontextmanager
 # ─── Router Imports ───────────────────────────────────────────────────────────
 # Import ALL routers
-from app.routers import health, datasources, reports, flags, scheduler, run, socket
 # ─── Dependencies ─────────────────────────────────────────────────────────────
-from app.deps import verify_api_key, check_all_services
 # ─── Logger Configuration ───────────────────────────────────────────────────────
 logging.basicConfig(
@@ -74,6 +74,29 @@ async def lifespan(app: FastAPI):
     logger.info(f"✅ Scheduler started (PID: {scheduler_process.pid})")
     logger.info("✅ Startup sequence complete")
     yield
     # ─── Shutdown ──────────────────────────────────────────────────────────────
@@ -155,7 +178,86 @@ async def add_request_tracking(request: Request, call_next):
     )
     return response
 # ─── Root Endpoint ─────────────────────────────────────────────────────────────
 @app.get("/", tags=["root"])
 def read_root():
@@ -230,4 +332,5 @@ app.include_router(reports.router, prefix="/api/v1/reports", dependencies=[Depen
 app.include_router(flags.router, prefix="/api/v1/flags", dependencies=[Depends(verify_api_key)])
 app.include_router(scheduler.router, prefix="/api/v1/scheduler", dependencies=[Depends(verify_api_key)])
 app.include_router(run.router, prefix="/api/v1/run", dependencies=[Depends(verify_api_key)])
-app.include_router(socket.router, prefix="/api/v1/socket", dependencies=[Depends(verify_api_key)])

 # ─── Router Imports ───────────────────────────────────────────────────────────
 # Import ALL routers
+from app.routers import health, datasources, reports, flags, scheduler, run, socket,analytics_stream
 # ─── Dependencies ─────────────────────────────────────────────────────────────
+from app.deps import get_current_user, rate_limit_org, verify_api_key, check_all_services
 # ─── Logger Configuration ───────────────────────────────────────────────────────
 logging.basicConfig(
     logger.info(f"✅ Scheduler started (PID: {scheduler_process.pid})")
     logger.info("✅ Startup sequence complete")
+    # Setup Redis streams
+    logger.info("🔄 Setting up Redis streams...")
+    try:
+        active_orgs = redis.keys("entity:*")
+        for key in active_orgs:
+            key_parts = key.decode().split(":")
+            if len(key_parts) >= 3:
+                org_id, source_id = key_parts[1], key_parts[2]
+                stream_key = f"stream:analytics:{org_id}:{source_id}"
+                try:
+                    redis.xgroup_create(stream_key, f"analytics_consumers_{org_id}", id="0", mkstream=True)
+                except Exception as e:
+                    if "BUSYGROUP" not in str(e):
+                        logger.warning(f"⚠️ Stream setup warning: {e}")
+        logger.info("✅ Redis streams consumer groups ready")
+    except Exception as e:
+        logger.error(f"❌ Stream setup failed: {e}")
+    # Start background KPI scheduler
+    logger.info("⏰ Starting KPI refresh scheduler...")
+    asyncio.create_task(continuous_kpi_refresh(), name="kpi_scheduler")
     yield
     # ─── Shutdown ──────────────────────────────────────────────────────────────
     )
     return response
+# ─── NEW: KPI COMPUTATION ENDPOINT (With Auth) ─────────────────────────────────
+@app.post("/api/v1/kpi/compute")
+async def compute_kpis(
+    source_id: str = Query(..., description="Data source ID"),
+    background_tasks: BackgroundTasks,
+    current_user: dict = Depends(get_current_user),  # NEW: Auth from query params
+    limited_org: str = Depends(rate_limit_org(max_requests=50))  # NEW: Rate limit
+):
+    """
+    Trigger KPI computation.
+    Returns immediately; results published to Redis stream.
+    Auth: Uses org_id from query params (validated against Vercel stack auth)
+    Rate limit: 50 requests/min per org
+    """
+    try:
+        org_id = current_user["org_id"]
+        # Check cache first
+        cached = redis.get(f"kpi_cache:{org_id}:{source_id}")
+        if cached:
+            return {
+                "status": "cached",
+                "org_id": org_id,
+                "data": json.loads(cached),
+                "rate_limit": {
+                    "remaining": 50 - _rate_limits[org_id]["count"],
+                    "reset_in": max(0, _rate_limits[org_id]["reset_at"] - time.time())
+                }
+            }
+        # Trigger background computation via QStash
+        background_tasks.add_task(trigger_kpi_computation, org_id, source_id)
+        return {
+            "status": "processing",
+            "org_id": org_id,
+            "message": "KPI computation queued. Poll /analytics/stream/recent for results.",
+            "poll_url": f"/api/v1/analytics/stream/recent?org_id={org_id}&source_id={source_id}",
+            "rate_limit": {
+                "remaining": 50 - _rate_limits[org_id]["count"],
+                "reset_in": max(0, _rate_limits[org_id]["reset_at"] - time.time())
+            }
+        }
+    except Exception as e:
+        logger.error(f"❌ KPI compute error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+# ─── NEW: BACKGROUND KPI SCHEDULER ───────────────────────────────────────────
+async def continuous_kpi_refresh():
+    """
+    Auto-refresh KPIs every 5 minutes for active organizations.
+    Runs as a background task started at app startup.
+    """
+    while True:
+        try:
+            logger.debug("🔄 KPI scheduler tick...")
+            # Get all active entity keys from Redis
+            active_keys = redis.keys("entity:*")
+            for key in active_keys:
+                key_parts = key.decode().split(":")
+                if len(key_parts) >= 3:
+                    org_id, source_id = key_parts[1], key_parts[2]
+                    # Skip if recently computed (cache exists)
+                    cache_key = f"kpi_cache:{org_id}:{source_id}"
+                    if redis.exists(cache_key):
+                        continue
+                    # Trigger async computation (non-blocking)
+                    logger.info(f"⏰ Auto-triggering KPIs for {org_id}/{source_id}")
+                    await trigger_kpi_computation(org_id, source_id)
+        except Exception as e:
+            logger.error(f"❌ Scheduler error: {e}")
+        # Wait 5 minutes before next run
+        await asyncio.sleep(300)
 # ─── Root Endpoint ─────────────────────────────────────────────────────────────
 @app.get("/", tags=["root"])
 def read_root():
 app.include_router(flags.router, prefix="/api/v1/flags", dependencies=[Depends(verify_api_key)])
 app.include_router(scheduler.router, prefix="/api/v1/scheduler", dependencies=[Depends(verify_api_key)])
 app.include_router(run.router, prefix="/api/v1/run", dependencies=[Depends(verify_api_key)])
+app.include_router(socket.router, prefix="/api/v1/socket", dependencies=[Depends(verify_api_key)])
+app.include_router(analytics_stream.router, prefix="/api/v1/analytics", tags=["analytics"])

app/mapper.py CHANGED Viewed

@@ -487,5 +487,17 @@ def canonify_df(org_id: str, source_id: str, hours_window: int = 24) -> tuple[pd
     df = df.replace([np.inf, -np.inf, np.nan], None)  # Clean for JSON response
     duration_ms = (datetime.now() - start_time).total_seconds() * 1000
     print(f"[canonify] ✅ Pipeline complete in {duration_ms:.2f}ms for {org_id}")
     return df, industry, industry_confidence

     df = df.replace([np.inf, -np.inf, np.nan], None)  # Clean for JSON response
     duration_ms = (datetime.now() - start_time).total_seconds() * 1000
     print(f"[canonify] ✅ Pipeline complete in {duration_ms:.2f}ms for {org_id}")
+    # After line: print(f"[canonify] ✅ Pipeline complete in {duration_ms:.2f}ms")
+    if not df.empty:
+            redis.publish(
+                f"analytics_trigger:{org_id}:{source_id}",
+                json.dumps({
+                    "type": "kpi_compute",
+                    "entity_type": entity_type,
+                    "industry": industry
+                })
+            )
+    print(f"[canonify] 🚀 Triggered analytics for {source_id}")
     return df, industry, industry_confidence

app/qstash_client.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ qstash_client.py

app/routers/analytics_stream.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# app/routers/analytics_stream.py
+from fastapi import APIRouter, Depends, HTTPException, Query
+from typing import List, Dict
+import json
+import asyncio
+from datetime import datetime
+from app.deps import get_current_user
+from app.redis_client import redis
+import uuid
+from app.qstash_client import publish_message, is_qstash_available
+router = APIRouter(prefix="/api/v1/analytics/stream", tags=["analytics"])
+class AnalyticsStreamManager:
+    """Manages Redis streams for real-time analytics without WebSockets"""
+    def __init__(self, org_id: str, source_id: str):
+        self.org_id = org_id
+        self.source_id = source_id
+        self.stream_key = f"stream:analytics:{org_id}:{source_id}"
+        self.consumer_group = f"analytics_consumers_{org_id}"
+    async def ensure_consumer_group(self):
+        """Create Redis consumer group if not exists"""
+        try:
+            redis.xgroup_create(
+                self.stream_key,
+                self.consumer_group,
+                id="0",
+                mkstream=True
+            )
+        except Exception as e:
+            if "BUSYGROUP" not in str(e):
+                print(f"[stream] ⚠️ Group creation warning: {e}")
+    async def publish_kpi_update(self, data: Dict):
+        """Publish KPI update to Redis stream"""
+        message = {
+            "type": "kpi_update",
+            "timestamp": datetime.utcnow().isoformat(),
+            "data": data
+        }
+        redis.xadd(self.stream_key, {"message": json.dumps(message)})
+    async def publish_insight(self, insight: Dict):
+        """Publish AI insight to stream"""
+        message = {
+            "type": "insight",
+            "timestamp": datetime.utcnow().isoformat(),
+            "data": insight
+        }
+        redis.xadd(self.stream_key, {"message": json.dumps(message)})
+    def read_recent(self, count: int = 10) -> List[Dict]:
+        """Read recent messages for polling"""
+        try:
+            messages = redis.xrevrange(self.stream_key, count=count)
+            return [
+                json.loads(msg[1][b"message"].decode())
+                for msg in messages
+            ]
+        except Exception as e:
+            print(f"[stream] ❌ Read error: {e}")
+            return []
+@router.get("/recent")
+async def get_recent_analytics(
+    org_id = current_user["org_id"]
+    source_id = current_user.get("source_id", "default")
+    count: int = Query(10, ge=1, le=100),
+    user = Depends(get_current_user)
+):
+    """Poll recent analytics (replaces Socket.io)"""
+    if user.org_id != org_id:
+        raise HTTPException(status_code=403, detail="Unauthorized")
+    manager = AnalyticsStreamManager(org_id, source_id)
+    messages = manager.read_recent(count)
+    return {
+        "status": "success",
+        "org_id": org_id,  # Confirm which org
+        "messages": messages,
+        "timestamp": datetime.utcnow().isoformat()
+    }
+@router.post("/trigger")
+async def trigger_kpi_computation(
+    source_id: str = Query(...),
+    org_id: str = Query(...),
+    current_user: Dict = Depends(get_current_user),
+):
+    """Trigger KPI computation via QStash"""
+    if not is_qstash_available():
+        raise HTTPException(
+            status_code=503,
+            detail="QStash not configured. Check HF secrets."
+        )
+    # Check cache (your existing logic)
+    cached = redis.get(f"kpi_cache:{org_id}:{source_id}")
+    if cached:
+        return {"status": "cached", "data": json.loads(cached)}
+    # Publish to QStash
+    try:
+        result = publish_message(
+            url=f"{settings.APP_URL}/api/v1/analytics/callback",
+            body={
+                "org_id": org_id,
+                "source_id": source_id,
+                "user_id": current_user["user_id"]
+            },
+            callback=f"{settings.APP_URL}/api/v1/analytics/notify"
+        )
+        return {
+            "status": "processing",
+            "message_id": result["messageId"],
+            "poll_url": f"/api/v1/analytics/stream/recent?org_id={org_id}&source_id={source_id}"
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"QStash error: {str(e)}")
+@router.post("/callback")
+async def qstash_kpi_callback(
+    payload: Dict = Body(...),
+    background_tasks: BackgroundTasks
+):
+    """QStash calls this to compute KPIs"""
+    org_id = payload["org_id"]
+    source_id = payload["source_id"]
+    # Trigger background computation
+    background_tasks.add_task(
+        run_analytics_worker, org_id, source_id
+    )
+    return {"status": "accepted"}
+@router.post("/notify")
+async def qstash_notification(payload: Dict = Body(...)):
+    """QStash calls this when job is done"""
+    # This is where you notify frontend
+    # Could ping a webhook or update a status key in Redis
+    return {"status": "ok"}
+async def run_analytics_worker(org_id: str, source_id: str):
+    """Run the KPI worker and publish results"""
+    try:
+        from app.tasks.analytics_worker import AnalyticsWorker
+        worker = AnalyticsWorker(org_id, source_id)
+        results = await worker.run()
+        # Publish to Redis stream
+        manager = AnalyticsStreamManager(org_id, source_id)
+        await manager.publish_kpi_update(results)
+    except Exception as e:
+        print(f"[callback] ❌ Worker failed: {e}")

app/routers/datasources.py CHANGED Viewed

@@ -2,7 +2,7 @@ from fastapi import APIRouter, Query, Form, File, UploadFile, Depends, HTTPExcep
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 from typing import List, Any, Dict, Union
-from app.deps import verify_api_key
 from app.db import get_conn, ensure_raw_table, bootstrap
 from app.mapper import canonify_df
 from app.routers.socket import sio
@@ -11,6 +11,8 @@ import json
 import time
 from datetime import datetime, timedelta
 from app.redis_client import redis
 router = APIRouter(tags=["datasources"])  # Remove
@@ -83,9 +85,10 @@ async def create_source_json(
     orgId: str = Query(...),
     sourceId: str = Query(...),
     type: str = Query(...),
     _: str = Depends(verify_api_key),
 ):
     org_id = orgId
     source_id = sourceId
     ds_type = type
@@ -118,8 +121,8 @@ async def create_source_json(
         #  Entity will be auto-queued by process_detect_industry()
         df, industry, confidence = canonify_df(org_id, source_id)
-        # 3. 🎯 Prepare preview for real-time broadcast
         # Convert DataFrame to JSON-safe format
         preview_df = df.head(3).copy()
         for col in preview_df.columns:

 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 from typing import List, Any, Dict, Union
+from app.deps import verify_api_key,get_current_user
 from app.db import get_conn, ensure_raw_table, bootstrap
 from app.mapper import canonify_df
 from app.routers.socket import sio
 import time
 from datetime import datetime, timedelta
 from app.redis_client import redis
+# Add this import
+from app.tasks.analytics_worker import trigger_kpi_computation
 router = APIRouter(tags=["datasources"])  # Remove
     orgId: str = Query(...),
     sourceId: str = Query(...),
     type: str = Query(...),
+    current_user: dict = Depends(get_current_user),
     _: str = Depends(verify_api_key),
 ):
     org_id = orgId
     source_id = sourceId
     ds_type = type
         #  Entity will be auto-queued by process_detect_industry()
         df, industry, confidence = canonify_df(org_id, source_id)
+        # run autokpi computation in background
+        background_tasks.add_task(trigger_kpi_computation, org_id, source_id)
         # Convert DataFrame to JSON-safe format
         preview_df = df.head(3).copy()
         for col in preview_df.columns:

app/tasks/analytics_worker.py ADDED Viewed

	@@ -0,0 +1,208 @@

+# app/tasks/analytics_worker.py
+import asyncio
+import json
+import pandas as pd
+from datetime import datetime
+from typing import Dict, Any
+from app.redis_client import redis
+from app.db import get_conn
+from app.engine.kpi_calculators.base import get_kpi_calculator
+class AnalyticsWorker:
+    """Background worker for KPI computation and Redis pub/sub"""
+    def __init__(self, org_id: str, source_id: str, hours_window: int = 24):
+        self.org_id = org_id
+        self.source_id = source_id
+        self.hours_window = hours_window
+        self.computed_at = None
+    async def run(self) -> Dict[str, Any]:
+        """Async KPI computation with error handling"""
+        start_time = datetime.now()
+        try:
+            # 1. Load data
+            df = await self._load_dataframe()
+            if df.empty:
+                await self._publish_status("no_data")
+                return {"error": "No data in window"}
+            # 2. Get industry
+            industry = await self._get_industry()
+            if not industry or industry == "UNKNOWN":
+                await self._publish_status("unknown_industry")
+                return {"error": "Industry unknown"}
+            # 3. Compute KPIs
+            calculator = get_kpi_calculator(industry, self.org_id, df, self.source_id)
+            results = await asyncio.to_thread(calculator.compute_all)
+            self.computed_at = datetime.now()
+            # 4. Publish to Redis
+            await self._publish_results(results)
+            # 5. Cache for 5 minutes
+            cache_ttl = 300  # 5 min
+            redis.setex(
+                f"kpi_cache:{self.org_id}:{self.source_id}",
+                cache_ttl,
+                json.dumps(results)
+            )
+            duration = (self.computed_at - start_time).total_seconds()
+            print(f"[worker] ✅ {self.org_id}/{self.source_id} computed in {duration:.2f}s")
+            return results
+        except Exception as e:
+            error_msg = f"KPI computation failed: {str(e)}"
+            print(f"[worker] ❌ {self.org_id}/{self.source_id}: {error_msg}")
+            await self._publish_error(error_msg)
+            return {"error": error_msg}
+    async def _load_dataframe(self) -> pd.DataFrame:
+        """Load from DuckDB with async wrapper"""
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(None, self._sync_load_dataframe)
+    def _sync_load_dataframe(self) -> pd.DataFrame:
+    """
+    Synchronous DB loading with canonical table readiness check.
+    Waits up to 30 seconds for the table to exist and contain data.
+    """
+    conn = None
+    MAX_WAIT = 30  # seconds
+    RETRY_INTERVAL = 2  # seconds
+    try:
+        # Get entity type from Redis
+        entity_key = f"entity:{self.org_id}:{self.source_id}"
+        entity_info = redis.get(entity_key)
+        if not entity_info:
+            print(f"[worker] ⚠️ No entity info in Redis: {entity_key}")
+            return pd.DataFrame()
+        try:
+            entity_type = json.loads(entity_info)['entity_type']
+            if entity_type == "UNKNOWN":
+                print(f"[worker] ⚠️ Entity type is UNKNOWN, skipping")
+                return pd.DataFrame()
+        except (json.JSONDecodeError, KeyError) as e:
+            print(f"[worker] ❌ Invalid entity info: {e}")
+            return pd.DataFrame()
+        table_name = f"main.{entity_type}_canonical"
+        cutoff_time = datetime.now() - timedelta(hours=self.hours_window)
+        conn = get_conn(self.org_id)
+        # Wait for table readiness
+        start_time = time.time()
+        elapsed = 0
+        while elapsed < MAX_WAIT:
+            try:
+                # Try to query row count - this checks both existence and data
+                count_query = f"SELECT COUNT(*) FROM {table_name} WHERE timestamp >= ?"
+                row_count = conn.execute(count_query, [cutoff_time]).fetchone()[0]
+                if row_count > 0:
+                    print(f"[worker] ✅ Table ready: {row_count} rows in {table_name} (waited {elapsed:.1f}s)")
+                    break
+                else:
+                    print(f"[worker] ⏳ Table exists but no data yet (waited {elapsed:.1f}s)")
+            except Exception as e:
+                error_msg = str(e).lower()
+                if "does not exist" in error_msg or "catalog error" in error_msg:
+                    print(f"[worker] ⏳ Table doesn't exist yet (waited {elapsed:.1f}s)")
+                else:
+                    print(f"[worker] ⚠️ Unexpected error: {e} (waited {elapsed:.1f}s)")
+            time.sleep(RETRY_INTERVAL)
+            elapsed = time.time() - start_time
+        else:
+            print(f"[worker] ❌ Timeout after {MAX_WAIT}s: {table_name}")
+            return pd.DataFrame()
+        # Load the data
+        query = f"SELECT * FROM {table_name} WHERE timestamp >= ? ORDER BY timestamp DESC"
+        df = conn.execute(query, [cutoff_time]).df()
+        print(f"[worker] 📊 Loaded {len(df)} rows × {len(df.columns)} cols")
+        return df
+    except Exception as e:
+        print(f"[worker] ❌ Fatal error: {e}")
+        return pd.DataFrame()
+    finally:
+        if conn:
+            try:
+                conn.close()
+                print(f"[worker] 🔒 Connection closed for {self.org_id}")
+            except Exception as e:
+                print(f"[worker] ⚠️ Error closing connection: {e}")
+    async def _get_industry(self) -> str:
+        """Get industry from Redis cache"""
+        try:
+            industry_key = f"industry:{self.org_id}:{self.source_id}"
+            data = redis.get(industry_key)
+            if data:
+                return json.loads(data).get('industry', 'supermarket').lower()
+            return "supermarket"
+        except:
+            return "supermarket"
+    async def _publish_results(self, results: Dict[str, Any]):
+        """Publish KPIs and insights to Redis pub/sub"""
+        # Main KPI channel
+        kpi_channel = f"analytics:{self.org_id}:{self.source_id}:kpi"
+        kpi_message = {
+            "type": "kpi_update",
+            "timestamp": self.computed_at.isoformat(),
+            "data": results
+        }
+        redis.publish(kpi_channel, json.dumps(kpi_message))
+        # Separate insight channel
+        insight_channel = f"analytics:{self.org_id}:{self.source_id}:insights"
+        for alert in results.get('predictive', {}).get('alerts', []):
+            insight_message = {
+                "type": "insight",
+                "timestamp": self.computed_at.isoformat(),
+                "data": alert
+            }
+            redis.publish(insight_channel, json.dumps(insight_message))
+        print(f"[worker] 📤 Published to {kpi_channel}")
+    async def _publish_status(self, status: str):
+        """Publish system status"""
+        channel = f"analytics:{self.org_id}:{self.source_id}:status"
+        redis.publish(channel, json.dumps({
+            "type": "status",
+            "status": status,
+            "timestamp": datetime.now().isoformat()
+        }))
+    async def _publish_error(self, message: str):
+        """Publish error to status channel"""
+        channel = f"analytics:{self.org_id}:{self.source_id}:status"
+        redis.publish(channel, json.dumps({
+            "type": "error",
+            "message": message,
+            "timestamp": datetime.now().isoformat()
+        }))
+# Helper for triggering worker
+async def trigger_kpi_computation(org_id: str, source_id: str):
+    """Non-blocking KPI trigger"""
+    worker = AnalyticsWorker(org_id, source_id)
+    asyncio.create_task(worker.run(), name=f"kpi-{org_id}-{source_id}")