Spaces:

petermutwiri
/

analytics-engine

Paused

App Files Files Community

Peter Mutwiri commited on Nov 6, 2025

Commit

472833f

0 Parent(s):

Clean snapshot

Browse files

Files changed (37) hide show

.gitattributes +1 -0
.gitignore +4 -0
.vscode/settings.json +4 -0
Dockerfile +40 -0
analytics-data/.schedules.json +44 -0
analytics-data/duckdb/23739e24-d3ae-4ecf-b32f-16e019a561bd.duckdb +0 -0
analytics-data/duckdb/demo.duckdb +0 -0
app/db.py +127 -0
app/deps.py +10 -0
app/engine/analytics.py +1193 -0
app/engine/json_utils.py +16 -0
app/engine/supermarket_metrics.py +129 -0
app/ingest.py +6 -0
app/main.py +61 -0
app/mapper.py +186 -0
app/redis_pool.py +2 -0
app/routers/datasources.py +117 -0
app/routers/flags.py +22 -0
app/routers/health.py +7 -0
app/routers/ingress.py +16 -0
app/routers/reports.py +117 -0
app/routers/run.py +65 -0
app/routers/scheduler.py +90 -0
app/routers/socket.py +54 -0
app/service/industry_svc.py +57 -0
app/service/live_ingest.py +34 -0
app/tasks/ingest_worker.py +18 -0
app/tasks/kpi_logger.py +44 -0
app/tasks/purge.py +5 -0
app/tasks/scheduler.py +137 -0
app/utils/detect_industry.py +116 -0
app/utils/email.py +6 -0
data/duckdb/.gitkeep +0 -0
data/duckdb/schedules.json +0 -0
fly.toml +23 -0
requirements.txt +23 -0
scheduler_loop.py +42 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.duckdb filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+node_modules
+client-nextjs/googlecalendar.json
+.env.local
+analytics-service/.env.analytics

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "python-envs.defaultEnvManager": "ms-python.python:system",
+    "python-envs.pythonProjects": []
+}

Dockerfile ADDED Viewed

	@@ -0,0 +1,40 @@

+# ---- 1. base image ---------------------------------------------------------
+FROM python:3.11-slim
+# ---- 2. system dependencies for binary wheels ------------------------------
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        gcc \
+        g++ \
+        cmake \
+        libgomp1 \
+        libstdc++6 \
+        ca-certificates \
+        wget \
+        unzip \
+    && rm -rf /var/lib/apt/lists/*
+# ---- 2½. DuckDB CLI (optional but handy for debugging) --------------------
+RUN wget -q https://github.com/duckdb/duckdb/releases/download/v0.10.2/duckdb_cli-linux-amd64.zip && \
+    unzip duckdb_cli-linux-amd64.zip -d /usr/local/bin && rm *.zip
+# ---- 3. upgrade pip & enable pre-built wheels ------------------------------
+RUN pip install --no-cache-dir --upgrade pip setuptools wheel
+# ---- 4. install Python deps (+ DuckDB driver) ------------------------------
+COPY requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir --prefer-binary -r /tmp/requirements.txt && \
+    pip install --no-cache-dir duckdb==0.10.2
+# ---- 5. copy source --------------------------------------------------------
+COPY . /app
+WORKDIR /app
+# ---- 5½. scheduler loop ----------------------------------------------------
+COPY scheduler_loop.py /app/scheduler_loop.py
+# ---- 6. runtime env vars ---------------------------------------------------
+ENV API_KEYS=dev-analytics-key-123
+# ---- 7. start both services -----------------------------------------------
+CMD sh -c "python -m uvicorn app.main:app --host 0.0.0.0 --port 8080 & python /app/scheduler_loop.py"

analytics-data/.schedules.json ADDED Viewed

	@@ -0,0 +1,44 @@

+[
+  {
+    "id": "2bc54229-97ee-4101-b751-ea9f8edfa84d",
+    "orgId": "demo",
+    "frequency": "daily",
+    "analytics": [
+      "eda"
+    ],
+    "nextRun": "2025-10-16T13:50:46.392839"
+  },
+  {
+    "id": "5a73737e-5c28-496f-a941-0e060760ccb3",
+    "orgId": "23739e24-d3ae-4ecf-b32f-16e019a561bd",
+    "frequency": "daily",
+    "analytics": [
+      "eda",
+      "basket",
+      "forecast"
+    ],
+    "nextRun": "2025-10-16T13:50:46.398193"
+  },
+  {
+    "id": "4f99c560-8ff5-471e-9711-91b92b7be4b5",
+    "orgId": "23739e24-d3ae-4ecf-b32f-16e019a561bd",
+    "frequency": "daily",
+    "analytics": [
+      "eda",
+      "basket",
+      "forecast"
+    ],
+    "nextRun": "2025-10-16T13:50:46.402940"
+  },
+  {
+    "id": "1a03ea97-a085-4d3c-994a-54be9b8885f6",
+    "orgId": "23739e24-d3ae-4ecf-b32f-16e019a561bd",
+    "frequency": "daily",
+    "analytics": [
+      "eda",
+      "basket",
+      "forecast"
+    ],
+    "nextRun": "2025-10-16T13:50:46.407630"
+  }
+]

analytics-data/duckdb/23739e24-d3ae-4ecf-b32f-16e019a561bd.duckdb ADDED Viewed

Binary file (131 Bytes). View file

analytics-data/duckdb/demo.duckdb ADDED Viewed

Binary file (131 Bytes). View file

app/db.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import duckdb, os, pathlib, json
+from datetime import datetime
+from typing import Any, Dict, List
+DB_DIR = pathlib.Path("./data/duckdb")
+DB_DIR.mkdir(parents=True, exist_ok=True)
+def get_conn(org_id: str):
+    """Get or create a DuckDB connection for an organization."""
+    db_file = DB_DIR / f"{org_id}.duckdb"
+    return duckdb.connect(str(db_file), read_only=False)
+# ------------------------------------------------------------
+# 🔹 Backward-compatible table for raw JSON ingestion
+# ------------------------------------------------------------
+def ensure_raw_table(conn):
+    """
+    Maintains legacy compatibility for ingestion from webhooks / file uploads.
+    """
+    conn.execute("CREATE SCHEMA IF NOT EXISTS main")
+    conn.execute("""
+        CREATE TABLE IF NOT EXISTS main.raw_rows(
+            ingested_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+            row_data    JSON
+        )
+    """)
+# ------------------------------------------------------------
+# 🔹 Flexible dynamic schema table creation
+# ------------------------------------------------------------
+def ensure_table(conn, table_name: str, sample_record: Dict[str, Any]):
+    """
+    Ensures a DuckDB table exists with columns inferred from sample_record.
+    If new columns appear later, adds them automatically.
+    """
+    conn.execute("CREATE SCHEMA IF NOT EXISTS main")
+    conn.execute(
+        f"CREATE TABLE IF NOT EXISTS main.{table_name} ("
+        "id UUID DEFAULT uuid(), "
+        "_ingested_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)"
+    )
+    if not sample_record:
+        return
+    existing_cols = {r[0] for r in conn.execute(f"PRAGMA table_info('main.{table_name}')").fetchall()}
+    for col, val in sample_record.items():
+        if col in existing_cols:
+            continue
+        dtype = infer_duckdb_type(val)
+        print(f"[db] ➕ Adding new column '{col}:{dtype}' to main.{table_name}")
+        conn.execute(f"ALTER TABLE main.{table_name} ADD COLUMN {col} {dtype}")
+def infer_duckdb_type(value: Any) -> str:
+    """Infer a DuckDB-compatible column type from a Python value."""
+    if isinstance(value, bool):
+        return "BOOLEAN"
+    if isinstance(value, int):
+        return "BIGINT"
+    if isinstance(value, float):
+        return "DOUBLE"
+    if isinstance(value, datetime):
+        return "TIMESTAMP"
+    if isinstance(value, (dict, list)):
+        return "JSON"
+    return "VARCHAR"
+# ------------------------------------------------------------
+# 🔹 Insert records with auto-schema
+# ------------------------------------------------------------
+def insert_records(conn, table_name: str, records: List[Dict[str, Any]]):
+    """
+    Insert records into the specified table.
+    Assumes ensure_table() has already been called.
+    """
+    if not records:
+        return
+    cols = records[0].keys()
+    placeholders = ", ".join(["?"] * len(cols))
+    col_list = ", ".join(cols)
+    insert_sql = f"INSERT INTO main.{table_name} ({col_list}) VALUES ({placeholders})"
+    values = [tuple(r.get(c) for c in cols) for r in records]
+    conn.executemany(insert_sql, values)
+    print(f"[db] ✅ Inserted {len(records)} rows into {table_name}")
+# ------------------------------------------------------------
+# 🔹 Unified bootstrap entrypoint
+# ------------------------------------------------------------
+def bootstrap(org_id: str, payload: Dict[str, Any]):
+    """
+    Main entrypoint for ingestion.
+    Detects whether the payload contains:
+      - A single table (list of dicts)
+      - Multiple named tables (dict of lists)
+    Also logs the raw payload in main.raw_rows for lineage tracking.
+    """
+    conn = get_conn(org_id)
+    conn.execute("CREATE SCHEMA IF NOT EXISTS main")
+    ensure_raw_table(conn)
+    # Log raw payload for debugging / lineage
+    conn.execute("INSERT INTO main.raw_rows (row_data) VALUES (?)", (json.dumps(payload),))
+    if isinstance(payload, dict) and "tables" in payload:
+        # multi-table mode
+        for table_name, rows in payload["tables"].items():
+            if not rows:
+                continue
+            ensure_table(conn, table_name, rows[0])
+            insert_records(conn, table_name, rows)
+    elif isinstance(payload, list):
+        # single-table mode (assume 'sales' as default)
+        ensure_table(conn, "sales", payload[0])
+        insert_records(conn, "sales", payload)
+    else:
+        print("[db] ⚠️ Unsupported payload shape")
+    conn.close()

app/deps.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import os
+from fastapi import HTTPException, Header
+API_KEYS = os.getenv("API_KEYS", "").split(",")
+def verify_key(x_api_key: str = Header(None, convert_underscores=True)):   # ← accept any case
+    print(f"[verify_key] received: {x_api_key}, allowed: {API_KEYS}")
+    if not x_api_key or x_api_key not in API_KEYS:
+        raise HTTPException(status_code=401, detail="Invalid API key")
+    return x_api_key

app/engine/analytics.py ADDED Viewed

	@@ -0,0 +1,1193 @@

+import pandas as pd
+import numpy as np
+from prophet import Prophet
+from datetime import datetime
+import redis
+import json
+from sklearn.cluster import KMeans, DBSCAN
+from sklearn.preprocessing import StandardScaler, MinMaxScaler
+from sklearn.decomposition import PCA
+from sklearn.ensemble import IsolationForest
+from .json_utils import CustomJSONEncoder
+from scipy import stats
+from scipy.stats import pearsonr
+from statsmodels.tsa.seasonal import seasonal_decompose
+from statsmodels.tsa.stattools import adfuller
+import networkx as nx
+from sklearn.metrics import silhouette_score
+from sklearn.feature_extraction.text import TfidfVectorizer
+from .supermarket_metrics import supermarket_insights
+from app.utils.detect_industry import is_supermarket   # next snippet
+class AnalyticsService:
+    def __init__(self):
+        self.redis_client = redis.Redis(host='localhost', port=6379, db=0)
+        self.industry_metrics = {
+            'retail': self._retail_metrics,
+            'wholesale': self._wholesale_metrics,
+            'supermarket': self._supermarket_metrics,
+            'manufacturing': self._manufacturing_metrics,
+            'healthcare': self._healthcare_metrics
+        }
+        self.cross_industry_analyzers = {
+            'market_dynamics': self._analyze_market_dynamics,
+            'supply_chain': self._analyze_supply_chain,
+            'customer_insights': self._analyze_customer_insights,
+            'operational_efficiency': self._analyze_operational_efficiency,
+            'risk_assessment': self._analyze_risk_patterns,
+            'sustainability': self._analyze_sustainability_metrics
+        }
+    def perform_eda(self, data, industry=None):
+        """
+        Perform enhanced Exploratory Data Analysis with cross-industry insights
+        """
+        if not data:
+            raise ValueError("Empty dataset provided")
+        df = pd.DataFrame(data)
+        if df.empty:
+            raise ValueError("Empty dataset provided")
+        # Validate numeric columns
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        if len(numeric_cols) == 0:
+            raise ValueError("Non-numeric values found in dataset")
+        # Convert date columns to datetime
+        date_columns = []
+        for col in df.columns:
+            if df[col].dtype == 'object':
+                try:
+                    df[col] = pd.to_datetime(df[col])
+                    date_columns.append(col)
+                except (ValueError, TypeError):
+                    continue
+        # Get numeric columns excluding dates
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        # Advanced statistics and AI-ready features
+        analysis_results = {
+            'basic_stats': df[numeric_cols].describe().to_dict() if len(numeric_cols) > 0 else {},
+            'missing_values': df.isnull().sum().to_dict(),
+            'columns': list(df.columns),
+            'row_count': len(df),
+            'correlation_matrix': df[numeric_cols].corr().to_dict() if len(numeric_cols) > 0 else {},
+            'skewness': df[numeric_cols].skew().to_dict() if len(numeric_cols) > 0 else {},
+            'kurtosis': df[numeric_cols].kurtosis().to_dict() if len(numeric_cols) > 0 else {},
+            'outliers': self._detect_outliers(df),
+            'distribution_tests': self._perform_distribution_tests(df),
+            'dimensionality_reduction': self._perform_dimensionality_reduction(df),
+            'temporal_patterns': self._analyze_temporal_patterns(df),
+            'anomaly_detection': self._detect_anomalies(df),
+            'feature_importance': self._calculate_feature_importance(df)
+        }
+         # --- supermarket auto-detection ---
+        if is_supermarket(df):
+           industry = 'supermarket'
+           results['supermarket_kpis'] = supermarket_insights(df)
+        # Add industry-specific metrics
+        if industry and industry.lower() in self.industry_metrics:
+            analysis_results['industry_metrics'] = self.industry_metrics[industry.lower()](df)
+        # Add cross-industry insights
+        analysis_results['cross_industry_insights'] = {}
+        for analyzer_name, analyzer_func in self.cross_industry_analyzers.items():
+            analysis_results['cross_industry_insights'][analyzer_name] = analyzer_func(df)
+        return analysis_results
+    def _detect_outliers(self, df):
+        """
+        Detect outliers using IQR method for numerical columns
+        """
+        outliers = {}
+        for column in df.select_dtypes(include=[np.number]).columns:
+            Q1 = df[column].quantile(0.25)
+            Q3 = df[column].quantile(0.75)
+            IQR = Q3 - Q1
+            outliers[column] = {
+                'count': len(df[(df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR))]),
+                'percentage': len(df[(df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR))]) / len(df) * 100
+            }
+        return outliers
+    def _perform_distribution_tests(self, df):
+        """
+        Perform distribution tests for numerical columns
+        """
+        tests = {}
+        for column in df.select_dtypes(include=[np.number]).columns:
+            shapiro_test = stats.shapiro(df[column].dropna())
+            tests[column] = {
+                'shapiro_test': {
+                    'statistic': float(shapiro_test.statistic),
+                    'p_value': float(shapiro_test.pvalue)
+                }
+            }
+        return tests
+    def _perform_dimensionality_reduction(self, df):
+        """
+        Perform PCA for dimensional insights
+        """
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        if len(numeric_cols) < 2:
+            return {}
+        scaler = StandardScaler()
+        scaled_data = scaler.fit_transform(df[numeric_cols])
+        pca = PCA()
+        pca_result = pca.fit_transform(scaled_data)
+        return {
+            'explained_variance_ratio': pca.explained_variance_ratio_.tolist(),
+            'cumulative_variance_ratio': np.cumsum(pca.explained_variance_ratio_).tolist(),
+            'n_components_95_variance': np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.95) + 1
+        }
+    def _analyze_temporal_patterns(self, df):
+        """
+        Analyze temporal patterns and seasonality
+        """
+        date_cols = df.select_dtypes(include=['datetime64']).columns
+        if len(date_cols) == 0:
+            return None
+        patterns = {}
+        for date_col in date_cols:
+            df['year'] = df[date_col].dt.year
+            df['month'] = df[date_col].dt.month
+            df['day_of_week'] = df[date_col].dt.dayofweek
+            numeric_cols = df.select_dtypes(include=[np.number]).columns
+            for metric in numeric_cols:
+                if metric not in ['year', 'month', 'day_of_week']:
+                    patterns[f"{metric}_by_month"] = df.groupby('month')[metric].mean().to_dict()
+                    patterns[f"{metric}_by_day_of_week"] = df.groupby('day_of_week')[metric].mean().to_dict()
+        return patterns
+    def _detect_anomalies(self, df):
+        """
+        Detect anomalies using multiple methods
+        """
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        if len(numeric_cols) == 0:
+            return None
+        scaler = StandardScaler()
+        scaled_data = scaler.fit_transform(df[numeric_cols])
+        isolation_forest = IsolationForest(random_state=42, contamination=0.1)
+        anomalies = isolation_forest.fit_predict(scaled_data)
+        return {
+            'anomaly_percentage': float((anomalies == -1).mean() * 100),
+            'anomaly_indices': np.where(anomalies == -1)[0].tolist()
+        }
+    def _calculate_feature_importance(self, df):
+        """
+        Calculate feature importance and relationships
+        """
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        if len(numeric_cols) < 2:
+            return None
+        importance = {}
+        for col in numeric_cols:
+            correlations = []
+            for other_col in numeric_cols:
+                if col != other_col:
+                    # Check if either column is constant
+                    if df[col].nunique() <= 1 or df[other_col].nunique() <= 1:
+                        continue
+                    try:
+                        corr, _ = pearsonr(df[col].fillna(0), df[other_col].fillna(0))
+                        if not np.isnan(corr):  # Only add if correlation is valid
+                            correlations.append((other_col, abs(corr)))
+                    except ValueError:
+                        continue  # Skip if correlation can't be calculated
+            # Handle empty correlations case
+            correlation_values = [abs(c[1]) for c in correlations]
+            importance[col] = {
+                'top_correlations': sorted(correlations, key=lambda x: abs(x[1]), reverse=True)[:3],
+                'correlation_strength': float(np.mean(correlation_values)) if correlation_values else 0.0
+            }
+        return importance
+    def _retail_metrics(self, df):
+        """Calculate retail-specific metrics"""
+        if not all(col in df.columns for col in ['sales', 'inventory', 'customer_satisfaction']):
+            # Return default structure if required columns are missing
+            return {
+                'sales_performance': {},
+                'customer_behavior': {},
+                'inventory': {}
+            }
+        metrics = {
+            'sales_performance': {
+                'total_sales': float(df['sales'].sum()) if 'sales' in df.columns else 0.0,
+                'average_daily_sales': float(df['sales'].mean()) if 'sales' in df.columns else 0.0,
+                'sales_growth': float((df['sales'].iloc[-1] / df['sales'].iloc[0] - 1) * 100) if 'sales' in df.columns else 0.0
+            },
+            'inventory_turnover': {
+                'rate': float(df['sales'].sum() / df['inventory'].mean()) if all(col in df.columns for col in ['sales', 'inventory']) else 0.0,
+                'days_of_inventory': float(df['inventory'].mean() / (df['sales'].mean() / 30)) if all(col in df.columns for col in ['sales', 'inventory']) else 0.0
+            },
+            'customer_metrics': {
+                'satisfaction_score': float(df['customer_satisfaction'].mean()) if 'customer_satisfaction' in df.columns else 0.0,
+                'satisfaction_trend': df['customer_satisfaction'].rolling(window=7).mean().to_dict() if 'customer_satisfaction' in df.columns else {}
+            }
+        }
+        return metrics
+    def _wholesale_metrics(self, df):
+        """
+        Calculate wholesale-specific metrics
+        """
+        metrics = {
+            'order_analytics': {},
+            'supplier_performance': {},
+            'distribution': {}
+        }
+        if 'order_value' in df.columns:
+            metrics['order_analytics']['average_order_value'] = float(df['order_value'].mean())
+            metrics['order_analytics']['order_value_distribution'] = df['order_value'].quantile([0.25, 0.5, 0.75]).to_dict()
+        if 'supplier_id' in df.columns and 'delivery_time' in df.columns:
+            supplier_performance = df.groupby('supplier_id')['delivery_time'].agg(['mean', 'std']).to_dict()
+            metrics['supplier_performance'] = supplier_performance
+        return metrics
+    def _supermarket_metrics(self, df):
+        """
+        Calculate supermarket-specific metrics
+        """
+        metrics = {
+            'category_performance': {},
+            'basket_analysis': {},
+            'promotion_impact': {}
+        }
+        if 'category' in df.columns and 'sales_amount' in df.columns:
+            category_sales = df.groupby('category')['sales_amount'].sum()
+            metrics['category_performance']['top_categories'] = category_sales.nlargest(5).to_dict()
+        if 'transaction_id' in df.columns and 'product_id' in df.columns:
+            # Simple basket analysis
+            transactions = df.groupby('transaction_id')['product_id'].count()
+            metrics['basket_analysis']['average_items_per_transaction'] = float(transactions.mean())
+        if 'promotion_flag' in df.columns and 'sales_amount' in df.columns:
+            promo_impact = df.groupby('promotion_flag')['sales_amount'].mean()
+            metrics['promotion_impact']['sales_lift'] = float(
+                (promo_impact.get(1, 0) - promo_impact.get(0, 0)) / promo_impact.get(0, 1) * 100
+            )
+        return metrics
+    def _manufacturing_metrics(self, df):
+        """Calculate manufacturing-specific metrics"""
+        production_col = 'production_volume' if 'production_volume' in df.columns else 'units_produced'
+        metrics = {
+            'production_efficiency': {
+                'volume': float(df[production_col].mean()),
+                'trend': df[production_col].rolling(window=7).mean().to_dict()
+            },
+            'quality_metrics': {
+                'defect_rate': float(df['defect_rate'].mean()) if 'defect_rate' in df.columns else 0.0,
+                'quality_trend': df['defect_rate'].rolling(window=7).mean().to_dict() if 'defect_rate' in df.columns else {}
+            },
+            'quality_control': {
+                'defects_per_unit': float(df['defect_rate'].mean()) if 'defect_rate' in df.columns else 0.0,
+                'defect_trend': df['defect_rate'].rolling(window=7).mean().to_dict() if 'defect_rate' in df.columns else {}
+            },
+            'equipment_utilization': {
+                'rate': float((df[production_col] / df[production_col].max()).mean() * 100),
+                'trend': df[production_col].rolling(window=7).mean().to_dict()
+            }
+        }
+        return metrics
+    def _healthcare_metrics(self, df):
+        """Calculate healthcare-specific metrics"""
+        metrics = {
+            'patient_outcomes': {
+                'satisfaction': float(df['patient_satisfaction'].mean()),
+                'treatment_success': float(df['treatment_success_rate'].mean())
+            },
+            'operational_efficiency': {
+                'avg_wait_time': float(df['order_fulfillment_time'].mean()),
+                'utilization_rate': float(df['production_volume'].mean() / df['production_volume'].max())
+            },
+            'quality_of_care': {
+                'satisfaction_trend': df['patient_satisfaction'].rolling(window=7).mean().to_dict(),
+                'success_rate_trend': df['treatment_success_rate'].rolling(window=7).mean().to_dict()
+            }
+        }
+        return metrics
+    def forecast_timeseries(self, data, date_column, value_column):
+        """
+        Forecast time series data with support for edge cases
+        """
+        if not data:
+            raise ValueError("Empty dataset provided")
+        df = pd.DataFrame(data)
+        if date_column not in df.columns:
+            raise KeyError(f"Required column '{date_column}' not found")
+        if value_column not in df.columns:
+            raise KeyError(f"Required column '{value_column}' not found")
+        # Convert to datetime
+        try:
+            df[date_column] = pd.to_datetime(df[date_column])
+        except ValueError as exc:
+            raise ValueError("Invalid date format") from exc
+        # Handle missing values
+        has_missing = df[value_column].isnull().any()
+        if has_missing:
+            df[value_column] = df[value_column].interpolate(method='linear')
+        # Detect and handle outliers
+        Q1 = df[value_column].quantile(0.25)
+        Q3 = df[value_column].quantile(0.75)
+        IQR = Q3 - Q1
+        outlier_mask = (df[value_column] < (Q1 - 1.5 * IQR)) | (df[value_column] > (Q3 + 1.5 * IQR))
+        has_outliers = outlier_mask.any()
+        # Prepare data for Prophet
+        prophet_df = df.rename(columns={date_column: 'ds', value_column: 'y'})
+        model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True)
+        model.fit(prophet_df)
+        # Make future dataframe for forecasting
+        future = model.make_future_dataframe(periods=30)
+        forecast = model.predict(future)
+        result = {
+            'forecast': forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].to_dict('records'),
+            'components': {
+                'trend': forecast['trend'].to_dict(),
+                'yearly': forecast['yearly'].to_dict() if 'yearly' in forecast else {},
+                'weekly': forecast['weekly'].to_dict() if 'weekly' in forecast else {},
+                'daily': forecast['daily'].to_dict() if 'daily' in forecast else {}
+            }
+        }
+        if has_missing:
+            result['handling_missing_values'] = {'filled_indices': df[value_column].isnull().sum()}
+        if has_outliers:
+            result['outlier_impact'] = {
+                'outlier_indices': outlier_mask[outlier_mask].index.tolist(),
+                'outlier_values': df.loc[outlier_mask, value_column].tolist()
+            }
+        # Detect seasonality
+        decomposition = seasonal_decompose(df[value_column], period=7, extrapolate_trend='freq')
+        result['seasonality_components'] = {
+            'trend': decomposition.trend.to_dict(),
+            'seasonal': decomposition.seasonal.to_dict(),
+            'residual': decomposition.resid.to_dict()
+        }
+        # Cache the forecast with timestamp to ensure freshness
+        timestamp = datetime.now().strftime('%Y%m%d%H')
+        cache_key = f"forecast_{date_column}_{value_column}_{timestamp}"
+        self.redis_client.set(cache_key, json.dumps(result, cls=CustomJSONEncoder))
+        return result
+    def get_cached_forecast(self, date_column, value_column):
+        """
+        Retrieve cached forecast results
+        """
+        timestamp = datetime.now().strftime('%Y%m%d%H')
+        cache_key = f"forecast_{date_column}_{value_column}_{timestamp}"
+        cached = self.redis_client.get(cache_key)
+        if cached:
+            return json.loads(cached)
+        return None
+    def _analyze_market_dynamics(self, df):
+        """
+        Analyze market dynamics across industries
+        """
+        metrics = {
+            'market_trends': {},
+            'competitive_analysis': {},
+            'growth_patterns': {}
+        }
+        if 'revenue' in df.columns and 'date' in df.columns:
+            # Trend Analysis
+            df['month'] = pd.to_datetime(df['date']).dt.to_period('M')
+            monthly_revenue = df.groupby('month')['revenue'].sum()
+            # Calculate growth rates
+            metrics['growth_patterns']['monthly_growth'] = float(
+                ((monthly_revenue.iloc[-1] / monthly_revenue.iloc[0]) ** (1/len(monthly_revenue)) - 1) * 100
+            )
+            # Market volatility
+            mean_revenue = monthly_revenue.mean()
+            if mean_revenue > 0:  # Avoid division by zero
+                metrics['market_trends']['volatility'] = float(monthly_revenue.std() / mean_revenue)
+            else:
+                metrics['market_trends']['volatility'] = 0.0
+        if 'competitor_price' in df.columns and 'price' in df.columns:
+            comp_price_mean = df['competitor_price'].mean()
+            if comp_price_mean > 0:  # Avoid division by zero
+                metrics['competitive_analysis']['price_position'] = float(
+                    (df['price'].mean() / comp_price_mean - 1) * 100
+                )
+            else:
+                metrics['competitive_analysis']['price_position'] = 0.0
+        return metrics
+    def _analyze_supply_chain(self, df):
+        """
+        Analyze supply chain metrics across industries
+        """
+        metrics = {
+            'efficiency': {},
+            'reliability': {},
+            'cost_analysis': {}
+        }
+        # Supply Chain Network Analysis
+        if 'supplier_id' in df.columns and 'delivery_time' in df.columns:
+            supplier_performance = df.groupby('supplier_id').agg({
+                'delivery_time': ['mean', 'std'],
+                'order_value': ['sum', 'mean']
+            }).round(2)
+            metrics['reliability']['supplier_consistency'] = float(
+                1 - (supplier_performance['delivery_time']['std'] / supplier_performance['delivery_time']['mean']).mean()
+            )
+        # Cost and Efficiency Analysis
+        if 'transportation_cost' in df.columns and 'order_value' in df.columns:
+            metrics['cost_analysis']['logistics_cost_ratio'] = float(
+                (df['transportation_cost'].sum() / df['order_value'].sum()) * 100
+            )
+        return metrics
+    def _analyze_customer_insights(self, df):
+        """
+        Cross-industry customer behavior analysis
+        """
+        insights = {
+            'customer_segments': {},
+            'behavior_patterns': {},
+            'lifetime_value': {}
+        }
+        if 'customer_id' in df.columns and 'transaction_amount' in df.columns:
+            # Customer Segmentation using DBSCAN for more natural clustering
+            customer_features = df.groupby('customer_id').agg({
+                'transaction_amount': ['sum', 'mean', 'count']
+            }).values
+            scaler = MinMaxScaler()
+            scaled_features = scaler.fit_transform(customer_features)
+            # Find optimal eps parameter for DBSCAN
+            dbscan = DBSCAN(eps=0.3, min_samples=5)
+            clusters = dbscan.fit_predict(scaled_features)
+            insights['customer_segments']['natural_segments'] = {
+                'n_segments': len(np.unique(clusters[clusters >= 0])),
+                'segment_sizes': pd.Series(clusters).value_counts().to_dict()
+            }
+        return insights
+    def _analyze_operational_efficiency(self, df):
+        """
+        Cross-industry operational efficiency analysis
+        """
+        metrics = {
+            'process_efficiency': {},
+            'resource_utilization': {},
+            'bottleneck_analysis': {}
+        }
+        if 'process_time' in df.columns and 'output_quantity' in df.columns:
+            # Process Efficiency Analysis
+            metrics['process_efficiency']['throughput_rate'] = float(
+                df['output_quantity'].sum() / df['process_time'].sum()
+            )
+            # Calculate process stability
+            process_stability = 1 - (df['process_time'].std() / df['process_time'].mean())
+            metrics['process_efficiency']['stability_score'] = float(process_stability)
+        return metrics
+    def _analyze_risk_patterns(self, df):
+        """
+        Cross-industry risk pattern analysis
+        """
+        risk_metrics = {
+            'operational_risk': {},
+            'market_risk': {},
+            'compliance_risk': {}
+        }
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        if len(numeric_cols) > 0:
+            # Use Isolation Forest for risk pattern detection
+            iso_forest = IsolationForest(contamination=0.1, random_state=42)
+            risk_scores = iso_forest.fit_predict(df[numeric_cols])
+            risk_metrics['operational_risk']['anomaly_percentage'] = float(
+                (risk_scores == -1).mean() * 100
+            )
+        return risk_metrics
+    def _analyze_sustainability_metrics(self, df):
+        """
+        Analyze sustainability metrics including environmental impact, resource utilization, and waste management
+        """
+        if not all(col in df.columns for col in ['energy_consumption', 'water_consumption', 'waste_generated']):
+            return {}
+        results = {
+            'environmental_impact': {
+                'carbon_footprint_trend': df['carbon_footprint'].rolling(window=7).mean().to_dict() if 'carbon_footprint' in df.columns else {},
+                'total_emissions': float(df['energy_consumption'].sum() * 0.5)
+            },
+            'resource_utilization': {
+                'energy_efficiency': float(df['energy_consumption'].mean()),
+                'water_efficiency': float(df['water_consumption'].mean())
+            },
+            'waste_management': {
+                'recycling_performance': float(df['recycling_rate'].mean()) if 'recycling_rate' in df.columns else 0.0,
+                'waste_reduction_trend': df['waste_generated'].rolling(window=7).mean().to_dict()
+            }
+        }
+        return results
+    def prepare_ai_query_interface(self, df):
+        """
+        Prepare data for natural language analytics queries with enhanced semantic understanding
+        """
+        query_interface = {
+            'semantic_mappings': {},
+            'entity_relationships': {},
+            'available_metrics': {},
+            'temporal_context': {},
+            'metric_relationships': {},
+            'data_patterns': {},
+            'suggested_queries': []
+        }
+        try:
+            # Create semantic mappings for textual columns
+            text_columns = df.select_dtypes(include=['object']).columns
+            vectorizer = TfidfVectorizer(max_features=1000)
+            for col in text_columns:
+                if df[col].str.len().mean() > 5:  # Only process meaningful text fields
+                    text_features = vectorizer.fit_transform(df[col].fillna('').astype(str))
+                    query_interface['semantic_mappings'][col] = {
+                        'vocabulary': vectorizer.vocabulary_,
+                        'idf_values': vectorizer.idf_.tolist(),
+                        'top_terms': dict(zip(
+                            vectorizer.get_feature_names_out(),
+                            np.asarray(text_features.sum(axis=0)).ravel()
+                        ))
+                    }
+            # Map entity relationships and hierarchies
+            entity_columns = [col for col in df.columns if any(entity in col.lower()
+                            for entity in ['id', 'category', 'type', 'name', 'class', 'group'])]
+            for col in entity_columns:
+                if df[col].dtype == 'object':
+                    value_counts = df[col].value_counts()
+                    unique_values = df[col].unique().tolist()
+                    # Find potential hierarchical relationships
+                    hierarchy = {}
+                    if '_' in col or col.lower().endswith('_id'):
+                        related_cols = [c for c in df.columns if col.split('_')[0] in c and c != col]
+                        for rel_col in related_cols:
+                            hierarchy[rel_col] = df.groupby(col)[rel_col].agg(list).to_dict()
+                    query_interface['entity_relationships'][col] = {
+                        'unique_values': unique_values,
+                        'value_counts': value_counts.to_dict(),
+                        'hierarchy': hierarchy,
+                        'cardinality': len(unique_values)
+                    }
+            # Document available metrics and their relationships
+            numeric_cols = df.select_dtypes(include=[np.number]).columns
+            for col in numeric_cols:
+                stats = df[col].describe()
+                query_interface['available_metrics'][col] = {
+                    'min': float(stats['min']),
+                    'max': float(stats['max']),
+                    'mean': float(stats['mean']),
+                    'std': float(stats['std']),
+                    'quartiles': {
+                        '25%': float(stats['25%']),
+                        '50%': float(stats['50%']),
+                        '75%': float(stats['75%'])
+                    }
+                }
+                # Analyze metric relationships
+                correlations = {}
+                for other_col in numeric_cols:
+                    if col != other_col:
+                        corr = df[col].corr(df[other_col])
+                        if abs(corr) > 0.3:  # Only store meaningful correlations
+                            correlations[other_col] = float(corr)
+                query_interface['metric_relationships'][col] = {
+                    'correlations': correlations,
+                    'trends': self._analyze_metric_trends(df, col)
+                }
+            # Add temporal context if available
+            date_cols = df.select_dtypes(include=['datetime64']).columns
+            if len(date_cols) == 0:
+                # Try to convert string columns that might contain dates
+                for col in df.columns:
+                    if df[col].dtype == 'object':
+                        try:
+                            pd.to_datetime(df[col])
+                            date_cols = date_cols.append(col)
+                        except:
+                            continue
+            for date_col in date_cols:
+                df[date_col] = pd.to_datetime(df[date_col])
+                temporal_stats = {
+                    'min_date': df[date_col].min().isoformat(),
+                    'max_date': df[date_col].max().isoformat(),
+                    'frequency': pd.infer_freq(df[date_col]),
+                    'temporal_patterns': {}
+                }
+                # Analyze temporal patterns
+                temporal_stats['temporal_patterns'] = {
+                    'daily_pattern': df.groupby(df[date_col].dt.dayofweek).size().to_dict(),
+                    'monthly_pattern': df.groupby(df[date_col].dt.month).size().to_dict(),
+                    'yearly_pattern': df.groupby(df[date_col].dt.year).size().to_dict()
+                }
+                query_interface['temporal_context'][date_col] = temporal_stats
+            # Identify data patterns and anomalies
+            query_interface['data_patterns'] = {
+                'missing_patterns': df.isnull().sum().to_dict(),
+                'unique_value_counts': df.nunique().to_dict(),
+                'distribution_types': self._analyze_distributions(df)
+            }
+            # Generate suggested queries based on data characteristics
+            query_interface['suggested_queries'] = self._generate_suggested_queries(df)
+            # Add metadata about the dataset
+            query_interface['metadata'] = {
+                'row_count': len(df),
+                'column_count': len(df.columns),
+                'memory_usage': df.memory_usage(deep=True).sum(),
+                'data_types': df.dtypes.astype(str).to_dict()
+            }
+        except Exception as e:
+            query_interface['error'] = str(e)
+        return query_interface
+    def _analyze_metric_trends(self, df, column):
+        """Helper method to analyze trends in numeric columns"""
+        trends = {}
+        if 'date' in df.columns:
+            df['date'] = pd.to_datetime(df['date'])
+            time_series = df.groupby('date')[column].mean()
+            if len(time_series) > 2:
+                # Calculate trend
+                x = np.arange(len(time_series))
+                y = time_series.values
+                slope, intercept = np.polyfit(x, y, 1)
+                trends['slope'] = float(slope)
+                trends['trend_direction'] = 'increasing' if slope > 0 else 'decreasing'
+                trends['trend_strength'] = float(abs(slope) / time_series.mean())
+        return trends
+    def _analyze_distributions(self, df):
+        """Helper method to analyze value distributions"""
+        distributions = {}
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        for col in numeric_cols:
+            if df[col].nunique() > 5:  # Skip columns with too few unique values
+                # Test for normality
+                _, p_value = stats.normaltest(df[col].dropna())
+                skewness = float(df[col].skew())
+                kurtosis = float(df[col].kurtosis())
+                distributions[col] = {
+                    'distribution_type': 'normal' if p_value > 0.05 else 'non_normal',
+                    'skewness': skewness,
+                    'kurtosis': kurtosis
+                }
+        return distributions
+    def _generate_suggested_queries(self, df):
+        """Helper method to generate relevant query suggestions"""
+        suggestions = []
+        # Add time-based queries if temporal data exists
+        if 'date' in df.columns:
+            suggestions.extend([
+                "Show the trend over time",
+                "Compare year-over-year growth",
+                "Find seasonal patterns"
+            ])
+        # Add metric-based queries
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        if len(numeric_cols) > 0:
+            suggestions.extend([
+                f"Analyze the distribution of {col}" for col in numeric_cols[:3]
+            ])
+        # Add categorical analysis queries
+        categorical_cols = df.select_dtypes(include=['object']).columns
+        if len(categorical_cols) > 0:
+            suggestions.extend([
+                f"Break down metrics by {col}" for col in categorical_cols[:3]
+            ])
+        return suggestions
+    def enhance_cross_industry_correlations(self, df):
+        """
+        Enhanced analysis of correlations across different industries
+        """
+        correlations = {
+            'metric_correlations': {},
+            'industry_patterns': {},
+            'shared_trends': {}
+        }
+        if 'industry' in df.columns:
+            industries = df['industry'].unique()
+            numeric_cols = df.select_dtypes(include=[np.number]).columns
+            # Calculate cross-industry metric correlations
+            for ind1 in industries:
+                for ind2 in industries:
+                    if ind1 < ind2:  # Avoid duplicate comparisons
+                        ind1_data = df[df['industry'] == ind1][numeric_cols]
+                        ind2_data = df[df['industry'] == ind2][numeric_cols]
+                        if not ind1_data.empty and not ind2_data.empty:
+                            common_metrics = set(ind1_data.columns) & set(ind2_data.columns)
+                            for metric in common_metrics:
+                                corr, p_value = pearsonr(
+                                    ind1_data[metric].fillna(0),
+                                    ind2_data[metric].fillna(0)
+                                )
+                                correlations['metric_correlations'][f"{ind1}_{ind2}_{metric}"] = {
+                                    'correlation': float(corr),
+                                    'p_value': float(p_value)
+                                }
+            # Identify shared trends
+            if 'date' in df.columns:
+                for metric in numeric_cols:
+                    industry_trends = {}
+                    for industry in industries:
+                        industry_data = df[df['industry'] == industry]
+                        if not industry_data.empty:
+                            trend = industry_data.groupby('date')[metric].mean()
+                            if len(trend) > 0:
+                                industry_trends[industry] = trend.to_dict()
+                    correlations['shared_trends'][metric] = industry_trends
+        return correlations
+    def perform_market_basket_analysis(self, df: pd.DataFrame, min_support: float = 0.01,
+                                      min_confidence: float = 0.3, min_lift: float = 1.0) -> dict:
+        """
+        Perform advanced market basket analysis with support for multiple analytics dimensions.
+        Args:
+            df (pd.DataFrame): Input transaction data with required columns
+            min_support (float): Minimum support threshold for frequent itemsets (default: 0.01)
+            min_confidence (float): Minimum confidence threshold for rules (default: 0.3)
+            min_lift (float): Minimum lift threshold for rules (default: 1.0)
+        Returns:
+            dict: Dictionary containing:
+                - product_associations: Support, confidence, and lift metrics for product pairs
+                - temporal_baskets: Time-based purchase patterns
+                - product_clusters: Product groupings based on purchase behavior
+                - customer_segments: Customer segments based on purchase patterns
+                - performance_metrics: Key performance indicators
+        Raises:
+            ValueError: If required columns are missing or data validation fails
+        """
+        try:
+            # Validate input data
+            required_columns = ['transaction_id', 'product_id']
+            if not all(col in df.columns for col in required_columns):
+                raise ValueError(f"Missing required columns: {set(required_columns) - set(df.columns)}")
+            if df.empty:
+                raise ValueError("Empty dataframe provided")
+            # Work with a copy of the dataframe
+            df = df.copy()
+            # Convert to basket format with optimization for large datasets
+            baskets = (df.groupby('transaction_id')['product_id']
+                      .agg(lambda x: frozenset(x.values))  # Using frozenset for better performance
+                      .reset_index())
+            total_transactions = len(baskets)
+            # Calculate product frequencies using vectorized operations
+            product_freq = df.groupby('product_id').size().to_dict()
+            # Generate product pairs efficiently
+            pairs_data = []
+            for products in baskets['product_id']:
+                products_list = list(products)  # Convert frozenset to list once
+                pairs_data.extend(
+                    tuple(sorted([p1, p2]))
+                    for i, p1 in enumerate(products_list)
+                    for p2 in products_list[i+1:]
+                )
+            pair_freq = pd.Series(pairs_data).value_counts().to_dict()
+            # Calculate association metrics with validation
+            product_associations = {
+                'support': {},
+                'confidence': {},
+                'lift': {},
+                'metrics_distribution': {
+                    'support': {'min': float('inf'), 'max': 0, 'mean': 0},
+                    'confidence': {'min': float('inf'), 'max': 0, 'mean': 0},
+                    'lift': {'min': float('inf'), 'max': 0, 'mean': 0}
+                }
+            }
+            valid_rules = []
+            for pair, freq in pair_freq.items():
+                prod1, prod2 = pair
+                support = freq / total_transactions
+                if support >= min_support:
+                    confidence_1_2 = freq / product_freq[prod1]
+                    confidence_2_1 = freq / product_freq[prod2]
+                    max_confidence = max(confidence_1_2, confidence_2_1)
+                    if max_confidence >= min_confidence:
+                        lift = (freq * total_transactions) / (product_freq[prod1] * product_freq[prod2])
+                        if lift >= min_lift:
+                            valid_rules.append({
+                                'pair': pair,
+                                'support': support,
+                                'confidence': max_confidence,
+                                'lift': lift
+                            })
+                            # Store metrics with string keys for JSON serialization
+                            pair_key = f"({prod1}, {prod2})"
+                            product_associations['support'][pair_key] = float(support)
+                            product_associations['confidence'][pair_key] = float(max_confidence)
+                            product_associations['lift'][pair_key] = float(lift)
+                            # Update metrics distribution
+                            for metric_type, value in [('support', support),
+                                                     ('confidence', max_confidence),
+                                                     ('lift', lift)]:
+                                dist = product_associations['metrics_distribution'][metric_type]
+                                dist['min'] = min(dist['min'], value)
+                                dist['max'] = max(dist['max'], value)
+            # Calculate means for distributions
+            for metric_type in ['support', 'confidence', 'lift']:
+                values = [rule[metric_type] for rule in valid_rules]
+                if values:
+                    product_associations['metrics_distribution'][metric_type]['mean'] = float(sum(values) / len(values))
+                else:
+                    product_associations['metrics_distribution'][metric_type] = {'min': 0, 'max': 0, 'mean': 0}
+            # Enhanced temporal analysis
+            temporal_patterns = self._analyze_temporal_patterns(df) if 'timestamp' in df.columns else {}
+            # Enhanced product clustering
+            product_clusters = self._perform_product_clustering(df) if 'quantity' in df.columns else {}
+            # Customer segmentation
+            customer_segments = self._analyze_customer_segments(df) if 'customer_id' in df.columns else {}
+            # Performance metrics
+            performance_metrics = {
+                'total_transactions': total_transactions,
+                'unique_products': len(product_freq),
+                'avg_basket_size': float(df.groupby('transaction_id')['product_id'].count().mean()),
+                'total_rules_found': len(valid_rules),
+                'rules_distribution': {
+                    'strong_associations': len([r for r in valid_rules if r['lift'] > 2]),
+                    'moderate_associations': len([r for r in valid_rules if 1 < r['lift'] <= 2]),
+                    'weak_associations': len([r for r in valid_rules if r['lift'] <= 1])
+                }
+            }
+            return {
+                'product_associations': product_associations,
+                'temporal_baskets': temporal_patterns,
+                'product_clusters': product_clusters,
+                'customer_segments': customer_segments,
+                'performance_metrics': performance_metrics
+            }
+        except Exception as e:
+            print(f"Error in market basket analysis: {str(e)}")
+            raise ValueError(f"Market basket analysis failed: {str(e)}") from e
+    def _analyze_temporal_patterns(self, df: pd.DataFrame) -> dict:
+        """Analyze temporal patterns in purchase behavior"""
+        patterns = {
+            'daily_patterns': {},
+            'weekly_patterns': {},
+            'monthly_patterns': {},
+            'hourly_patterns': {}
+        }
+        try:
+            timestamps = pd.to_datetime(df['timestamp'])
+            for period, grouper in [
+                ('hourly_patterns', timestamps.dt.hour),
+                ('daily_patterns', timestamps.dt.day),
+                ('weekly_patterns', timestamps.dt.dayofweek),
+                ('monthly_patterns', timestamps.dt.month)
+            ]:
+                pattern_data = df.groupby(grouper).agg({
+                    'product_id': ['count', 'nunique'],
+                    'transaction_id': 'nunique',
+                    'quantity': ['sum', 'mean'] if 'quantity' in df.columns else ['count']
+                }).round(2)
+                patterns[period] = {
+                    'transaction_count': pattern_data['transaction_id']['nunique'].to_dict(),
+                    'product_count': pattern_data['product_id']['count'].to_dict(),
+                    'unique_products': pattern_data['product_id']['nunique'].to_dict(),
+                    'total_quantity': pattern_data['quantity']['sum'].to_dict() if 'quantity' in df.columns else {},
+                    'avg_quantity': pattern_data['quantity']['mean'].to_dict() if 'quantity' in df.columns else {}
+                }
+        except (ValueError, KeyError) as e:
+            print(f"Error in temporal pattern analysis: {str(e)}")
+            return patterns
+        return patterns
+    def _perform_product_clustering(self, df: pd.DataFrame) -> dict:
+        """Perform advanced product clustering analysis"""
+        try:
+            # Create rich product features
+            product_features = df.groupby('product_id').agg({
+                'quantity': ['mean', 'std', 'sum', 'count'],
+                'transaction_id': 'nunique'
+            }).fillna(0)
+            # Feature engineering
+            product_features['quantity_per_transaction'] = (
+                product_features['quantity']['sum'] /
+                product_features['transaction_id']['nunique']
+            )
+            # Prepare features for clustering
+            features_for_clustering = product_features.copy()
+            features_for_clustering.columns = [f"{col[0]}_{col[1]}" if isinstance(col, tuple) else col
+                                            for col in features_for_clustering.columns]
+            if len(features_for_clustering) > 1:
+                # Scale features
+                scaler = StandardScaler()
+                scaled_features = scaler.fit_transform(features_for_clustering)
+                # Determine optimal number of clusters
+                max_clusters = min(5, len(features_for_clustering) - 1)
+                scores = []
+                for k in range(2, max_clusters + 1):
+                    kmeans = KMeans(n_clusters=k, random_state=42)
+                    clusters = kmeans.fit_predict(scaled_features)
+                    score = silhouette_score(scaled_features, clusters)
+                    scores.append((k, score))
+                # Use optimal number of clusters
+                optimal_k = max(scores, key=lambda x: x[1])[0]
+                kmeans = KMeans(n_clusters=optimal_k, random_state=42)
+                clusters = kmeans.fit_predict(scaled_features)
+                # Prepare cluster insights
+                cluster_data = {
+                    'cluster_assignments': {
+                        prod: int(cluster) for prod, cluster in zip(product_features.index, clusters)
+                    },
+                    'cluster_profiles': {},
+                    'evaluation_metrics': {
+                        'silhouette_score': float(max(scores, key=lambda x: x[1])[1]),
+                        'num_clusters': optimal_k
+                    }
+                }
+                # Generate cluster profiles
+                for cluster_id in range(optimal_k):
+                    cluster_mask = clusters == cluster_id
+                    cluster_data['cluster_profiles'][str(cluster_id)] = {
+                        'size': int(sum(cluster_mask)),
+                        'avg_quantity': float(product_features['quantity']['mean'][cluster_mask].mean()),
+                        'avg_transactions': float(product_features['transaction_id']['nunique'][cluster_mask].mean()),
+                        'total_quantity': float(product_features['quantity']['sum'][cluster_mask].sum()),
+                        'purchase_frequency': float(
+                            (product_features['quantity']['count'][cluster_mask].sum() /
+                             product_features['transaction_id']['nunique'][cluster_mask].sum())
+                        )
+                    }
+                return cluster_data
+        except np.linalg.LinAlgError as e:
+            print(f"Error in clustering computation: {str(e)}")
+            return {}
+        except (ValueError, KeyError) as e:
+            print(f"Error in product clustering: {str(e)}")
+            return {}
+        return {}
+    def _analyze_customer_segments(self, df: pd.DataFrame) -> dict:
+        """Analyze customer segments based on purchase behavior"""
+        try:
+            if 'customer_id' not in df.columns:
+                return {}
+            customer_stats = df.groupby('customer_id').agg({
+                'transaction_id': 'nunique',
+                'product_id': ['nunique', 'count'],
+                'quantity': ['sum', 'mean'] if 'quantity' in df.columns else ['count', 'mean']
+            })
+            # Calculate RFM scores
+            if 'timestamp' in df.columns:
+                current_date = pd.to_datetime(df['timestamp']).max()
+                customer_stats['recency'] = df.groupby('customer_id')['timestamp'].max().apply(
+                    lambda x: (current_date - pd.to_datetime(x)).days
+                )
+            # Segment customers
+            stats_for_clustering = customer_stats.copy()
+            stats_for_clustering.columns = [f"{col[0]}_{col[1]}" if isinstance(col, tuple) else col
+                                         for col in stats_for_clustering.columns]
+            if len(stats_for_clustering) > 1:
+                scaler = StandardScaler()
+                scaled_features = scaler.fit_transform(stats_for_clustering)
+                # Use DBSCAN for flexible cluster numbers
+                dbscan = DBSCAN(eps=0.5, min_samples=3)
+                clusters = dbscan.fit_predict(scaled_features)
+                return {
+                    'customer_segments': {
+                        str(cust): int(cluster) for cust, cluster in zip(customer_stats.index, clusters)
+                    },
+                    'segment_profiles': {
+                        str(segment): {
+                            'size': int(sum(clusters == segment)),
+                            'avg_transactions': float(customer_stats['transaction_id']['nunique'][clusters == segment].mean()),
+                            'avg_products': float(customer_stats['product_id']['nunique'][clusters == segment].mean())
+                        }
+                        for segment in set(clusters) if segment != -1
+                    },
+                    'segment_statistics': {
+                        'num_segments': len(set(clusters) - {-1}),
+                        'noise_points': int(sum(clusters == -1))
+                    }
+                }
+        except Exception as e:
+            print(f"Error in customer segmentation: {str(e)}")
+            return {}
+    def _calculate_correlations(self, df: pd.DataFrame) -> dict:
+        """Calculate correlations between numeric columns with detailed statistics"""
+        correlations = {}
+        try:
+            numeric_cols = df.select_dtypes(include=[np.number]).columns
+            if len(numeric_cols) < 2:
+                return correlations
+            # Calculate correlation matrix
+            corr_matrix = df[numeric_cols].corr()
+            # Convert correlations to dictionary with additional metadata
+            for col1 in numeric_cols:
+                correlations[col1] = {}
+                for col2 in numeric_cols:
+                    if col1 != col2:
+                        correlation = corr_matrix.loc[col1, col2]
+                        if not np.isnan(correlation):
+                            # Calculate p-value using pearsonr
+                            coef, p_value = pearsonr(df[col1].fillna(0), df[col2].fillna(0))
+                            correlations[col1][col2] = {
+                                'coefficient': float(correlation),
+                                'p_value': float(p_value),
+                                'strength': 'strong' if abs(correlation) > 0.7
+                                          else 'moderate' if abs(correlation) > 0.3
+                                          else 'weak',
+                                'direction': 'positive' if correlation > 0 else 'negative',
+                                'sample_size': len(df)
+                            }
+        except Exception as e:
+            print(f"Error calculating correlations: {str(e)}")
+            return {}
+        return correlations

app/engine/json_utils.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# analytics-service/app/engine/json_utils.py
+import json
+from datetime import datetime, date
+import numpy as np
+class CustomJSONEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, (datetime, date)):
+            return obj.isoformat()
+        if isinstance(obj, (np.integer, np.int64)):
+            return int(obj)
+        if isinstance(obj, (np.floating, np.float64)):
+            return float(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        return super().default(obj)

app/engine/supermarket_metrics.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""
+Supermarket-specific KPI generator – works with ANY POS export.
+Handles: Square, Lightspeed, Shopify POS, NCR, Oracle MICROS, QuickBooks POS
+"""
+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+from typing import Dict, Any
+# POS column alias map – covers 99 % of exports
+_ALIAS = {
+    "sku": ["sku", "barcode", "item_code", "plu", "product_id"],
+    "qty": ["qty", "quantity", "units", "stock", "quantity_on_hand"],
+    "expiry": ["expiry_date", "exp", "best_before", "use_by", "expiration"],
+    "promo": ["promo", "promotion", "discount_code", "campaign", "is_promo"],
+    "sales": ["total_line", "net_amount", "line_total", "amount", "sales_amount"],
+    "transaction": ["transaction_id", "receipt_no", "ticket_no", "order_id"],
+    "store": ["store_id", "branch_code", "location_id", "outlet_id"],
+    "category": ["category", "department", "cat", "sub_category"],
+    "loss": ["loss_qty", "waste_qty", "shrinkage_qty", "damaged_qty"],
+    "customer": ["customer_id", "loyalty_id", "phone"],
+    "price": ["unit_price", "price", "sell_price"],
+    "cost": ["cost_price", "supply_price", "unit_cost"],
+}
+def _find_col(df: pd.DataFrame, keys):
+    """Return first matching column or None."""
+    for k in keys:
+        for col in df.columns:
+            if k.lower() in col.lower():
+                return col
+    return None
+def supermarket_insights(df: pd.DataFrame) -> Dict[str, Any]:
+    """Return supermarket KPIs & alerts – zero config."""
+    df = df.copy()
+    df.columns = [c.lower().strip() for c in df.columns]
+    # --- resolve columns via alias map ---
+    sku_col      = _find_col(df, _ALIAS["sku"])
+    qty_col      = _find_col(df, _ALIAS["qty"])
+    expiry_col   = _find_col(df, _ALIAS["expiry"])
+    promo_col    = _find_col(df, _ALIAS["promo"])
+    sales_col    = _find_col(df, _ALIAS["sales"])
+    trans_col    = _find_col(df, _ALIAS["transaction"])
+    store_col    = _find_col(df, _ALIAS["store"])
+    cat_col      = _find_col(df, _ALIAS["category"])
+    loss_col     = _find_col(df, _ALIAS["loss"])
+    cust_col     = _find_col(df, _ALIAS["customer"])
+    price_col    = _find_col(df, _ALIAS["price"])
+    cost_col     = _find_col(df, _ALIAS["cost"])
+    # 1  STOCK COUNT & SKU BREADTH
+    stock = int(df[qty_col].sum()) if qty_col else 0
+    unique_sku = int(df[sku_col].nunique()) if sku_col else 0
+    # 2  EXPIRY ALERTS
+    expiring_7d = 0
+    if expiry_col:
+        df[expiry_col] = pd.to_datetime(df[expiry_col], errors='coerce')
+        expiring_7d = int((df[expiry_col] - datetime.now()).dt.days.le(7).sum())
+    # 3  PROMO LIFT
+    lift = 0.0
+    if promo_col and sales_col:
+        base = df[df[promo_col].astype(str).str[0].isin(['0','F','f'])][sales_col].mean()
+        promo= df[df[promo_col].astype(str).str[0].isin(['1','T','t'])][sales_col].mean()
+        lift = float((promo - base) / base * 100) if base else 0.0
+    # 4  BASKET SIZE
+    avg_basket = 0.0
+    if trans_col and sales_col:
+        basket = df.groupby(trans_col)[sales_col].sum()
+        avg_basket = float(basket.mean())
+    # 5  SHRINKAGE %
+    shrink = 0.0
+    if loss_col and qty_col:
+        shrink = float(df[loss_col].sum() / df[qty_col].sum() * 100)
+    # 6  FAST MOVERS (top 5)
+    movers = {}
+    if sku_col and qty_col:
+        movers = (df.groupby(sku_col)[qty_col].sum()
+                    .nlargest(5)
+                    .to_dict())
+    # 7  GROSS-MARGIN BY CATEGORY
+    margin = {}
+    if cat_col and price_col and cost_col:
+        df['margin'] = (df[price_col] - df[cost_col]) / df[price_col] * 100
+        margin = (df.groupby(cat_col)['margin'].mean()
+                    .round(1)
+                    .to_dict())
+    # 8  CUSTOMER REACH
+    unique_cust = int(df[cust_col].nunique()) if cust_col else 0
+    # 9  STORE PERFORMANCE (if multi-outlet)
+    store_perf = {}
+    if store_col and sales_col:
+        store_perf = (df.groupby(store_col)[sales_col].sum()
+                        .round(0)
+                        .to_dict())
+    # 10 ALERTS
+    alerts = []
+    if expiring_7d:
+        alerts.append({"type": "expiry",   "severity": "high", "message": f"{expiring_7d} SKUs expire ≤7 days"})
+    if shrink > 1:
+        alerts.append({"type": "shrinkage","severity": "med",  "message": f"Shrinkage {shrink:.1f} %"})
+    if lift < 0:
+        alerts.append({"type": "promo",    "severity": "low",  "message": "Promo discount deeper than lift"})
+    return {
+        "supermarket_kpis": {
+            "stock_on_hand": stock,
+            "unique_sku": unique_sku,
+            "expiring_next_7_days": expiring_7d,
+            "promo_lift_pct": round(lift, 1),
+            "avg_basket_kes": round(avg_basket, 2),
+            "shrinkage_pct": round(shrink, 2),
+            "unique_customers": unique_cust,
+        },
+        "fast_movers": movers,
+        "category_margin_pct": margin,
+        "store_sales": store_perf,
+        "alerts": alerts,
+    }

app/ingest.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from datetime import datetime
+def ingest_dict(org_id: str, payload: dict):
+    conn = get_conn(org_id)
+    ensure_raw_table(conn)
+    conn.execute("INSERT INTO raw_rows(row_data) VALUES (?)", [json.dumps(payload)])
+    conn.close()

app/main.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from fastapi import FastAPI, Depends
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.encoders import jsonable_encoder
+from fastapi.responses import JSONResponse
+from app.routers import ingress, reports, flags, datasources, scheduler, run, health, socket
+from app.tasks.scheduler import start_scheduler
+from app.deps import verify_key
+from contextlib import asynccontextmanager
+import os
+# ----------  lifespan  ----------
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    start_scheduler()
+    yield
+# ----------  app init  ----------
+app = FastAPI(
+    title="MutSyncHub Analytics Engine",
+    version="2.2",
+    lifespan=lifespan
+)
+# ----------  Socket.IO Mount  ----------
+app.mount("/socket.io", socket.socket_app)
+# ----------  Middleware (fix order) ----------
+@app.middleware("http")
+async def serialize_all_responses(request, call_next):
+    """Ensure all responses are safely JSON-serializable."""
+    response = await call_next(request)
+    if isinstance(response, dict):
+        return JSONResponse(content=jsonable_encoder(response))
+    return response
+# ----------  CORS Configuration ----------
+origins = [
+    "https://mut-sync-hub.vercel.app",  # live frontend
+    "http://localhost:3000",            # local dev
+]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ----------  Routers ----------
+app.include_router(health.router)  # public route (no key)
+app.include_router(datasources.router, dependencies=[Depends(verify_key)])
+app.include_router(reports.router, dependencies=[Depends(verify_key)])
+app.include_router(flags.router, dependencies=[Depends(verify_key)])
+app.include_router(scheduler.router, dependencies=[Depends(verify_key)])
+app.include_router(run.router, dependencies=[Depends(verify_key)])
+app.include_router(socket.router)
+# ----------  Public Health Endpoint ----------
+@app.get("/health")
+def health_check():
+    return {"status": "ok", "service": "analytics-engine"}

app/mapper.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import os, json, duckdb, pandas as pd
+from datetime import datetime
+from app.db import get_conn, ensure_raw_table
+from app.utils.detect_industry import _ALIAS
+# ----------------------  Canonical schema base  ---------------------- #
+CANONICAL = {
+    "timestamp":  ["timestamp", "date", "sale_date", "created_at"],
+    "product_id": ["sku", "barcode", "plu", "product_id", "item_code"],
+    "qty":        ["qty", "quantity", "units", "pieces"],
+    "total":      ["total", "amount", "line_total", "sales_amount"],
+    "store_id":   ["store_id", "branch", "location", "outlet_id"],
+    "category":   ["category", "department", "cat", "family"],
+    "promo_flag": ["promo", "promotion", "is_promo", "discount_code"],
+    "expiry_date":["expiry_date", "best_before", "use_by", "expiration"],
+}
+ALIAS_FILE = "./db/alias_memory.json"
+def safe_str_transform(series: pd.Series) -> pd.Series:
+    """Apply .str.lower() & .str.strip() only if dtype is object/string."""
+    if pd.api.types.is_string_dtype(series):
+        return series.str.lower().str.strip()
+    return series
+# ----------------------  Alias memory helpers  ---------------------- #
+def load_dynamic_aliases() -> None:
+    """Load learned aliases and merge into CANONICAL."""
+    if os.path.exists(ALIAS_FILE):
+        try:
+            with open(ALIAS_FILE) as f:
+                dynamic_aliases = json.load(f)
+            for k, v in dynamic_aliases.items():
+                if k in CANONICAL:
+                    for alias in v:
+                        if alias not in CANONICAL[k]:
+                            CANONICAL[k].append(alias)
+                else:
+                    CANONICAL[k] = v
+        except Exception as e:
+            print(f"[mapper] ⚠️ failed to load alias memory: {e}")
+def save_dynamic_aliases() -> None:
+    """Persist learned aliases for next runs."""
+    os.makedirs(os.path.dirname(ALIAS_FILE), exist_ok=True)
+    with open(ALIAS_FILE, "w") as f:
+        json.dump(CANONICAL, f, indent=2)
+# ----------------------  Schema versioning helpers  ---------------------- #
+def ensure_schema_version(duck, df: pd.DataFrame) -> str:
+    """
+    Ensure schema versioning and track evolution.
+    Returns the active canonical table name (e.g., main.canonical_v2).
+    """
+    duck.execute("CREATE SCHEMA IF NOT EXISTS main")
+    duck.execute("""
+        CREATE TABLE IF NOT EXISTS main.schema_versions (
+            version INTEGER PRIMARY KEY,
+            columns JSON,
+            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+        )
+    """)
+    latest = duck.execute("SELECT * FROM main.schema_versions ORDER BY version DESC LIMIT 1").fetchone()
+    new_signature = sorted(df.columns.tolist())
+    if latest:
+        latest_cols = sorted(json.loads(latest[1]))
+        if latest_cols == new_signature:
+            return f"main.canonical_v{latest[0]}"
+        else:
+            new_version = latest[0] + 1
+            duck.execute("INSERT INTO main.schema_versions (version, columns) VALUES (?, ?)",
+                         (new_version, json.dumps(new_signature)))
+            print(f"[schema] → new version detected: canonical_v{new_version}")
+            return f"main.canonical_v{new_version}"
+    else:
+        duck.execute("INSERT INTO main.schema_versions (version, columns) VALUES (?, ?)",
+                     (1, json.dumps(new_signature)))
+        print("[schema] → initialized canonical_v1")
+        return "main.canonical_v1"
+def reconcile_latest_schema(duck):
+    """
+    Merge all canonical_v* tables into main.canonical_latest
+    preserving new columns and filling missing values with NULL.
+    """
+    tables = [r[0] for r in duck.execute("""
+        SELECT table_name FROM information_schema.tables
+        WHERE table_name LIKE 'canonical_v%'
+    """).fetchall()]
+    if not tables:
+        return
+    union_query = " UNION ALL ".join([f"SELECT * FROM {t}" for t in tables])
+    duck.execute("CREATE OR REPLACE TABLE main.canonical_latest AS " + union_query)
+    print(f"[schema] ✅ reconciled {len(tables)} schema versions → canonical_latest")
+# ----------------------  Canonify core logic  ---------------------- #
+def canonify_df(org_id: str, hours_window: int = 24) -> pd.DataFrame:
+    """
+    Normalize, version, and persist canonical data snapshot for org_id.
+    """
+    load_dynamic_aliases()
+    conn = get_conn(org_id)
+    ensure_raw_table(conn)
+    # --------------------------
+    # ⏱  Safe timestamp filtering
+    # --------------------------
+    try:
+        rows = conn.execute(
+            """
+            SELECT row_data
+            FROM raw_rows
+            WHERE strptime(json_extract(row_data, '$.timestamp'), '%Y-%m-%d %H:%M:%S')
+                  >= now() - INTERVAL ? HOUR
+            """,
+            (hours_window,)
+        ).fetchall()
+    except Exception as e:
+        print(f"[canonify] ⚠️ fallback to all rows due to timestamp parse error: {e}")
+        rows = conn.execute("SELECT row_data FROM raw_rows").fetchall()
+    if not rows:
+        print("[canonify] no rows to process")
+        return pd.DataFrame()
+    # --------------------------
+    # 🧩 DataFrame normalization
+    # --------------------------
+    raw = pd.DataFrame([json.loads(r[0]) for r in rows])
+    raw.columns = safe_str_transform(raw.columns)
+    # Flexible alias mapping
+    mapping = {}
+    for canon, aliases in CANONICAL.items():
+        for col in raw.columns:
+            if any(a in col for a in aliases):
+                mapping[col] = canon
+                break
+    # 🧠 Learn new aliases dynamically
+    for col in raw.columns:
+        if col not in sum(CANONICAL.values(), []):
+            for canon in CANONICAL.keys():
+                if canon in col and col not in CANONICAL[canon]:
+                    CANONICAL[canon].append(col)
+    save_dynamic_aliases()
+    # Apply canonical renaming
+    renamed = raw.rename(columns=mapping)
+    cols = [c for c in CANONICAL.keys() if c in renamed.columns]
+    df = renamed[cols].copy() if cols else renamed.copy()
+    # 🔢 Normalize datatypes
+    if "timestamp" in df:
+        df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
+    if "expiry_date" in df:
+        df["expiry_date"] = pd.to_datetime(df["expiry_date"], errors="coerce").dt.date
+    if "promo_flag" in df:
+        df["promo_flag"] = df["promo_flag"].astype(str).isin({"1", "true", "t", "yes"})
+    for col in ("qty", "total"):
+        if col in df:
+            df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
+    # --------------------------
+    # 🪣 Schema versioning + storage
+    # --------------------------
+    os.makedirs("./db", exist_ok=True)
+    duck = duckdb.connect(f"./db/{org_id}.duckdb")
+    table_name = ensure_schema_version(duck, df)
+    duck.execute(f"CREATE TABLE IF NOT EXISTS {table_name} AS SELECT * FROM df LIMIT 0")
+    duck.execute(f"INSERT INTO {table_name} SELECT * FROM df")
+    # 🧩 Always refresh canonical_latest for unified analytics
+    reconcile_latest_schema(duck)
+    duck.close()
+    print(f"[canonify] ✅ canonical snapshot updated for {org_id}")
+    return df

app/redis_pool.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ import redis, os
2	+ redis_client = redis.from_url(os.getenv("REDIS_URL", "redis://redis:6379"), decode_responses=True)

app/routers/datasources.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from fastapi import APIRouter, Query, Form, File, UploadFile, Depends, HTTPException
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+from typing import List, Any, Dict, Union
+from app.deps import verify_key
+from app.db import get_conn, ensure_raw_table, bootstrap
+from app.mapper import canonify_df
+from app.utils.detect_industry import detect_industry
+from app.routers.socket import sio
+import pandas as pd
+import json
+router = APIRouter(prefix="/api/v1", tags=["datasources"])
+# =======================================================================
+# 1️⃣  ORIGINAL UPLOAD ENDPOINT – handles CSV, POS plug-in, etc.
+# =======================================================================
+@router.post("/datasources")
+async def create_source(
+    orgId: str = Query(...),
+    sourceId: str = Query(...),
+    type: str = Query(...),
+    config: str = Form(...),
+    file: UploadFile = File(None),
+    data: str = Form(None),
+    _: str = Depends(verify_key),
+):
+    """
+    Keeps existing behavior – for CSV upload, POS plug-in, API push, etc.
+    """
+    conn = get_conn(orgId)
+    ensure_raw_table(conn)
+    config_dict = json.loads(config)
+    if type == "FILE_IMPORT" and file:
+        chunk_size = 1000
+        for chunk in pd.read_csv(file.file, chunksize=chunk_size):
+            for _, row in chunk.iterrows():
+                conn.execute("INSERT INTO raw_rows (row_data) VALUES (?)", (row.to_json(),))
+        file.file.seek(0)
+    elif type in ["API", "DATABASE", "WEBHOOK", "POS_SYSTEM", "ERP", "CUSTOM"]:
+        if not data:
+            raise HTTPException(status_code=400, detail="Data required for non-file sources")
+        records = json.loads(data)
+        records = records if isinstance(records, list) else [records]
+        for row in records:
+            conn.execute("INSERT INTO raw_rows (row_data) VALUES (?)", (json.dumps(row),))
+    # Normalize, detect, and close connection
+    df = canonify_df(orgId)
+    industry, confidence = detect_industry(df)
+    conn.close()
+    # Live broadcast sample
+    rows = df.head(3).to_dict("records")
+    await sio.emit("datasource:new-rows", {"rows": rows}, room=orgId)
+    return {
+        "id": sourceId,
+        "status": "listening" if type != "WEBHOOK" else "received",
+        "industry": industry,
+        "confidence": confidence,
+        "recentRows": rows,
+    }
+# =======================================================================
+# 2️⃣  SMART JSON ENDPOINT – fully schema-agnostic and multi-table aware
+# =======================================================================
+class JsonPayload(BaseModel):
+    config: Dict[str, Any]
+    data: Union[List[Any], Dict[str, Any]]  # flexible: list or { "tables": {...} }
+@router.post("/datasources/json")
+async def create_source_json(
+    payload: JsonPayload,
+    orgId: str = Query(...),
+    sourceId: str = Query(...),
+    type: str = Query(...),
+    _: str = Depends(verify_key),
+):
+    """
+    Accepts structured JSON (list or multi-table dict) from n8n, Render jobs, or APIs.
+    Automatically evolves schemas, stores data, detects industry, and broadcasts live rows.
+    """
+    try:
+        if not payload or not payload.data:
+            raise HTTPException(status_code=400, detail="Missing payload data")
+        # 💾 Flexible insertion – handles one or multiple tables
+        bootstrap(orgId, payload.data)
+        # 🧭 Canonical normalization (only if “sales” or compatible table exists)
+        df = canonify_df(orgId)
+        industry, confidence = detect_industry(df)
+        # 🎯 Preview last few normalized rows
+        rows = df.head(3).to_dict("records") if not df.empty else []
+        await sio.emit("datasource:new-rows", {"rows": rows}, room=orgId)
+        return JSONResponse(
+            content={
+                "id": sourceId,
+                "status": "processed",
+                "industry": industry,
+                "confidence": confidence,
+                "recentRows": rows,
+                "message": "✅ Data ingested successfully",
+            }
+        )
+    except Exception as e:
+        print(f"[datasources/json] ❌ ingestion error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))

app/routers/flags.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# app/routers/flags.py
+from fastapi import APIRouter, Depends, HTTPException
+import httpx
+from app.deps import verify_key
+import os
+router = APIRouter(prefix="/flags", tags=["Feature Flags"])
+NEXT_API = os.getenv("NEXT_API")      # never hard-code localhost          # internal Docker name (or env var)
+@router.get("/{key}")
+async def read_flag(key: str, _: str = Depends(verify_key)):
+    async with httpx.AsyncClient() as c:
+        r = await c.get(f"{NEXT_API}/api/flags/{key}", headers={"x-api-key": "dev-analytics-key-123"})
+    if r.status_code == 404:
+        raise HTTPException(404, "Flag not found")
+    return r.json()
+@router.put("/{key}")
+async def set_flag(key: str, body: dict, _: str = Depends(verify_key)):
+    async with httpx.AsyncClient() as c:
+        r = await c.put(f"{NEXT_API}/api/flags/{key}", json=body, headers={"x-api-key": "dev-analytics-key-123"})
+    return r.json()

app/routers/health.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from fastapi import APIRouter
+router = APIRouter(tags=["health"])
+@router.get("/health")
+def health():
+    return {"status": "ok", "service": "analytics-engine"}

app/routers/ingress.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# from fastapi import APIRouter, Depends
+# from pydantic import BaseModel
+# from app.deps import verify_key
+# router = APIRouter(prefix="/api/v1", tags=["datasources"])
+# class NewSource(BaseModel):
+#     orgId: str
+#     sourceId: str
+#     type: str
+#     config: dict
+# @router.post("/datasources")
+# def create_source(payload: NewSource, _: str = Depends(verify_key)):
+#     print("[analytics] new source", payload)
+#     return {"id": payload.sourceId, "status": "sync_queued"}

app/routers/reports.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+Analytics engine routes – DuckDB-backed, any-shape input.
+Also exposes Neon-bridge endpoints so Next.js (Prisma) can store history.
+"""
+from fastapi import APIRouter, Query, HTTPException
+from pydantic import BaseModel
+from datetime import datetime
+import json
+from app.mapper import canonify_df
+from app.engine.analytics import AnalyticsService
+from app.utils.detect_industry import detect_industry
+from app.service.industry_svc import (
+    eda, forecast, basket, market_dynamics, supply_chain,
+    customer_insights, operational_efficiency, risk_assessment, sustainability
+)
+router = APIRouter(prefix="/analytics", tags=["Analytics"])
+analytics = AnalyticsService()
+# --------------------------------------------------
+# 1  RUN ANALYTIC – real-time, any column names
+# --------------------------------------------------
+class RunAnalyticIn(BaseModel):
+    analytic: str
+    dateColumn: str | None = None
+    valueColumn: str | None = None
+    minSupport: float = 0.01
+    minConfidence: float = 0.3
+    minLift: float = 1.0
+@router.post("/run")
+async def run_analytic(orgId: str, body: RunAnalyticIn):
+    """
+    1. Canonify last 6 h of raw rows (any shape)
+    2. Compute chosen analytic
+    3. Return shaped payload
+    """
+    df = canonify_df(orgId)
+    if df.empty:
+        raise HTTPException(404, "No recent data found – please ingest or stream first.")
+    data = df.to_dict("records")
+    industry, _ = detect_industry(df)
+    match body.analytic:
+        case "eda":
+            result = await eda(data, industry)
+        case "forecast":
+            if not body.dateColumn or not body.valueColumn:
+                raise HTTPException(400, "dateColumn & valueColumn required")
+            result = await forecast(data, body.dateColumn, body.valueColumn)
+        case "basket":
+            result = await basket(data, body.minSupport, body.minConfidence, body.minLift)
+        case "market-dynamics":
+            result = await market_dynamics(data)
+        case "supply-chain":
+            result = await supply_chain(data)
+        case "customer-insights":
+            result = await customer_insights(data)
+        case "operational-efficiency":
+            result = await operational_efficiency(data)
+        case "risk-assessment":
+            result = await risk_assessment(data)
+        case "sustainability":
+            result = await sustainability(data)
+        case _:
+            raise HTTPException(400, "Unknown analytic")
+    return {"industry": industry, "data": result}
+# --------------------------------------------------
+# 2  NEON BRIDGE – latest report for UI + push endpoint
+# --------------------------------------------------
+class PushReportIn(BaseModel):
+    orgId: str
+    type: str
+    results: dict
+    lastRun: datetime
+@router.get("/report/latest")
+def latest_report(orgId: str = Query(...)):
+    """
+    Returns the newest KPI snapshot we have for this org
+    (shape matches Neon schema so Next.js can forward 1-to-1)
+    """
+    from app.db import get_conn
+    conn = get_conn(orgId)
+    row = conn.execute("""
+        SELECT analytic_type, results, ts
+        FROM   kpi_log
+        WHERE  org_id = ?
+        ORDER  BY ts DESC
+        LIMIT  1
+    """, [orgId]).fetchone()
+    conn.close()
+    if not row:
+        raise HTTPException(404, "No report yet")
+    return {
+        "orgId": orgId,
+        "type": row[0],
+        "results": json.loads(row[1]) if isinstance(row[1], str) else row[1],
+        "lastRun": row[2].isoformat(),
+    }
+@router.post("/report/push")
+async def push_report(body: PushReportIn):
+    """
+    Internal endpoint – Next.js (Prisma) calls this to store history in Neon.
+    Analytics container itself does **not** touch Prisma.
+    """
+    # optional: validate signature / api-key here if you want
+    return {"status": "accepted", "orgId": body.orgId, "type": body.type}

app/routers/run.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""
+Analytics engine routes – stateless, DuckDB-backed, any-shape input.
+"""
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+import pandas as pd
+from app.mapper import canonify_df                      # NEW
+from app.engine.analytics import AnalyticsService
+from app.utils.detect_industry import detect_industry
+from app.service.industry_svc import (
+    eda, forecast, basket, market_dynamics, supply_chain,
+    customer_insights, operational_efficiency, risk_assessment, sustainability
+)
+router = APIRouter(prefix="/analytics", tags=["Analytics"])
+class RunAnalyticIn(BaseModel):
+    analytic: str
+    dateColumn: str | None = None
+    valueColumn: str | None = None
+    minSupport: float = 0.01
+    minConfidence: float = 0.3
+    minLift: float = 1.0
+@router.post("/run")
+async def run_analytic(orgId: str, body: RunAnalyticIn):
+    """
+    1. Pull last 6 h of raw rows (any column names)
+    2. Map -> canonical DataFrame
+    3. Run chosen analytic
+    4. Return shaped result
+    """
+    df = canonify_df(orgId)                # ← replaces pd.read_parquet
+    if df.empty:
+        raise HTTPException(404, "No recent data found – please ingest or stream first.")
+    industry, _ = detect_industry(df)
+    data = df.to_dict("records")
+    match body.analytic:
+        case "eda":
+            result = await eda(data, industry)
+        case "forecast":
+            if not body.dateColumn or not body.valueColumn:
+                raise HTTPException(400, "dateColumn & valueColumn required")
+            result = await forecast(data, body.dateColumn, body.valueColumn)
+        case "basket":
+            result = await basket(data, body.minSupport, body.minConfidence, body.minLift)
+        case "market-dynamics":
+            result = await market_dynamics(data)
+        case "supply-chain":
+            result = await supply_chain(data)
+        case "customer-insights":
+            result = await customer_insights(data)
+        case "operational-efficiency":
+            result = await operational_efficiency(data)
+        case "risk-assessment":
+            result = await risk_assessment(data)
+        case "sustainability":
+            result = await sustainability(data)
+        case _:
+            raise HTTPException(400, "Unknown analytic")
+    return {"industry": industry, "data": result}

app/routers/scheduler.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""
+State-less scheduler REST facade.
+Jobs are still executed by APScheduler; this router only
+  - persists schedules to /data/.schedules.json
+  - keeps APScheduler in sync
+"""
+import json, uuid, os
+from datetime import datetime
+from typing import List
+from fastapi import APIRouter, Query, HTTPException
+from pydantic import BaseModel
+router = APIRouter(prefix="/schedules", tags=["scheduler"])
+SCHEDULE_FILE = "/data/.schedules.json"
+# --------------------------------------------------
+# models
+# --------------------------------------------------
+class ScheduleIn(BaseModel):
+    orgId   : str
+    frequency: str          # daily | weekly | monthly
+    analytics: List[str]
+class ScheduleOut(ScheduleIn):
+    id       : str
+    nextRun  : datetime
+# --------------------------------------------------
+# helpers
+# --------------------------------------------------
+def _load() -> List[dict]:
+    if not os.path.exists(SCHEDULE_FILE):
+        return []
+    with open(SCHEDULE_FILE) as f:
+        return json.load(f)
+def _save(obj: List[dict]):
+    with open(SCHEDULE_FILE, "w") as f:
+        json.dump(obj, f, indent=2, default=str)
+def _next_run(frequency: str) -> datetime:
+    from datetime import timedelta
+    now = datetime.utcnow()
+    if frequency == "daily":    return now + timedelta(days=1)
+    if frequency == "weekly":   return now + timedelta(weeks=1)
+    if frequency == "monthly":  return now + timedelta(days=30)
+    return now
+# --------------------------------------------------
+# CRUD
+# --------------------------------------------------
+# ↓↓↓  ADD THIS LINE  ↓↓↓
+@router.get("/schedules", response_model=List[ScheduleOut])
+def list_schedules_endpoint(orgId: str = Query(...)):
+    return list_schedules(orgId)
+@router.get("", response_model=List[ScheduleOut])
+def list_schedules(orgId: str = Query(...)):
+    data = _load()
+    return [s for s in data if s["orgId"] == orgId]
+@router.post("", response_model=ScheduleOut)
+def create_schedule(payload: ScheduleIn):
+    new_id = str(uuid.uuid4())
+    record = {
+        "id"       : new_id,
+        "orgId"    : payload.orgId,
+        "frequency": payload.frequency,
+        "analytics": payload.analytics,
+        "nextRun"  : _next_run(payload.frequency).isoformat(),
+    }
+    all_ = _load()
+    all_.append(record)
+    _save(all_)
+    # sync to APScheduler
+    from app.tasks.scheduler import add_job_to_scheduler
+    add_job_to_scheduler(record)
+    return ScheduleOut(**record)
+@router.delete("/{schedule_id}", status_code=204)
+def delete_schedule(schedule_id: str):
+    all_ = _load()
+    filtered = [s for s in all_ if s["id"] != schedule_id]
+    if len(filtered) == len(all_):
+        raise HTTPException(404, "Schedule not found")
+    _save(filtered)
+    # remove from APScheduler
+    from app.tasks.scheduler import remove_job_from_scheduler
+    remove_job_from_scheduler(schedule_id)

app/routers/socket.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# app/routers/socket.py
+import socketio
+from fastapi import APIRouter, Depends, Path, Request
+from fastapi.responses import PlainTextResponse
+from app.deps import verify_key  # your API-key guard
+# 1️⃣ Socket.IO server
+sio = socketio.AsyncServer(
+    async_mode="asgi",
+    cors_allowed_origins=[
+        "https://mut-sync-hub.vercel.app",
+        "http://localhost:3000",
+    ],
+)
+# 2️⃣ ASGI sub-app (mounted separately in main.py)
+socket_app = socketio.ASGIApp(sio)
+# 3️⃣ FastAPI router for REST routes (no prefix → /socket-push)
+router = APIRouter(tags=["socket"])
+# ----------  POST /socket-push/{org_id} ----------
+@router.post("/socket-push/{org_id}")
+async def socket_push(
+    org_id: str = Path(...),
+    request: Request = None,
+    _: str = Depends(verify_key),
+):
+    """
+    Receive top-N rows from n8n workflow and broadcast them
+    live to all connected clients in the given org room.
+    """
+    payload = await request.json()
+    rows = payload.get("rows", [])
+    await sio.emit("datasource:new-rows", {"rows": rows}, room=org_id)
+    print(f"[socket] 🔄 broadcasted {len(rows)} rows → room={org_id}")
+    return {"status": "ok", "emitted": len(rows)}
+# ----------  Health Check ----------
+@router.get("/health")
+async def health():
+    return PlainTextResponse("ok")
+# ----------  Socket.IO Events ----------
+@sio.event
+async def connect(sid, environ, auth):
+    org_id = (auth or {}).get("orgId", "demo")
+    await sio.save_session(sid, {"orgId": org_id})
+    await sio.enter_room(sid, org_id)
+    print(f"[socket] ✅ {sid} connected → room={org_id}")
+@sio.event
+async def disconnect(sid):
+    print(f"[socket] ❌ {sid} disconnected")

app/service/industry_svc.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""
+Pure async wrappers around AnalyticsService – no quota, no DB.
+"""
+from typing import Any, Dict, List, Optional
+import pandas as pd
+from app.engine.analytics import AnalyticsService
+analytics = AnalyticsService()
+# ------------------------------------------------------------------
+# 1  EDA – full exploratory + industry auto-detect
+# ------------------------------------------------------------------
+async def eda(data: List[Dict], industry: Optional[str] = None) -> Dict[str, Any]:
+    return analytics.perform_eda(data, industry)
+# ------------------------------------------------------------------
+# 2  FORECAST – Prophet 30-day forward
+# ------------------------------------------------------------------
+async def forecast(data: List[Dict], date_column: str, value_column: str) -> Dict[str, Any]:
+    return analytics.forecast_timeseries(data, date_column, value_column)
+# ------------------------------------------------------------------
+# 3  BASKET – market basket analysis
+# ------------------------------------------------------------------
+async def basket(data: List[Dict],
+                 min_support: float = 0.01,
+                 min_confidence: float = 0.3,
+                 min_lift: float = 1.0) -> Dict[str, Any]:
+    df = pd.DataFrame(data)
+    return analytics.perform_market_basket_analysis(df, min_support, min_confidence, min_lift)
+# ------------------------------------------------------------------
+# 4  CROSS-INDUSTRY INSIGHTS – one per endpoint
+# ------------------------------------------------------------------
+async def market_dynamics(data: List[Dict]) -> Dict[str, Any]:
+    df = pd.DataFrame(data)
+    return analytics._analyze_market_dynamics(df)
+async def supply_chain(data: List[Dict]) -> Dict[str, Any]:
+    df = pd.DataFrame(data)
+    return analytics._analyze_supply_chain(df)
+async def customer_insights(data: List[Dict]) -> Dict[str, Any]:
+    df = pd.DataFrame(data)
+    return analytics._analyze_customer_insights(df)
+async def operational_efficiency(data: List[Dict]) -> Dict[str, Any]:
+    df = pd.DataFrame(data)
+    return analytics._analyze_operational_efficiency(df)
+async def risk_assessment(data: List[Dict]) -> Dict[str, Any]:
+    df = pd.DataFrame(data)
+    return analytics._analyze_risk_patterns(df)
+async def sustainability(data: List[Dict]) -> Dict[str, Any]:
+    df = pd.DataFrame(data)
+    return analytics._analyze_sustainability_metrics(df)

app/service/live_ingest.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import json, pandas as pd, redis
+from datetime import datetime
+from app.engine.analytics import AnalyticsService
+from app.redis_pool import redis_client
+class LiveIngestService:
+    def __init__(self, org_id: str):
+        self.org_id = org_id
+        self.buffer: list[dict] = []
+        self.analytics = AnalyticsService()
+    async def handle(self, msg: dict):
+        if msg.get("event") != "sale": return
+        self.buffer.append(msg["data"])
+        if len(self.buffer) >= 100 or self._older_than_3s():
+            await self._flush()
+    async def _flush(self):
+        if not self.buffer: return
+        df = pd.DataFrame(self.buffer)
+        df["timestamp"] = pd.to_datetime(df["timestamp"])
+        industry = self._detect_industry(df)
+        report = self.analytics.perform_eda(df.to_dict("records"), industry=industry)
+        redis_client.setex(f"live:{self.org_id}", 300, json.dumps(report, default=str))
+        self.buffer.clear()
+    def _older_than_3s(self) -> bool:
+        return self.buffer and (pd.Timestamp.utcnow() - pd.to_datetime(self.buffer[-1]["timestamp"])).seconds > 3
+    def _detect_industry(self, df: pd.DataFrame) -> str:
+        cols = set(df.columns)
+        if {"product_id", "qty", "price", "total"}.issubset(cols): return "supermarket"
+        if {"sku", "wholesale_price"}.issubset(cols): return "wholesale"
+        return "retail"

app/tasks/ingest_worker.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import os
+import asyncio, json, redis, duckdb
+from app.db import get_conn, ensure_raw_table
+from app.ingest import ingest_dict
+r = redis.from_url(os.getenv("REDIS_URL"))
+STREAM_KEY = "pos_stream:{org_id}"   # one stream per tenant
+async def stream_consumer(org_id: str):
+    conn = get_conn(org_id)
+    ensure_raw_table(conn)
+    while True:
+        msgs = r.xread({STREAM_KEY.format(org_id=org_id): '$'}, count=100, block=5000)
+        if msgs:
+            _, entries = msgs[0]
+            for _, data in entries:
+                ingest_dict(org_id, json.loads(data[b'row']))
+        await asyncio.sleep(1)   # 1 s micro-batch

app/tasks/kpi_logger.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import duckdb
+from app.db import get_conn, ensure_kpi_log
+from app.mapper import canonify_df          # gives uniform DF
+from app.engine.analytics import AnalyticsService
+from app.utils.detect_industry import detect_industry
+analytics = AnalyticsService()
+def log_kpis_and_purge(org_id: str) -> None:
+    """
+    1. Canonify last 6 h of raw rows
+    2. Compute KPIs
+    3. Insert into kpi_log (history)
+    4. Delete raw rows older than 6 h
+    """
+    conn = get_conn(org_id)
+    ensure_kpi_log(conn)
+    df = canonify_df(org_id)
+    if df.empty:
+        conn.close()
+        return
+    industry, _ = detect_industry(df)
+    kpis = analytics.perform_eda(df.to_dict("records"), industry).get("supermarket_kpis", {})
+    conn.execute(
+        """INSERT INTO kpi_log(daily_sales, daily_qty, avg_basket,
+                               shrinkage, promo_lift, stock)
+           VALUES (?,?,?,?,?,?)""",
+        [
+            kpis.get("daily_sales", 0),
+            kpis.get("daily_qty", 0),
+            kpis.get("avg_basket", 0),
+            kpis.get("shrinkage_pct", 0),
+            kpis.get("promo_lift_pct", 0),
+            kpis.get("stock_on_hand", 0),
+        ],
+    )
+    # purge raw buffer
+    conn.execute("DELETE FROM raw_rows WHERE ingested_at < now() - INTERVAL 6 HOUR")
+    conn.commit()
+    conn.close()

app/tasks/purge.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from app.db import get_conn, ensure_raw_table
+def purge_old_raw(org_id: str, hours=6):
+    conn = get_conn(org_id)
+    conn.execute("DELETE FROM raw_rows WHERE ingested_at < now() - INTERVAL ? HOURS", [hours])
+    conn.commit(); conn.close()

app/tasks/scheduler.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""
+State-less scheduler – caller (Next-js) orchestrates storage & quota.
+Only duty: run analytics on cron, return JSON.
+"""
+import asyncio
+import pandas as pd
+from apscheduler.schedulers.asyncio import AsyncIOScheduler
+from app.engine.analytics import AnalyticsService
+from app.service.industry_svc import (eda, forecast, basket, market_dynamics,
+                                      supply_chain, customer_insights,
+                                      operational_efficiency, risk_assessment,
+                                      sustainability)
+from app.utils.detect_industry import detect_industry
+from app.utils.email import send_pdf_email
+import os
+from datetime import datetime
+import aiohttp
+sched = AsyncIOScheduler()
+# ------------------------------------------------------------------
+# 1  RUN ONE ANALYTIC – pure logic, no DB
+# ------------------------------------------------------------------
+async def run_analytic_job(org_id: str, analytic_type: str, **kwargs) -> dict:
+    """
+    1. Canonify last 6 h of raw rows (any column names) via DuckDB
+    2. Compute chosen analytic
+    3. Log KPIs + purge old raw data
+    4. Return shaped payload
+    """
+    from app.mapper import canonify_df          # NEW: any-shape → canonical
+    from app.tasks.kpi_logger import log_kpis_and_purge  # NEW: history & tidy
+    df = canonify_df(org_id)
+    if df.empty:
+        return {"error": "No recent data found"}
+    data = df.to_dict("records")
+    industry, _ = detect_industry(df)
+    match analytic_type:
+        case "eda":
+            result = await eda(data, industry)
+        case "forecast":
+            result = await forecast(data, kwargs["date_col"], kwargs["value_col"])
+        case "basket":
+            result = await basket(data, 0.01, 0.3, 1.0)
+        case "market-dynamics":
+            result = await market_dynamics(data)
+        case "supply-chain":
+            result = await supply_chain(data)
+        case "customer-insights":
+            result = await customer_insights(data)
+        case "operational-efficiency":
+            result = await operational_efficiency(data)
+        case "risk-assessment":
+            result = await risk_assessment(data)
+        case "sustainability":
+            result = await sustainability(data)
+        case _:
+            return {"error": "Unknown analytic"}
+    # ----------  NEW – history + disk tidy  ----------
+    log_kpis_and_purge(org_id)          # inserts KPIs & deletes raw > 6 h
+    # -------------------------------------------------
+    async with aiohttp.ClientSession() as session:
+        await session.post(
+            f"{os.getenv('NEXT_PUBLIC_ORIGIN')}/analytics/report/sync",
+            json={
+                "orgId": org_id,
+                "type": analytic_type,
+                "results": result,
+                "lastRun": datetime.utcnow().isoformat(),
+            },
+            headers={"x-api-key": os.getenv("ANALYTICS_KEY")},
+        )
+    # fire-and-forget email (caller decides storage)
+    pdf_url = f"{os.getenv('PUBLIC_URL', '')}/api/reports/{org_id}/{analytic_type}.pdf"
+    asyncio.create_task(send_pdf_email(org_id, f"{analytic_type} report", {"pdf": pdf_url, "data": result}))
+    return {"orgId": org_id, "analytic": analytic_type, "industry": industry, "results": result, "timestamp": datetime.utcnow().isoformat()}
+# ------------------------------------------------------------------
+# 2  APScheduler glue – unchanged
+# ------------------------------------------------------------------
+def add_job_to_scheduler(schedule: dict):
+    org_id    = schedule["orgId"]
+    freq      = schedule["frequency"]
+    analytics = schedule["analytics"]
+    for analytic in analytics:
+        job_id = f"{schedule['id']}_{analytic}"
+        if freq == "daily":
+            sched.add_job(run_analytic_job, "cron", hour=6, minute=0,
+                          args=[org_id, analytic], id=job_id)
+        elif freq == "weekly":
+            sched.add_job(run_analytic_job, "cron", day_of_week=0, hour=6, minute=0,
+                          args=[org_id, analytic], id=job_id)
+        elif freq == "monthly":
+            sched.add_job(run_analytic_job, "cron", day=1, hour=6, minute=0,
+                          args=[org_id, analytic], id=job_id)
+def remove_job_from_scheduler(schedule_id: str):
+    for job in sched.get_jobs():
+        if job.id.startswith(schedule_id):
+            sched.remove_job(job.id)
+# ------------------------------------------------------------------
+# 3  ENV-loader – unchanged
+# ------------------------------------------------------------------
+async def load_schedules():
+    import json
+    raw = os.getenv("SCHEDULES", "[]")
+    try:
+        schedules = json.loads(raw)
+    except Exception:
+        schedules = []
+    for sch in schedules:
+        org_id = sch["orgId"]
+        freq = sch.get("frequency", "daily")
+        analytics = sch.get("analytics", ["eda"])
+        for analytic in analytics:
+            job_id = f"{org_id}_{analytic}"
+            if freq == "daily":
+                sched.add_job(run_analytic_job, "cron", hour=6, minute=0, args=[org_id, analytic], id=job_id)
+            elif freq == "weekly":
+                sched.add_job(run_analytic_job, "cron", day_of_week=0, hour=6, minute=0, args=[org_id, analytic], id=job_id)
+            elif freq == "monthly":
+                sched.add_job(run_analytic_job, "cron", day=1, hour=6, minute=0, args=[org_id, analytic], id=job_id)
+# ------------------------------------------------------------------
+# 4  STARTER
+# ------------------------------------------------------------------
+def start_scheduler():
+    asyncio.create_task(load_schedules())
+    sched.start()

app/utils/detect_industry.py ADDED Viewed

	@@ -0,0 +1,116 @@

+"""
+Enterprise industry detector – POS-schema aware.
+Works with exports from Square, Lightspeed, Shopify POS, NCR, Oracle MICROS,
+QuickBooks POS, Clover, Revel, Toast, etc.
+"""
+import pandas as pd
+from typing import Tuple
+# ------------------------------------------------------------------
+# 1  COLUMN ALIAS MAP – covers 99 % of real-world POS exports
+# ------------------------------------------------------------------
+_ALIAS = {
+    "supermarket": {
+        "sku": ["barcode", "item_code", "plu", "product_id", "product_code", "item_id",
+      "sku", "goods_code", "article_number", "artnum", "sale_id", "item_barcode",
+      "product_barcode", "item_sku", "goods_id", "inventory_id", "merchandise_code"],
+        "qty": ["qty", "quantity", "units", "stock", "quantity_sold", "qty_sold",
+      "item_count", "unit_count", "pieces", "pcs", "amount_sold",
+      "sold_qty", "sales_qty", "sold_quantity", "transaction_qty"],
+        "price": ["unit_price", "price", "sell_price", "unit_sell", "selling_price",
+      "item_price", "product_price", "rate", "unit_cost", "cost_price",
+      "retail_price", "sales_price", "price_each", "unit_rate"],
+        "total": ["total", "total_line", "line_total", "net_amount", "amount", "sales_amount",
+      "value", "extended_price", "total_price", "gross_amount", "total_amount",
+      "line_value", "transaction_total", "subtotal", "total_sales"],
+        "transaction": ["transaction_id", "receipt_no", "ticket_no", "order_id", "sale_id",
+      "tran_id", "trans_id", "receipt_number", "invoice_no", "bill_no",
+      "ticket_id", "session_id", "pos_transaction_id", "order_number"],
+        "store": ["store_id", "branch_code", "location_id", "outlet_id", "shop_id",
+      "branch_id", "terminal_id", "pos_id", "workstation_id", "station_id",
+      "store_code", "site_id", "warehouse_id", "depot_id"],
+        "category": ["category", "cat", "department", "class", "sub_category", "group_name",
+      "product_group", "family", "section", "division", "category_name",
+      "item_category", "product_category", "group_code"],
+        "expiry": ["expiry_date", "exp", "best_before", "use_by", "expiration_date",
+      "exp_date", "best_before_date", "shelf_life_date", "valid_until",
+      "expires_on", "expiry", "expiration"],
+        "promo": ["promo", "promotion", "discount_code", "campaign", "is_promo",
+      "promotion_code", "disc_code", "offer_code", "special_code",
+      "promo_flag", "promotion_flag", "discount_flag", "is_discount"],
+        "loss": ["loss_qty", "waste_qty", "shrinkage_qty", "damaged_qty", "spoiled_qty",
+      "expired_qty", "write_off_qty", "shrinkage", "waste", "damaged",
+      "loss", "shrinkage_units", "waste_units", "damaged_units", "spoiled_units"],
+    },
+    "healthcare": {
+        "patient": ["patient_id", "patient_no", "mrn", "medical_record_number"],
+        "treatment": ["treatment_cost", "procedure_cost", "bill_amount", "invoice_amount"],
+        "diagnosis": ["diagnosis_code", "icd_code", "condition"],
+        "drug": ["drug_name", "medication", "prescription"],
+    },
+    "wholesale": {
+        "sku": ["sku", "item_code"],
+        "wholesale_price": ["wholesale_price", "bulk_price", "trade_price"],
+        "moq": ["moq", "min_order_qty", "minimum_order"],
+    },
+    "manufacturing": {
+        "production": ["production_volume", "units_produced", "output_qty"],
+        "defect": ["defect_rate", "rejection_rate", "scrap_qty"],
+        "machine": ["machine_id", "line_id", "station_id"],
+    },
+    "retail": {
+        "product": ["product_name", "product_id"],
+        "sale": ["sale_date", "sale_amount"],
+    },
+}
+# ------------------------------------------------------------------
+# 2  HELPER – find first matching column
+# ------------------------------------------------------------------
+def _find_col(df: pd.DataFrame, keys) -> str | None:
+    cols = {c.lower() for c in df.columns}
+    for k in keys:
+        if any(k.lower() in col for col in cols):
+            return k
+    return None
+# ------------------------------------------------------------------
+# 3  SCORER – returns (industry, confidence 0-1)
+# ------------------------------------------------------------------
+def detect_industry(df: pd.DataFrame) -> Tuple[str, float]:
+    """
+    Detect industry from any POS / ERP / healthcare CSV.
+    Returns (industry, confidence_score)
+    """
+    if df.empty:
+        return "retail", 0.0
+    scores = {}
+    for industry, groups in _ALIAS.items():
+        hit = 0
+        for group_keys in groups.values():
+            if _find_col(df, group_keys):
+                hit += 1
+        scores[industry] = hit / len(groups)   # normalised 0-1
+    # pick highest score
+    industry = max(scores, key=scores.get) if scores else "retail"
+    confidence = scores.get(industry, 0.0)
+    # tie-breaker: supermarket wins if score == retail score (supermarket is strict superset)
+    if scores.get("supermarket", 0) == scores.get("retail", 0) and "supermarket" in scores:
+        industry = "supermarket"
+    return industry, confidence
+# ------------------------------------------------------------------
+# 4  SINGLE-USE HELPER – supermarket boolean
+# ------------------------------------------------------------------
+def is_supermarket(df: pd.DataFrame) -> bool:
+    """
+    Fast yes/no wrapper for downstream code that only cares
+    whether we treat this as a supermarket data set.
+    """
+    industry, confidence = detect_industry(df)
+    # be conservative: only return True if we are *sure*
+    return industry == "supermarket" and confidence >= 0.6

app/utils/email.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# analytics-service/app/utils/email.py
+from typing import Any
+def send_pdf_email(*_: Any, **__: Any) -> None:
+    """Stub – replace with real e-mail logic later."""
+    pass

data/duckdb/.gitkeep ADDED Viewed

File without changes

data/duckdb/schedules.json ADDED Viewed

File without changes

fly.toml ADDED Viewed

	@@ -0,0 +1,23 @@

+# fly.toml app configuration file generated for mutsynchub on 2025-11-06T14:44:31Z
+#
+# See https://fly.io/docs/reference/configuration/ for information about how to use this file.
+#
+app = 'mutsynchub'
+primary_region = 'iad'
+[build]
+[http_service]
+  internal_port = 8080
+  force_https = true
+  auto_stop_machines = 'stop'
+  auto_start_machines = true
+  min_machines_running = 0
+  processes = ['app']
+[[vm]]
+  memory = '1gb'
+  cpu_kind = 'shared'
+  cpus = 1
+  memory_mb = 1024

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+# Analytics Service dependencies
+apscheduler>=3.10
+pyarrow>=15.0
+redis>=5.0
+pandas>=2.2
+fastapi>=0.111
+uvicorn[standard]>=0.29
+prophet==1.1.5
+numpy>=1.24
+scikit-learn>=1.3
+scipy>=1.10
+statsmodels>=0.14
+networkx>=3.0
+sqlalchemy[asyncio]>=2.0
+asyncpg>=0.29          # async postgres driver
+numpy<2.0
+requests>=2.31
+huggingface_hub>=0.20.0
+aiohttp>=3.9.0
+httpx>=0.27.0
+python-multipart==0.0.6
+pycryptodome==3.20.0
+python-socketio[asyncio]>=5.11.0

scheduler_loop.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import json, time, os, requests
+from datetime import datetime, timedelta
+from pathlib import Path
+SCHEDULE_FILE = "/data/.schedules.json"
+RUN_URL = "http://localhost:8000/analytics/run"   # inside container
+def tick():
+    if not Path(SCHEDULE_FILE).exists():
+        return
+    with open(SCHEDULE_FILE) as f:
+        schedules = json.load(f)
+    now = datetime.utcnow().isoformat()
+    for s in schedules:
+        if s["nextRun"] <= now:
+            for analytic in s["analytics"]:
+                # call the same endpoint the UI uses
+                r = requests.post(RUN_URL,
+                                  json={"analytic": analytic},
+                                  headers={"X-Data-Path": f"/data/{s['orgId']}/sales.parquet"})
+                print(f"[scheduler] ran {analytic} for {s['orgId']} -> {r.status_code}")
+            # bump nextRun
+            s["nextRun"] = (_next_run(s["frequency"])).isoformat()
+    with open(SCHEDULE_FILE, "w") as f:
+        json.dump(schedules, f, indent=2)
+def _next_run(frequency: str) -> datetime:
+    now = datetime.utcnow()
+    if frequency == "daily":   return now + timedelta(days=1)
+    if frequency == "weekly":  return now + timedelta(weeks=1)
+    if frequency == "monthly": return now + timedelta(days=30)
+    return now
+if __name__ == "__main__":
+    while True:
+        try:
+            tick()
+        except Exception as e:
+            print("[scheduler] error:", e)
+        time.sleep(60)   # 1-minute granularity