Spaces:

petermutwiri
/

analytics-engine

Paused

App Files Files Community

Peter Mutwiri commited on Nov 17, 2025

Commit

88fd951

2 Parent(s): ea24fcc 14651bf

Merge HF Space state

Browse files

Files changed (8) hide show

Dockerfile +1 -1
README.md +1 -0
app/db.py +85 -77
app/entity_detector.py +80 -0
app/main.py +3 -0
app/mapper.py +176 -128
app/routers/datasources.py +76 -19
requirements.txt +28 -13

Dockerfile CHANGED Viewed

@@ -37,4 +37,4 @@ COPY scheduler_loop.py /app/scheduler_loop.py
 ENV API_KEYS=dev-analytics-key-123
 # ---- 7. start both services -----------------------------------------------
-CMD sh -c "python -m uvicorn app.main:app --host 0.0.0.0 --port 8080 & python /app/scheduler_loop.py"

 ENV API_KEYS=dev-analytics-key-123
 # ---- 7. start both services -----------------------------------------------
+CMD sh -c "python -m uvicorn app.main:app --host 0.0.0.0 --port 7860 & python /app/scheduler_loop.py"

README.md CHANGED Viewed

@@ -5,6 +5,7 @@ colorFrom: blue
 colorTo: green
 sdk: docker
 pinned: false
 ---
 FastAPI analytics webhook container.

 colorTo: green
 sdk: docker
 pinned: false
+port: 8080
 ---
 FastAPI analytics webhook container.

app/db.py CHANGED Viewed

@@ -1,24 +1,21 @@
-import duckdb, os, pathlib, json
-from datetime import datetime
 from typing import Any, Dict, List
 DB_DIR = pathlib.Path("./data/duckdb")
 DB_DIR.mkdir(parents=True, exist_ok=True)
 def get_conn(org_id: str):
     """Get or create a DuckDB connection for an organization."""
     db_file = DB_DIR / f"{org_id}.duckdb"
     return duckdb.connect(str(db_file), read_only=False)
-# ------------------------------------------------------------
-# 🔹 Backward-compatible table for raw JSON ingestion
-# ------------------------------------------------------------
 def ensure_raw_table(conn):
-    """
-    Maintains legacy compatibility for ingestion from webhooks / file uploads.
-    """
     conn.execute("CREATE SCHEMA IF NOT EXISTS main")
     conn.execute("""
         CREATE TABLE IF NOT EXISTS main.raw_rows(
@@ -27,101 +24,112 @@ def ensure_raw_table(conn):
         )
     """)
-# ------------------------------------------------------------
-# 🔹 Flexible dynamic schema table creation
-# ------------------------------------------------------------
 def ensure_table(conn, table_name: str, sample_record: Dict[str, Any]):
     """
-    Ensures a DuckDB table exists with columns inferred from sample_record.
-    If new columns appear later, adds them automatically.
     """
     conn.execute("CREATE SCHEMA IF NOT EXISTS main")
     conn.execute(
         f"CREATE TABLE IF NOT EXISTS main.{table_name} ("
         "id UUID DEFAULT uuid(), "
         "_ingested_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)"
     )
-    if not sample_record:
-        return
-    existing_cols = {r[0] for r in conn.execute(f"PRAGMA table_info('main.{table_name}')").fetchall()}
     for col, val in sample_record.items():
-        if col in existing_cols:
             continue
-        dtype = infer_duckdb_type(val)
-        print(f"[db] ➕ Adding new column '{col}:{dtype}' to main.{table_name}")
-        conn.execute(f"ALTER TABLE main.{table_name} ADD COLUMN {col} {dtype}")
-def infer_duckdb_type(value: Any) -> str:
-    """Infer a DuckDB-compatible column type from a Python value."""
-    if isinstance(value, bool):
-        return "BOOLEAN"
-    if isinstance(value, int):
-        return "BIGINT"
-    if isinstance(value, float):
-        return "DOUBLE"
-    if isinstance(value, datetime):
-        return "TIMESTAMP"
-    if isinstance(value, (dict, list)):
-        return "JSON"
-    return "VARCHAR"
-# ------------------------------------------------------------
-# 🔹 Insert records with auto-schema
-# ------------------------------------------------------------
 def insert_records(conn, table_name: str, records: List[Dict[str, Any]]):
     """
-    Insert records into the specified table.
-    Assumes ensure_table() has already been called.
     """
     if not records:
         return
-    cols = records[0].keys()
     placeholders = ", ".join(["?"] * len(cols))
     col_list = ", ".join(cols)
     insert_sql = f"INSERT INTO main.{table_name} ({col_list}) VALUES ({placeholders})"
-    values = [tuple(r.get(c) for c in cols) for r in records]
-    conn.executemany(insert_sql, values)
-    print(f"[db] ✅ Inserted {len(records)} rows into {table_name}")
-# ------------------------------------------------------------
-# 🔹 Unified bootstrap entrypoint
-# ------------------------------------------------------------
 def bootstrap(org_id: str, payload: Dict[str, Any]):
     """
-    Main entrypoint for ingestion.
-    Detects whether the payload contains:
-      - A single table (list of dicts)
-      - Multiple named tables (dict of lists)
-    Also logs the raw payload in main.raw_rows for lineage tracking.
     """
     conn = get_conn(org_id)
-    conn.execute("CREATE SCHEMA IF NOT EXISTS main")
     ensure_raw_table(conn)
-    # Log raw payload for debugging / lineage
-    conn.execute("INSERT INTO main.raw_rows (row_data) VALUES (?)", (json.dumps(payload),))
-    if isinstance(payload, dict) and "tables" in payload:
-        # multi-table mode
-        for table_name, rows in payload["tables"].items():
-            if not rows:
-                continue
-            ensure_table(conn, table_name, rows[0])
-            insert_records(conn, table_name, rows)
-    elif isinstance(payload, list):
-        # single-table mode (assume 'sales' as default)
-        ensure_table(conn, "sales", payload[0])
-        insert_records(conn, "sales", payload)
-    else:
-        print("[db] ⚠️ Unsupported payload shape")
-    conn.close()

+# app/db.py – BULLETPROOF VERSION
+import os
+import pathlib
+import json
+import duckdb
 from typing import Any, Dict, List
+from datetime import datetime
 DB_DIR = pathlib.Path("./data/duckdb")
 DB_DIR.mkdir(parents=True, exist_ok=True)
 def get_conn(org_id: str):
     """Get or create a DuckDB connection for an organization."""
     db_file = DB_DIR / f"{org_id}.duckdb"
     return duckdb.connect(str(db_file), read_only=False)
 def ensure_raw_table(conn):
+    """Creates audit trail table for raw JSON data."""
     conn.execute("CREATE SCHEMA IF NOT EXISTS main")
     conn.execute("""
         CREATE TABLE IF NOT EXISTS main.raw_rows(
         )
     """)
+def infer_duckdb_type(value: Any) -> str:
+    """Infer DuckDB type from Python value."""
+    if isinstance(value, bool):
+        return "BOOLEAN"
+    if isinstance(value, int):
+        return "BIGINT"
+    if isinstance(value, float):
+        return "DOUBLE"
+    if isinstance(value, datetime):
+        return "TIMESTAMP"
+    return "VARCHAR"
+# ✅ BULLETPROOF: Handles integer column names, None values, etc.
 def ensure_table(conn, table_name: str, sample_record: Dict[str, Any]):
     """
+    Ensures table exists with columns from sample_record.
+    SAFE: Converts int column names to strings, handles missing data.
     """
+    if not sample_record:
+        return
     conn.execute("CREATE SCHEMA IF NOT EXISTS main")
+    # Create base table with UUID and timestamp
     conn.execute(
         f"CREATE TABLE IF NOT EXISTS main.{table_name} ("
         "id UUID DEFAULT uuid(), "
         "_ingested_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)"
     )
+    # ✅ SAFE: Get existing columns (handle int names from DB)
+    try:
+        existing_cols_raw = conn.execute(f"PRAGMA table_info('main.{table_name}')").fetchall()
+        existing_cols = {str(r[0]).lower() for r in existing_cols_raw}
+    except Exception as e:
+        print(f"[db] ⚠️ Could not get table info: {e}")
+        existing_cols = set()
+    # ✅ BULLETPROOF: Add missing columns with safe name handling
     for col, val in sample_record.items():
+        col_name = str(col).lower().strip()  # ✅ FORCE STRING
+        if col_name in existing_cols:
             continue
+        # Skip None/empty values which can't infer type
+        if val is None:
+            print(f"[db] ⚠️ Skipping column {col_name} (None value)")
+            continue
+        try:
+            dtype = infer_duckdb_type(val)
+            print(f"[db] ➕ Adding column '{col_name}:{dtype}' to main.{table_name}")
+            conn.execute(f"ALTER TABLE main.{table_name} ADD COLUMN {col_name} {dtype}")
+        except Exception as e:
+            print(f"[db] ❌ Failed to add column {col_name}: {e}")
+            # Continue with next column - don't crash
 def insert_records(conn, table_name: str, records: List[Dict[str, Any]]):
     """
+    Insert records into table with safe column handling.
     """
     if not records:
         return
+    # ✅ Get columns from first record (force string names)
+    cols = [str(k) for k in records[0].keys()]
     placeholders = ", ".join(["?"] * len(cols))
     col_list = ", ".join(cols)
     insert_sql = f"INSERT INTO main.{table_name} ({col_list}) VALUES ({placeholders})"
+    # Prepare values (handle missing keys)
+    values = []
+    for record in records:
+        row = []
+        for col in cols:
+            val = record.get(col)
+            # Convert dict/list to JSON string for DuckDB
+            if isinstance(val, (dict, list)):
+                val = json.dumps(val)
+            row.append(val)
+        values.append(tuple(row))
+    try:
+        conn.executemany(insert_sql, values)
+        print(f"[db] ✅ Inserted {len(records)} rows into {table_name}")
+    except Exception as e:
+        print(f"[db] ❌ Insert failed: {e}")
+        raise
+# ✅ PURE AUDIT FUNCTION - Does NOT create tables
 def bootstrap(org_id: str, payload: Dict[str, Any]):
     """
+    **ENTERPRISE-GRADE**: Only stores raw JSON for audit.
+    Does NOT create any tables. Schema evolution is canonify_df's job.
     """
     conn = get_conn(org_id)
     ensure_raw_table(conn)
+    try:
+        raw_json = json.dumps(payload) if not isinstance(payload, str) else payload
+        if raw_json and raw_json not in ("null", "[]", "{}"):
+            conn.execute("INSERT INTO main.raw_rows (row_data) VALUES (?)", (raw_json,))
+            print(f"[bootstrap] ✅ Audit stored: {len(raw_json)} bytes")
+    except Exception as e:
+        print(f"[bootstrap] ⚠️ Audit failed: {e}")
+    conn.close()

app/entity_detector.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# app/entity_detector.py
+import pandas as pd
+from typing import Tuple
+# Entity-specific canonical schemas
+ENTITY_SCHEMAS = {
+    "sales": {
+        "indicators": ["timestamp", "total", "amount", "qty", "quantity", "sale_date", "transaction_id"],
+        "required_matches": 2,
+        "aliases": {
+            "timestamp": ["timestamp", "date", "sale_date", "created_at", "transaction_time"],
+            "product_id": ["sku", "barcode", "plu", "product_id", "item_code"],
+            "qty": ["qty", "quantity", "units", "pieces", "item_count"],
+            "total": ["total", "amount", "line_total", "sales_amount", "price"],
+            "store_id": ["store_id", "branch", "location", "outlet_id", "branch_code"],
+        }
+    },
+    "inventory": {
+        "indicators": ["stock", "quantity_on_hand", "reorder", "inventory", "current_stock", "warehouse_qty"],
+        "required_matches": 2,
+        "aliases": {
+            "product_id": ["sku", "barcode", "plu", "product_id", "item_code"],
+            "current_stock": ["stock", "quantity_on_hand", "qty_available", "current_quantity"],
+            "reorder_point": ["reorder_level", "min_stock", "reorder_point", "threshold"],
+            "supplier_id": ["supplier", "supplier_id", "vendor", "vendor_code"],
+            "last_stock_date": ["last_stock_date", "last_receipt", "last_updated"],
+        }
+    },
+    "customer": {
+        "indicators": ["customer_id", "email", "phone", "customer_name", "client_id", "loyalty_number"],
+        "required_matches": 2,
+        "aliases": {
+            "customer_id": ["customer_id", "client_id", "member_id", "loyalty_number", "phone"],
+            "full_name": ["customer_name", "full_name", "name", "client_name"],
+            "email": ["email", "email_address", "e_mail"],
+            "phone": ["phone", "phone_number", "mobile", "contact"],
+        }
+    },
+    "product": {
+        "indicators": ["product_name", "product_id", "sku", "category", "price", "cost", "unit_of_measure"],
+        "required_matches": 2,
+        "aliases": {
+            "product_id": ["sku", "barcode", "plu", "product_id", "item_code"],
+            "product_name": ["product_name", "name", "description", "item_name"],
+            "category": ["category", "department", "cat", "family", "classification"],
+            "unit_price": ["price", "unit_price", "selling_price", "retail_price"],
+            "cost_price": ["cost", "cost_price", "purchase_price", "wholesale_price"],
+        }
+    }
+}
+def detect_entity_type(df: pd.DataFrame) -> Tuple[str, float]:
+    """
+    AUTO-DETECT entity type from DataFrame columns.
+    Returns: (entity_type, confidence_score)
+    """
+    columns = {str(col).lower().strip() for col in df.columns}
+    scores = {}
+    for entity_type, config in ENTITY_SCHEMAS.items():
+        # Count matches between DataFrame columns and entity indicators
+        matches = sum(
+            1 for indicator in config["indicators"]
+            if any(indicator in col for col in columns)
+        )
+        # Calculate confidence (0.0 to 1.0)
+        confidence = min(matches / config["required_matches"], 1.0)
+        scores[entity_type] = confidence
+    # Return best match if confident enough
+    if scores:
+        best_entity = max(scores, key=scores.get)
+        confidence = scores[best_entity]
+        if confidence > 0.3:  # 30% threshold
+            return best_entity, confidence
+    # Default to sales if uncertain (most common)
+    return "sales", 0.0

app/main.py CHANGED Viewed

@@ -21,6 +21,9 @@ app = FastAPI(
     lifespan=lifespan
 )
 # ----------  Socket.IO Mount  ----------
 app.mount("/socket.io", socket.socket_app)

     lifespan=lifespan
 )
+@app.get("/")
+def read_root():
+    return {"status": "ok", "service": "analytics-engine"}
 # ----------  Socket.IO Mount  ----------
 app.mount("/socket.io", socket.socket_app)

app/mapper.py CHANGED Viewed

@@ -1,8 +1,11 @@
-import os, json, duckdb, pandas as pd
 from datetime import datetime, timedelta
 from app.db import get_conn, ensure_raw_table
-from app.utils.detect_industry import _ALIAS
 # ----------------------  Canonical schema base  ---------------------- #
 CANONICAL = {
@@ -18,171 +21,216 @@ CANONICAL = {
 ALIAS_FILE = "./db/alias_memory.json"
-def safe_str_transform(series: pd.Series) -> pd.Series:
-    """Apply .str.lower() & .str.strip() only if dtype is object/string."""
-    if pd.api.types.is_string_dtype(series):
-        return series.str.lower().str.strip()
-    return series
-# ----------------------  Alias memory helpers  ---------------------- #
 def load_dynamic_aliases() -> None:
-    """Load learned aliases and merge into CANONICAL."""
     if os.path.exists(ALIAS_FILE):
         try:
             with open(ALIAS_FILE) as f:
                 dynamic_aliases = json.load(f)
             for k, v in dynamic_aliases.items():
                 if k in CANONICAL:
-                    for alias in v:
-                        if alias not in CANONICAL[k]:
-                            CANONICAL[k].append(alias)
                 else:
                     CANONICAL[k] = v
         except Exception as e:
             print(f"[mapper] ⚠️ failed to load alias memory: {e}")
 def save_dynamic_aliases() -> None:
-    """Persist learned aliases for next runs."""
     os.makedirs(os.path.dirname(ALIAS_FILE), exist_ok=True)
     with open(ALIAS_FILE, "w") as f:
         json.dump(CANONICAL, f, indent=2)
-# ----------------------  Schema versioning helpers  ---------------------- #
-def ensure_schema_version(duck, df: pd.DataFrame) -> str:
     """
-    Ensure schema versioning and track evolution.
-    Returns the active canonical table name (e.g., main.canonical_v2).
-    """
-    duck.execute("CREATE SCHEMA IF NOT EXISTS main")
-    duck.execute("""
-        CREATE TABLE IF NOT EXISTS main.schema_versions (
-            version INTEGER PRIMARY KEY,
-            columns JSON,
-            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
-        )
-    """)
-    latest = duck.execute("SELECT * FROM main.schema_versions ORDER BY version DESC LIMIT 1").fetchone()
-    new_signature = sorted(df.columns.tolist())
-    if latest:
-        latest_cols = sorted(json.loads(latest[1]))
-        if latest_cols == new_signature:
-            return f"main.canonical_v{latest[0]}"
-        else:
-            new_version = latest[0] + 1
-            duck.execute("INSERT INTO main.schema_versions (version, columns) VALUES (?, ?)",
-                         (new_version, json.dumps(new_signature)))
-            print(f"[schema] → new version detected: canonical_v{new_version}")
-            return f"main.canonical_v{new_version}"
-    else:
-        duck.execute("INSERT INTO main.schema_versions (version, columns) VALUES (?, ?)",
-                     (1, json.dumps(new_signature)))
-        print("[schema] → initialized canonical_v1")
-        return "main.canonical_v1"
-def reconcile_latest_schema(duck):
-    """
-    Merge all canonical_v* tables into main.canonical_latest
-    preserving new columns and filling missing values with NULL.
-    """
-    tables = [r[0] for r in duck.execute("""
-        SELECT table_name FROM information_schema.tables
-        WHERE table_name LIKE 'canonical_v%'
-    """).fetchall()]
-    if not tables:
-        return
-    union_query = " UNION ALL ".join([f"SELECT * FROM {t}" for t in tables])
-    duck.execute("CREATE OR REPLACE TABLE main.canonical_latest AS " + union_query)
-    print(f"[schema] ✅ reconciled {len(tables)} schema versions → canonical_latest")
-# ----------------------  Canonify core logic  ---------------------- #
-def canonify_df(org_id: str, hours_window: int = 24) -> pd.DataFrame:
-    """
-    Normalize, version, and persist canonical data snapshot for org_id.
     """
     load_dynamic_aliases()
     conn = get_conn(org_id)
     ensure_raw_table(conn)
-    # --------------------------
-    # ⏱  Safe timestamp filtering
-    # --------------------------
     try:
-        # Compute cutoff in Python to avoid parameter placeholders inside INTERVAL
-        cutoff = datetime.now() - timedelta(hours=hours_window)
-        cutoff_str = cutoff.strftime("%Y-%m-%d %H:%M:%S")
-        rows = conn.execute(
-            f"""
-            SELECT row_data
-            FROM raw_rows
-            WHERE strptime(json_extract(row_data, '$.timestamp'), '%Y-%m-%d %H:%M:%S')
-                  >= TIMESTAMP '{cutoff_str}'
-            """
-        ).fetchall()
     except Exception as e:
-        print(f"[canonify] ⚠️ fallback to all rows due to timestamp parse error: {e}")
-        rows = conn.execute("SELECT row_data FROM raw_rows").fetchall()
     if not rows:
-        print("[canonify] no rows to process")
-        return pd.DataFrame()
-    # --------------------------
-    # 🧩 DataFrame normalization
-    # --------------------------
-    raw = pd.DataFrame([json.loads(r[0]) for r in rows])
-    raw.columns = safe_str_transform(raw.columns)
-    # Flexible alias mapping
     mapping = {}
     for canon, aliases in CANONICAL.items():
-        for col in raw.columns:
-            if any(a in col for a in aliases):
                 mapping[col] = canon
                 break
-    # ���� Learn new aliases dynamically
-    for col in raw.columns:
-        if col not in sum(CANONICAL.values(), []):
-            for canon in CANONICAL.keys():
-                if canon in col and col not in CANONICAL[canon]:
-                    CANONICAL[canon].append(col)
     save_dynamic_aliases()
-    # Apply canonical renaming
-    renamed = raw.rename(columns=mapping)
     cols = [c for c in CANONICAL.keys() if c in renamed.columns]
     df = renamed[cols].copy() if cols else renamed.copy()
-    # 🔢 Normalize datatypes
-    if "timestamp" in df:
-        df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
-    if "expiry_date" in df:
-        df["expiry_date"] = pd.to_datetime(df["expiry_date"], errors="coerce").dt.date
-    if "promo_flag" in df:
-        df["promo_flag"] = df["promo_flag"].astype(str).isin({"1", "true", "t", "yes"})
-    for col in ("qty", "total"):
-        if col in df:
-            df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
-    # --------------------------
-    # 🪣 Schema versioning + storage
-    # --------------------------
     os.makedirs("./db", exist_ok=True)
     duck = duckdb.connect(f"./db/{org_id}.duckdb")
-    table_name = ensure_schema_version(duck, df)
-    duck.execute(f"CREATE TABLE IF NOT EXISTS {table_name} AS SELECT * FROM df LIMIT 0")
-    duck.execute(f"INSERT INTO {table_name} SELECT * FROM df")
-    # 🧩 Always refresh canonical_latest for unified analytics
-    reconcile_latest_schema(duck)
     duck.close()
-    print(f"[canonify] ✅ canonical snapshot updated for {org_id}")
-    return df

+# app/mapper.py – BULLETPROOF VERSION
+import os
+import json
+import duckdb
+import pandas as pd
 from datetime import datetime, timedelta
 from app.db import get_conn, ensure_raw_table
+from app.utils.detect_industry import _ALIAS, detect_industry
 # ----------------------  Canonical schema base  ---------------------- #
 CANONICAL = {
 ALIAS_FILE = "./db/alias_memory.json"
+def map_pandas_to_duck(col: str, series: pd.Series) -> str:
+    if pd.api.types.is_bool_dtype(series):     return "BOOLEAN"
+    if pd.api.types.is_integer_dtype(series):  return "BIGINT"
+    if pd.api.types.is_float_dtype(series):    return "DOUBLE"
+    if pd.api.types.is_datetime64_any_dtype(series): return "TIMESTAMP"
+    return "VARCHAR"
+# ----------  INDUSTRY DETECTION (uses centralized detect_industry) ---------- #
+def ensure_canonical_table(duck: duckdb.DuckDBPyConnection, df: pd.DataFrame) -> str:
+    """
+    Creates single canonical table and adds missing columns dynamically.
+    BULLETPROOF: Handles int column names, missing columns, race conditions.
+    """
+    table_name = "main.canonical"
+    # Create base table if doesn't exist
+    duck.execute(f"""
+        CREATE TABLE IF NOT EXISTS {table_name} (
+            id UUID DEFAULT uuid(),
+            _ingested_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+        )
+    """)
+    # Get existing columns (lowercase for comparison)
+    existing_cols_raw = duck.execute(f"PRAGMA table_info('{table_name}')").fetchall()
+    existing_cols = {str(r[0]).lower() for r in existing_cols_raw}
+    # ✅ BULLETPROOF: Add missing columns with safe name handling
+    for col in df.columns:
+        col_name = str(col).lower().strip()  # ✅ FORCE STRING
+        if col_name not in existing_cols:
+            try:
+                dtype = map_pandas_to_duck(col_name, df[col])
+                print(f"[mapper] ➕ Adding column '{col_name}:{dtype}'")
+                duck.execute(f"ALTER TABLE {table_name} ADD COLUMN {col_name} {dtype}")
+            except Exception as e:
+                print(f"[mapper] ⚠️ Skipping column {col_name}: {e}")
+    return table_name
+# ----------  Alias Memory ---------- #
 def load_dynamic_aliases() -> None:
     if os.path.exists(ALIAS_FILE):
         try:
             with open(ALIAS_FILE) as f:
                 dynamic_aliases = json.load(f)
             for k, v in dynamic_aliases.items():
                 if k in CANONICAL:
+                    CANONICAL[k].extend([a for a in v if a not in CANONICAL[k]])
                 else:
                     CANONICAL[k] = v
         except Exception as e:
             print(f"[mapper] ⚠️ failed to load alias memory: {e}")
 def save_dynamic_aliases() -> None:
     os.makedirs(os.path.dirname(ALIAS_FILE), exist_ok=True)
     with open(ALIAS_FILE, "w") as f:
         json.dump(CANONICAL, f, indent=2)
+# ----------  Main Canonify Function (ENTERPRISE-GRADE) ---------- #
+def canonify_df(org_id: str, hours_window: int = 24) -> tuple[pd.DataFrame, str, float]:
     """
+    Enterprise ingestion pipeline:
+    - Accepts ANY raw data shape
+    - Forces safe column names (handles int, None, etc.)
+    - Auto-detects industry
+    - Dynamically evolves schema
+    - Returns (df, industry, confidence)
     """
     load_dynamic_aliases()
     conn = get_conn(org_id)
     ensure_raw_table(conn)
+    # 1) Pull raw audit data
     try:
+        rows = conn.execute("""
+            SELECT row_data FROM main.raw_rows
+            WHERE row_data IS NOT NULL
+            AND LENGTH(CAST(row_data AS TEXT)) > 0
+        """).fetchall()
     except Exception as e:
+        print(f"[canonify] SQL read error: {e}")
+        rows = []
     if not rows:
+        print("[canonify] no audit rows found")
+        return pd.DataFrame(), "unknown", 0.0
+    # 2) Parse JSON safely (handles both string and parsed objects)
+    parsed = []
+    malformed_count = 0
+    for r in rows:
+        raw = r[0]
+        if not raw:
+            malformed_count += 1
+            continue
+        try:
+            # ✅ Handle pre-parsed objects from Redis
+            if isinstance(raw, (dict, list)):
+                obj = raw
+            else:
+                # ✅ Parse string JSON
+                obj = json.loads(str(raw))
+        except Exception:
+            malformed_count += 1
+            continue
+        # ✅ Extract rows from various payload formats
+        if isinstance(obj, dict):
+            if "rows" in obj and isinstance(obj["rows"], list):
+                parsed.extend(obj["rows"])
+            elif "data" in obj and isinstance(obj["data"], list):
+                parsed.extend(obj["data"])
+            elif "tables" in obj and isinstance(obj["tables"], dict):
+                for table_rows in obj["tables"].values():
+                    if isinstance(table_rows, list):
+                        parsed.extend(table_rows)
+            else:
+                parsed.append(obj)
+        elif isinstance(obj, list):
+            parsed.extend(obj)
+        else:
+            malformed_count += 1
+    if malformed_count:
+        print(f"[canonify] skipped {malformed_count} malformed rows")
+    if not parsed:
+        print("[canonify] no valid data after parsing")
+        return pd.DataFrame(), "unknown", 0.0
+    # 3) ✅ BULLETPROOF: Force all column names to strings
+    df = pd.DataFrame(parsed)
+    df.columns = [str(col).lower().strip() for col in df.columns]
+    # ✅ Remove duplicate columns (can happen with messy data)
+    df = df.loc[:, ~df.columns.duplicated()]
+    # 4) Map to canonical schema
     mapping = {}
     for canon, aliases in CANONICAL.items():
+        for col in df.columns:
+            # ✅ SAFE: Ensure aliases are strings
+            if any(str(alias).lower() in str(col).lower() for alias in aliases):
                 mapping[col] = canon
                 break
+    # ✅ Learn new aliases
+    for col in df.columns:
+        for canon in CANONICAL.keys():
+            if str(canon).lower() in str(col).lower() and col not in CANONICAL[canon]:
+                CANONICAL[canon].append(col)
     save_dynamic_aliases()
+    renamed = df.rename(columns=mapping)
     cols = [c for c in CANONICAL.keys() if c in renamed.columns]
     df = renamed[cols].copy() if cols else renamed.copy()
+    # 5) Type conversions (best effort)
+    try:
+        if "timestamp" in df:
+            df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
+        if "expiry_date" in df:
+            df["expiry_date"] = pd.to_datetime(df["expiry_date"], errors="coerce").dt.date
+        if "promo_flag" in df:
+            df["promo_flag"] = df["promo_flag"].astype(str).isin({"1", "true", "t", "yes"})
+        for col in ("qty", "total"):
+            if col in df:
+                df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
+    except Exception as e:
+        print(f"[canonify] Type conversion warning (non-critical): {e}")
+    # 6) ✅ Industry detection
+    industry, confidence = detect_industry(df)
+    print(f"[canonify] 🎯 Industry: {industry} ({confidence:.1%} confidence)")
+    # 7) Dynamic schema evolution
     os.makedirs("./db", exist_ok=True)
     duck = duckdb.connect(f"./db/{org_id}.duckdb")
+    table_name = ensure_canonical_table(duck, df)
+    # ✅ SAFE INSERT: Match columns explicitly
+    if not df.empty:
+        # Get current table columns
+        table_info = duck.execute(f"PRAGMA table_info('{table_name}')").fetchall()
+        table_cols = [str(r[0]) for r in table_info]  # ✅ FORCE STRING
+        # Only insert columns that exist in table
+        df_to_insert = df[[col for col in df.columns if col in table_cols]]
+        if not df_to_insert.empty:
+            cols_str = ", ".join(df_to_insert.columns)
+            placeholders = ", ".join(["?"] * len(df_to_insert.columns))
+            try:
+                duck.executemany(
+                    f"INSERT INTO {table_name} ({cols_str}) VALUES ({placeholders})",
+                    df_to_insert.values.tolist()
+                )
+                print(f"[canonify] ✅ Inserted {len(df_to_insert)} rows")
+            except Exception as e:
+                print(f"[canonify] ❌ Insert failed: {e}")
+                # Continue anyway - data quality issues shouldn't crash pipeline
     duck.close()
+    print(f"[canonify] ✅ Pipeline complete for {org_id}")
+    return df, industry, confidence

app/routers/datasources.py CHANGED Viewed

@@ -69,10 +69,11 @@ async def create_source(
 # =======================================================================
 # 2️⃣  SMART JSON ENDPOINT – fully schema-agnostic and multi-table aware
 # =======================================================================
 class JsonPayload(BaseModel):
     config: Dict[str, Any]
-    data: Union[List[Any], Dict[str, Any]]  # flexible: list or { "tables": {...} }
 @router.post("/datasources/json")
 async def create_source_json(
@@ -83,35 +84,91 @@ async def create_source_json(
     _: str = Depends(verify_key),
 ):
     """
-    Accepts structured JSON (list or multi-table dict) from n8n, Render jobs, or APIs.
-    Automatically evolves schemas, stores data, detects industry, and broadcasts live rows.
     """
     try:
         if not payload or not payload.data:
-            raise HTTPException(status_code=400, detail="Missing payload data")
-        # 💾 Flexible insertion – handles one or multiple tables
         bootstrap(orgId, payload.data)
-        # 🧭 Canonical normalization (only if “sales” or compatible table exists)
-        df = canonify_df(orgId)
-        industry, confidence = detect_industry(df)
-        # 🎯 Preview last few normalized rows
-        rows = df.head(3).to_dict("records") if not df.empty else []
-        await sio.emit("datasource:new-rows", {"rows": rows}, room=orgId)
         return JSONResponse(
             content={
                 "id": sourceId,
                 "status": "processed",
                 "industry": industry,
-                "confidence": confidence,
-                "recentRows": rows,
-                "message": "✅ Data ingested successfully",
             }
         )
     except Exception as e:
-        print(f"[datasources/json] ❌ ingestion error: {e}")
-        raise HTTPException(status_code=500, detail=str(e))

 # =======================================================================
 # 2️⃣  SMART JSON ENDPOINT – fully schema-agnostic and multi-table aware
 # =======================================================================
+# app/routers/datasources.py
 class JsonPayload(BaseModel):
     config: Dict[str, Any]
+    data: Union[List[Any], Dict[str, Any]]  # Flexible: list or { "tables": {...} }
 @router.post("/datasources/json")
 async def create_source_json(
     _: str = Depends(verify_key),
 ):
     """
+    Enterprise ingestion endpoint:
+    - Stores raw audit trail
+    - Normalizes to canonical schema
+    - Auto-detects industry
+    - Broadcasts real-time updates
+    - Returns comprehensive metadata
     """
     try:
+        # ✅ Validate payload
         if not payload or not payload.data:
+            raise HTTPException(
+                status_code=400,
+                detail="Missing payload.data. Expected list or dict."
+            )
+        # 1. 💾 Store raw data for audit & lineage
         bootstrap(orgId, payload.data)
+        print(f"[api/json] ✅ Raw data stored for org: {orgId}")
+        # 2. 🧭 Normalize schema + auto-detect industry (single pass)
+        #    Returns: (normalized_df, industry_name, confidence_score)
+        df, industry, confidence = canonify_df(orgId)
+        print(f"[api/json] 🎯 Industry detected: {industry} ({confidence:.1%})")
+        # 3. 🎯 Prepare preview for real-time broadcast
+        # Convert DataFrame to JSON-safe format
+        preview_df = df.head(3).copy()
+        for col in preview_df.columns:
+           if pd.api.types.is_datetime64_any_dtype(preview_df[col]):
+                preview_df[col] = preview_df[col].dt.strftime('%Y-%m-%d %H:%M:%S')
+           elif pd.api.types.is_timedelta64_dtype(preview_df[col]):
+               preview_df[col] = preview_df[col].astype(str)
+        preview_rows = preview_df.to_dict("records") if not preview_df.empty else []
+        # 4. 📡 Broadcast to connected dashboards
+        await sio.emit(
+            "datasource:new-rows",
+            {
+                "rows": preview_rows,
+                "industry": industry,
+                "confidence": confidence,
+                "totalRows": len(df),
+                "datasourceId": sourceId,
+            },
+            room=orgId
+        )
+        # 5. ✅ Return comprehensive response
         return JSONResponse(
+            status_code=200,
             content={
                 "id": sourceId,
                 "status": "processed",
                 "industry": industry,
+                "confidence": round(confidence, 4),
+                "recentRows": preview_rows,
+                "message": "✅ Data ingested and normalized successfully",
+                "rowsProcessed": len(df),
+                "schemaColumns": list(df.columns) if not df.empty else [],
+                "processingTimeMs": 0, # You can add timing if needed
             }
         )
+    except HTTPException:
+        raise  # Re-raise FastAPI errors as-is
+    except pd.errors.EmptyDataError:
+        print(f"[api/json] ⚠️ Empty data for org: {orgId}")
+        return JSONResponse(
+            status_code=200,  # Not an error - just no data
+            content={
+                "id": sourceId,
+                "status": "no_data",
+                "industry": "unknown",
+                "confidence": 0.0,
+                "message": "⚠️ No valid data rows found",
+                "rowsProcessed": 0,
+            }
+        )
     except Exception as e:
+        print(f"[api/json] ❌ Unexpected error: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Ingestion pipeline failed: {str(e)}"
+        )

requirements.txt CHANGED Viewed

@@ -1,23 +1,38 @@
-# Analytics Service dependencies
-apscheduler>=3.10
-pyarrow>=15.0
-redis>=5.0
-pandas>=2.2
 fastapi>=0.111
 uvicorn[standard]>=0.29
-prophet==1.1.5
-numpy>=1.24
-scikit-learn>=1.3
 scipy>=1.10
 statsmodels>=0.14
 networkx>=3.0
-sqlalchemy[asyncio]>=2.0
-asyncpg>=0.29          # async postgres driver
-numpy<2.0
 requests>=2.31
-huggingface_hub>=0.20.0
 aiohttp>=3.9.0
 httpx>=0.27.0
 python-multipart==0.0.6
 pycryptodome==3.20.0
-python-socketio[asyncio]>=5.11.0

+# Core API
 fastapi>=0.111
 uvicorn[standard]>=0.29
+# Data Processing & Analytics
+duckdb>=0.10.3
+pandas>=2.2
+pyarrow>=15.0
+numpy>=1.24,<2.0
 scipy>=1.10
+scikit-learn>=1.3
 statsmodels>=0.14
 networkx>=3.0
+prophet>=1.1.5
+# Local LLM (Free GPU)
+torch==2.2.0
+transformers==4.40.0
+accelerate==0.28.0
+sentence-transformers==2.7.0
+# Redis Bridge (Upstash)
+redis==5.0.0
+qstash>=2.0.0,<3.0.0  # <-- ADDED VERSION PIN
+# HTTP Clients
 requests>=2.31
 aiohttp>=3.9.0
 httpx>=0.27.0
+# Utilities
+huggingface_hub>=0.20.0
 python-multipart==0.0.6
 pycryptodome==3.20.0
+python-socketio[asyncio]>=5.11.0
+asyncpg>=0.29
+apscheduler>=3.10
+sqlalchemy[asyncio]>=2.0