Spaces:

petermutwiri
/

analytics-engine

Paused

App Files Files Community

petermutwiri commited on Nov 13, 2025

Commit

1703bb3

verified ·

1 Parent(s): f87a9e1

Update app/mapper.py

Browse files

Files changed (1) hide show

app/mapper.py +71 -113

app/mapper.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app/mapper.py – FIXED WITH INDUSTRY DETECTION
 import os
 import json
 import duckdb
@@ -28,44 +28,39 @@ def map_pandas_to_duck(col: str, series: pd.Series) -> str:
     if pd.api.types.is_datetime64_any_dtype(series): return "TIMESTAMP"
     return "VARCHAR"
-def safe_str_transform(s: pd.Series) -> pd.Series:
-    if pd.api.types.is_string_dtype(s):
-        return s.str.lower().str.strip()
-    return s
-def sql(conn, stmt: str, *args):
-    """Centralised parameter binding → no more int-vs-tuple mistakes."""
-    return conn.execute(stmt, args).fetchall()
-def add_column_if_not_exists(duck: duckdb.DuckDBPyConnection, table: str, col: str, dtype: str) -> None:
-    existing = {r[0].lower() for r in duck.execute(f"PRAGMA table_info('{table}')").fetchall()}
-    if col.lower() not in existing:
-        duck.execute(f"ALTER TABLE {table} ADD COLUMN {col} {dtype}")
-        print(f"[schema] ➕ added {col}:{dtype} to {table}")
-# ----------  INDUSTRY DETECTION INTEGRATION  ---------- #
-def detect_industry_from_df(df: pd.DataFrame) -> tuple[str, float]:
     """
-    Auto-detect industry based on column patterns using _ALIAS.
-    Returns: (industry_name, confidence_score)
     """
-    if df.empty:
-        return "unknown", 0.0
-    cols = set(df.columns.str.lower())
-    scores = {}
-    for industry, aliases in _ALIAS.items():
-        matches = sum(1 for alias in aliases if any(alias in col for col in cols))
-        scores[industry] = min(matches / len(aliases), 1.0) if aliases else 0
-    # Get best match
-    best_industry = max(scores, key=scores.get)
-    confidence = scores[best_industry]
-    return best_industry, confidence
-# ----------  Alias Memory (no changes) ---------- #
 def load_dynamic_aliases() -> None:
     if os.path.exists(ALIAS_FILE):
         try:
@@ -84,42 +79,12 @@ def save_dynamic_aliases() -> None:
     with open(ALIAS_FILE, "w") as f:
         json.dump(CANONICAL, f, indent=2)
-# ----------  Dynamic Schema Evolution ---------- #
-def ensure_canonical_table(duck: duckdb.DuckDBPyConnection, df: pd.DataFrame) -> str:
-    """
-    Single canonical table that evolves dynamically.
-    Adds missing columns on-the-fly without creating new versions.
-    """
-    table_name = "main.canonical"
-    # Ensure base table exists
-    duck.execute(f"""
-        CREATE TABLE IF NOT EXISTS {table_name} (
-            id UUID DEFAULT uuid(),
-            _ingested_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
-        )
-    """)
-    # Get existing columns
-    existing_cols = {r[0].lower() for r in duck.execute(f"PRAGMA table_info('{table_name}')").fetchall()}
-    # Add missing columns dynamically
-    for col in df.columns:
-        if col.lower() not in existing_cols:
-            dtype = map_pandas_to_duck(col, df[col])
-            print(f"[mapper] ➕ Adding new column '{col}:{dtype}' to {table_name}")
-            duck.execute(f"ALTER TABLE {table_name} ADD COLUMN {col} {dtype}")
-    return table_name
-# ----------  Main Canonify Function (WITH INDUSTRY DETECTION) ---------- #
-# app/mapper.py - FIX with bulletproof error handling
 def canonify_df(org_id: str, hours_window: int = 24) -> tuple[pd.DataFrame, str, float]:
     """
-    Enterprise-grade normalization:
-    - Pulls raw audit data
-    - Safely parses JSON
     - Auto-detects industry
     - Dynamically evolves schema
     - Returns (df, industry, confidence)
@@ -128,7 +93,7 @@ def canonify_df(org_id: str, hours_window: int = 24) -> tuple[pd.DataFrame, str,
     conn = get_conn(org_id)
     ensure_raw_table(conn)
-    # ✅ SAFE: Handle both string and parsed objects from Redis
     try:
         rows = conn.execute("""
             SELECT row_data FROM main.raw_rows
@@ -143,42 +108,39 @@ def canonify_df(org_id: str, hours_window: int = 24) -> tuple[pd.DataFrame, str,
         print("[canonify] no audit rows found")
         return pd.DataFrame(), "unknown", 0.0
-    # ✅ SAFE: Parse JSON with type checking
     parsed = []
     malformed_count = 0
     for r in rows:
         raw = r[0]
-        # Handle both string and parsed object
         if not raw:
             malformed_count += 1
             continue
         try:
-            # If it's already parsed (object), use it directly
             if isinstance(raw, (dict, list)):
                 obj = raw
             else:
-                # If it's a string, parse it
                 obj = json.loads(str(raw))
         except Exception:
             malformed_count += 1
             continue
-        # ✅ SAFE: Extract data from various payload shapes
         if isinstance(obj, dict):
             if "rows" in obj and isinstance(obj["rows"], list):
                 parsed.extend(obj["rows"])
             elif "data" in obj and isinstance(obj["data"], list):
                 parsed.extend(obj["data"])
             elif "tables" in obj and isinstance(obj["tables"], dict):
-                # Flatten multi-table into single list for canonical
                 for table_rows in obj["tables"].values():
                     if isinstance(table_rows, list):
                         parsed.extend(table_rows)
             else:
-                # Single record dict
                 parsed.append(obj)
         elif isinstance(obj, list):
             parsed.extend(obj)
@@ -186,38 +148,33 @@ def canonify_df(org_id: str, hours_window: int = 24) -> tuple[pd.DataFrame, str,
             malformed_count += 1
     if malformed_count:
-        print(f"[canonify] skipped {malformed_count} malformed audit rows")
     if not parsed:
         print("[canonify] no valid data after parsing")
         return pd.DataFrame(), "unknown", 0.0
-    # ✅ Create DataFrame with SAFE column names
     df = pd.DataFrame(parsed)
-    # Handle empty DataFrame
-    if df.empty:
-        print("[canonify] DataFrame is empty")
-        return pd.DataFrame(), "unknown", 0.0
-    # ✅ FIX: Ensure all column names are strings
-    df.columns = df.columns.astype(str).str.lower().str.strip()
-    # ✅ Map columns to canonical names
     mapping = {}
     for canon, aliases in CANONICAL.items():
         for col in df.columns:
-            # SAFE alias matching
             if any(str(alias).lower() in str(col).lower() for alias in aliases):
                 mapping[col] = canon
                 break
-    # ✅ Learn new aliases dynamically
     for col in df.columns:
-        if col not in [str(a).lower() for sublist in CANONICAL.values() for a in sublist]:
-            for canon in CANONICAL.keys():
-                if str(canon).lower() in str(col).lower() and col not in CANONICAL[canon]:
-                    CANONICAL[canon].append(col)
     save_dynamic_aliases()
@@ -225,7 +182,7 @@ def canonify_df(org_id: str, hours_window: int = 24) -> tuple[pd.DataFrame, str,
     cols = [c for c in CANONICAL.keys() if c in renamed.columns]
     df = renamed[cols].copy() if cols else renamed.copy()
-    # ✅ Type conversions with error handling
     try:
         if "timestamp" in df:
             df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
@@ -237,41 +194,42 @@ def canonify_df(org_id: str, hours_window: int = 24) -> tuple[pd.DataFrame, str,
             if col in df:
                 df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
     except Exception as e:
-        print(f"[canonify] Type conversion warning: {e}")
-    # ✅ FIX: Industry detection with actual column matching
     industry, confidence = detect_industry(df)
-    # ✅ Dynamic schema evolution (NO MORE versioning)
     os.makedirs("./db", exist_ok=True)
     duck = duckdb.connect(f"./db/{org_id}.duckdb")
     table_name = ensure_canonical_table(duck, df)
-    # ✅ SAFE: Explicit column matching to avoid order issues
     if not df.empty:
-        # Get actual columns from table
         table_info = duck.execute(f"PRAGMA table_info('{table_name}')").fetchall()
         table_cols = [r[0] for r in table_info]
-        # Reorder df to match table
-        df = df.reindex(columns=[c for c in table_cols if c in df.columns], fill_value=None)
-        # Insert
-        cols_str = ", ".join(df.columns)
-        placeholders = ", ".join(["?"] * len(df.columns))
-        try:
-            duck.executemany(
-                f"INSERT INTO {table_name} ({cols_str}) VALUES ({placeholders})",
-                df.values.tolist()
-            )
-            print(f"[canonify] ✅ Inserted {len(df)} rows into {table_name}")
-        except Exception as e:
-            print(f"[canonify] ❌ Insert failed: {e}")
-            raise
     duck.close()
-    print(f"[canonify] ✅ Complete for {org_id}: {len(df)} rows, {industry} ({confidence:.1%})")
     return df, industry, confidence

+# app/mapper.py – BULLETPROOF VERSION
 import os
 import json
 import duckdb
     if pd.api.types.is_datetime64_any_dtype(series): return "TIMESTAMP"
     return "VARCHAR"
+# ----------  INDUSTRY DETECTION (uses centralized detect_industry) ---------- #
+def ensure_canonical_table(duck: duckdb.DuckDBPyConnection, df: pd.DataFrame) -> str:
     """
+    Creates single canonical table and adds missing columns dynamically.
+    BULLETPROOF: Handles int column names, missing columns, race conditions.
     """
+    table_name = "main.canonical"
+    # Create base table if doesn't exist
+    duck.execute(f"""
+        CREATE TABLE IF NOT EXISTS {table_name} (
+            id UUID DEFAULT uuid(),
+            _ingested_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+        )
+    """)
+    # Get existing columns (lowercase for comparison)
+    existing_cols = {r[0].lower() for r in duck.execute(f"PRAGMA table_info('{table_name}')").fetchall()}
+    # ✅ BULLETPROOF: Add missing columns with safe name handling
+    for col in df.columns:
+        col_name = str(col).lower().strip()  # ✅ FORCE STRING
+        if col_name not in existing_cols:
+            try:
+                dtype = map_pandas_to_duck(col_name, df[col])
+                print(f"[mapper] ➕ Adding column '{col_name}:{dtype}'")
+                duck.execute(f"ALTER TABLE {table_name} ADD COLUMN {col_name} {dtype}")
+            except Exception as e:
+                print(f"[mapper] ⚠️ Skipping column {col_name}: {e}")
+    return table_name
+# ----------  Alias Memory ---------- #
 def load_dynamic_aliases() -> None:
     if os.path.exists(ALIAS_FILE):
         try:
     with open(ALIAS_FILE, "w") as f:
         json.dump(CANONICAL, f, indent=2)
+# ----------  Main Canonify Function (ENTERPRISE-GRADE) ---------- #
 def canonify_df(org_id: str, hours_window: int = 24) -> tuple[pd.DataFrame, str, float]:
     """
+    Enterprise ingestion pipeline:
+    - Accepts ANY raw data shape
+    - Forces safe column names (handles int, None, etc.)
     - Auto-detects industry
     - Dynamically evolves schema
     - Returns (df, industry, confidence)
     conn = get_conn(org_id)
     ensure_raw_table(conn)
+    # 1) Pull raw audit data
     try:
         rows = conn.execute("""
             SELECT row_data FROM main.raw_rows
         print("[canonify] no audit rows found")
         return pd.DataFrame(), "unknown", 0.0
+    # 2) Parse JSON safely (handles both string and parsed objects)
     parsed = []
     malformed_count = 0
     for r in rows:
         raw = r[0]
         if not raw:
             malformed_count += 1
             continue
         try:
+            # ✅ Handle pre-parsed objects from Redis
             if isinstance(raw, (dict, list)):
                 obj = raw
             else:
+                # ✅ Parse string JSON
                 obj = json.loads(str(raw))
         except Exception:
             malformed_count += 1
             continue
+        # ✅ Extract rows from various payload formats
         if isinstance(obj, dict):
             if "rows" in obj and isinstance(obj["rows"], list):
                 parsed.extend(obj["rows"])
             elif "data" in obj and isinstance(obj["data"], list):
                 parsed.extend(obj["data"])
             elif "tables" in obj and isinstance(obj["tables"], dict):
                 for table_rows in obj["tables"].values():
                     if isinstance(table_rows, list):
                         parsed.extend(table_rows)
             else:
                 parsed.append(obj)
         elif isinstance(obj, list):
             parsed.extend(obj)
             malformed_count += 1
     if malformed_count:
+        print(f"[canonify] skipped {malformed_count} malformed rows")
     if not parsed:
         print("[canonify] no valid data after parsing")
         return pd.DataFrame(), "unknown", 0.0
+    # 3) ✅ BULLETPROOF: Force all column names to strings
     df = pd.DataFrame(parsed)
+    df.columns = [str(col).lower().strip() for col in df.columns]
+    # ✅ Remove duplicate columns (can happen with messy data)
+    df = df.loc[:, ~df.columns.duplicated()]
+    # 4) Map to canonical schema
     mapping = {}
     for canon, aliases in CANONICAL.items():
         for col in df.columns:
+            # ✅ SAFE: Ensure aliases are strings
             if any(str(alias).lower() in str(col).lower() for alias in aliases):
                 mapping[col] = canon
                 break
+    # ✅ Learn new aliases
     for col in df.columns:
+        for canon in CANONICAL.keys():
+            if str(canon).lower() in str(col).lower() and col not in CANONICAL[canon]:
+                CANONICAL[canon].append(col)
     save_dynamic_aliases()
     cols = [c for c in CANONICAL.keys() if c in renamed.columns]
     df = renamed[cols].copy() if cols else renamed.copy()
+    # 5) Type conversions (best effort)
     try:
         if "timestamp" in df:
             df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
             if col in df:
                 df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
     except Exception as e:
+        print(f"[canonify] Type conversion warning (non-critical): {e}")
+    # 6) ✅ Industry detection
     industry, confidence = detect_industry(df)
+    print(f"[canonify] 🎯 Industry: {industry} ({confidence:.1%} confidence)")
+    # 7) Dynamic schema evolution
     os.makedirs("./db", exist_ok=True)
     duck = duckdb.connect(f"./db/{org_id}.duckdb")
     table_name = ensure_canonical_table(duck, df)
+    # ✅ SAFE INSERT: Match columns explicitly
     if not df.empty:
+        # Get current table columns
         table_info = duck.execute(f"PRAGMA table_info('{table_name}')").fetchall()
         table_cols = [r[0] for r in table_info]
+        # Only insert columns that exist in table
+        df_to_insert = df[[col for col in df.columns if col in table_cols]]
+        if not df_to_insert.empty:
+            cols_str = ", ".join(df_to_insert.columns)
+            placeholders = ", ".join(["?"] * len(df_to_insert.columns))
+            try:
+                duck.executemany(
+                    f"INSERT INTO {table_name} ({cols_str}) VALUES ({placeholders})",
+                    df_to_insert.values.tolist()
+                )
+                print(f"[canonify] ✅ Inserted {len(df_to_insert)} rows")
+            except Exception as e:
+                print(f"[canonify] ❌ Insert failed: {e}")
+                # Continue anyway - data quality issues shouldn't crash pipeline
     duck.close()
+    print(f"[canonify] ✅ Pipeline complete for {org_id}")
     return df, industry, confidence