Spaces:

petermutwiri
/

analytics-engine

Paused

App Files Files Community

petermutwiri commited on Nov 13, 2025

Commit

4f1cf86

verified ·

1 Parent(s): e41d979

Update app/mapper.py

Browse files

Files changed (1) hide show

app/mapper.py +95 -56

app/mapper.py CHANGED Viewed

@@ -113,126 +113,165 @@ def ensure_canonical_table(duck: duckdb.DuckDBPyConnection, df: pd.DataFrame) ->
     return table_name
 # ----------  Main Canonify Function (WITH INDUSTRY DETECTION) ---------- #
 def canonify_df(org_id: str, hours_window: int = 24) -> tuple[pd.DataFrame, str, float]:
     """
-    Normalize, evolve schema dynamically, persist to canonical table, AND detect industry.
-    Returns: (dataframe, industry_name, confidence_score)
     """
     load_dynamic_aliases()
     conn = get_conn(org_id)
     ensure_raw_table(conn)
-    # 1) Pull raw rows safely
     try:
-        rows = conn.execute("SELECT row_data FROM main.raw_rows WHERE row_data IS NOT NULL AND LENGTH(row_data) > 0").fetchall()
     except Exception as e:
         print(f"[canonify] SQL read error: {e}")
         rows = []
     if not rows:
-        print("[canonify] no rows to process")
         return pd.DataFrame(), "unknown", 0.0
-    # 2) Parse JSON safely
     parsed = []
     malformed_count = 0
     for r in rows:
         raw = r[0]
-        if not raw or not isinstance(raw, str):
             malformed_count += 1
             continue
         try:
-            obj = json.loads(raw)
         except Exception:
             malformed_count += 1
             continue
         if isinstance(obj, dict):
             if "rows" in obj and isinstance(obj["rows"], list):
                 parsed.extend(obj["rows"])
             elif "data" in obj and isinstance(obj["data"], list):
                 parsed.extend(obj["data"])
             elif "tables" in obj and isinstance(obj["tables"], dict):
-                for t_rows in obj["tables"].values():
-                    if isinstance(t_rows, list):
-                        parsed.extend(t_rows)
             else:
                 parsed.append(obj)
         elif isinstance(obj, list):
             parsed.extend(obj)
         else:
             malformed_count += 1
-            continue
     if malformed_count:
-        print(f"[canonify] skipped {malformed_count} malformed rows")
     if not parsed:
-        print("[canonify] no valid parsed rows")
         return pd.DataFrame(), "unknown", 0.0
-    # 3) Build DataFrame and normalize
-    raw_df = pd.DataFrame(parsed)
-    if raw_df.empty:
-        print("[canonify] dataframe empty after parse")
         return pd.DataFrame(), "unknown", 0.0
-    raw_df.columns = raw_df.columns.str.lower().str.strip()
     mapping = {}
     for canon, aliases in CANONICAL.items():
-        for col in raw_df.columns:
-            if any(a in col for a in aliases):
                 mapping[col] = canon
                 break
-    # Learn dynamic aliases
-    for col in raw_df.columns:
-        if col not in sum(CANONICAL.values(), []):
             for canon in CANONICAL.keys():
-                if canon in col and col not in CANONICAL[canon]:
                     CANONICAL[canon].append(col)
     save_dynamic_aliases()
-    renamed = raw_df.rename(columns=mapping)
     cols = [c for c in CANONICAL.keys() if c in renamed.columns]
     df = renamed[cols].copy() if cols else renamed.copy()
-    # 4) Type conversions
-    if "timestamp" in df:
-        df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
-    if "expiry_date" in df:
-        df["expiry_date"] = pd.to_datetime(df["expiry_date"], errors="coerce").dt.date
-    if "promo_flag" in df:
-        df["promo_flag"] = df["promo_flag"].astype(str).isin({"1", "true", "t", "yes"})
-    for col in ("qty", "total"):
-        if col in df:
-            df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
-    # 5) ✅ DETECT INDUSTRY BEFORE INSERTING
-    industry, confidence = detect_industry_from_df(df)
-    print(f"[canonify] 🎯 Detected industry: {industry} (confidence: {confidence:.2%})")
-    # 6) Dynamic schema evolution
     os.makedirs("./db", exist_ok=True)
     duck = duckdb.connect(f"./db/{org_id}.duckdb")
     table_name = ensure_canonical_table(duck, df)
-    # ✅ Safe insert with explicit column matching
-    cols_str = ", ".join(df.columns)
-    placeholders = ", ".join(["?"] * len(df.columns))
-    try:
-        duck.executemany(
-            f"INSERT INTO {table_name} ({cols_str}) VALUES ({placeholders})",
-            df.values.tolist()
-        )
-        print(f"[canonify] ✅ Inserted {len(df)} rows into {table_name}")
-    except Exception as e:
-        print(f"[canonify] ❌ Insert failed: {e}")
-        raise
     duck.close()
-    print(f"[canonify] ✅ canonical snapshot updated for {org_id}")
-    return df, industry, confidence  # ✅ Return all three values

     return table_name
 # ----------  Main Canonify Function (WITH INDUSTRY DETECTION) ---------- #
+# app/mapper.py - FIX with bulletproof error handling
 def canonify_df(org_id: str, hours_window: int = 24) -> tuple[pd.DataFrame, str, float]:
     """
+    Enterprise-grade normalization:
+    - Pulls raw audit data
+    - Safely parses JSON
+    - Auto-detects industry
+    - Dynamically evolves schema
+    - Returns (df, industry, confidence)
     """
     load_dynamic_aliases()
     conn = get_conn(org_id)
     ensure_raw_table(conn)
+    # ✅ SAFE: Handle both string and parsed objects from Redis
     try:
+        rows = conn.execute("""
+            SELECT row_data FROM main.raw_rows
+            WHERE row_data IS NOT NULL
+            AND LENGTH(CAST(row_data AS TEXT)) > 0
+        """).fetchall()
     except Exception as e:
         print(f"[canonify] SQL read error: {e}")
         rows = []
     if not rows:
+        print("[canonify] no audit rows found")
         return pd.DataFrame(), "unknown", 0.0
+    # ✅ SAFE: Parse JSON with type checking
     parsed = []
     malformed_count = 0
     for r in rows:
         raw = r[0]
+        # Handle both string and parsed object
+        if not raw:
             malformed_count += 1
             continue
         try:
+            # If it's already parsed (object), use it directly
+            if isinstance(raw, (dict, list)):
+                obj = raw
+            else:
+                # If it's a string, parse it
+                obj = json.loads(str(raw))
         except Exception:
             malformed_count += 1
             continue
+        # ✅ SAFE: Extract data from various payload shapes
         if isinstance(obj, dict):
             if "rows" in obj and isinstance(obj["rows"], list):
                 parsed.extend(obj["rows"])
             elif "data" in obj and isinstance(obj["data"], list):
                 parsed.extend(obj["data"])
             elif "tables" in obj and isinstance(obj["tables"], dict):
+                # Flatten multi-table into single list for canonical
+                for table_rows in obj["tables"].values():
+                    if isinstance(table_rows, list):
+                        parsed.extend(table_rows)
             else:
+                # Single record dict
                 parsed.append(obj)
         elif isinstance(obj, list):
             parsed.extend(obj)
         else:
             malformed_count += 1
     if malformed_count:
+        print(f"[canonify] skipped {malformed_count} malformed audit rows")
     if not parsed:
+        print("[canonify] no valid data after parsing")
         return pd.DataFrame(), "unknown", 0.0
+    # ✅ Create DataFrame with SAFE column names
+    df = pd.DataFrame(parsed)
+    # Handle empty DataFrame
+    if df.empty:
+        print("[canonify] DataFrame is empty")
         return pd.DataFrame(), "unknown", 0.0
+    # ✅ FIX: Ensure all column names are strings
+    df.columns = df.columns.astype(str).str.lower().str.strip()
+    # ✅ Map columns to canonical names
     mapping = {}
     for canon, aliases in CANONICAL.items():
+        for col in df.columns:
+            # SAFE alias matching
+            if any(str(alias).lower() in str(col).lower() for alias in aliases):
                 mapping[col] = canon
                 break
+    # ✅ Learn new aliases dynamically
+    for col in df.columns:
+        if col not in [str(a).lower() for sublist in CANONICAL.values() for a in sublist]:
             for canon in CANONICAL.keys():
+                if str(canon).lower() in str(col).lower() and col not in CANONICAL[canon]:
                     CANONICAL[canon].append(col)
     save_dynamic_aliases()
+    renamed = df.rename(columns=mapping)
     cols = [c for c in CANONICAL.keys() if c in renamed.columns]
     df = renamed[cols].copy() if cols else renamed.copy()
+    # ✅ Type conversions with error handling
+    try:
+        if "timestamp" in df:
+            df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
+        if "expiry_date" in df:
+            df["expiry_date"] = pd.to_datetime(df["expiry_date"], errors="coerce").dt.date
+        if "promo_flag" in df:
+            df["promo_flag"] = df["promo_flag"].astype(str).isin({"1", "true", "t", "yes"})
+        for col in ("qty", "total"):
+            if col in df:
+                df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
+    except Exception as e:
+        print(f"[canonify] Type conversion warning: {e}")
+    # ✅ FIX: Industry detection with actual column matching
+    industry, confidence = detect_industry(df)
+    # ✅ Dynamic schema evolution (NO MORE versioning)
     os.makedirs("./db", exist_ok=True)
     duck = duckdb.connect(f"./db/{org_id}.duckdb")
     table_name = ensure_canonical_table(duck, df)
+    # ✅ SAFE: Explicit column matching to avoid order issues
+    if not df.empty:
+        # Get actual columns from table
+        table_info = duck.execute(f"PRAGMA table_info('{table_name}')").fetchall()
+        table_cols = [r[0] for r in table_info]
+        # Reorder df to match table
+        df = df.reindex(columns=[c for c in table_cols if c in df.columns], fill_value=None)
+        # Insert
+        cols_str = ", ".join(df.columns)
+        placeholders = ", ".join(["?"] * len(df.columns))
+        try:
+            duck.executemany(
+                f"INSERT INTO {table_name} ({cols_str}) VALUES ({placeholders})",
+                df.values.tolist()
+            )
+            print(f"[canonify] ✅ Inserted {len(df)} rows into {table_name}")
+        except Exception as e:
+            print(f"[canonify] ❌ Insert failed: {e}")
+            raise
     duck.close()
+    print(f"[canonify] ✅ Complete for {org_id}: {len(df)} rows, {industry} ({confidence:.1%})")
+    return df, industry, confidence