Spaces:

petermutwiri
/

analytics-engine

Paused

App Files Files Community

petermutwiri commited on Nov 13, 2025

Commit

8b47729

verified ·

1 Parent(s): 085e727

Update app/mapper.py

Browse files

Files changed (1) hide show

app/mapper.py +88 -100

app/mapper.py CHANGED Viewed

@@ -1,11 +1,11 @@
-# mapper.py  –  production-hardened
 import os
 import json
 import duckdb
 import pandas as pd
 from datetime import datetime, timedelta
 from app.db import get_conn, ensure_raw_table
-from app.utils.detect_industry import _ALIAS
 # ----------------------  Canonical schema base  ---------------------- #
 CANONICAL = {
@@ -18,6 +18,7 @@ CANONICAL = {
     "promo_flag": ["promo", "promotion", "is_promo", "discount_code"],
     "expiry_date":["expiry_date", "best_before", "use_by", "expiration"],
 }
 ALIAS_FILE = "./db/alias_memory.json"
 def map_pandas_to_duck(col: str, series: pd.Series) -> str:
@@ -26,7 +27,7 @@ def map_pandas_to_duck(col: str, series: pd.Series) -> str:
     if pd.api.types.is_float_dtype(series):    return "DOUBLE"
     if pd.api.types.is_datetime64_any_dtype(series): return "TIMESTAMP"
     return "VARCHAR"
-# ----------  helpers  ---------- #
 def safe_str_transform(s: pd.Series) -> pd.Series:
     if pd.api.types.is_string_dtype(s):
         return s.str.lower().str.strip()
@@ -42,8 +43,29 @@ def add_column_if_not_exists(duck: duckdb.DuckDBPyConnection, table: str, col: s
         duck.execute(f"ALTER TABLE {table} ADD COLUMN {col} {dtype}")
         print(f"[schema] ➕ added {col}:{dtype} to {table}")
-# ----------  alias memory  ---------- #
 def load_dynamic_aliases() -> None:
     if os.path.exists(ALIAS_FILE):
         try:
@@ -62,75 +84,45 @@ def save_dynamic_aliases() -> None:
     with open(ALIAS_FILE, "w") as f:
         json.dump(CANONICAL, f, indent=2)
-# ----------  schema versioning  ---------- #
-def ensure_schema_version(duck: duckdb.DuckDBPyConnection, df: pd.DataFrame) -> str:
-    duck.execute("CREATE SCHEMA IF NOT EXISTS main")
-    # versioning metadata
-    duck.execute("""
-        CREATE TABLE IF NOT EXISTS main.schema_versions (
-            version INTEGER PRIMARY KEY,
-            columns JSON,
-            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
         )
     """)
-    new_signature = sorted(df.columns.tolist())
-    latest = duck.execute(
-        "SELECT version, columns FROM main.schema_versions ORDER BY version DESC LIMIT 1"
-    ).fetchone()
-    if latest is None:
-        version = 1
-    else:
-        latest_cols = sorted(json.loads(latest[1]))
-        if latest_cols == new_signature:
-            # ✅ schema unchanged → reuse existing table
-            return f"main.canonical_v{latest[0]}"
-        version = latest[0] + 1
-    # ✅ record new schema version
-    duck.execute(
-        "INSERT INTO main.schema_versions (version, columns) VALUES (?, ?)",
-        (version, json.dumps(new_signature))
-    )
-    table = f"main.canonical_v{version}"
-    # ✅ create new table with ALL columns directly (safe)
-    col_defs = []
     for col in df.columns:
-        dtype = map_pandas_to_duck(col, df[col])
-        col_defs.append(f"{col} {dtype}")
-    duck.execute(f"CREATE TABLE {table} ({', '.join(col_defs)})")
-    print(f"[schema] ✅ created {table}")
-    return table
-def reconcile_latest_schema(duck: duckdb.DuckDBPyConnection) -> None:
-    tables = [r[0] for r in duck.execute("""
-        SELECT table_name FROM information_schema.tables
-        WHERE table_name LIKE 'canonical_v%'
-    """).fetchall()]
-    if not tables:
-        return
-    union_query = " UNION ALL ".join([f"SELECT * FROM {t}" for t in tables])
-    duck.execute("CREATE OR REPLACE TABLE main.canonical_latest AS " + union_query)
-    print(f"[schema] ✅ reconciled {len(tables)} versions → canonical_latest")
-def canonify_df(org_id: str, hours_window: int = 24) -> pd.DataFrame:
     """
-    Normalize, version, and persist canonical data snapshot for org_id.
-    This version pulls raw_rows as raw strings and parses JSON in Python so
-    malformed raw_rows don't crash the pipeline.
     """
     load_dynamic_aliases()
     conn = get_conn(org_id)
     ensure_raw_table(conn)
-    # 1) pull raw strings from DB (no JSON parsing in SQL)
     try:
         rows = conn.execute("SELECT row_data FROM main.raw_rows WHERE row_data IS NOT NULL AND LENGTH(row_data) > 0").fetchall()
     except Exception as e:
@@ -139,9 +131,9 @@ def canonify_df(org_id: str, hours_window: int = 24) -> pd.DataFrame:
     if not rows:
         print("[canonify] no rows to process")
-        return pd.DataFrame()
-    # 2) parse json strings safely in Python, skip bad ones
     parsed = []
     malformed_count = 0
     for r in rows:
@@ -152,45 +144,38 @@ def canonify_df(org_id: str, hours_window: int = 24) -> pd.DataFrame:
         try:
             obj = json.loads(raw)
         except Exception:
-            # Maybe raw is a single-object (not list) or legacy shape;
-            # attempt best-effort: ignore empty or malformed
             malformed_count += 1
             continue
-        # If this is a wrapper like {"rows": [...]} or {"data": [...]} or {"tables": {...}}
         if isinstance(obj, dict):
-            # prefer list under rows, data, or fallback to tables flatten
             if "rows" in obj and isinstance(obj["rows"], list):
                 parsed.extend(obj["rows"])
             elif "data" in obj and isinstance(obj["data"], list):
                 parsed.extend(obj["data"])
             elif "tables" in obj and isinstance(obj["tables"], dict):
-                # flatten: append all rows from all tables (optional)
                 for t_rows in obj["tables"].values():
                     if isinstance(t_rows, list):
                         parsed.extend(t_rows)
             else:
-                # maybe the dict itself represents a single record
                 parsed.append(obj)
         elif isinstance(obj, list):
             parsed.extend(obj)
         else:
-            # unknown shape — skip
             malformed_count += 1
             continue
     if malformed_count:
-        print(f"[canonify] skipped {malformed_count} malformed/unsupported raw_rows")
     if not parsed:
-        print("[canonify] no valid parsed rows after filtering")
-        return pd.DataFrame()
-    # 3) build DataFrame and normalize column names
     raw_df = pd.DataFrame(parsed)
     if raw_df.empty:
         print("[canonify] dataframe empty after parse")
-        return pd.DataFrame()
     raw_df.columns = raw_df.columns.str.lower().str.strip()
     mapping = {}
@@ -200,7 +185,7 @@ def canonify_df(org_id: str, hours_window: int = 24) -> pd.DataFrame:
                 mapping[col] = canon
                 break
-    # learn dynamic aliases
     for col in raw_df.columns:
         if col not in sum(CANONICAL.values(), []):
             for canon in CANONICAL.keys():
@@ -212,7 +197,7 @@ def canonify_df(org_id: str, hours_window: int = 24) -> pd.DataFrame:
     cols = [c for c in CANONICAL.keys() if c in renamed.columns]
     df = renamed[cols].copy() if cols else renamed.copy()
-    # datatype conversions
     if "timestamp" in df:
         df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
     if "expiry_date" in df:
@@ -223,28 +208,31 @@ def canonify_df(org_id: str, hours_window: int = 24) -> pd.DataFrame:
         if col in df:
             df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
-    # 4) persist canonical snapshot (use safe schema-versioning)
     os.makedirs("./db", exist_ok=True)
     duck = duckdb.connect(f"./db/{org_id}.duckdb")
-    table_name = ensure_schema_version(duck, df)
-    # create table if not exists with the columns of df
-    duck.execute(f"CREATE TABLE IF NOT EXISTS {table_name} AS SELECT * FROM df LIMIT 0")
-    # if table created above has no columns (rare), fallback to explicit column creation handled in ensure_schema_version
     try:
-        duck.execute(f"INSERT INTO {table_name} SELECT * FROM df")
     except Exception as e:
-        print(f"[canonify] insert error, retrying with explicit column checks: {e}")
-        # ensure columns exist individually
-        existing_cols = {r[0].lower() for r in duck.execute(f"PRAGMA table_info('{table_name}')").fetchall()}
-        for col in df.columns:
-            if col.lower() not in existing_cols:
-                dtype = map_pandas_to_duck(col, df[col])
-                duck.execute(f"ALTER TABLE {table_name} ADD COLUMN {col} {dtype}")
-        duck.execute(f"INSERT INTO {table_name} SELECT * FROM df")
-    reconcile_latest_schema(duck)
-    duck.close()
     print(f"[canonify] ✅ canonical snapshot updated for {org_id}")
-    return df

+# app/mapper.py – FIXED WITH INDUSTRY DETECTION
 import os
 import json
 import duckdb
 import pandas as pd
 from datetime import datetime, timedelta
 from app.db import get_conn, ensure_raw_table
+from app.utils.detect_industry import _ALIAS  # ✅ RESTORED
 # ----------------------  Canonical schema base  ---------------------- #
 CANONICAL = {
     "promo_flag": ["promo", "promotion", "is_promo", "discount_code"],
     "expiry_date":["expiry_date", "best_before", "use_by", "expiration"],
 }
 ALIAS_FILE = "./db/alias_memory.json"
 def map_pandas_to_duck(col: str, series: pd.Series) -> str:
     if pd.api.types.is_float_dtype(series):    return "DOUBLE"
     if pd.api.types.is_datetime64_any_dtype(series): return "TIMESTAMP"
     return "VARCHAR"
 def safe_str_transform(s: pd.Series) -> pd.Series:
     if pd.api.types.is_string_dtype(s):
         return s.str.lower().str.strip()
         duck.execute(f"ALTER TABLE {table} ADD COLUMN {col} {dtype}")
         print(f"[schema] ➕ added {col}:{dtype} to {table}")
+# ----------  INDUSTRY DETECTION INTEGRATION  ---------- #
+def detect_industry_from_df(df: pd.DataFrame) -> tuple[str, float]:
+    """
+    Auto-detect industry based on column patterns using _ALIAS.
+    Returns: (industry_name, confidence_score)
+    """
+    if df.empty:
+        return "unknown", 0.0
+    cols = set(df.columns.str.lower())
+    scores = {}
+    for industry, aliases in _ALIAS.items():
+        matches = sum(1 for alias in aliases if any(alias in col for col in cols))
+        scores[industry] = min(matches / len(aliases), 1.0) if aliases else 0
+    # Get best match
+    best_industry = max(scores, key=scores.get)
+    confidence = scores[best_industry]
+    return best_industry, confidence
+# ----------  Alias Memory (no changes) ---------- #
 def load_dynamic_aliases() -> None:
     if os.path.exists(ALIAS_FILE):
         try:
     with open(ALIAS_FILE, "w") as f:
         json.dump(CANONICAL, f, indent=2)
+# ----------  Dynamic Schema Evolution ---------- #
+def ensure_canonical_table(duck: duckdb.DuckDBPyConnection, df: pd.DataFrame) -> str:
+    """
+    Single canonical table that evolves dynamically.
+    Adds missing columns on-the-fly without creating new versions.
+    """
+    table_name = "main.canonical"
+    # Ensure base table exists
+    duck.execute(f"""
+        CREATE TABLE IF NOT EXISTS {table_name} (
+            id UUID DEFAULT uuid(),
+            _ingested_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
         )
     """)
+    # Get existing columns
+    existing_cols = {r[0].lower() for r in duck.execute(f"PRAGMA table_info('{table_name}')").fetchall()}
+    # Add missing columns dynamically
     for col in df.columns:
+        if col.lower() not in existing_cols:
+            dtype = map_pandas_to_duck(col, df[col])
+            print(f"[mapper] ➕ Adding new column '{col}:{dtype}' to {table_name}")
+            duck.execute(f"ALTER TABLE {table_name} ADD COLUMN {col} {dtype}")
+    return table_name
+# ----------  Main Canonify Function (WITH INDUSTRY DETECTION) ---------- #
+def canonify_df(org_id: str, hours_window: int = 24) -> tuple[pd.DataFrame, str, float]:
     """
+    Normalize, evolve schema dynamically, persist to canonical table, AND detect industry.
+    Returns: (dataframe, industry_name, confidence_score)
     """
     load_dynamic_aliases()
     conn = get_conn(org_id)
     ensure_raw_table(conn)
+    # 1) Pull raw rows safely
     try:
         rows = conn.execute("SELECT row_data FROM main.raw_rows WHERE row_data IS NOT NULL AND LENGTH(row_data) > 0").fetchall()
     except Exception as e:
     if not rows:
         print("[canonify] no rows to process")
+        return pd.DataFrame(), "unknown", 0.0
+    # 2) Parse JSON safely
     parsed = []
     malformed_count = 0
     for r in rows:
         try:
             obj = json.loads(raw)
         except Exception:
             malformed_count += 1
             continue
         if isinstance(obj, dict):
             if "rows" in obj and isinstance(obj["rows"], list):
                 parsed.extend(obj["rows"])
             elif "data" in obj and isinstance(obj["data"], list):
                 parsed.extend(obj["data"])
             elif "tables" in obj and isinstance(obj["tables"], dict):
                 for t_rows in obj["tables"].values():
                     if isinstance(t_rows, list):
                         parsed.extend(t_rows)
             else:
                 parsed.append(obj)
         elif isinstance(obj, list):
             parsed.extend(obj)
         else:
             malformed_count += 1
             continue
     if malformed_count:
+        print(f"[canonify] skipped {malformed_count} malformed rows")
     if not parsed:
+        print("[canonify] no valid parsed rows")
+        return pd.DataFrame(), "unknown", 0.0
+    # 3) Build DataFrame and normalize
     raw_df = pd.DataFrame(parsed)
     if raw_df.empty:
         print("[canonify] dataframe empty after parse")
+        return pd.DataFrame(), "unknown", 0.0
     raw_df.columns = raw_df.columns.str.lower().str.strip()
     mapping = {}
                 mapping[col] = canon
                 break
+    # Learn dynamic aliases
     for col in raw_df.columns:
         if col not in sum(CANONICAL.values(), []):
             for canon in CANONICAL.keys():
     cols = [c for c in CANONICAL.keys() if c in renamed.columns]
     df = renamed[cols].copy() if cols else renamed.copy()
+    # 4) Type conversions
     if "timestamp" in df:
         df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
     if "expiry_date" in df:
         if col in df:
             df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
+    # 5) ✅ DETECT INDUSTRY BEFORE INSERTING
+    industry, confidence = detect_industry_from_df(df)
+    print(f"[canonify] 🎯 Detected industry: {industry} (confidence: {confidence:.2%})")
+    # 6) Dynamic schema evolution
     os.makedirs("./db", exist_ok=True)
     duck = duckdb.connect(f"./db/{org_id}.duckdb")
+    table_name = ensure_canonical_table(duck, df)
+    # ✅ Safe insert with explicit column matching
+    cols_str = ", ".join(df.columns)
+    placeholders = ", ".join(["?"] * len(df.columns))
     try:
+        duck.executemany(
+            f"INSERT INTO {table_name} ({cols_str}) VALUES ({placeholders})",
+            df.values.tolist()
+        )
+        print(f"[canonify] ✅ Inserted {len(df)} rows into {table_name}")
     except Exception as e:
+        print(f"[canonify] ❌ Insert failed: {e}")
+        raise
+    duck.close()
     print(f"[canonify] ✅ canonical snapshot updated for {org_id}")
+    return df, industry, confidence  # ✅ Return all three values