Spaces:

petermutwiri
/

analytics-engine

Paused

App Files Files Community

shaliz-kong commited on Nov 30, 2025

Commit

ee959f2

1 Parent(s): 0ddcd64

created schema fallback

Browse files

Files changed (2) hide show

app/schemas/org_schema.py +60 -22
app/tasks/analytics_worker.py +32 -17

app/schemas/org_schema.py CHANGED Viewed

@@ -2,10 +2,11 @@
 from typing import Dict, Optional, List, Tuple
 import json
 import logging
 from app.core.event_hub import event_hub
 from app.service.llm_service import LocalLLMService
-from app.service.vector_service import VectorService  # Your existing vector service
-import duckdb
 logger = logging.getLogger(__name__)
@@ -31,21 +32,22 @@ class OrgSchema:
                      "trans_date", "sale_time", "order_date"],
     }
-    def __init__(self, org_id: str):
         self.org_id = org_id
-        self.cache_key = f"schema:{org_id}:ai:v3"
         self.stats_key = f"schema:stats:{org_id}"
         self.llm = LocalLLMService()
-        self.vector = VectorService(org_id)
     def get_mapping(self) -> Dict[str, str]:
         """Autonomous mapping with AI fallback for unmatched columns"""
         try:
             if cached := event_hub.get_key(self.cache_key):
-                logger.info(f"[Schema] Cache hit for org {self.org_id}")
                 return json.loads(cached)
-            logger.info(f"[Schema] Starting AI discovery for org {self.org_id}")
             mapping = self._discover_schema()
             self.save_mapping(mapping)
             return mapping
@@ -56,15 +58,19 @@ class OrgSchema:
     def _discover_schema(self) -> Dict[str, str]:
         """Three-tier discovery: Rule-based → Vector similarity → LLM reasoning"""
-        conn = duckdb.connect("md:?motherduck_token=")
-        # Get column metadata
         columns_info = conn.execute(f"""
             SELECT column_name, data_type, is_nullable
             FROM information_schema.columns
-            WHERE table_name = 'transactions_{self.org_id}'
         """).fetchall()
         columns = {row[0]: row[1] for row in columns_info}
         mapping = {}
@@ -84,6 +90,7 @@ class OrgSchema:
                 mapping[semantic] = match
                 continue
         return mapping
     def _exact_match(self, semantic: str, columns: Dict[str, str]) -> Optional[str]:
@@ -91,17 +98,16 @@ class OrgSchema:
         patterns = self.PATTERN_VECTORS.get(semantic, [])
         for col in columns.keys():
             if any(pattern in col.lower().replace("_", "") for pattern in patterns):
                 return col
         return None
     def _vector_match(self, semantic: str, column_names: List[str]) -> Optional[str]:
         """Semantic similarity via embeddings"""
         try:
-            # Embed semantic field and candidate columns
             semantic_emb = self.vector.embed(semantic)
             column_embs = [self.vector.embed(name) for name in column_names]
-            # Find best match above threshold
             best_match, score = self.vector.find_best_match(semantic_emb, column_embs, column_names)
             if score > 0.85:  # High confidence threshold
@@ -115,23 +121,54 @@ class OrgSchema:
     def _llm_match(self, semantic: str, columns: Dict[str, str]) -> Optional[str]:
         """LLM reasoning with schema context"""
         try:
-            prompt = f"""
-            You are a data schema expert. Map this semantic field to the most likely column.
             Semantic Field: `{semantic}`
             Available Columns: {list(columns.keys())}
             Data Types: {columns}
             Return ONLY the matching column name or "NONE" if no match.
-            Consider: naming conventions, business context, data types.
-            """
             response = self.llm.generate(prompt, max_tokens=20).strip()
-            return response if response != "NONE" else None
         except Exception as e:
             logger.warning(f"[LLM] Matching failed: {e}")
             return None
     def get_column(self, semantic: str) -> Optional[str]:
         """Safely get column name with audit logging"""
         mapping = self.get_mapping()
@@ -139,8 +176,6 @@ class OrgSchema:
         if not actual:
             logger.warning(f"[Schema] Missing semantic field: {semantic}")
-            self._log_missing_field(semantic)
         return actual
     def build_dynamic_query(self, required_fields: List[str]) -> Tuple[str, List[str]]:
@@ -150,9 +185,12 @@ class OrgSchema:
         for field in required_fields:
             if actual := mapping.get(field):
-                available.append(f"{actual} AS {field}")  # Alias to semantic name
         if not available:
-            raise ValueError(f"No required fields available: {required_fields}")
-        return f"SELECT {', '.join(available)} FROM transactions_{self.org_id}", available

 from typing import Dict, Optional, List, Tuple
 import json
 import logging
+from datetime import datetime
 from app.core.event_hub import event_hub
 from app.service.llm_service import LocalLLMService
+from app.service.vector_service import VectorService
+from app.db import get_conn
 logger = logging.getLogger(__name__)
                      "trans_date", "sale_time", "order_date"],
     }
+    def __init__(self, org_id: str, entity_type: str):
         self.org_id = org_id
+        self._entity_type = entity_type
+        self.cache_key = f"schema:{org_id}:{entity_type}:v3"
         self.stats_key = f"schema:stats:{org_id}"
         self.llm = LocalLLMService()
+        self.vector = VectorService(org_id)
     def get_mapping(self) -> Dict[str, str]:
         """Autonomous mapping with AI fallback for unmatched columns"""
         try:
             if cached := event_hub.get_key(self.cache_key):
+                logger.info(f"[Schema] Cache hit for org {self.org_id}/{self._entity_type}")
                 return json.loads(cached)
+            logger.info(f"[Schema] Starting AI discovery for org {self.org_id}/{self._entity_type}")
             mapping = self._discover_schema()
             self.save_mapping(mapping)
             return mapping
     def _discover_schema(self) -> Dict[str, str]:
         """Three-tier discovery: Rule-based → Vector similarity → LLM reasoning"""
+        conn = get_conn(self.org_id)
+        # Get columns from actual canonical table
         columns_info = conn.execute(f"""
             SELECT column_name, data_type, is_nullable
             FROM information_schema.columns
+            WHERE table_schema = 'main'
+              AND table_name = '{self._entity_type}_canonical'
         """).fetchall()
+        if not columns_info:
+            raise ValueError(f"No schema found for {self._entity_type}_canonical")
         columns = {row[0]: row[1] for row in columns_info}
         mapping = {}
                 mapping[semantic] = match
                 continue
+        logger.info(f"[Schema] AI discovery complete: {len(mapping)} fields mapped")
         return mapping
     def _exact_match(self, semantic: str, columns: Dict[str, str]) -> Optional[str]:
         patterns = self.PATTERN_VECTORS.get(semantic, [])
         for col in columns.keys():
             if any(pattern in col.lower().replace("_", "") for pattern in patterns):
+                logger.info(f"[Rule] Matched '{semantic}' → '{col}' (pattern)")
                 return col
         return None
     def _vector_match(self, semantic: str, column_names: List[str]) -> Optional[str]:
         """Semantic similarity via embeddings"""
         try:
             semantic_emb = self.vector.embed(semantic)
             column_embs = [self.vector.embed(name) for name in column_names]
             best_match, score = self.vector.find_best_match(semantic_emb, column_embs, column_names)
             if score > 0.85:  # High confidence threshold
     def _llm_match(self, semantic: str, columns: Dict[str, str]) -> Optional[str]:
         """LLM reasoning with schema context"""
         try:
+            prompt = f"""You are a data schema expert. Map this semantic field to the most likely column.
             Semantic Field: `{semantic}`
             Available Columns: {list(columns.keys())}
             Data Types: {columns}
             Return ONLY the matching column name or "NONE" if no match.
+            Consider: naming conventions, business context, data types."""
             response = self.llm.generate(prompt, max_tokens=20).strip()
+            if response != "NONE":
+                logger.info(f"[LLM] Matched '{semantic}' → '{response}'")
+                return response
+            return None
         except Exception as e:
             logger.warning(f"[LLM] Matching failed: {e}")
             return None
+    def save_mapping(self, mapping: Dict[str, str]) -> None:
+        """Persist mapping with TTL and stats"""
+        try:
+            event_hub.redis.setex(self.cache_key, 3600, json.dumps(mapping))
+            stats = {
+                "timestamp": datetime.now().isoformat(),
+                "fields_mapped": len(mapping),
+                "entity_type": self._entity_type
+            }
+            event_hub.redis.setex(self.stats_key, 3600, json.dumps(stats))
+        except Exception as e:
+            logger.warning(f"[Schema] Failed to save mapping: {e}")
+    def _get_fallback_mapping(self) -> Dict[str, str]:
+        """
+        🚀 EMERGENCY FALLBACK: Map columns to themselves
+        Ensures SaaS flexibility for any schema
+        """
+        logger.warning(f"[Schema] 🚨 EMERGENCY FALLBACK for {self.org_id}/{self._entity_type}")
+        conn = get_conn(self.org_id)
+        columns_info = conn.execute(f"""
+            SELECT column_name FROM information_schema.columns
+            WHERE table_schema = 'main' AND table_name = '{self._entity_type}_canonical'
+        """).fetchall()
+        # Map every column to itself - works for ANY schema
+        return {row[0]: row[0] for row in columns_info}
     def get_column(self, semantic: str) -> Optional[str]:
         """Safely get column name with audit logging"""
         mapping = self.get_mapping()
         if not actual:
             logger.warning(f"[Schema] Missing semantic field: {semantic}")
         return actual
     def build_dynamic_query(self, required_fields: List[str]) -> Tuple[str, List[str]]:
         for field in required_fields:
             if actual := mapping.get(field):
+                available.append(f"{actual} AS {field}")
         if not available:
+            # Return all columns if no semantic matches
+            conn = get_conn(self.org_id)
+            columns = conn.execute(f"PRAGMA table_info('{self._entity_type}_canonical')").fetchall()
+            available = [f"{c[1]} AS {c[1]}" for c in columns]
+        return f"SELECT {', '.join(available)} FROM {self._entity_type}_canonical", available

app/tasks/analytics_worker.py CHANGED Viewed

@@ -360,31 +360,46 @@ class AnalyticsWorker:
     # ==================== SCHEMA & EMBEDDING ====================
-    async def _discover_schema(self, df: pd.DataFrame) -> Dict[str, str]:
-        """🧠 Einstein's discovery engine with caching"""
         try:
-            cache_key = f"schema:mapping:{self.org_id}"
-            if cached := event_hub.get_key(cache_key):
-                logger.info("[SCHEMA] 💾 Cache hit")
-                return json.loads(cached)
             logger.info("[SCHEMA] 🧠 Cache miss, discovering...")
             mapping = self.schema.get_mapping()
             if not mapping:
-                logger.error("[SCHEMA] Discovery returned empty")
-                return {}
-            # Cache for 24h
             event_hub.setex(cache_key, 86400, json.dumps(mapping))
-            logger.info(f"[SCHEMA] ✅ Discovered {len(mapping)} mappings")
             return mapping
         except Exception as e:
-            logger.error(f"[SCHEMA] ❌ Discovery failed: {e}", exc_info=True)
-            return {}
     def _alias_columns(self, df: pd.DataFrame, mapping: Dict[str, str]) -> pd.DataFrame:
         """🔀 Renames columns to semantic names"""

     # ==================== SCHEMA & EMBEDDING ====================
+    # app/tasks/analytics_worker.py - Replace your _discover_schema method
+    def _discover_schema(self):
+        """Schema discovery with proper caching and error handling"""
         try:
             logger.info("[SCHEMA] 🧠 Cache miss, discovering...")
+            from app.schemas.org_schema import OrgSchema
+            # Initialize schema discoverer with entity context
+            self.schema = OrgSchema(self.org_id, self._entity_type)
             mapping = self.schema.get_mapping()
             if not mapping:
+                raise ValueError("Empty mapping returned")
+            # ✅ FIX: Define cache_key BEFORE using it
+            cache_key = f"schema:{self.org_id}:{self._entity_type}:worker_cache"
+            # ✅ FIX: Save to Redis with proper TTL
             event_hub.setex(cache_key, 86400, json.dumps(mapping))
+            logger.info(f"[SCHEMA] 💾 Cached mapping for 24h: {cache_key}")
+            self._schema_cache = mapping
+            logger.info(f"[SCHEMA] ✅ Discovery complete: {len(mapping)} columns")
             return mapping
         except Exception as e:
+            logger.error(f"[SCHEMA] ❌ Discovery failed: {e}")
+            # 🚀 EMERGENCY FALLBACK: Map columns to themselves (SaaS-ready)
+            logger.warning("[SCHEMA] 🚨 Using fallback - mapping columns as-is")
+            stealth_mapping = {col: col for col in self.df.columns}
+            # ✅ Cache the fallback too
+            cache_key = f"schema:{self.org_id}:{self._entity_type}:worker_cache:fallback"
+            event_hub.setex(cache_key, 3600, json.dumps(stealth_mapping))
+            self._schema_cache = stealth_mapping
+            return stealth_mapping
     def _alias_columns(self, df: pd.DataFrame, mapping: Dict[str, str]) -> pd.DataFrame:
         """🔀 Renames columns to semantic names"""