Spaces:

rairo
/

smart-w

Running

App Files Files Community

rairo commited on Jul 23, 2025

Commit

0bffbcb

verified ·

1 Parent(s): f357ad5

Update utility.py

Browse files

Files changed (1) hide show

utility.py +35 -88

utility.py CHANGED Viewed

@@ -6,10 +6,11 @@ from datetime import datetime, timezone
 from typing import List, Dict, Union, Optional, Any
 from google.cloud import firestore
 import pandas as pd
-import inflect  # <-- ADDED for robust pluralization
 from pandasai import SmartDatalake
 from pandasai.responses.response_parser import ResponseParser
-from pandasai.exceptions import NoCodeFoundError # <-- ADDED for specific error handling
 from langchain_google_genai import ChatGoogleGenerativeAI
 import google.generativeai as genai
 import re
@@ -128,51 +129,6 @@ Each transaction object MUST have the following keys:
 - **Rule for Queries:** For "read" intents or general questions, set `transaction_type` to "query" and the `details` object MUST contain a single key `"query"` with the user's full, original question as the value.
 - **Rule for Multiple Items:** If the user's request contains multiple distinct transactions (e.g., recording an expense AND an asset), create a separate JSON object for each one within the main list.
 - **Rule for Expense Normalization:** For "create" intents with `transaction_type` "expense", analyze the `description`. If it contains common keywords, normalize it to a single word. For example, if the description is "paid for fuel for the delivery truck", the normalized `description` in the JSON should be "fuel". If it's "office electricity bill", normalize it to "electricity".
-**5. Examples:**
-**Example 1: Simple Query**
-- **Input:** "what are my assets?"
-- **Output:**
-  [
-    {
-      "intent": "read",
-      "transaction_type": "query",
-      "details": {
-        "query": "what are my assets?"
-      }
-    }
-  ]
-**Example 2: Creating a Normalized Expense**
-- **Input:** "I paid R250 for fuel for work"
-- **Output:**
-  [
-    {
-      "intent": "create",
-      "transaction_type": "expense",
-      "details": {
-        "description": "fuel",
-        "amount": 250,
-        "currency": "R"
-      }
-    }
-  ]
-**Example 3: Creating an Asset**
-- **Input:** "just bought a new company laptop for $1500"
-- **Output:**
-  [
-    {
-      "intent": "create",
-      "transaction_type": "asset",
-      "details": {
-        "name": "new company laptop",
-        "value": 1500,
-        "currency": "$"
-      }
-    }
-  ]
     """
     try:
         full_prompt = [system_prompt, prompt]
@@ -213,39 +169,36 @@ def add_timestamp(transaction: Dict) -> Dict:
 def _get_canonical_info(user_phone: str, item_name: str) -> Dict[str, Any]:
     """
-    Finds the canonical version of an item, handling plurals robustly.
     """
     inventory_ref = db.collection("users").document(user_phone).collection("inventory_and_services")
     name_lower = item_name.lower().strip()
-    # --- CHANGE 3: Use inflect for robust singularization ---
     singular = p.singular_noun(name_lower)
-    if not singular:  # If inflect returns False (e.g., for already singular words)
         singular = name_lower
-    plural = p.plural(singular)
-    # --- END OF CHANGE 3 ---
-    doc_singular = inventory_ref.document(singular).get()
-    doc_plural = inventory_ref.document(plural).get()
-    exists_singular = doc_singular.exists
-    exists_plural = doc_plural.exists
-    if exists_singular and exists_plural:
-        data_s = doc_singular.to_dict()
-        data_p = doc_plural.to_dict()
-        time_s = data_s.get('last_updated', '')
-        time_p = data_p.get('last_updated', '')
-        if time_p > time_s:
-            return {'doc': doc_plural, 'name': plural}
-        return {'doc': doc_singular, 'name': singular}
-    elif exists_singular:
-        return {'doc': doc_singular, 'name': singular}
-    elif exists_plural:
-        return {'doc': doc_plural, 'name': plural}
-    else:
-        return {'doc': None, 'name': singular}
 def create_or_update_inventory_or_service_offering(user_phone: str, transaction_data: List[Dict]) -> tuple[bool, str]:
@@ -300,7 +253,7 @@ def create_or_update_inventory_or_service_offering(user_phone: str, transaction_
 def create_sale(user_phone: str, transaction_data: List[Dict]) -> tuple[bool, str]:
     """
-    Process sales with user price override, name normalization, and service bypass.
     """
     feedback_messages = []
     any_success = False
@@ -332,7 +285,6 @@ def create_sale(user_phone: str, transaction_data: List[Dict]) -> tuple[bool, st
             @firestore.transactional
             def process_one_sale(transaction, sale_details):
-                # --- CHANGE 2: Implement Price Override Logic ---
                 user_price = sale_details.get('price') or sale_details.get('unit_price')
                 if user_price is not None:
@@ -343,7 +295,6 @@ def create_sale(user_phone: str, transaction_data: List[Dict]) -> tuple[bool, st
                     logger.info(f"Using last known price for '{canonical_name}': {selling_price}")
                 else:
                     return f"Sale failed for new item '{canonical_name}': You must specify a price for the first sale."
-                # --- END OF CHANGE 2 ---
                 if not isinstance(selling_price, (int, float)): selling_price = 0
@@ -518,10 +469,12 @@ def _validate_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     if df.empty:
         return df
-    # 1. Validate and convert timestamp columns
     for col in ['timestamp', 'created_at', 'last_updated', 'acquisition_date', 'due_date']:
         if col in df.columns:
-            df[col] = pd.to_datetime(df[col], errors='coerce')
     # 2. Validate and convert numeric columns
     numeric_cols = ['price', 'unit_price', 'quantity', 'amount', 'value', 'cost', 'hours', 'units_available']
@@ -534,13 +487,13 @@ def _validate_dataframe(df: pd.DataFrame) -> pd.DataFrame:
         df[col] = df[col].fillna('Unknown')
     return df
 def _fetch_all_collections_as_dfs(user_phone: str) -> List[pd.DataFrame]:
     """
     Fetches all user data, splits inventory/services, validates, and returns DataFrames.
     """
-    # --- CHANGE 4 & 5: Split inventory/services and validate DataFrames ---
     collections = ['sales', 'expenses', 'assets', 'liabilities']
     all_dfs = []
@@ -560,12 +513,12 @@ def _fetch_all_collections_as_dfs(user_phone: str) -> List[pd.DataFrame]:
     if inventory_data:
         inventory_df = pd.DataFrame(inventory_data)
-        inventory_df.name = "inventory" # Name the dataframe for PandasAI
         all_dfs.append(_validate_dataframe(inventory_df))
     if services_data:
         services_df = pd.DataFrame(services_data)
-        services_df.name = "services" # Name the dataframe for PandasAI
         all_dfs.append(_validate_dataframe(services_df))
     # Handle other collections
@@ -583,18 +536,16 @@ def _fetch_all_collections_as_dfs(user_phone: str) -> List[pd.DataFrame]:
         if data:
             df = pd.DataFrame(data)
-            df.name = coll_name # Name the dataframe for PandasAI
             all_dfs.append(_validate_dataframe(df))
     return all_dfs
-    # --- END OF CHANGE 4 & 5 ---
 def read_datalake(user_phone: str, query: str) -> str:
     """
     Handles queries with temporal awareness, robust error handling, and recall logic.
     """
-    # --- CHANGE 1 & 6: Temporal Awareness and Advanced Error Handling ---
     try:
         all_dfs = _fetch_all_collections_as_dfs(user_phone)
         if not all_dfs:
@@ -605,7 +556,6 @@ def read_datalake(user_phone: str, query: str) -> str:
             "save_charts_path": user_defined_path, "enable_cache": False,
         })
-        # 1. Add temporal context to the query
         today_str = datetime.now(timezone.utc).strftime('%Y-%m-%d')
         contextual_query = (
             f"For context, today's date is {today_str}. "
@@ -614,12 +564,10 @@ def read_datalake(user_phone: str, query: str) -> str:
         logger.info(f"Contextual query for PandasAI: {contextual_query}")
         try:
-            # First attempt
             response = lake.chat(contextual_query)
             return str(response)
         except NoCodeFoundError:
             logger.warning(f"PandasAI failed on first attempt (NoCodeFoundError) for query: '{query}'. Retrying with simplification.")
-            # 6. Recall with a simplified prompt
             simplified_query = (
                 f"The previous attempt to answer the user's query failed. "
                 f"Try again with a simpler approach. Instead of complex analysis, "
@@ -636,7 +584,6 @@ def read_datalake(user_phone: str, query: str) -> str:
     except Exception as e:
         logger.error(f"Data query failed for user {user_phone}, query '{query}': {e}", exc_info=True)
         return "Sorry, I encountered an error while analyzing your data. There might be an issue with the records. Please check your recent transactions."
-    # --- END OF CHANGE 1 & 6 ---
 def _find_document_by_details(user_phone: str, collection_name: str, details: Dict) -> Optional[Any]:

 from typing import List, Dict, Union, Optional, Any
 from google.cloud import firestore
 import pandas as pd
+import inflect  # For robust pluralization
+from thefuzz import process as fuzzy_process  # For fuzzy string matching
 from pandasai import SmartDatalake
 from pandasai.responses.response_parser import ResponseParser
+from pandasai.exceptions import NoCodeFoundError # For specific error handling
 from langchain_google_genai import ChatGoogleGenerativeAI
 import google.generativeai as genai
 import re
 - **Rule for Queries:** For "read" intents or general questions, set `transaction_type` to "query" and the `details` object MUST contain a single key `"query"` with the user's full, original question as the value.
 - **Rule for Multiple Items:** If the user's request contains multiple distinct transactions (e.g., recording an expense AND an asset), create a separate JSON object for each one within the main list.
 - **Rule for Expense Normalization:** For "create" intents with `transaction_type` "expense", analyze the `description`. If it contains common keywords, normalize it to a single word. For example, if the description is "paid for fuel for the delivery truck", the normalized `description` in the JSON should be "fuel". If it's "office electricity bill", normalize it to "electricity".
     """
     try:
         full_prompt = [system_prompt, prompt]
 def _get_canonical_info(user_phone: str, item_name: str) -> Dict[str, Any]:
     """
+    Finds the canonical version of an item using fuzzy matching for existing items
+    and inflect for new ones.
     """
+    # --- CHANGE 1: Fuzzy Search and Robust Pluralization ---
     inventory_ref = db.collection("users").document(user_phone).collection("inventory_and_services")
     name_lower = item_name.lower().strip()
+    # 1. Fetch all existing item names for fuzzy matching
+    all_item_docs = list(inventory_ref.stream())
+    all_item_names = [doc.id for doc in all_item_docs]
+    if all_item_names:
+        # 2. Find the best match using fuzzy logic
+        best_match = fuzzy_process.extractOne(name_lower, all_item_names)
+        # 3. Apply a strict threshold
+        if best_match and best_match[1] >= 90:
+            matched_name = best_match[0]
+            # Find the corresponding document
+            for doc in all_item_docs:
+                if doc.id == matched_name:
+                    return {'doc': doc, 'name': matched_name}
+    # 4. If no good match is found, create a clean singular name for a new item
     singular = p.singular_noun(name_lower)
+    if not singular:
         singular = name_lower
+    return {'doc': None, 'name': singular}
+    # --- END OF CHANGE 1 ---
 def create_or_update_inventory_or_service_offering(user_phone: str, transaction_data: List[Dict]) -> tuple[bool, str]:
 def create_sale(user_phone: str, transaction_data: List[Dict]) -> tuple[bool, str]:
     """
+    Process sales with fuzzy name matching, user price override, and service bypass.
     """
     feedback_messages = []
     any_success = False
             @firestore.transactional
             def process_one_sale(transaction, sale_details):
                 user_price = sale_details.get('price') or sale_details.get('unit_price')
                 if user_price is not None:
                     logger.info(f"Using last known price for '{canonical_name}': {selling_price}")
                 else:
                     return f"Sale failed for new item '{canonical_name}': You must specify a price for the first sale."
                 if not isinstance(selling_price, (int, float)): selling_price = 0
     if df.empty:
         return df
+    # --- CHANGE 2: Robust Data Validation ---
+    # 1. Validate and convert timestamp columns to a consistent UTC format
     for col in ['timestamp', 'created_at', 'last_updated', 'acquisition_date', 'due_date']:
         if col in df.columns:
+            # The key fix: utc=True handles mixed timezone-aware/naive data
+            df[col] = pd.to_datetime(df[col], errors='coerce', utc=True)
     # 2. Validate and convert numeric columns
     numeric_cols = ['price', 'unit_price', 'quantity', 'amount', 'value', 'cost', 'hours', 'units_available']
         df[col] = df[col].fillna('Unknown')
     return df
+    # --- END OF CHANGE 2 ---
 def _fetch_all_collections_as_dfs(user_phone: str) -> List[pd.DataFrame]:
     """
     Fetches all user data, splits inventory/services, validates, and returns DataFrames.
     """
     collections = ['sales', 'expenses', 'assets', 'liabilities']
     all_dfs = []
     if inventory_data:
         inventory_df = pd.DataFrame(inventory_data)
+        inventory_df.name = "inventory"
         all_dfs.append(_validate_dataframe(inventory_df))
     if services_data:
         services_df = pd.DataFrame(services_data)
+        services_df.name = "services"
         all_dfs.append(_validate_dataframe(services_df))
     # Handle other collections
         if data:
             df = pd.DataFrame(data)
+            df.name = coll_name
             all_dfs.append(_validate_dataframe(df))
     return all_dfs
 def read_datalake(user_phone: str, query: str) -> str:
     """
     Handles queries with temporal awareness, robust error handling, and recall logic.
     """
     try:
         all_dfs = _fetch_all_collections_as_dfs(user_phone)
         if not all_dfs:
             "save_charts_path": user_defined_path, "enable_cache": False,
         })
         today_str = datetime.now(timezone.utc).strftime('%Y-%m-%d')
         contextual_query = (
             f"For context, today's date is {today_str}. "
         logger.info(f"Contextual query for PandasAI: {contextual_query}")
         try:
             response = lake.chat(contextual_query)
             return str(response)
         except NoCodeFoundError:
             logger.warning(f"PandasAI failed on first attempt (NoCodeFoundError) for query: '{query}'. Retrying with simplification.")
             simplified_query = (
                 f"The previous attempt to answer the user's query failed. "
                 f"Try again with a simpler approach. Instead of complex analysis, "
     except Exception as e:
         logger.error(f"Data query failed for user {user_phone}, query '{query}': {e}", exc_info=True)
         return "Sorry, I encountered an error while analyzing your data. There might be an issue with the records. Please check your recent transactions."
 def _find_document_by_details(user_phone: str, collection_name: str, details: Dict) -> Optional[Any]: