Spaces:

AshenH
/

ALM_LLM

Sleeping

App Files Files Community

AshenH commited on Oct 24

Commit

2a4b15b

verified ·

1 Parent(s): 2de875c

Update tools/sql_tool.py

Browse files

Files changed (1) hide show

tools/sql_tool.py +82 -228

tools/sql_tool.py CHANGED Viewed

@@ -1,236 +1,90 @@
-# tools/sql_tool.py
 import os
-import re
-from typing import Optional, Tuple, List
 import duckdb
 import pandas as pd
-# ------------------------------------------------------------
-# Connection config
-# ------------------------------------------------------------
-DUCKDB_PATH = os.getenv("DUCKDB_PATH", "alm.duckdb")
-# If you need to attach a catalog (e.g., MotherDuck), put the full ATTACH here.
-# Example:
-DUCKDB_ATTACH_SQL=ATTACH 'md:my_db' AS my_db;
-# Preferred identifiers (we will fall back automatically if they don't exist)
-PREF_CATALOG = os.getenv("SQL_DEFAULT_DB", "my_db")      # catalog (optional)
-PREF_SCHEMA  = os.getenv("SQL_DEFAULT_SCHEMA", "main")   # schema
-PREF_TABLE   = os.getenv("SQL_DEFAULT_TABLE", "masterdataset_v")  # table
-class SQLTool:
     """
-    NL→SQL helper for DuckDB with:
-      - optional pre-attach SQL (DUCKDB_ATTACH_SQL)
-      - robust table path resolution (tries 3-part → 2-part → 1-part → information_schema scan)
     """
-    def __init__(self, db_path: Optional[str] = None):
-        self.db_path = db_path or DUCKDB_PATH
-        self.con = duckdb.connect(self.db_path)
-        # Optional: run user-supplied ATTACH (safe no-op if empty)
-        if DUCKDB_ATTACH_SQL:
-            try:
-                self.con.execute(DUCKDB_ATTACH_SQL)
-            except Exception as e:
-                # Don't crash the app on attach issues; we still try local tables
-                print(f"[WARN] DUCKDB_ATTACH_SQL failed: {e}")
-        self.full_table = self._resolve_full_table(PREF_CATALOG, PREF_SCHEMA, PREF_TABLE)
-    # ------------------------------------------------------------
-    # Resolution helpers
-    # ------------------------------------------------------------
-    def _try_probe(self, path: str) -> bool:
-        """Return True if SELECT * FROM <path> LIMIT 1 succeeds."""
-        try:
-            self.con.execute(f"SELECT * FROM {path} LIMIT 1")
-            return True
-        except Exception:
-            return False
-    def _scan_information_schema(self, table_name: str) -> Optional[str]:
-        """
-        Look for <schema>.<table> (and <catalog>.<schema>.<table> if available)
-        in information_schema. Return a best guess path string or None.
-        """
-        q = """
-        SELECT table_catalog, table_schema, table_name
-        FROM information_schema.tables
-        WHERE lower(table_name) = ?
-        ORDER BY table_catalog, table_schema
-        """
-        rows = self.con.execute(q, [table_name.lower()]).fetchall()
-        if not rows:
-            return None
-        # Prefer matches in preferred schema/catalog when possible
-        # 1) exact catalog+schema
-        for cat, sch, t in rows:
-            if (cat or "").lower() == (PREF_CATALOG or "").lower() and sch.lower() == PREF_SCHEMA.lower():
-                candidate = f"{cat}.{sch}.{t}" if cat else f"{sch}.{t}"
-                if self._try_probe(candidate):
-                    return candidate
-        # 2) exact schema (2-part)
-        for cat, sch, t in rows:
-            if sch.lower() == PREF_SCHEMA.lower():
-                candidate = f"{sch}.{t}"
-                if self._try_probe(candidate):
-                    return candidate
-        # 3) first working row (prefer 3-part if catalog present)
-        for cat, sch, t in rows:
-            candidate = f"{cat}.{sch}.{t}" if cat else f"{sch}.{t}"
-            if self._try_probe(candidate):
-                return candidate
-        return None
-    def _resolve_full_table(self, catalog: Optional[str], schema: Optional[str], table: str) -> str:
-        """
-        Return a working fully qualified path for the table by trying:
-          - <catalog>.<schema>.<table> (3-part)
-          - <schema>.<table>        (2-part)
-          - <table>                  (1-part)
-          - information_schema scan (best effort)
-        """
-        candidates: List[str] = []
-        if catalog:
-            candidates.append(f"{catalog}.{schema}.{table}")
-        if schema:
-            candidates.append(f"{schema}.{table}")
-        candidates.append(table)
-        for path in candidates:
-            if self._try_probe(path):
-                print(f"[INFO] Using table path: {path}")
-                return path
-        # Fallback: scan information_schema
-        scanned = self._scan_information_schema(table)
-        if scanned:
-            print(f"[INFO] Using table path (scanned): {scanned}")
-            return scanned
-        # Last resort: keep preferred 3-part (will raise on first query)
-        fallback = f"{catalog}.{schema}.{table}" if catalog else f"{schema}.{table}"
-        print(f"[WARN] Could not resolve table path; falling back to: {fallback}")
-        return fallback
-    # ------------------------------------------------------------
-    # Run SQL directly
-    # ------------------------------------------------------------
-    def run_sql(self, sql: str) -> pd.DataFrame:
-        return self.con.execute(sql).df()
-    # ------------------------------------------------------------
-    # NL → SQL
-    # ------------------------------------------------------------
-    def _nl_to_sql(self, message: str) -> Tuple[str, str]:
-        full_table = self.full_table
-        m = (message or "").strip().lower()
-        def has_any(txt, words):
-            return any(w in txt for w in words)
-        # Extract "top N"
-        limit = None
-        m_top = re.search(r"\btop\s+(\d+)", m)
-        if m_top:
-            limit = int(m_top.group(1))
-        # 1. Top N FDs
-        if has_any(m, ["fd", "fixed deposit", "deposits"]) and has_any(
-            m, ["top", "largest", "biggest"]
-        ) and has_any(m, ["portfolio value", "portfolio_value"]):
-            n = limit or 10
-            sql = f"""
-            SELECT contract_number, Portfolio_value, Interest_rate, currency, segments
-            FROM {full_table}
-            WHERE lower(product) = 'fd'
-            ORDER BY Portfolio_value DESC
-            LIMIT {n};
-            """
-            why = f"Top {n} fixed deposits by Portfolio_value from {full_table}"
-            return sql, why
-        # 2. Top N Assets
-        if has_any(m, ["asset", "loan", "advances"]) and has_any(
-            m, ["top", "largest", "biggest"]
-        ) and has_any(m, ["portfolio value", "portfolio_value"]):
-            n = limit or 10
-            sql = f"""
-            SELECT contract_number, Portfolio_value, Interest_rate, currency, segments
-            FROM {full_table}
-            WHERE lower(product) = 'assets'
-            ORDER BY Portfolio_value DESC
-            LIMIT {n};
-            """
-            why = f"Top {n} assets by Portfolio_value from {full_table}"
-            return sql, why
-        # 3. Aggregate by segment/currency
-        if has_any(m, ["sum", "total", "avg", "average"]) and has_any(
-            m, ["segment", "currency"]
-        ):
-            agg = "SUM" if has_any(m, ["sum", "total"]) else "AVG"
-            dim = "segments" if "segment" in m else "currency"
-            sql = f"""
-            SELECT {dim}, {agg}(Portfolio_value) AS {agg.lower()}_Portfolio_value
-            FROM {full_table}
-            GROUP BY 1
-            ORDER BY 2 DESC;
-            """
-            why = f"{agg} Portfolio_value grouped by {dim} from {full_table}"
-            return sql, why
-        # 4. Generic filters
-        product = None
-        if "fd" in m or "deposit" in m:
-            product = "fd"
-        elif "asset" in m or "loan" in m or "advance" in m:
-            product = "assets"
-        parts = [f"SELECT * FROM {full_table} WHERE 1=1"]
-        why_parts = [f"Filtered rows from {full_table}"]
-        if product:
-            parts.append(f"AND lower(product) = '{product}'")
-            why_parts.append(f"product = {product}")
-        cur_match = re.search(r"\b(currency|in)\s+([a-z]{3})\b", m)
-        if cur_match:
-            cur = cur_match.group(2).upper()
-            parts.append(f"AND upper(currency) = '{cur}'")
-            why_parts.append(f"currency = {cur}")
-        seg_match = re.search(r"(segment|for)\s+([a-z0-9_\- ]+)", m)
-        if seg_match:
-            seg = seg_match.group(2).strip()
-            if seg:
-                parts.append(f"AND lower(segments) LIKE '%{seg.lower()}%'")
-                why_parts.append(f"segments like '{seg}'")
-        if limit:
-            parts.append(f"LIMIT {limit}")
-        fallback_sql = " ".join(parts) + ";"
-        fallback_why = "; ".join(why_parts)
-        return fallback_sql, fallback_why
-    # ------------------------------------------------------------
-    # Public wrappers
-    # ------------------------------------------------------------
-    def query_from_nl(self, message: str):
-        sql, why = self._nl_to_sql(message)
-        df = self.run_sql(sql)
-        return df, sql, why
-    def get_full_table_path(self) -> str:
-        return self.full_table

+from langchain_core.tools import tool
 import os
 import duckdb
 import pandas as pd
+import warnings
+# Suppress warnings that might clutter the output
+warnings.filterwarnings("ignore")
+# --- Database Connection Setup ---
+def get_md_connection() -> duckdb.DuckDBPyConnection:
     """
+    Establishes a connection to MotherDuck using the MOTHERDUCK_TOKEN environment variable.
     """
+    # 1. Get the connection token
+    token = os.environ.get('MOTHERDUCK_TOKEN')
+    if not token:
+        raise ConnectionError(
+            "MOTHERDUCK_TOKEN environment variable is not set. "
+            "Please ensure it is configured in your secrets to connect to the database."
+        )
+    # 2. Connect to the MotherDuck service
+    # Note: Replace 'my_db' with your actual MotherDuck database name if necessary,
+    # otherwise it connects to the default MotherDuck endpoint.
+    conn = duckdb.connect(f'md:?motherduck_token={token}')
+    return conn
+# --- SQL Tools ---
+@tool
+def run_duckdb_query(query: str) -> str:
+    """
+    Runs a read-only SQL query against the connected MotherDuck database and returns the results as a string.
+    The query must be valid DuckDB SQL. This tool only supports SELECT queries.
+    """
+    try:
+        conn = get_md_connection()
+        # Enforce read-only constraint
+        if not query.strip().lower().startswith('select'):
+            return "Error: Only read-only SELECT queries are allowed."
+        # Execute the query and fetch the results into a pandas DataFrame
+        result_df = conn.execute(query).fetchdf()
+        if result_df.empty:
+            return "Query executed successfully, but no rows were returned."
+        # Return the DataFrame as a string
+        return result_df.to_string(index=False)
+    except ConnectionError as e:
+        return f"Connection Error: {e}"
+    except Exception as e:
+        return f"DuckDB Query Error: {e}"
+    finally:
+        # Always close the connection
+        if 'conn' in locals() and conn:
+            conn.close()
+@tool
+def get_table_schema(table_name: str = "positions") -> str:
+    """
+    Returns the schema (column names and data types) for the specified table in the MotherDuck database.
+    Defaults to the 'positions' table.
+    """
+    try:
+        conn = get_md_connection()
+        # Use PRAGMA table_info to get the schema details dynamically
+        query = f"PRAGMA table_info('{table_name}')"
+        schema_df = conn.execute(query).fetchdf()
+        if schema_df.empty:
+            return f"Error: Table '{table_name}' not found in the MotherDuck database."
+        # Format the schema into a simple string: name TYPE, name TYPE, ...
+        schema_parts = [f"{row['name']} {row['type']}" for index, row in schema_df.iterrows()]
+        return ", ".join(schema_parts)
+    except ConnectionError as e:
+        return f"Connection Error: {e}"
+    except Exception as e:
+        return f"DuckDB Schema Error: {e}"
+    finally:
+        # Always close the connection
+        if 'conn' in locals() and conn:
+            conn.close()