Spaces:

AshenH
/

ALM_LLM

Sleeping

App Files Files Community

AshenH commited on Oct 14

Commit

0d9239a

verified ·

1 Parent(s): 47a613b

Update tools/sql_tool.py

Browse files

Files changed (1) hide show

tools/sql_tool.py +148 -110

tools/sql_tool.py CHANGED Viewed

@@ -1,117 +1,155 @@
-# app.py
 import os
-import pandas as pd
-import gradio as gr
-from tools.sql_tool import SQLTool
-from tools.ts_preprocess import build_timeseries
-# ==========================================================
-#   CONFIG
-# ==========================================================
 DUCKDB_PATH = os.getenv("DUCKDB_PATH", "alm.duckdb")
 DEFAULT_SCHEMA = os.getenv("SQL_DEFAULT_SCHEMA", "my_db")
 DEFAULT_TABLE  = os.getenv("SQL_DEFAULT_TABLE",  "masterdataset_v")
-sql_tool = SQLTool(DUCKDB_PATH)
-INTRO = f"""
-### ALM LLM — Demo
-Connected to **DuckDB** at `{DUCKDB_PATH}` using table **{DEFAULT_SCHEMA}.{DEFAULT_TABLE}**.
-**Try:**
-- *"show me the top 10 fds by portfolio value"*
-- *"top 10 assets by portfolio value"*
-- *"sum portfolio value by currency"*
-"""
-# ==========================================================
-#   BACKEND HANDLERS
-# ==========================================================
-def run_nl(nl_query: str):
-    """Handle natural-language queries."""
-    if not nl_query or not nl_query.strip():
-        return pd.DataFrame(), "", "Please enter a query.", pd.DataFrame(), pd.DataFrame()
-    try:
-        df, sql, why = sql_tool.query_from_nl(nl_query)
-    except Exception as e:
-        return pd.DataFrame(), "", f"Error: {e}", pd.DataFrame(), pd.DataFrame()
-    try:
-        cf, gap = build_timeseries(df)
-    except Exception:
-        cf, gap = pd.DataFrame(), pd.DataFrame()
-    return df, sql.strip(), why, cf, gap
-def run_sql(sql_text: str):
-    """Handle raw SQL execution."""
-    if not sql_text or not sql_text.strip():
-        return pd.DataFrame(), "Please paste a SQL statement.", pd.DataFrame(), pd.DataFrame()
-    try:
-        df = sql_tool.run_sql(sql_text)
-    except Exception as e:
-        return pd.DataFrame(), f"Error: {e}", pd.DataFrame(), pd.DataFrame()
-    try:
-        cf, gap = build_timeseries(df)
-    except Exception:
-        cf, gap = pd.DataFrame(), pd.DataFrame()
-    return df, "OK", cf, gap
-# ==========================================================
-#   GRADIO UI
-# ==========================================================
-with gr.Blocks(title="ALM LLM") as demo:
-    gr.Markdown(INTRO)
-    # ---- Tab 1: Natural language ----
-    with gr.Tab("Ask in Natural Language"):
-        nl = gr.Textbox(
-            label="Ask a question",
-            placeholder="e.g., show me the top 10 fds by portfolio value",
-            lines=2,
-        )
-        btn = gr.Button("Run")
-        sql_out = gr.Textbox(label="Generated SQL", interactive=False)
-        why_out = gr.Textbox(label="Reasoning", interactive=False)
-        df_out  = gr.Dataframe(label="Query Result", interactive=True)
-        cf_out  = gr.Dataframe(label="Projected Cash-Flows (if applicable)", interactive=True)
-        gap_out = gr.Dataframe(label="Liquidity Gap (monthly)", interactive=True)
-        btn.click(
-            fn=run_nl,
-            inputs=[nl],
-            outputs=[df_out, sql_out, why_out, cf_out, gap_out],
-        )
-    # ---- Tab 2: Raw SQL ----
-    with gr.Tab("Run Raw SQL"):
-        sql_in = gr.Code(
-            label="SQL",
-            language="sql",
-            value=f"SELECT * FROM {DEFAULT_SCHEMA}.{DEFAULT_TABLE} LIMIT 20;",
-        )
-        btn2 = gr.Button("Execute")
-        df2   = gr.Dataframe(label="Result", interactive=True)
-        status = gr.Textbox(label="Status", interactive=False)
-        cf2    = gr.Dataframe(label="Projected Cash-Flows (if applicable)", interactive=True)
-        gap2   = gr.Dataframe(label="Liquidity Gap (monthly)", interactive=True)
-        btn2.click(
-            fn=run_sql,
-            inputs=[sql_in],
-            outputs=[df2, status, cf2, gap2],
-        )
-# ==========================================================
-#   LAUNCH
-# ==========================================================
-if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))

+# tools/sql_tool.py
 import os
+import re
+from typing import Optional, Tuple
+import duckdb
+# DuckDB file path (can be overridden in Space settings)
 DUCKDB_PATH = os.getenv("DUCKDB_PATH", "alm.duckdb")
+# Default schema/table -> your path my_db.masterdataset_v
 DEFAULT_SCHEMA = os.getenv("SQL_DEFAULT_SCHEMA", "my_db")
 DEFAULT_TABLE  = os.getenv("SQL_DEFAULT_TABLE",  "masterdataset_v")
+def _full_table(schema: Optional[str] = None, table: Optional[str] = None) -> str:
+    schema = schema or DEFAULT_SCHEMA
+    table = table or DEFAULT_TABLE
+    return f"{schema}.{table}"
+class SQLTool:
+    """
+    Minimal NL→SQL helper wired to my_db.masterdataset_v with a DuckDB runner.
+    """
+    def __init__(self, db_path: Optional[str] = None):
+        self.db_path = db_path or DUCKDB_PATH
+        self.con = duckdb.connect(self.db_path)
+    # -------------------------
+    # SQL Runner
+    # -------------------------
+    def run_sql(self, sql: str):
+        return self.con.execute(sql).df()
+    # -------------------------
+    # NL → SQL
+    # -------------------------
+    def _nl_to_sql(
+        self, message: str, schema: Optional[str] = None, table: Optional[str] = None
+    ) -> Tuple[str, str]:
+        """
+        Returns (sql, rationale). Small template library covering common queries.
+        Falls back to a filtered SELECT or a sample.
+        """
+        full_table = _full_table(schema, table)
+        m = (message or "").strip().lower()
+        def has_any(txt, words):
+            return any(w in txt for w in words)
+        # Extract "top N"
+        limit = None
+        m_top = re.search(r"\btop\s+(\d+)", m)
+        if m_top:
+            limit = int(m_top.group(1))
+        # 1) Top N FDs by Portfolio_value
+        if has_any(m, ["fd", "fixed deposit", "deposits"]) and has_any(
+            m, ["top", "largest", "biggest"]
+        ) and has_any(m, ["portfolio value", "portfolio_value"]):
+            n = limit or 10
+            sql = f"""
+            SELECT contract_number, Portfolio_value, Interest_rate, currency, segments
+            FROM {full_table}
+            WHERE lower(product) = 'fd'
+            ORDER BY Portfolio_value DESC
+            LIMIT {n};
+            """
+            why = f"Top {n} fixed deposits by Portfolio_value from {full_table}"
+            return sql, why
+        # 2) Top N Assets by Portfolio_value
+        if has_any(m, ["asset", "loan", "advances"]) and has_any(
+            m, ["top", "largest", "biggest"]
+        ) and has_any(m, ["portfolio value", "portfolio_value"]):
+            n = limit or 10
+            sql = f"""
+            SELECT contract_number, Portfolio_value, Interest_rate, currency, segments
+            FROM {full_table}
+            WHERE lower(product) = 'assets'
+            ORDER BY Portfolio_value DESC
+            LIMIT {n};
+            """
+            why = f"Top {n} assets by Portfolio_value from {full_table}"
+            return sql, why
+        # 3) Aggregate (SUM/AVG) by segment or currency
+        if has_any(m, ["sum", "total", "avg", "average"]) and has_any(
+            m, ["segment", "currency"]
+        ):
+            agg = "SUM" if has_any(m, ["sum", "total"]) else "AVG"
+            dim = "segments" if "segment" in m else "currency"
+            sql = f"""
+            SELECT {dim}, {agg}(Portfolio_value) AS {agg.lower()}_Portfolio_value
+            FROM {full_table}
+            GROUP BY 1
+            ORDER BY 2 DESC;
+            """
+            why = f"{agg} Portfolio_value grouped by {dim} from {full_table}"
+            return sql, why
+        # 4) Generic filters
+        product = None
+        if "fd" in m or "deposit" in m:
+            product = "fd"
+        elif "asset" in m or "loan" in m or "advance" in m:
+            product = "assets"
+        parts = [f"SELECT * FROM {full_table} WHERE 1=1"]
+        why_parts = [f"Filtered rows from {full_table}"]
+        if product:
+            parts.append(f"AND lower(product) = '{product}'")
+            why_parts.append(f"product = {product}")
+        # currency filter like: "in lkr", "currency usd"
+        cur_match = re.search(r"\b(currency|in)\s+([a-z]{3})\b", m)
+        if cur_match:
+            cur = cur_match.group(2).upper()
+            parts.append(f"AND upper(currency) = '{cur}'")
+            why_parts.append(f"currency = {cur}")
+        # segment filter like: "segment retail" or "for corporate"
+        seg_match = re.search(r"(segment|for)\s+([a-z0-9_\- ]+)", m)
+        if seg_match:
+            seg = seg_match.group(2).strip()
+            if seg:
+                parts.append(f"AND lower(segments) LIKE '%{seg.lower()}%'")
+                why_parts.append(f"segments like '{seg}'")
+        if limit:
+            parts.append(f"LIMIT {limit}")
+        fallback_sql = " ".join(parts) + ";"
+        fallback_why = "; ".join(why_parts)
+        return fallback_sql, fallback_why
+    # Public helpers
+    def query_from_nl(self, message: str):
+        sql, why = self._nl_to_sql(message)
+        df = self.run_sql(sql)
+        return df, sql, why
+    def table_exists(self, schema: Optional[str] = None, table: Optional[str] = None) -> bool:
+        schema = schema or DEFAULT_SCHEMA
+        table = table or DEFAULT_TABLE
+        q = f"""
+        SELECT COUNT(*) AS n
+        FROM information_schema.tables
+        WHERE table_schema = '{schema}' AND table_name = '{table}';
+        """
+        n = self.con.execute(q).fetchone()[0]
+        return n > 0