Agentic-Service-Data-Eyond-Catalog

Sleeping

App Files Files Community

Rifqi Hafizuddin commited on 11 days ago

Commit

2e601ec

1 Parent(s): c4754f3

[KM-564] fix render and sql error

Browse files

Files changed (3) hide show

src/catalog/render.py +6 -6
src/config/prompts/chatbot_system.md +1 -0
src/query/executor/db.py +1 -1

src/catalog/render.py CHANGED Viewed

@@ -8,10 +8,9 @@ from .models import Source
 def render_source(source: Source) -> str:
     """Render a Source as the canonical text block consumed by the planner.
-    Identifiers (source_id / table_id / column_id) are intentionally NOT
-    rendered — the LLM references things by name, and the IR resolver maps
-    names back to stable IDs before validation. This saves ~10% input tokens
-    per planner call.
     Columns show data type, sample values (or `PII (suppressed)`), and
     populated stats only (min/max suppressed for string/bool, where they're
@@ -20,6 +19,7 @@ def render_source(source: Source) -> str:
     """
     lines: list[str] = [
         f"Source: {source.name} ({source.source_type})",
         "",
         "Tables:",
     ]
@@ -33,7 +33,7 @@ def render_source(source: Source) -> str:
         rc = table.row_count
         rc_str = f" ({rc:,} rows)" if rc is not None else ""
         lines.append("")
-        lines.append(f"  Table: {table.name}{rc_str}")
         lines.append("  Columns:")
         for col in table.columns:
             samples = "PII (suppressed)" if col.pii_flag else (col.sample_values or [])
@@ -53,7 +53,7 @@ def render_source(source: Source) -> str:
                     stats_parts.append(f"top={col.stats.top_values}")
             stats_str = (", " + ", ".join(stats_parts)) if stats_parts else ""
             lines.append(
-                f"    - {col.name} [{col.data_type}]: samples={samples}{stats_str}"
             )
         if table.foreign_keys:
             lines.append("  Foreign keys:")

 def render_source(source: Source) -> str:
     """Render a Source as the canonical text block consumed by the planner.
+    Stable identifiers (source_id / table_id / column_id) are rendered
+    alongside names. The planner must copy these verbatim into the IR;
+    the IRValidator does a literal ID lookup, so anything else fails.
     Columns show data type, sample values (or `PII (suppressed)`), and
     populated stats only (min/max suppressed for string/bool, where they're
     """
     lines: list[str] = [
         f"Source: {source.name} ({source.source_type})",
+        f"Source ID: {source.source_id}",
         "",
         "Tables:",
     ]
         rc = table.row_count
         rc_str = f" ({rc:,} rows)" if rc is not None else ""
         lines.append("")
+        lines.append(f"  Table: {table.name}{rc_str} — id={table.table_id}")
         lines.append("  Columns:")
         for col in table.columns:
             samples = "PII (suppressed)" if col.pii_flag else (col.sample_values or [])
                     stats_parts.append(f"top={col.stats.top_values}")
             stats_str = (", " + ", ".join(stats_parts)) if stats_parts else ""
             lines.append(
+                f"    - {col.name} [{col.data_type}]: samples={samples}{stats_str} — id={col.column_id}"
             )
         if table.foreign_keys:
             lines.append("  Foreign keys:")

src/config/prompts/chatbot_system.md CHANGED Viewed

@@ -6,6 +6,7 @@ You are a friendly, precise data assistant for a user who has registered databas
 2. **Be concise.** Default to 1–4 sentences. Bullet lists when comparing items. A small table when more than ~5 rows of data carry the answer.
 3. **Use the user's terms when possible.** Mirror the column / table names they care about, but feel free to humanize ("revenue" instead of "total_cents", "last month" instead of "2026-04 timestamps").
 4. **Reference the source.** When you cite a number from a query result, mention the source briefly (e.g., "from your prod_db `orders` table"). When you quote a document, cite the filename and page if available.
 5. **Stream coherently.** You are streaming token-by-token; don't backtrack or self-correct mid-answer. Plan the structure mentally before the first token.
 6. **Markdown is OK** for emphasis and small tables, but avoid heavy formatting (code fences, headers) unless the question genuinely calls for it.

 2. **Be concise.** Default to 1–4 sentences. Bullet lists when comparing items. A small table when more than ~5 rows of data carry the answer.
 3. **Use the user's terms when possible.** Mirror the column / table names they care about, but feel free to humanize ("revenue" instead of "total_cents", "last month" instead of "2026-04 timestamps").
 4. **Reference the source.** When you cite a number from a query result, mention the source briefly (e.g., "from your prod_db `orders` table"). When you quote a document, cite the filename and page if available.
+   - **Never expose internal identifiers** (UUIDs, `source_id`, `client_id`, `room_id`, primary-key integers, etc.) to the user. Refer to sources by their human-readable name (database name, table name, filename). If a row has both an id and a name column, use the name.
 5. **Stream coherently.** You are streaming token-by-token; don't backtrack or self-correct mid-answer. Plan the structure mentally before the first token.
 6. **Markdown is OK** for emphasis and small tables, but avoid heavy formatting (code fences, headers) unless the question genuinely calls for it.

src/query/executor/db.py CHANGED Viewed

@@ -169,7 +169,7 @@ class DbExecutor(BaseExecutor):
             raise ValueError(
                 f"compiled SQL is not a SELECT (got {type(parsed).__name__})"
             )
-        forbidden = (exp.Insert, exp.Update, exp.Delete, exp.Drop, exp.AlterTable)
         for node in parsed.find_all(forbidden):
             raise ValueError(
                 f"compiled SQL contains forbidden DML/DDL: {type(node).__name__}"

             raise ValueError(
                 f"compiled SQL is not a SELECT (got {type(parsed).__name__})"
             )
+        forbidden = (exp.Insert, exp.Update, exp.Delete, exp.Drop, exp.Alter)
         for node in parsed.find_all(forbidden):
             raise ValueError(
                 f"compiled SQL contains forbidden DML/DDL: {type(node).__name__}"