Rifqi Hafizuddin commited on
Commit ·
2e601ec
1
Parent(s): c4754f3
[KM-564] fix render and sql error
Browse files
src/catalog/render.py
CHANGED
|
@@ -8,10 +8,9 @@ from .models import Source
|
|
| 8 |
def render_source(source: Source) -> str:
|
| 9 |
"""Render a Source as the canonical text block consumed by the planner.
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
per planner call.
|
| 15 |
|
| 16 |
Columns show data type, sample values (or `PII (suppressed)`), and
|
| 17 |
populated stats only (min/max suppressed for string/bool, where they're
|
|
@@ -20,6 +19,7 @@ def render_source(source: Source) -> str:
|
|
| 20 |
"""
|
| 21 |
lines: list[str] = [
|
| 22 |
f"Source: {source.name} ({source.source_type})",
|
|
|
|
| 23 |
"",
|
| 24 |
"Tables:",
|
| 25 |
]
|
|
@@ -33,7 +33,7 @@ def render_source(source: Source) -> str:
|
|
| 33 |
rc = table.row_count
|
| 34 |
rc_str = f" ({rc:,} rows)" if rc is not None else ""
|
| 35 |
lines.append("")
|
| 36 |
-
lines.append(f" Table: {table.name}{rc_str}")
|
| 37 |
lines.append(" Columns:")
|
| 38 |
for col in table.columns:
|
| 39 |
samples = "PII (suppressed)" if col.pii_flag else (col.sample_values or [])
|
|
@@ -53,7 +53,7 @@ def render_source(source: Source) -> str:
|
|
| 53 |
stats_parts.append(f"top={col.stats.top_values}")
|
| 54 |
stats_str = (", " + ", ".join(stats_parts)) if stats_parts else ""
|
| 55 |
lines.append(
|
| 56 |
-
f" - {col.name} [{col.data_type}]: samples={samples}{stats_str}"
|
| 57 |
)
|
| 58 |
if table.foreign_keys:
|
| 59 |
lines.append(" Foreign keys:")
|
|
|
|
| 8 |
def render_source(source: Source) -> str:
|
| 9 |
"""Render a Source as the canonical text block consumed by the planner.
|
| 10 |
|
| 11 |
+
Stable identifiers (source_id / table_id / column_id) are rendered
|
| 12 |
+
alongside names. The planner must copy these verbatim into the IR;
|
| 13 |
+
the IRValidator does a literal ID lookup, so anything else fails.
|
|
|
|
| 14 |
|
| 15 |
Columns show data type, sample values (or `PII (suppressed)`), and
|
| 16 |
populated stats only (min/max suppressed for string/bool, where they're
|
|
|
|
| 19 |
"""
|
| 20 |
lines: list[str] = [
|
| 21 |
f"Source: {source.name} ({source.source_type})",
|
| 22 |
+
f"Source ID: {source.source_id}",
|
| 23 |
"",
|
| 24 |
"Tables:",
|
| 25 |
]
|
|
|
|
| 33 |
rc = table.row_count
|
| 34 |
rc_str = f" ({rc:,} rows)" if rc is not None else ""
|
| 35 |
lines.append("")
|
| 36 |
+
lines.append(f" Table: {table.name}{rc_str} — id={table.table_id}")
|
| 37 |
lines.append(" Columns:")
|
| 38 |
for col in table.columns:
|
| 39 |
samples = "PII (suppressed)" if col.pii_flag else (col.sample_values or [])
|
|
|
|
| 53 |
stats_parts.append(f"top={col.stats.top_values}")
|
| 54 |
stats_str = (", " + ", ".join(stats_parts)) if stats_parts else ""
|
| 55 |
lines.append(
|
| 56 |
+
f" - {col.name} [{col.data_type}]: samples={samples}{stats_str} — id={col.column_id}"
|
| 57 |
)
|
| 58 |
if table.foreign_keys:
|
| 59 |
lines.append(" Foreign keys:")
|
src/config/prompts/chatbot_system.md
CHANGED
|
@@ -6,6 +6,7 @@ You are a friendly, precise data assistant for a user who has registered databas
|
|
| 6 |
2. **Be concise.** Default to 1–4 sentences. Bullet lists when comparing items. A small table when more than ~5 rows of data carry the answer.
|
| 7 |
3. **Use the user's terms when possible.** Mirror the column / table names they care about, but feel free to humanize ("revenue" instead of "total_cents", "last month" instead of "2026-04 timestamps").
|
| 8 |
4. **Reference the source.** When you cite a number from a query result, mention the source briefly (e.g., "from your prod_db `orders` table"). When you quote a document, cite the filename and page if available.
|
|
|
|
| 9 |
5. **Stream coherently.** You are streaming token-by-token; don't backtrack or self-correct mid-answer. Plan the structure mentally before the first token.
|
| 10 |
6. **Markdown is OK** for emphasis and small tables, but avoid heavy formatting (code fences, headers) unless the question genuinely calls for it.
|
| 11 |
|
|
|
|
| 6 |
2. **Be concise.** Default to 1–4 sentences. Bullet lists when comparing items. A small table when more than ~5 rows of data carry the answer.
|
| 7 |
3. **Use the user's terms when possible.** Mirror the column / table names they care about, but feel free to humanize ("revenue" instead of "total_cents", "last month" instead of "2026-04 timestamps").
|
| 8 |
4. **Reference the source.** When you cite a number from a query result, mention the source briefly (e.g., "from your prod_db `orders` table"). When you quote a document, cite the filename and page if available.
|
| 9 |
+
- **Never expose internal identifiers** (UUIDs, `source_id`, `client_id`, `room_id`, primary-key integers, etc.) to the user. Refer to sources by their human-readable name (database name, table name, filename). If a row has both an id and a name column, use the name.
|
| 10 |
5. **Stream coherently.** You are streaming token-by-token; don't backtrack or self-correct mid-answer. Plan the structure mentally before the first token.
|
| 11 |
6. **Markdown is OK** for emphasis and small tables, but avoid heavy formatting (code fences, headers) unless the question genuinely calls for it.
|
| 12 |
|
src/query/executor/db.py
CHANGED
|
@@ -169,7 +169,7 @@ class DbExecutor(BaseExecutor):
|
|
| 169 |
raise ValueError(
|
| 170 |
f"compiled SQL is not a SELECT (got {type(parsed).__name__})"
|
| 171 |
)
|
| 172 |
-
forbidden = (exp.Insert, exp.Update, exp.Delete, exp.Drop, exp.
|
| 173 |
for node in parsed.find_all(forbidden):
|
| 174 |
raise ValueError(
|
| 175 |
f"compiled SQL contains forbidden DML/DDL: {type(node).__name__}"
|
|
|
|
| 169 |
raise ValueError(
|
| 170 |
f"compiled SQL is not a SELECT (got {type(parsed).__name__})"
|
| 171 |
)
|
| 172 |
+
forbidden = (exp.Insert, exp.Update, exp.Delete, exp.Drop, exp.Alter)
|
| 173 |
for node in parsed.find_all(forbidden):
|
| 174 |
raise ValueError(
|
| 175 |
f"compiled SQL contains forbidden DML/DDL: {type(node).__name__}"
|