Rifqi Hafizuddin commited on
Commit
2e601ec
·
1 Parent(s): c4754f3

[KM-564] fix render and sql error

Browse files
src/catalog/render.py CHANGED
@@ -8,10 +8,9 @@ from .models import Source
8
  def render_source(source: Source) -> str:
9
  """Render a Source as the canonical text block consumed by the planner.
10
 
11
- Identifiers (source_id / table_id / column_id) are intentionally NOT
12
- rendered the LLM references things by name, and the IR resolver maps
13
- names back to stable IDs before validation. This saves ~10% input tokens
14
- per planner call.
15
 
16
  Columns show data type, sample values (or `PII (suppressed)`), and
17
  populated stats only (min/max suppressed for string/bool, where they're
@@ -20,6 +19,7 @@ def render_source(source: Source) -> str:
20
  """
21
  lines: list[str] = [
22
  f"Source: {source.name} ({source.source_type})",
 
23
  "",
24
  "Tables:",
25
  ]
@@ -33,7 +33,7 @@ def render_source(source: Source) -> str:
33
  rc = table.row_count
34
  rc_str = f" ({rc:,} rows)" if rc is not None else ""
35
  lines.append("")
36
- lines.append(f" Table: {table.name}{rc_str}")
37
  lines.append(" Columns:")
38
  for col in table.columns:
39
  samples = "PII (suppressed)" if col.pii_flag else (col.sample_values or [])
@@ -53,7 +53,7 @@ def render_source(source: Source) -> str:
53
  stats_parts.append(f"top={col.stats.top_values}")
54
  stats_str = (", " + ", ".join(stats_parts)) if stats_parts else ""
55
  lines.append(
56
- f" - {col.name} [{col.data_type}]: samples={samples}{stats_str}"
57
  )
58
  if table.foreign_keys:
59
  lines.append(" Foreign keys:")
 
8
  def render_source(source: Source) -> str:
9
  """Render a Source as the canonical text block consumed by the planner.
10
 
11
+ Stable identifiers (source_id / table_id / column_id) are rendered
12
+ alongside names. The planner must copy these verbatim into the IR;
13
+ the IRValidator does a literal ID lookup, so anything else fails.
 
14
 
15
  Columns show data type, sample values (or `PII (suppressed)`), and
16
  populated stats only (min/max suppressed for string/bool, where they're
 
19
  """
20
  lines: list[str] = [
21
  f"Source: {source.name} ({source.source_type})",
22
+ f"Source ID: {source.source_id}",
23
  "",
24
  "Tables:",
25
  ]
 
33
  rc = table.row_count
34
  rc_str = f" ({rc:,} rows)" if rc is not None else ""
35
  lines.append("")
36
+ lines.append(f" Table: {table.name}{rc_str} — id={table.table_id}")
37
  lines.append(" Columns:")
38
  for col in table.columns:
39
  samples = "PII (suppressed)" if col.pii_flag else (col.sample_values or [])
 
53
  stats_parts.append(f"top={col.stats.top_values}")
54
  stats_str = (", " + ", ".join(stats_parts)) if stats_parts else ""
55
  lines.append(
56
+ f" - {col.name} [{col.data_type}]: samples={samples}{stats_str} — id={col.column_id}"
57
  )
58
  if table.foreign_keys:
59
  lines.append(" Foreign keys:")
src/config/prompts/chatbot_system.md CHANGED
@@ -6,6 +6,7 @@ You are a friendly, precise data assistant for a user who has registered databas
6
  2. **Be concise.** Default to 1–4 sentences. Bullet lists when comparing items. A small table when more than ~5 rows of data carry the answer.
7
  3. **Use the user's terms when possible.** Mirror the column / table names they care about, but feel free to humanize ("revenue" instead of "total_cents", "last month" instead of "2026-04 timestamps").
8
  4. **Reference the source.** When you cite a number from a query result, mention the source briefly (e.g., "from your prod_db `orders` table"). When you quote a document, cite the filename and page if available.
 
9
  5. **Stream coherently.** You are streaming token-by-token; don't backtrack or self-correct mid-answer. Plan the structure mentally before the first token.
10
  6. **Markdown is OK** for emphasis and small tables, but avoid heavy formatting (code fences, headers) unless the question genuinely calls for it.
11
 
 
6
  2. **Be concise.** Default to 1–4 sentences. Bullet lists when comparing items. A small table when more than ~5 rows of data carry the answer.
7
  3. **Use the user's terms when possible.** Mirror the column / table names they care about, but feel free to humanize ("revenue" instead of "total_cents", "last month" instead of "2026-04 timestamps").
8
  4. **Reference the source.** When you cite a number from a query result, mention the source briefly (e.g., "from your prod_db `orders` table"). When you quote a document, cite the filename and page if available.
9
+ - **Never expose internal identifiers** (UUIDs, `source_id`, `client_id`, `room_id`, primary-key integers, etc.) to the user. Refer to sources by their human-readable name (database name, table name, filename). If a row has both an id and a name column, use the name.
10
  5. **Stream coherently.** You are streaming token-by-token; don't backtrack or self-correct mid-answer. Plan the structure mentally before the first token.
11
  6. **Markdown is OK** for emphasis and small tables, but avoid heavy formatting (code fences, headers) unless the question genuinely calls for it.
12
 
src/query/executor/db.py CHANGED
@@ -169,7 +169,7 @@ class DbExecutor(BaseExecutor):
169
  raise ValueError(
170
  f"compiled SQL is not a SELECT (got {type(parsed).__name__})"
171
  )
172
- forbidden = (exp.Insert, exp.Update, exp.Delete, exp.Drop, exp.AlterTable)
173
  for node in parsed.find_all(forbidden):
174
  raise ValueError(
175
  f"compiled SQL contains forbidden DML/DDL: {type(node).__name__}"
 
169
  raise ValueError(
170
  f"compiled SQL is not a SELECT (got {type(parsed).__name__})"
171
  )
172
+ forbidden = (exp.Insert, exp.Update, exp.Delete, exp.Drop, exp.Alter)
173
  for node in parsed.find_all(forbidden):
174
  raise ValueError(
175
  f"compiled SQL contains forbidden DML/DDL: {type(node).__name__}"