| """Render `TableInfo` records into embed-ready `SchemaChunk`s. |
| |
| Per docs/02_architecture_v2.md §4: ONE chunk per table. The chunk's `text` is |
| what we embed; `metadata` carries structured fields the retriever needs at |
| filter time (db_id, table name, FK targets) without re-parsing the rendered |
| text. Business-term hints are intentionally a thin stub — populated later from |
| a glossary file once we have one (see `03_eval_methodology.md` §7). |
| """ |
|
|
| from __future__ import annotations |
|
|
| from dataclasses import dataclass, field |
| from typing import Any |
|
|
| from nl_sql.schema_index.introspector import TableInfo |
|
|
| BusinessHints = dict[str, list[str]] |
| """Map ``table_name → list[hint]``. Each hint is a short ≤80-char phrase |
| ('active customer = invoice in last 90 days') merged into the chunk text.""" |
|
|
|
|
| @dataclass(frozen=True, slots=True) |
| class SchemaChunk: |
| """One embedded record per table. |
| |
| `chunk_id` is stable across re-indexing runs (same db + table → same id), |
| so Chroma upserts replace stale chunks instead of duplicating them. |
| """ |
|
|
| chunk_id: str |
| db_id: str |
| table_name: str |
| text: str |
| fk_targets: tuple[str, ...] |
| metadata: dict[str, Any] = field(default_factory=dict) |
|
|
|
|
| def to_chunks( |
| tables: list[TableInfo], |
| db_id: str, |
| *, |
| business_hints: BusinessHints | None = None, |
| ) -> list[SchemaChunk]: |
| """Render one chunk per table. |
| |
| `business_hints` (optional) attaches 1-2 domain phrases per table — kept |
| out of the chunk if missing rather than guessed by the LLM. Same hints |
| appear verbatim in chunk text and in metadata for downstream display. |
| """ |
| hints_map = business_hints or {} |
| return [_chunk_for_table(t, db_id=db_id, hints=hints_map.get(t.name, [])) for t in tables] |
|
|
|
|
| def _chunk_for_table(table: TableInfo, *, db_id: str, hints: list[str]) -> SchemaChunk: |
| text = _render_table_text(table, hints=hints) |
| fk_targets = tuple(sorted({fk.referred_table for fk in table.foreign_keys})) |
| metadata = { |
| "db_id": db_id, |
| "table_name": table.name, |
| "row_count": table.row_count, |
| "column_count": len(table.columns), |
| "primary_key": ",".join(table.primary_key_columns), |
| "fk_targets": ",".join(fk_targets), |
| "business_hints": " | ".join(hints), |
| } |
| return SchemaChunk( |
| chunk_id=f"{db_id}::{table.name}", |
| db_id=db_id, |
| table_name=table.name, |
| text=text, |
| fk_targets=fk_targets, |
| metadata=metadata, |
| ) |
|
|
|
|
| def _render_table_text(table: TableInfo, *, hints: list[str]) -> str: |
| """Pretty multi-line description used as the embedded text body. |
| |
| Layout (stable for snapshot-style tests and for prompt rendering downstream): |
| |
| Table: <name> (rows=<n>) |
| Primary key: <cols> |
| Columns: |
| - <col>: <type> [PK] [NULL?] | nulls=<n>, distinct=<n> | samples: v1, v2, v3 |
| ... |
| Foreign keys: |
| - (<col>, ...) -> <other_table>(<col>, ...) |
| Business hints: |
| - <hint> |
| """ |
| lines: list[str] = [f"Table: {table.name} (rows={table.row_count})"] |
| if table.primary_key_columns: |
| lines.append(f"Primary key: {', '.join(table.primary_key_columns)}") |
|
|
| lines.append("Columns:") |
| for col in table.columns: |
| flags: list[str] = [] |
| if col.is_primary_key: |
| flags.append("PK") |
| flags.append("NULL" if col.nullable else "NOT NULL") |
| null_pct = _null_pct(col.null_count, table.row_count) |
| stats = f"nulls={col.null_count} ({null_pct})" if table.row_count else "empty" |
| stats += f", distinct={col.distinct_count}" |
| samples = _format_samples(col.sample_values) |
| suffix = f" | samples: {samples}" if samples else "" |
| lines.append(f" - {col.name}: {col.type} [{' '.join(flags)}] | {stats}{suffix}") |
|
|
| if table.foreign_keys: |
| lines.append("Foreign keys:") |
| for fk in table.foreign_keys: |
| local = ", ".join(fk.columns) |
| remote = ", ".join(fk.referred_columns) |
| lines.append(f" - ({local}) -> {fk.referred_table}({remote})") |
|
|
| if hints: |
| lines.append("Business hints:") |
| lines.extend(f" - {h}" for h in hints) |
|
|
| return "\n".join(lines) |
|
|
|
|
| def _null_pct(null_count: int, row_count: int) -> str: |
| if row_count <= 0: |
| return "0%" |
| pct = 100.0 * null_count / row_count |
| return f"{pct:.0f}%" if pct >= 1 else f"{pct:.1f}%" |
|
|
|
|
| def _format_samples(values: tuple[Any, ...]) -> str: |
| if not values: |
| return "" |
| rendered = [_format_one(v) for v in values] |
| return ", ".join(rendered) |
|
|
|
|
| def _format_one(value: Any) -> str: |
| if value is None: |
| return "NULL" |
| if isinstance(value, str): |
| return repr(value) |
| return str(value) |
|
|