"""Render `TableInfo` records into embed-ready `SchemaChunk`s. Per docs/02_architecture_v2.md §4: ONE chunk per table. The chunk's `text` is what we embed; `metadata` carries structured fields the retriever needs at filter time (db_id, table name, FK targets) without re-parsing the rendered text. Business-term hints are intentionally a thin stub — populated later from a glossary file once we have one (see `03_eval_methodology.md` §7). """ from __future__ import annotations from dataclasses import dataclass, field from typing import Any from nl_sql.schema_index.introspector import TableInfo BusinessHints = dict[str, list[str]] """Map ``table_name → list[hint]``. Each hint is a short ≤80-char phrase ('active customer = invoice in last 90 days') merged into the chunk text.""" @dataclass(frozen=True, slots=True) class SchemaChunk: """One embedded record per table. `chunk_id` is stable across re-indexing runs (same db + table → same id), so Chroma upserts replace stale chunks instead of duplicating them. """ chunk_id: str db_id: str table_name: str text: str fk_targets: tuple[str, ...] metadata: dict[str, Any] = field(default_factory=dict) def to_chunks( tables: list[TableInfo], db_id: str, *, business_hints: BusinessHints | None = None, ) -> list[SchemaChunk]: """Render one chunk per table. `business_hints` (optional) attaches 1-2 domain phrases per table — kept out of the chunk if missing rather than guessed by the LLM. Same hints appear verbatim in chunk text and in metadata for downstream display. """ hints_map = business_hints or {} return [_chunk_for_table(t, db_id=db_id, hints=hints_map.get(t.name, [])) for t in tables] def _chunk_for_table(table: TableInfo, *, db_id: str, hints: list[str]) -> SchemaChunk: text = _render_table_text(table, hints=hints) fk_targets = tuple(sorted({fk.referred_table for fk in table.foreign_keys})) metadata = { "db_id": db_id, "table_name": table.name, "row_count": table.row_count, "column_count": len(table.columns), "primary_key": ",".join(table.primary_key_columns), "fk_targets": ",".join(fk_targets), "business_hints": " | ".join(hints), } return SchemaChunk( chunk_id=f"{db_id}::{table.name}", db_id=db_id, table_name=table.name, text=text, fk_targets=fk_targets, metadata=metadata, ) def _render_table_text(table: TableInfo, *, hints: list[str]) -> str: """Pretty multi-line description used as the embedded text body. Layout (stable for snapshot-style tests and for prompt rendering downstream): Table: (rows=) Primary key: Columns: - : [PK] [NULL?] | nulls=, distinct= | samples: v1, v2, v3 ... Foreign keys: - (, ...) -> (, ...) Business hints: - """ lines: list[str] = [f"Table: {table.name} (rows={table.row_count})"] if table.primary_key_columns: lines.append(f"Primary key: {', '.join(table.primary_key_columns)}") lines.append("Columns:") for col in table.columns: flags: list[str] = [] if col.is_primary_key: flags.append("PK") flags.append("NULL" if col.nullable else "NOT NULL") null_pct = _null_pct(col.null_count, table.row_count) stats = f"nulls={col.null_count} ({null_pct})" if table.row_count else "empty" stats += f", distinct={col.distinct_count}" samples = _format_samples(col.sample_values) suffix = f" | samples: {samples}" if samples else "" lines.append(f" - {col.name}: {col.type} [{' '.join(flags)}] | {stats}{suffix}") if table.foreign_keys: lines.append("Foreign keys:") for fk in table.foreign_keys: local = ", ".join(fk.columns) remote = ", ".join(fk.referred_columns) lines.append(f" - ({local}) -> {fk.referred_table}({remote})") if hints: lines.append("Business hints:") lines.extend(f" - {h}" for h in hints) return "\n".join(lines) def _null_pct(null_count: int, row_count: int) -> str: if row_count <= 0: return "0%" pct = 100.0 * null_count / row_count return f"{pct:.0f}%" if pct >= 1 else f"{pct:.1f}%" def _format_samples(values: tuple[Any, ...]) -> str: if not values: return "" rendered = [_format_one(v) for v in values] return ", ".join(rendered) def _format_one(value: Any) -> str: if value is None: return "NULL" if isinstance(value, str): return repr(value) return str(value)