File size: 3,165 Bytes
6bff5d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""Render a `Source` into the canonical text block consumed by the planner."""

from __future__ import annotations

from .models import Source


def render_source(source: Source) -> str:
    """Render a Source as the canonical text block consumed by the planner.

    Stable identifiers (source_id / table_id / column_id) are rendered
    alongside names. The planner must copy these verbatim into the IR;
    the IRValidator does a literal ID lookup, so anything else fails.

    Columns show data type, sample values (or `PII (suppressed)`), and
    populated stats only (min/max suppressed for string/bool, where they're
    useless). Top values are listed when available for low-cardinality cols.
    Foreign keys are resolved to names.
    """
    lines: list[str] = [
        f"Source: {source.name} ({source.source_type})",
        f"Source ID: {source.source_id}",
        "",
        "Tables:",
    ]

    tables_by_id = {t.table_id: t for t in source.tables}
    col_names_by_id = {
        t.table_id: {c.column_id: c.name for c in t.columns} for t in source.tables
    }

    for table in source.tables:
        rc = table.row_count
        rc_str = f" ({rc:,} rows)" if rc is not None else ""
        lines.append("")
        lines.append(f"  Table: {table.name}{rc_str} — id={table.table_id}")
        lines.append("  Columns:")
        for col in table.columns:
            samples = "PII (suppressed)" if col.pii_flag else (col.sample_values or [])
            stats_parts: list[str] = []
            if col.stats:
                if col.stats.min is not None:
                    stats_parts.append(f"min={col.stats.min}")
                if col.stats.max is not None:
                    stats_parts.append(f"max={col.stats.max}")
                if col.stats.mean is not None:
                    stats_parts.append(f"mean={col.stats.mean:.4g}")
                if col.stats.median is not None:
                    stats_parts.append(f"median={col.stats.median:.4g}")
                if col.stats.distinct_count is not None:
                    stats_parts.append(f"distinct={col.stats.distinct_count}")
                if col.stats.top_values:
                    stats_parts.append(f"top={col.stats.top_values}")
            stats_str = (", " + ", ".join(stats_parts)) if stats_parts else ""
            lines.append(
                f"    - {col.name} [{col.data_type}]: samples={samples}{stats_str} — id={col.column_id}"
            )
        if table.foreign_keys:
            lines.append("  Foreign keys:")
            cols_in_this_table = {c.column_id: c.name for c in table.columns}
            for fk in table.foreign_keys:
                src_col_name = cols_in_this_table.get(fk.column_id, fk.column_id)
                tgt_table = tables_by_id.get(fk.target_table_id)
                tgt_table_name = tgt_table.name if tgt_table else fk.target_table_id
                tgt_col_name = col_names_by_id.get(fk.target_table_id, {}).get(
                    fk.target_column_id, fk.target_column_id
                )
                lines.append(f"    - {src_col_name} -> {tgt_table_name}.{tgt_col_name}")
    return "\n".join(lines)