| """Pydantic models for the per-user data catalog (Cs + Ct). |
| |
| See ARCHITECTURE.md §6 for the full schema definition. |
| |
| Source.location_ref URI scheme |
| ------------------------------ |
| A `Source` is uniquely addressable by `location_ref`; introspectors and |
| executors parse it to find the underlying data: |
| |
| schema sources → "dbclient://{database_client_id}" |
| Resolves via `database_client_service.get(...)` which |
| returns a `DatabaseClient` row whose Fernet-encrypted |
| credentials are decrypted at runtime. |
| |
| tabular sources → "az_blob://{user_id}/{document_id}" |
| The Source aggregates one or more sheets as Tables; |
| each per-sheet Parquet blob is named via |
| `parquet_service.parquet_blob_name(user_id, document_id, sheet_name)`, |
| so executors derive the per-Table blob path from |
| `Source.location_ref` plus `Table.name`. |
| |
| unstructured → reserved (deferred — see ARCHITECTURE.md §10 q2). |
| """ |
|
|
| from datetime import datetime |
| from typing import Any, Literal |
|
|
| from pydantic import BaseModel, Field |
|
|
| SourceType = Literal["schema", "tabular", "unstructured"] |
| DataType = Literal["int", "decimal", "string", "datetime", "date", "bool", "json"] |
|
|
|
|
| class ColumnStats(BaseModel): |
| min: Any | None = None |
| max: Any | None = None |
| mean: float | None = None |
| median: float | None = None |
| distinct_count: int | None = None |
| top_values: list[Any] | None = None |
|
|
|
|
| class Column(BaseModel): |
| column_id: str |
| name: str |
| data_type: DataType |
| nullable: bool |
| pii_flag: bool = False |
| sample_values: list[Any] | None = None |
| stats: ColumnStats | None = None |
|
|
|
|
| class ForeignKey(BaseModel): |
| """A FK edge from one column in this table to a column in another table. |
| |
| All references use stable IDs derived from source/table/column names so |
| edges survive renames at the `name` level. The target table must belong |
| to the SAME `Source` — cross-source FKs are not modeled in v1. |
| """ |
| column_id: str |
| target_table_id: str |
| target_column_id: str |
|
|
|
|
| class Table(BaseModel): |
| table_id: str |
| name: str |
| row_count: int | None = None |
| columns: list[Column] |
| foreign_keys: list[ForeignKey] = Field(default_factory=list) |
|
|
|
|
| class Source(BaseModel): |
| source_id: str |
| source_type: SourceType |
| name: str |
| location_ref: str |
| updated_at: datetime |
| tables: list[Table] = Field(default_factory=list) |
|
|
|
|
| class Catalog(BaseModel): |
| user_id: str |
| schema_version: str = "1.0" |
| generated_at: datetime |
| sources: list[Source] = Field(default_factory=list) |
|
|