File size: 2,817 Bytes
6bff5d9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | """Pydantic models for the per-user data catalog (Cs + Ct).
See ARCHITECTURE.md §6 for the full schema definition.
Source.location_ref URI scheme
------------------------------
A `Source` is uniquely addressable by `location_ref`; introspectors and
executors parse it to find the underlying data:
schema sources → "dbclient://{database_client_id}"
Resolves via `database_client_service.get(...)` which
returns a `DatabaseClient` row whose Fernet-encrypted
credentials are decrypted at runtime.
tabular sources → "az_blob://{user_id}/{document_id}"
The Source aggregates one or more sheets as Tables;
each per-sheet Parquet blob is named via
`parquet_service.parquet_blob_name(user_id, document_id, sheet_name)`,
so executors derive the per-Table blob path from
`Source.location_ref` plus `Table.name`.
unstructured → reserved (deferred — see ARCHITECTURE.md §10 q2).
"""
from datetime import datetime
from typing import Any, Literal
from pydantic import BaseModel, Field
SourceType = Literal["schema", "tabular", "unstructured"]
DataType = Literal["int", "decimal", "string", "datetime", "date", "bool", "json"]
class ColumnStats(BaseModel):
min: Any | None = None
max: Any | None = None
mean: float | None = None
median: float | None = None
distinct_count: int | None = None
top_values: list[Any] | None = None
class Column(BaseModel):
column_id: str
name: str
data_type: DataType
nullable: bool
pii_flag: bool = False
sample_values: list[Any] | None = None
stats: ColumnStats | None = None
class ForeignKey(BaseModel):
"""A FK edge from one column in this table to a column in another table.
All references use stable IDs derived from source/table/column names so
edges survive renames at the `name` level. The target table must belong
to the SAME `Source` — cross-source FKs are not modeled in v1.
"""
column_id: str # the column in this table that holds the FK
target_table_id: str # referenced table_id, within the same Source
target_column_id: str # referenced column_id
class Table(BaseModel):
table_id: str
name: str
row_count: int | None = None
columns: list[Column]
foreign_keys: list[ForeignKey] = Field(default_factory=list)
class Source(BaseModel):
source_id: str
source_type: SourceType
name: str
location_ref: str
updated_at: datetime
tables: list[Table] = Field(default_factory=list)
class Catalog(BaseModel):
user_id: str
schema_version: str = "1.0"
generated_at: datetime
sources: list[Source] = Field(default_factory=list)
|