File size: 2,817 Bytes
6bff5d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
"""Pydantic models for the per-user data catalog (Cs + Ct).

See ARCHITECTURE.md §6 for the full schema definition.

Source.location_ref URI scheme
------------------------------
A `Source` is uniquely addressable by `location_ref`; introspectors and
executors parse it to find the underlying data:

  schema sources   → "dbclient://{database_client_id}"
                     Resolves via `database_client_service.get(...)` which
                     returns a `DatabaseClient` row whose Fernet-encrypted
                     credentials are decrypted at runtime.

  tabular sources  → "az_blob://{user_id}/{document_id}"
                     The Source aggregates one or more sheets as Tables;
                     each per-sheet Parquet blob is named via
                     `parquet_service.parquet_blob_name(user_id, document_id, sheet_name)`,
                     so executors derive the per-Table blob path from
                     `Source.location_ref` plus `Table.name`.

  unstructured     → reserved (deferred — see ARCHITECTURE.md §10 q2).
"""

from datetime import datetime
from typing import Any, Literal

from pydantic import BaseModel, Field

SourceType = Literal["schema", "tabular", "unstructured"]
DataType = Literal["int", "decimal", "string", "datetime", "date", "bool", "json"]


class ColumnStats(BaseModel):
    min: Any | None = None
    max: Any | None = None
    mean: float | None = None
    median: float | None = None
    distinct_count: int | None = None
    top_values: list[Any] | None = None


class Column(BaseModel):
    column_id: str
    name: str
    data_type: DataType
    nullable: bool
    pii_flag: bool = False
    sample_values: list[Any] | None = None
    stats: ColumnStats | None = None


class ForeignKey(BaseModel):
    """A FK edge from one column in this table to a column in another table.

    All references use stable IDs derived from source/table/column names so
    edges survive renames at the `name` level. The target table must belong
    to the SAME `Source` — cross-source FKs are not modeled in v1.
    """
    column_id: str           # the column in this table that holds the FK
    target_table_id: str     # referenced table_id, within the same Source
    target_column_id: str    # referenced column_id


class Table(BaseModel):
    table_id: str
    name: str
    row_count: int | None = None
    columns: list[Column]
    foreign_keys: list[ForeignKey] = Field(default_factory=list)


class Source(BaseModel):
    source_id: str
    source_type: SourceType
    name: str
    location_ref: str
    updated_at: datetime
    tables: list[Table] = Field(default_factory=list)


class Catalog(BaseModel):
    user_id: str
    schema_version: str = "1.0"
    generated_at: datetime
    sources: list[Source] = Field(default_factory=list)