agentic-rag / docs /Database Design.md
vksepm
TruLens Azure judges, reranked context for metrics, feedback timeout; extend API proxy timeouts
1499b8f
|
Raw
History Blame Contribute Delete
8.54 kB

Entity-Relationship Diagram (ERD) & Table Definitions

1. Entity-Relationship Diagram (Logical)

erDiagram
    USER ||--o{ USER_ROLE : "assigned"
    ROLE ||--o{ USER_ROLE : "assigned to"
    ROLE ||--o{ ROLE_DOCUMENT_ACCESS : "has access to"
    DOCUMENT ||--o{ ROLE_DOCUMENT_ACCESS : "is accessed by"
    DOCUMENT ||--o{ CHUNK : "contains"
    USER ||--o{ QUERY_LOG : "submits"
    QUERY_LOG ||--o| EVALUATION_RESULT : "evaluated by"

    USER {
        uuid id PK
        string email
        string password_hash
        string display_name
        boolean is_active
        timestamp created_at
        timestamp last_login
    }
    ROLE {
        uuid id PK
        string name "e.g., 'researcher', 'guest'"
    }
    USER_ROLE {
        uuid user_id FK
        uuid role_id FK
    }
    DOCUMENT {
        uuid id PK
        string arxiv_id "UNIQUE, from dataset"
        string title
        string source_url "Link to ArXiv"
        string full_citation
        jsonb metadata
        timestamp created_at
    }
    CHUNK {
        uuid id PK
        uuid document_id FK
        text content
        vector embedding "vector(384)"
        int chunk_index
        jsonb metadata "page/section refs"
    }
    ROLE_DOCUMENT_ACCESS {
        uuid role_id FK
        uuid document_id FK
    }
    QUERY_LOG {
        uuid id PK
        uuid user_id FK
        text query_text
        text response_text
        string model_provider
        int agent_steps
        timestamp created_at
    }
    EVALUATION_RESULT {
        uuid id PK
        uuid query_log_id FK
        float relevance_score
        float groundedness_score
        float answer_relevance_score
        string trulens_record_id "Cross-ref to TruLens DB"
        timestamp evaluated_at
    }

2. Table Definitions (SQL)

2.1 Core Tables

-- Enable pgvector extension
CREATE EXTENSION IF NOT EXISTS vector;

-- Users Table
CREATE TABLE users (
    id            UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    email         TEXT UNIQUE NOT NULL,
    password_hash TEXT NOT NULL,
    display_name  TEXT,
    is_active     BOOLEAN NOT NULL DEFAULT TRUE,
    created_at    TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
    last_login    TIMESTAMP WITH TIME ZONE
);

-- Roles Table
CREATE TABLE roles (
    id   UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    name TEXT UNIQUE NOT NULL  -- e.g., 'researcher', 'guest'
);

-- User-Role Junction Table (role-level RBAC only β€” no per-user document grants)
CREATE TABLE user_roles (
    user_id UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE,
    role_id UUID NOT NULL REFERENCES roles(id) ON DELETE CASCADE,
    PRIMARY KEY (user_id, role_id)
);

-- Documents Table
CREATE TABLE documents (
    id            UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    arxiv_id      TEXT UNIQUE NOT NULL,
    title         TEXT,
    source_url    TEXT,        -- Direct link for user verification
    full_citation TEXT,        -- Bibliographic reference
    metadata      JSONB,
    created_at    TIMESTAMP WITH TIME ZONE DEFAULT NOW()
);

-- Chunks Table (each chunk belongs to exactly one document)
CREATE TABLE chunks (
    id          UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
    content     TEXT NOT NULL,
    embedding   VECTOR(384),
    chunk_index INT NOT NULL,
    metadata    JSONB          -- Stores page numbers, section headers, etc.
);

-- Role-Document Access Control
CREATE TABLE role_document_access (
    role_id     UUID NOT NULL REFERENCES roles(id) ON DELETE CASCADE,
    document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
    PRIMARY KEY (role_id, document_id)
);

2.2 Observability Tables

-- Query Log: persists every agentic query and its final response
CREATE TABLE query_logs (
    id             UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    user_id        UUID REFERENCES users(id) ON DELETE SET NULL,
    query_text     TEXT NOT NULL,
    response_text  TEXT,
    model_provider TEXT,   -- 'openai' | 'azure_openai' | 'gemini'
    agent_steps    INT,    -- SmolAgents steps taken (max 5)
    created_at     TIMESTAMP WITH TIME ZONE DEFAULT NOW()
);

-- Evaluation Results: RAG Triad scores, cross-referenced to TruLens
CREATE TABLE evaluation_results (
    id                    UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    query_log_id          UUID NOT NULL REFERENCES query_logs(id) ON DELETE CASCADE,
    relevance_score       NUMERIC(4,3),  -- Query vs. Reranked Context  (0–1)
    groundedness_score    NUMERIC(4,3),  -- Answer vs. Reranked Context (0–1)
    answer_relevance_score NUMERIC(4,3), -- Query vs. Final Answer      (0–1)
    trulens_record_id     TEXT,          -- Maps to TruLens Record.record_id
    evaluated_at          TIMESTAMP WITH TIME ZONE DEFAULT NOW()
);

3. Indexing Recommendations

3.1 Vector Index β€” HNSW (384-dim, Cosine Distance)

HNSW is preferred over IVFFlat: better recall at query time, no need to pre-specify cluster counts, and no index rebuild required after inserts. Parameters are tuned for 384-dimensional sentence-transformer embeddings.

-- Cosine distance is standard for sentence-transformer embeddings
CREATE INDEX idx_chunks_embedding_hnsw
ON chunks USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 64);

-- Tune recall vs. latency at query time (default: 40):
-- SET hnsw.ef_search = 100;
Parameter Value Rationale
m 16 Balanced graph connectivity; increase to 32 for higher recall at cost of memory
ef_construction 64 Build-time quality; increase to 128 for finer index at cost of build time
ef_search 40–100 Set per-session; 40 for <50 ms, 100 for maximum recall
Distance op vector_cosine_ops Standard for normalized sentence-transformer output

3.2 Access Control & Foreign Key Indexes

-- Speed up RBAC JOIN: "WHERE document_id IN (authorized_list)"
CREATE INDEX idx_chunks_document_id ON chunks(document_id);

-- Speed up role resolution for a given user
CREATE INDEX idx_user_roles_user_id ON user_roles(user_id);

-- Speed up document permission checks
CREATE INDEX idx_role_doc_access_doc_id ON role_document_access(document_id);

3.3 Observability Indexes

-- Time-ordered query history per user (pagination, monitoring dashboards)
CREATE INDEX idx_query_logs_user_id_created ON query_logs(user_id, created_at DESC);

-- Evaluation lookup by query
CREATE INDEX idx_eval_results_query_log_id ON evaluation_results(query_log_id);

-- TruLens cross-reference (partial β€” only rows that have been evaluated)
CREATE INDEX idx_eval_results_trulens_id
ON evaluation_results(trulens_record_id)
WHERE trulens_record_id IS NOT NULL;

4. Example Access-Aware Query with Citations

When the Agent performs a search, it retrieves source_url, title, and full_citation so it can cite its sources in the final response.

SELECT
    c.content,
    d.title,
    d.source_url,
    d.full_citation,
    c.metadata->>'page'                      AS page_number,
    c.embedding <=> '[query_vector]'::vector AS distance
FROM chunks c
JOIN documents d             ON c.document_id  = d.id
JOIN role_document_access rda ON d.id           = rda.document_id
JOIN user_roles ur            ON rda.role_id    = ur.role_id
WHERE ur.user_id = 'current_user_id'
ORDER BY distance
LIMIT 20;

5. TruLens Integration Strategy

TruLens maintains its own internal schema for detailed trace data. The evaluation_results table here serves as a lightweight application-side mirror for:

  • Querying aggregate RAG Triad scores without launching the TruLens dashboard.
  • Joining scores directly to your own users and query_logs data for reporting.
  • Setting up alerting when scores fall below SLA thresholds (relevance > 0.85, groundedness > 0.9).

The trulens_record_id column maps to TruLens's Record.record_id, enabling a full round-trip lookup when deeper trace or span inspection is needed.

Application evaluation code waits for feedback completion (TRULENS_FEEDBACK_TIMEOUT, retrieve_feedback_results in backend/app/evaluation/trulens_eval.py) before writing evaluation_results. The context embedded in the TruLens replay for Context Relevance and Groundedness is aligned with the last pgvector_retriever formatted output when available (reranked_context), not only LLM JSON.