Spaces:

build-small-hackathon
/

MyPal

Sleeping

File size: 10,792 Bytes

-- 9XAIPal Database Schema

CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";

-- Documents table
CREATE TABLE IF NOT EXISTS documents (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    filename TEXT NOT NULL,
    original_filename TEXT NOT NULL,
    file_size_bytes BIGINT,
    page_count INTEGER,
    status TEXT NOT NULL DEFAULT 'queued',
    error_message TEXT,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),

    -- LLM-corrected reading order for complex layouts (two-column papers, etc.)
    -- Stores an array of original sequence_ids in the correct logical reading order.
    reading_order JSONB,
    reading_order_model TEXT,
    reading_order_updated_at TIMESTAMPTZ,

    -- Which extractor produced this document's chunks ("mineru" or "pymupdf_fallback").
    -- Surfaced in the UI so users can see whether they got high-fidelity MinerU
    -- output (typed equations, page_footnotes, table structure) or the degraded
    -- text-only fallback.
    extractor TEXT,

    -- Whether this document is a "book" (chapter-by-chapter reading navigation)
    -- or a "paper" (linear reading). Chosen by the user at upload time.
    doc_kind TEXT NOT NULL DEFAULT 'paper'
);

COMMENT ON COLUMN documents.reading_order IS 'Array of original chunk sequence_ids in LLM-corrected logical reading order. Used to fix two-column and complex layout extraction issues.';

-- Persistent raw PDF storage.
-- The HF Space container disk is ephemeral, so the original uploaded bytes are
-- kept in Postgres (external Supabase) alongside metadata. The filesystem copy
-- in assets/ is used as a fast local cache when present.
CREATE TABLE IF NOT EXISTS document_raw_files (
    document_id UUID PRIMARY KEY REFERENCES documents(id) ON DELETE CASCADE,
    content BYTEA NOT NULL,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

-- Chunks table with physical ordering
CREATE TABLE IF NOT EXISTS chunks (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
    sequence_id INTEGER NOT NULL,
    parent_sequence_id INTEGER,
    chunk_type TEXT NOT NULL DEFAULT 'text',
    heading_path TEXT[],
    markdown TEXT NOT NULL,
    plain_text TEXT NOT NULL,
    page_start INTEGER,
    page_end INTEGER,
    bbox_json JSONB,
    token_count INTEGER,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    UNIQUE(document_id, sequence_id)
);

CREATE INDEX IF NOT EXISTS idx_chunks_document_sequence
    ON chunks(document_id, sequence_id);

-- Chunk embeddings with pgvector
CREATE TABLE IF NOT EXISTS chunk_embeddings (
    chunk_id UUID PRIMARY KEY REFERENCES chunks(id) ON DELETE CASCADE,
    embedding vector(1024) NOT NULL,  -- dimension is substituted from VECTOR_DIMENSION at migration time
    embedding_model TEXT NOT NULL,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

-- Chunk assets (images, figures, tables)
CREATE TABLE IF NOT EXISTS chunk_assets (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    chunk_id UUID NOT NULL REFERENCES chunks(id) ON DELETE CASCADE,
    asset_type TEXT NOT NULL,
    file_path TEXT NOT NULL,
    mime_type TEXT,
    width INTEGER,
    height INTEGER,
    caption TEXT,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

CREATE INDEX IF NOT EXISTS idx_chunk_assets_chunk_id ON chunk_assets(chunk_id);

-- Persistent extracted image/figure storage.
-- The HF Space container disk is ephemeral, so extracted figure images are kept
-- in Postgres (external Supabase) keyed by their chunk_assets row. The filesystem
-- copy under images/ is used as a fast local cache when present.
CREATE TABLE IF NOT EXISTS chunk_asset_files (
    asset_id UUID PRIMARY KEY REFERENCES chunk_assets(id) ON DELETE CASCADE,
    content BYTEA NOT NULL,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

-- Conversation turns
-- Supports the nested sub-thread feature (tangents without polluting the main paper chat).
-- Main linear chat turns have parent_turn_id IS NULL.
-- Sub-thread turns have parent_turn_id pointing to their parent turn in the tree
-- (the branching user turn for the first continuation, or the previous turn for follow-ups).
CREATE TABLE IF NOT EXISTS conversation_turns (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    conversation_id UUID NOT NULL,
    document_id UUID REFERENCES documents(id) ON DELETE SET NULL,
    role TEXT NOT NULL,
    content TEXT NOT NULL,
    context_type TEXT,
    router_reason TEXT,
    model TEXT,
    citations JSONB,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),

    -- NULL for all turns that belong to the main linear chat for a conversation_id.
    -- Non-NULL points to the parent turn this message is a reply to (supports
    -- arbitrary-depth nesting of tangents). The root of a sub-thread is the
    -- original user message that started the tangent (even though that user
    -- message itself has parent_turn_id = NULL so it stays visible in main chat).
    parent_turn_id UUID REFERENCES conversation_turns(id) ON DELETE CASCADE
);

CREATE INDEX IF NOT EXISTS idx_conversation_turns_conversation
    ON conversation_turns(conversation_id, created_at);

-- Fast lookup for "does this turn have any children in a sub-thread?"
-- and for recursive subtree loading.
CREATE INDEX IF NOT EXISTS idx_conversation_turns_parent
    ON conversation_turns(parent_turn_id);

-- Ask traces for debugging
CREATE TABLE IF NOT EXISTS ask_traces (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    conversation_turn_id UUID REFERENCES conversation_turns(id) ON DELETE CASCADE,
    context_type TEXT NOT NULL,
    router_reason TEXT,
    retrieved_chunk_ids UUID[],
    model TEXT,
    prompt_tokens INTEGER,
    completion_tokens INTEGER,
    latency_ms INTEGER,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

-- Ingestion jobs
CREATE TABLE IF NOT EXISTS ingestion_jobs (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
    status TEXT NOT NULL DEFAULT 'queued',
    error_message TEXT,
    started_at TIMESTAMPTZ,
    completed_at TIMESTAMPTZ,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),

    -- Celery task id of the currently running step, so the watchdog can
    -- force-terminate a stuck task and re-dispatch the next attempt.
    celery_task_id TEXT,

    -- Number of times the watchdog has recovered this job. Prevents infinite
    -- recovery loops; after the cap the job is marked failed.
    recovery_attempts INTEGER NOT NULL DEFAULT 0
);

CREATE INDEX IF NOT EXISTS idx_ingestion_jobs_status ON ingestion_jobs(status);

-- ============================================================================
-- Section Summaries: Pre-computed hierarchical overviews for high-quality
-- "What is this paper about?" / "Summarize the paper" experiences.
--
-- Design goals (personal use, quality-first):
--   * Separate table (does NOT pollute the source-of-truth `chunks` table)
--   * Stores rich attribution (source_chunk_ids) so answers can cite original
--     sequence_ids / pages even when using the overview path.
--   * Model + prompt_hash for future invalidation / regeneration when you
--     change models or improve prompts.
--   * Supports both per-section (H1/H2) and whole-paper executive summary.
-- ============================================================================

CREATE TABLE IF NOT EXISTS section_summaries (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,

    -- Stable identifier for the section within the document (e.g. "h1-03-introduction")
    section_id TEXT NOT NULL,

    -- 0 = whole-paper executive summary, 1 = H1, 2 = H2
    level INTEGER NOT NULL CHECK (level IN (0, 1, 2)),

    -- Full heading path at the time of summarization (e.g. ["Introduction", "Motivation"])
    heading_path TEXT[] NOT NULL,

    -- Inclusive range of source sequence_ids that contributed to this summary
    sequence_start INTEGER,
    sequence_end INTEGER,

    summary_markdown TEXT NOT NULL,
    summary_plain TEXT NOT NULL,

    -- Strong grounding: the exact chunk IDs whose content was fed to the LLM
    source_chunk_ids UUID[] NOT NULL,

    model TEXT NOT NULL,
    prompt_hash TEXT NOT NULL,           -- hash of the prompt template + version

    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),

    UNIQUE (document_id, section_id, model)
);

-- ============================================================================
-- Rich Extraction Enhancements (Quality-First for Tables, Figures & Architectures)
--
-- These additions support much deeper interaction with complex paper elements.
-- The author accepts heavy processing at ingestion time for superior chat quality.
-- ============================================================================

-- Add structured table data to existing chunks (only populated for table-type chunks)
-- This allows the model to query tables intelligently ("what was the F1 score for the 7B variant in Table 4?")
ALTER TABLE chunks
    ADD COLUMN IF NOT EXISTS table_json JSONB;

-- Figure / Diagram / Architecture Descriptions
-- Generated at ingestion time using VLM (gemma4:26b vision or equivalent).
-- Stored separately so they can be retrieved by GLOBAL search, OVERVIEW, or targeted tools,
-- while the original image remains in chunk_assets.
CREATE TABLE IF NOT EXISTS figure_descriptions (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
    chunk_id UUID NOT NULL REFERENCES chunks(id) ON DELETE CASCADE,

    -- Original image reference
    image_path TEXT NOT NULL,           -- relative path under images/ or original MinerU name

    -- Rich VLM-generated description (technical, precise, good for architectures)
    description_markdown TEXT NOT NULL,
    description_plain TEXT NOT NULL,

    -- Attribution for grounding/citations
    source_sequence_start INTEGER,
    source_sequence_end INTEGER,
    referenced_by_chunk_ids UUID[],     -- text chunks that mention this figure

    model TEXT NOT NULL,
    prompt_hash TEXT NOT NULL,

    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),

    UNIQUE (chunk_id, model)
);

CREATE INDEX IF NOT EXISTS idx_figure_descriptions_document
    ON figure_descriptions(document_id, created_at DESC);

CREATE INDEX IF NOT EXISTS idx_figure_descriptions_chunk
    ON figure_descriptions(chunk_id);
CREATE INDEX IF NOT EXISTS idx_section_summaries_document
    ON section_summaries(document_id, level, sequence_start);

CREATE INDEX IF NOT EXISTS idx_section_summaries_document_created
    ON section_summaries(document_id, created_at DESC);