-- 9XAIPal Database Schema CREATE EXTENSION IF NOT EXISTS vector; CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; -- Documents table CREATE TABLE IF NOT EXISTS documents ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), filename TEXT NOT NULL, original_filename TEXT NOT NULL, file_size_bytes BIGINT, page_count INTEGER, status TEXT NOT NULL DEFAULT 'queued', error_message TEXT, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), -- LLM-corrected reading order for complex layouts (two-column papers, etc.) -- Stores an array of original sequence_ids in the correct logical reading order. reading_order JSONB, reading_order_model TEXT, reading_order_updated_at TIMESTAMPTZ, -- Which extractor produced this document's chunks ("mineru" or "pymupdf_fallback"). -- Surfaced in the UI so users can see whether they got high-fidelity MinerU -- output (typed equations, page_footnotes, table structure) or the degraded -- text-only fallback. extractor TEXT, -- Whether this document is a "book" (chapter-by-chapter reading navigation) -- or a "paper" (linear reading). Chosen by the user at upload time. doc_kind TEXT NOT NULL DEFAULT 'paper' ); COMMENT ON COLUMN documents.reading_order IS 'Array of original chunk sequence_ids in LLM-corrected logical reading order. Used to fix two-column and complex layout extraction issues.'; -- Persistent raw PDF storage. -- The HF Space container disk is ephemeral, so the original uploaded bytes are -- kept in Postgres (external Supabase) alongside metadata. The filesystem copy -- in assets/ is used as a fast local cache when present. CREATE TABLE IF NOT EXISTS document_raw_files ( document_id UUID PRIMARY KEY REFERENCES documents(id) ON DELETE CASCADE, content BYTEA NOT NULL, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() ); -- Chunks table with physical ordering CREATE TABLE IF NOT EXISTS chunks ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE, sequence_id INTEGER NOT NULL, parent_sequence_id INTEGER, chunk_type TEXT NOT NULL DEFAULT 'text', heading_path TEXT[], markdown TEXT NOT NULL, plain_text TEXT NOT NULL, page_start INTEGER, page_end INTEGER, bbox_json JSONB, token_count INTEGER, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), UNIQUE(document_id, sequence_id) ); CREATE INDEX IF NOT EXISTS idx_chunks_document_sequence ON chunks(document_id, sequence_id); -- Chunk embeddings with pgvector CREATE TABLE IF NOT EXISTS chunk_embeddings ( chunk_id UUID PRIMARY KEY REFERENCES chunks(id) ON DELETE CASCADE, embedding vector(1024) NOT NULL, -- dimension is substituted from VECTOR_DIMENSION at migration time embedding_model TEXT NOT NULL, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() ); -- Chunk assets (images, figures, tables) CREATE TABLE IF NOT EXISTS chunk_assets ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), chunk_id UUID NOT NULL REFERENCES chunks(id) ON DELETE CASCADE, asset_type TEXT NOT NULL, file_path TEXT NOT NULL, mime_type TEXT, width INTEGER, height INTEGER, caption TEXT, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() ); CREATE INDEX IF NOT EXISTS idx_chunk_assets_chunk_id ON chunk_assets(chunk_id); -- Persistent extracted image/figure storage. -- The HF Space container disk is ephemeral, so extracted figure images are kept -- in Postgres (external Supabase) keyed by their chunk_assets row. The filesystem -- copy under images/ is used as a fast local cache when present. CREATE TABLE IF NOT EXISTS chunk_asset_files ( asset_id UUID PRIMARY KEY REFERENCES chunk_assets(id) ON DELETE CASCADE, content BYTEA NOT NULL, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() ); -- Conversation turns -- Supports the nested sub-thread feature (tangents without polluting the main paper chat). -- Main linear chat turns have parent_turn_id IS NULL. -- Sub-thread turns have parent_turn_id pointing to their parent turn in the tree -- (the branching user turn for the first continuation, or the previous turn for follow-ups). CREATE TABLE IF NOT EXISTS conversation_turns ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), conversation_id UUID NOT NULL, document_id UUID REFERENCES documents(id) ON DELETE SET NULL, role TEXT NOT NULL, content TEXT NOT NULL, context_type TEXT, router_reason TEXT, model TEXT, citations JSONB, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), -- NULL for all turns that belong to the main linear chat for a conversation_id. -- Non-NULL points to the parent turn this message is a reply to (supports -- arbitrary-depth nesting of tangents). The root of a sub-thread is the -- original user message that started the tangent (even though that user -- message itself has parent_turn_id = NULL so it stays visible in main chat). parent_turn_id UUID REFERENCES conversation_turns(id) ON DELETE CASCADE ); CREATE INDEX IF NOT EXISTS idx_conversation_turns_conversation ON conversation_turns(conversation_id, created_at); -- Fast lookup for "does this turn have any children in a sub-thread?" -- and for recursive subtree loading. CREATE INDEX IF NOT EXISTS idx_conversation_turns_parent ON conversation_turns(parent_turn_id); -- Ask traces for debugging CREATE TABLE IF NOT EXISTS ask_traces ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), conversation_turn_id UUID REFERENCES conversation_turns(id) ON DELETE CASCADE, context_type TEXT NOT NULL, router_reason TEXT, retrieved_chunk_ids UUID[], model TEXT, prompt_tokens INTEGER, completion_tokens INTEGER, latency_ms INTEGER, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() ); -- Ingestion jobs CREATE TABLE IF NOT EXISTS ingestion_jobs ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE, status TEXT NOT NULL DEFAULT 'queued', error_message TEXT, started_at TIMESTAMPTZ, completed_at TIMESTAMPTZ, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), -- Celery task id of the currently running step, so the watchdog can -- force-terminate a stuck task and re-dispatch the next attempt. celery_task_id TEXT, -- Number of times the watchdog has recovered this job. Prevents infinite -- recovery loops; after the cap the job is marked failed. recovery_attempts INTEGER NOT NULL DEFAULT 0 ); CREATE INDEX IF NOT EXISTS idx_ingestion_jobs_status ON ingestion_jobs(status); -- ============================================================================ -- Section Summaries: Pre-computed hierarchical overviews for high-quality -- "What is this paper about?" / "Summarize the paper" experiences. -- -- Design goals (personal use, quality-first): -- * Separate table (does NOT pollute the source-of-truth `chunks` table) -- * Stores rich attribution (source_chunk_ids) so answers can cite original -- sequence_ids / pages even when using the overview path. -- * Model + prompt_hash for future invalidation / regeneration when you -- change models or improve prompts. -- * Supports both per-section (H1/H2) and whole-paper executive summary. -- ============================================================================ CREATE TABLE IF NOT EXISTS section_summaries ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE, -- Stable identifier for the section within the document (e.g. "h1-03-introduction") section_id TEXT NOT NULL, -- 0 = whole-paper executive summary, 1 = H1, 2 = H2 level INTEGER NOT NULL CHECK (level IN (0, 1, 2)), -- Full heading path at the time of summarization (e.g. ["Introduction", "Motivation"]) heading_path TEXT[] NOT NULL, -- Inclusive range of source sequence_ids that contributed to this summary sequence_start INTEGER, sequence_end INTEGER, summary_markdown TEXT NOT NULL, summary_plain TEXT NOT NULL, -- Strong grounding: the exact chunk IDs whose content was fed to the LLM source_chunk_ids UUID[] NOT NULL, model TEXT NOT NULL, prompt_hash TEXT NOT NULL, -- hash of the prompt template + version created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), UNIQUE (document_id, section_id, model) ); -- ============================================================================ -- Rich Extraction Enhancements (Quality-First for Tables, Figures & Architectures) -- -- These additions support much deeper interaction with complex paper elements. -- The author accepts heavy processing at ingestion time for superior chat quality. -- ============================================================================ -- Add structured table data to existing chunks (only populated for table-type chunks) -- This allows the model to query tables intelligently ("what was the F1 score for the 7B variant in Table 4?") ALTER TABLE chunks ADD COLUMN IF NOT EXISTS table_json JSONB; -- Figure / Diagram / Architecture Descriptions -- Generated at ingestion time using VLM (gemma4:26b vision or equivalent). -- Stored separately so they can be retrieved by GLOBAL search, OVERVIEW, or targeted tools, -- while the original image remains in chunk_assets. CREATE TABLE IF NOT EXISTS figure_descriptions ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE, chunk_id UUID NOT NULL REFERENCES chunks(id) ON DELETE CASCADE, -- Original image reference image_path TEXT NOT NULL, -- relative path under images/ or original MinerU name -- Rich VLM-generated description (technical, precise, good for architectures) description_markdown TEXT NOT NULL, description_plain TEXT NOT NULL, -- Attribution for grounding/citations source_sequence_start INTEGER, source_sequence_end INTEGER, referenced_by_chunk_ids UUID[], -- text chunks that mention this figure model TEXT NOT NULL, prompt_hash TEXT NOT NULL, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), UNIQUE (chunk_id, model) ); CREATE INDEX IF NOT EXISTS idx_figure_descriptions_document ON figure_descriptions(document_id, created_at DESC); CREATE INDEX IF NOT EXISTS idx_figure_descriptions_chunk ON figure_descriptions(chunk_id); CREATE INDEX IF NOT EXISTS idx_section_summaries_document ON section_summaries(document_id, level, sequence_start); CREATE INDEX IF NOT EXISTS idx_section_summaries_document_created ON section_summaries(document_id, created_at DESC);