Spaces:
Sleeping
Sleeping
| -- 9XAIPal Database Schema | |
| CREATE EXTENSION IF NOT EXISTS vector; | |
| CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; | |
| -- Documents table | |
| CREATE TABLE IF NOT EXISTS documents ( | |
| id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), | |
| filename TEXT NOT NULL, | |
| original_filename TEXT NOT NULL, | |
| file_size_bytes BIGINT, | |
| page_count INTEGER, | |
| status TEXT NOT NULL DEFAULT 'queued', | |
| error_message TEXT, | |
| created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), | |
| updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), | |
| -- LLM-corrected reading order for complex layouts (two-column papers, etc.) | |
| -- Stores an array of original sequence_ids in the correct logical reading order. | |
| reading_order JSONB, | |
| reading_order_model TEXT, | |
| reading_order_updated_at TIMESTAMPTZ, | |
| -- Which extractor produced this document's chunks ("mineru" or "pymupdf_fallback"). | |
| -- Surfaced in the UI so users can see whether they got high-fidelity MinerU | |
| -- output (typed equations, page_footnotes, table structure) or the degraded | |
| -- text-only fallback. | |
| extractor TEXT, | |
| -- Whether this document is a "book" (chapter-by-chapter reading navigation) | |
| -- or a "paper" (linear reading). Chosen by the user at upload time. | |
| doc_kind TEXT NOT NULL DEFAULT 'paper' | |
| ); | |
| COMMENT ON COLUMN documents.reading_order IS 'Array of original chunk sequence_ids in LLM-corrected logical reading order. Used to fix two-column and complex layout extraction issues.'; | |
| -- Persistent raw PDF storage. | |
| -- The HF Space container disk is ephemeral, so the original uploaded bytes are | |
| -- kept in Postgres (external Supabase) alongside metadata. The filesystem copy | |
| -- in assets/ is used as a fast local cache when present. | |
| CREATE TABLE IF NOT EXISTS document_raw_files ( | |
| document_id UUID PRIMARY KEY REFERENCES documents(id) ON DELETE CASCADE, | |
| content BYTEA NOT NULL, | |
| created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() | |
| ); | |
| -- Chunks table with physical ordering | |
| CREATE TABLE IF NOT EXISTS chunks ( | |
| id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), | |
| document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE, | |
| sequence_id INTEGER NOT NULL, | |
| parent_sequence_id INTEGER, | |
| chunk_type TEXT NOT NULL DEFAULT 'text', | |
| heading_path TEXT[], | |
| markdown TEXT NOT NULL, | |
| plain_text TEXT NOT NULL, | |
| page_start INTEGER, | |
| page_end INTEGER, | |
| bbox_json JSONB, | |
| token_count INTEGER, | |
| created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), | |
| UNIQUE(document_id, sequence_id) | |
| ); | |
| CREATE INDEX IF NOT EXISTS idx_chunks_document_sequence | |
| ON chunks(document_id, sequence_id); | |
| -- Chunk embeddings with pgvector | |
| CREATE TABLE IF NOT EXISTS chunk_embeddings ( | |
| chunk_id UUID PRIMARY KEY REFERENCES chunks(id) ON DELETE CASCADE, | |
| embedding vector(1024) NOT NULL, -- dimension is substituted from VECTOR_DIMENSION at migration time | |
| embedding_model TEXT NOT NULL, | |
| created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() | |
| ); | |
| -- Chunk assets (images, figures, tables) | |
| CREATE TABLE IF NOT EXISTS chunk_assets ( | |
| id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), | |
| chunk_id UUID NOT NULL REFERENCES chunks(id) ON DELETE CASCADE, | |
| asset_type TEXT NOT NULL, | |
| file_path TEXT NOT NULL, | |
| mime_type TEXT, | |
| width INTEGER, | |
| height INTEGER, | |
| caption TEXT, | |
| created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() | |
| ); | |
| CREATE INDEX IF NOT EXISTS idx_chunk_assets_chunk_id ON chunk_assets(chunk_id); | |
| -- Persistent extracted image/figure storage. | |
| -- The HF Space container disk is ephemeral, so extracted figure images are kept | |
| -- in Postgres (external Supabase) keyed by their chunk_assets row. The filesystem | |
| -- copy under images/ is used as a fast local cache when present. | |
| CREATE TABLE IF NOT EXISTS chunk_asset_files ( | |
| asset_id UUID PRIMARY KEY REFERENCES chunk_assets(id) ON DELETE CASCADE, | |
| content BYTEA NOT NULL, | |
| created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() | |
| ); | |
| -- Conversation turns | |
| -- Supports the nested sub-thread feature (tangents without polluting the main paper chat). | |
| -- Main linear chat turns have parent_turn_id IS NULL. | |
| -- Sub-thread turns have parent_turn_id pointing to their parent turn in the tree | |
| -- (the branching user turn for the first continuation, or the previous turn for follow-ups). | |
| CREATE TABLE IF NOT EXISTS conversation_turns ( | |
| id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), | |
| conversation_id UUID NOT NULL, | |
| document_id UUID REFERENCES documents(id) ON DELETE SET NULL, | |
| role TEXT NOT NULL, | |
| content TEXT NOT NULL, | |
| context_type TEXT, | |
| router_reason TEXT, | |
| model TEXT, | |
| citations JSONB, | |
| created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), | |
| -- NULL for all turns that belong to the main linear chat for a conversation_id. | |
| -- Non-NULL points to the parent turn this message is a reply to (supports | |
| -- arbitrary-depth nesting of tangents). The root of a sub-thread is the | |
| -- original user message that started the tangent (even though that user | |
| -- message itself has parent_turn_id = NULL so it stays visible in main chat). | |
| parent_turn_id UUID REFERENCES conversation_turns(id) ON DELETE CASCADE | |
| ); | |
| CREATE INDEX IF NOT EXISTS idx_conversation_turns_conversation | |
| ON conversation_turns(conversation_id, created_at); | |
| -- Fast lookup for "does this turn have any children in a sub-thread?" | |
| -- and for recursive subtree loading. | |
| CREATE INDEX IF NOT EXISTS idx_conversation_turns_parent | |
| ON conversation_turns(parent_turn_id); | |
| -- Ask traces for debugging | |
| CREATE TABLE IF NOT EXISTS ask_traces ( | |
| id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), | |
| conversation_turn_id UUID REFERENCES conversation_turns(id) ON DELETE CASCADE, | |
| context_type TEXT NOT NULL, | |
| router_reason TEXT, | |
| retrieved_chunk_ids UUID[], | |
| model TEXT, | |
| prompt_tokens INTEGER, | |
| completion_tokens INTEGER, | |
| latency_ms INTEGER, | |
| created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() | |
| ); | |
| -- Ingestion jobs | |
| CREATE TABLE IF NOT EXISTS ingestion_jobs ( | |
| id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), | |
| document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE, | |
| status TEXT NOT NULL DEFAULT 'queued', | |
| error_message TEXT, | |
| started_at TIMESTAMPTZ, | |
| completed_at TIMESTAMPTZ, | |
| created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), | |
| -- Celery task id of the currently running step, so the watchdog can | |
| -- force-terminate a stuck task and re-dispatch the next attempt. | |
| celery_task_id TEXT, | |
| -- Number of times the watchdog has recovered this job. Prevents infinite | |
| -- recovery loops; after the cap the job is marked failed. | |
| recovery_attempts INTEGER NOT NULL DEFAULT 0 | |
| ); | |
| CREATE INDEX IF NOT EXISTS idx_ingestion_jobs_status ON ingestion_jobs(status); | |
| -- ============================================================================ | |
| -- Section Summaries: Pre-computed hierarchical overviews for high-quality | |
| -- "What is this paper about?" / "Summarize the paper" experiences. | |
| -- | |
| -- Design goals (personal use, quality-first): | |
| -- * Separate table (does NOT pollute the source-of-truth `chunks` table) | |
| -- * Stores rich attribution (source_chunk_ids) so answers can cite original | |
| -- sequence_ids / pages even when using the overview path. | |
| -- * Model + prompt_hash for future invalidation / regeneration when you | |
| -- change models or improve prompts. | |
| -- * Supports both per-section (H1/H2) and whole-paper executive summary. | |
| -- ============================================================================ | |
| CREATE TABLE IF NOT EXISTS section_summaries ( | |
| id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), | |
| document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE, | |
| -- Stable identifier for the section within the document (e.g. "h1-03-introduction") | |
| section_id TEXT NOT NULL, | |
| -- 0 = whole-paper executive summary, 1 = H1, 2 = H2 | |
| level INTEGER NOT NULL CHECK (level IN (0, 1, 2)), | |
| -- Full heading path at the time of summarization (e.g. ["Introduction", "Motivation"]) | |
| heading_path TEXT[] NOT NULL, | |
| -- Inclusive range of source sequence_ids that contributed to this summary | |
| sequence_start INTEGER, | |
| sequence_end INTEGER, | |
| summary_markdown TEXT NOT NULL, | |
| summary_plain TEXT NOT NULL, | |
| -- Strong grounding: the exact chunk IDs whose content was fed to the LLM | |
| source_chunk_ids UUID[] NOT NULL, | |
| model TEXT NOT NULL, | |
| prompt_hash TEXT NOT NULL, -- hash of the prompt template + version | |
| created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), | |
| UNIQUE (document_id, section_id, model) | |
| ); | |
| -- ============================================================================ | |
| -- Rich Extraction Enhancements (Quality-First for Tables, Figures & Architectures) | |
| -- | |
| -- These additions support much deeper interaction with complex paper elements. | |
| -- The author accepts heavy processing at ingestion time for superior chat quality. | |
| -- ============================================================================ | |
| -- Add structured table data to existing chunks (only populated for table-type chunks) | |
| -- This allows the model to query tables intelligently ("what was the F1 score for the 7B variant in Table 4?") | |
| ALTER TABLE chunks | |
| ADD COLUMN IF NOT EXISTS table_json JSONB; | |
| -- Figure / Diagram / Architecture Descriptions | |
| -- Generated at ingestion time using VLM (gemma4:26b vision or equivalent). | |
| -- Stored separately so they can be retrieved by GLOBAL search, OVERVIEW, or targeted tools, | |
| -- while the original image remains in chunk_assets. | |
| CREATE TABLE IF NOT EXISTS figure_descriptions ( | |
| id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), | |
| document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE, | |
| chunk_id UUID NOT NULL REFERENCES chunks(id) ON DELETE CASCADE, | |
| -- Original image reference | |
| image_path TEXT NOT NULL, -- relative path under images/ or original MinerU name | |
| -- Rich VLM-generated description (technical, precise, good for architectures) | |
| description_markdown TEXT NOT NULL, | |
| description_plain TEXT NOT NULL, | |
| -- Attribution for grounding/citations | |
| source_sequence_start INTEGER, | |
| source_sequence_end INTEGER, | |
| referenced_by_chunk_ids UUID[], -- text chunks that mention this figure | |
| model TEXT NOT NULL, | |
| prompt_hash TEXT NOT NULL, | |
| created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), | |
| UNIQUE (chunk_id, model) | |
| ); | |
| CREATE INDEX IF NOT EXISTS idx_figure_descriptions_document | |
| ON figure_descriptions(document_id, created_at DESC); | |
| CREATE INDEX IF NOT EXISTS idx_figure_descriptions_chunk | |
| ON figure_descriptions(chunk_id); | |
| CREATE INDEX IF NOT EXISTS idx_section_summaries_document | |
| ON section_summaries(document_id, level, sequence_start); | |
| CREATE INDEX IF NOT EXISTS idx_section_summaries_document_created | |
| ON section_summaries(document_id, created_at DESC); | |