Spaces:
Sleeping
Sleeping
File size: 10,792 Bytes
1086e43 1da98b1 1086e43 da28452 1086e43 1da98b1 1086e43 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 | -- 9XAIPal Database Schema
CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
-- Documents table
CREATE TABLE IF NOT EXISTS documents (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
filename TEXT NOT NULL,
original_filename TEXT NOT NULL,
file_size_bytes BIGINT,
page_count INTEGER,
status TEXT NOT NULL DEFAULT 'queued',
error_message TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
-- LLM-corrected reading order for complex layouts (two-column papers, etc.)
-- Stores an array of original sequence_ids in the correct logical reading order.
reading_order JSONB,
reading_order_model TEXT,
reading_order_updated_at TIMESTAMPTZ,
-- Which extractor produced this document's chunks ("mineru" or "pymupdf_fallback").
-- Surfaced in the UI so users can see whether they got high-fidelity MinerU
-- output (typed equations, page_footnotes, table structure) or the degraded
-- text-only fallback.
extractor TEXT,
-- Whether this document is a "book" (chapter-by-chapter reading navigation)
-- or a "paper" (linear reading). Chosen by the user at upload time.
doc_kind TEXT NOT NULL DEFAULT 'paper'
);
COMMENT ON COLUMN documents.reading_order IS 'Array of original chunk sequence_ids in LLM-corrected logical reading order. Used to fix two-column and complex layout extraction issues.';
-- Persistent raw PDF storage.
-- The HF Space container disk is ephemeral, so the original uploaded bytes are
-- kept in Postgres (external Supabase) alongside metadata. The filesystem copy
-- in assets/ is used as a fast local cache when present.
CREATE TABLE IF NOT EXISTS document_raw_files (
document_id UUID PRIMARY KEY REFERENCES documents(id) ON DELETE CASCADE,
content BYTEA NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Chunks table with physical ordering
CREATE TABLE IF NOT EXISTS chunks (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
sequence_id INTEGER NOT NULL,
parent_sequence_id INTEGER,
chunk_type TEXT NOT NULL DEFAULT 'text',
heading_path TEXT[],
markdown TEXT NOT NULL,
plain_text TEXT NOT NULL,
page_start INTEGER,
page_end INTEGER,
bbox_json JSONB,
token_count INTEGER,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE(document_id, sequence_id)
);
CREATE INDEX IF NOT EXISTS idx_chunks_document_sequence
ON chunks(document_id, sequence_id);
-- Chunk embeddings with pgvector
CREATE TABLE IF NOT EXISTS chunk_embeddings (
chunk_id UUID PRIMARY KEY REFERENCES chunks(id) ON DELETE CASCADE,
embedding vector(1024) NOT NULL, -- dimension is substituted from VECTOR_DIMENSION at migration time
embedding_model TEXT NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Chunk assets (images, figures, tables)
CREATE TABLE IF NOT EXISTS chunk_assets (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
chunk_id UUID NOT NULL REFERENCES chunks(id) ON DELETE CASCADE,
asset_type TEXT NOT NULL,
file_path TEXT NOT NULL,
mime_type TEXT,
width INTEGER,
height INTEGER,
caption TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_chunk_assets_chunk_id ON chunk_assets(chunk_id);
-- Persistent extracted image/figure storage.
-- The HF Space container disk is ephemeral, so extracted figure images are kept
-- in Postgres (external Supabase) keyed by their chunk_assets row. The filesystem
-- copy under images/ is used as a fast local cache when present.
CREATE TABLE IF NOT EXISTS chunk_asset_files (
asset_id UUID PRIMARY KEY REFERENCES chunk_assets(id) ON DELETE CASCADE,
content BYTEA NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Conversation turns
-- Supports the nested sub-thread feature (tangents without polluting the main paper chat).
-- Main linear chat turns have parent_turn_id IS NULL.
-- Sub-thread turns have parent_turn_id pointing to their parent turn in the tree
-- (the branching user turn for the first continuation, or the previous turn for follow-ups).
CREATE TABLE IF NOT EXISTS conversation_turns (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
conversation_id UUID NOT NULL,
document_id UUID REFERENCES documents(id) ON DELETE SET NULL,
role TEXT NOT NULL,
content TEXT NOT NULL,
context_type TEXT,
router_reason TEXT,
model TEXT,
citations JSONB,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
-- NULL for all turns that belong to the main linear chat for a conversation_id.
-- Non-NULL points to the parent turn this message is a reply to (supports
-- arbitrary-depth nesting of tangents). The root of a sub-thread is the
-- original user message that started the tangent (even though that user
-- message itself has parent_turn_id = NULL so it stays visible in main chat).
parent_turn_id UUID REFERENCES conversation_turns(id) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS idx_conversation_turns_conversation
ON conversation_turns(conversation_id, created_at);
-- Fast lookup for "does this turn have any children in a sub-thread?"
-- and for recursive subtree loading.
CREATE INDEX IF NOT EXISTS idx_conversation_turns_parent
ON conversation_turns(parent_turn_id);
-- Ask traces for debugging
CREATE TABLE IF NOT EXISTS ask_traces (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
conversation_turn_id UUID REFERENCES conversation_turns(id) ON DELETE CASCADE,
context_type TEXT NOT NULL,
router_reason TEXT,
retrieved_chunk_ids UUID[],
model TEXT,
prompt_tokens INTEGER,
completion_tokens INTEGER,
latency_ms INTEGER,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Ingestion jobs
CREATE TABLE IF NOT EXISTS ingestion_jobs (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
status TEXT NOT NULL DEFAULT 'queued',
error_message TEXT,
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
-- Celery task id of the currently running step, so the watchdog can
-- force-terminate a stuck task and re-dispatch the next attempt.
celery_task_id TEXT,
-- Number of times the watchdog has recovered this job. Prevents infinite
-- recovery loops; after the cap the job is marked failed.
recovery_attempts INTEGER NOT NULL DEFAULT 0
);
CREATE INDEX IF NOT EXISTS idx_ingestion_jobs_status ON ingestion_jobs(status);
-- ============================================================================
-- Section Summaries: Pre-computed hierarchical overviews for high-quality
-- "What is this paper about?" / "Summarize the paper" experiences.
--
-- Design goals (personal use, quality-first):
-- * Separate table (does NOT pollute the source-of-truth `chunks` table)
-- * Stores rich attribution (source_chunk_ids) so answers can cite original
-- sequence_ids / pages even when using the overview path.
-- * Model + prompt_hash for future invalidation / regeneration when you
-- change models or improve prompts.
-- * Supports both per-section (H1/H2) and whole-paper executive summary.
-- ============================================================================
CREATE TABLE IF NOT EXISTS section_summaries (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
-- Stable identifier for the section within the document (e.g. "h1-03-introduction")
section_id TEXT NOT NULL,
-- 0 = whole-paper executive summary, 1 = H1, 2 = H2
level INTEGER NOT NULL CHECK (level IN (0, 1, 2)),
-- Full heading path at the time of summarization (e.g. ["Introduction", "Motivation"])
heading_path TEXT[] NOT NULL,
-- Inclusive range of source sequence_ids that contributed to this summary
sequence_start INTEGER,
sequence_end INTEGER,
summary_markdown TEXT NOT NULL,
summary_plain TEXT NOT NULL,
-- Strong grounding: the exact chunk IDs whose content was fed to the LLM
source_chunk_ids UUID[] NOT NULL,
model TEXT NOT NULL,
prompt_hash TEXT NOT NULL, -- hash of the prompt template + version
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE (document_id, section_id, model)
);
-- ============================================================================
-- Rich Extraction Enhancements (Quality-First for Tables, Figures & Architectures)
--
-- These additions support much deeper interaction with complex paper elements.
-- The author accepts heavy processing at ingestion time for superior chat quality.
-- ============================================================================
-- Add structured table data to existing chunks (only populated for table-type chunks)
-- This allows the model to query tables intelligently ("what was the F1 score for the 7B variant in Table 4?")
ALTER TABLE chunks
ADD COLUMN IF NOT EXISTS table_json JSONB;
-- Figure / Diagram / Architecture Descriptions
-- Generated at ingestion time using VLM (gemma4:26b vision or equivalent).
-- Stored separately so they can be retrieved by GLOBAL search, OVERVIEW, or targeted tools,
-- while the original image remains in chunk_assets.
CREATE TABLE IF NOT EXISTS figure_descriptions (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
chunk_id UUID NOT NULL REFERENCES chunks(id) ON DELETE CASCADE,
-- Original image reference
image_path TEXT NOT NULL, -- relative path under images/ or original MinerU name
-- Rich VLM-generated description (technical, precise, good for architectures)
description_markdown TEXT NOT NULL,
description_plain TEXT NOT NULL,
-- Attribution for grounding/citations
source_sequence_start INTEGER,
source_sequence_end INTEGER,
referenced_by_chunk_ids UUID[], -- text chunks that mention this figure
model TEXT NOT NULL,
prompt_hash TEXT NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE (chunk_id, model)
);
CREATE INDEX IF NOT EXISTS idx_figure_descriptions_document
ON figure_descriptions(document_id, created_at DESC);
CREATE INDEX IF NOT EXISTS idx_figure_descriptions_chunk
ON figure_descriptions(chunk_id);
CREATE INDEX IF NOT EXISTS idx_section_summaries_document
ON section_summaries(document_id, level, sequence_start);
CREATE INDEX IF NOT EXISTS idx_section_summaries_document_created
ON section_summaries(document_id, created_at DESC);
|