File size: 10,792 Bytes
1086e43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1da98b1
 
 
 
 
 
 
 
 
 
1086e43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da28452
 
 
 
 
 
 
 
 
 
1086e43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1da98b1
 
 
 
 
 
 
 
 
1086e43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
-- 9XAIPal Database Schema

CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";

-- Documents table
CREATE TABLE IF NOT EXISTS documents (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    filename TEXT NOT NULL,
    original_filename TEXT NOT NULL,
    file_size_bytes BIGINT,
    page_count INTEGER,
    status TEXT NOT NULL DEFAULT 'queued',
    error_message TEXT,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),

    -- LLM-corrected reading order for complex layouts (two-column papers, etc.)
    -- Stores an array of original sequence_ids in the correct logical reading order.
    reading_order JSONB,
    reading_order_model TEXT,
    reading_order_updated_at TIMESTAMPTZ,

    -- Which extractor produced this document's chunks ("mineru" or "pymupdf_fallback").
    -- Surfaced in the UI so users can see whether they got high-fidelity MinerU
    -- output (typed equations, page_footnotes, table structure) or the degraded
    -- text-only fallback.
    extractor TEXT,

    -- Whether this document is a "book" (chapter-by-chapter reading navigation)
    -- or a "paper" (linear reading). Chosen by the user at upload time.
    doc_kind TEXT NOT NULL DEFAULT 'paper'
);

COMMENT ON COLUMN documents.reading_order IS 'Array of original chunk sequence_ids in LLM-corrected logical reading order. Used to fix two-column and complex layout extraction issues.';

-- Persistent raw PDF storage.
-- The HF Space container disk is ephemeral, so the original uploaded bytes are
-- kept in Postgres (external Supabase) alongside metadata. The filesystem copy
-- in assets/ is used as a fast local cache when present.
CREATE TABLE IF NOT EXISTS document_raw_files (
    document_id UUID PRIMARY KEY REFERENCES documents(id) ON DELETE CASCADE,
    content BYTEA NOT NULL,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

-- Chunks table with physical ordering
CREATE TABLE IF NOT EXISTS chunks (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
    sequence_id INTEGER NOT NULL,
    parent_sequence_id INTEGER,
    chunk_type TEXT NOT NULL DEFAULT 'text',
    heading_path TEXT[],
    markdown TEXT NOT NULL,
    plain_text TEXT NOT NULL,
    page_start INTEGER,
    page_end INTEGER,
    bbox_json JSONB,
    token_count INTEGER,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    UNIQUE(document_id, sequence_id)
);

CREATE INDEX IF NOT EXISTS idx_chunks_document_sequence
    ON chunks(document_id, sequence_id);

-- Chunk embeddings with pgvector
CREATE TABLE IF NOT EXISTS chunk_embeddings (
    chunk_id UUID PRIMARY KEY REFERENCES chunks(id) ON DELETE CASCADE,
    embedding vector(1024) NOT NULL,  -- dimension is substituted from VECTOR_DIMENSION at migration time
    embedding_model TEXT NOT NULL,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

-- Chunk assets (images, figures, tables)
CREATE TABLE IF NOT EXISTS chunk_assets (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    chunk_id UUID NOT NULL REFERENCES chunks(id) ON DELETE CASCADE,
    asset_type TEXT NOT NULL,
    file_path TEXT NOT NULL,
    mime_type TEXT,
    width INTEGER,
    height INTEGER,
    caption TEXT,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

CREATE INDEX IF NOT EXISTS idx_chunk_assets_chunk_id ON chunk_assets(chunk_id);

-- Persistent extracted image/figure storage.
-- The HF Space container disk is ephemeral, so extracted figure images are kept
-- in Postgres (external Supabase) keyed by their chunk_assets row. The filesystem
-- copy under images/ is used as a fast local cache when present.
CREATE TABLE IF NOT EXISTS chunk_asset_files (
    asset_id UUID PRIMARY KEY REFERENCES chunk_assets(id) ON DELETE CASCADE,
    content BYTEA NOT NULL,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

-- Conversation turns
-- Supports the nested sub-thread feature (tangents without polluting the main paper chat).
-- Main linear chat turns have parent_turn_id IS NULL.
-- Sub-thread turns have parent_turn_id pointing to their parent turn in the tree
-- (the branching user turn for the first continuation, or the previous turn for follow-ups).
CREATE TABLE IF NOT EXISTS conversation_turns (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    conversation_id UUID NOT NULL,
    document_id UUID REFERENCES documents(id) ON DELETE SET NULL,
    role TEXT NOT NULL,
    content TEXT NOT NULL,
    context_type TEXT,
    router_reason TEXT,
    model TEXT,
    citations JSONB,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),

    -- NULL for all turns that belong to the main linear chat for a conversation_id.
    -- Non-NULL points to the parent turn this message is a reply to (supports
    -- arbitrary-depth nesting of tangents). The root of a sub-thread is the
    -- original user message that started the tangent (even though that user
    -- message itself has parent_turn_id = NULL so it stays visible in main chat).
    parent_turn_id UUID REFERENCES conversation_turns(id) ON DELETE CASCADE
);

CREATE INDEX IF NOT EXISTS idx_conversation_turns_conversation
    ON conversation_turns(conversation_id, created_at);

-- Fast lookup for "does this turn have any children in a sub-thread?"
-- and for recursive subtree loading.
CREATE INDEX IF NOT EXISTS idx_conversation_turns_parent
    ON conversation_turns(parent_turn_id);

-- Ask traces for debugging
CREATE TABLE IF NOT EXISTS ask_traces (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    conversation_turn_id UUID REFERENCES conversation_turns(id) ON DELETE CASCADE,
    context_type TEXT NOT NULL,
    router_reason TEXT,
    retrieved_chunk_ids UUID[],
    model TEXT,
    prompt_tokens INTEGER,
    completion_tokens INTEGER,
    latency_ms INTEGER,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

-- Ingestion jobs
CREATE TABLE IF NOT EXISTS ingestion_jobs (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
    status TEXT NOT NULL DEFAULT 'queued',
    error_message TEXT,
    started_at TIMESTAMPTZ,
    completed_at TIMESTAMPTZ,
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),

    -- Celery task id of the currently running step, so the watchdog can
    -- force-terminate a stuck task and re-dispatch the next attempt.
    celery_task_id TEXT,

    -- Number of times the watchdog has recovered this job. Prevents infinite
    -- recovery loops; after the cap the job is marked failed.
    recovery_attempts INTEGER NOT NULL DEFAULT 0
);

CREATE INDEX IF NOT EXISTS idx_ingestion_jobs_status ON ingestion_jobs(status);

-- ============================================================================
-- Section Summaries: Pre-computed hierarchical overviews for high-quality
-- "What is this paper about?" / "Summarize the paper" experiences.
--
-- Design goals (personal use, quality-first):
--   * Separate table (does NOT pollute the source-of-truth `chunks` table)
--   * Stores rich attribution (source_chunk_ids) so answers can cite original
--     sequence_ids / pages even when using the overview path.
--   * Model + prompt_hash for future invalidation / regeneration when you
--     change models or improve prompts.
--   * Supports both per-section (H1/H2) and whole-paper executive summary.
-- ============================================================================

CREATE TABLE IF NOT EXISTS section_summaries (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,

    -- Stable identifier for the section within the document (e.g. "h1-03-introduction")
    section_id TEXT NOT NULL,

    -- 0 = whole-paper executive summary, 1 = H1, 2 = H2
    level INTEGER NOT NULL CHECK (level IN (0, 1, 2)),

    -- Full heading path at the time of summarization (e.g. ["Introduction", "Motivation"])
    heading_path TEXT[] NOT NULL,

    -- Inclusive range of source sequence_ids that contributed to this summary
    sequence_start INTEGER,
    sequence_end INTEGER,

    summary_markdown TEXT NOT NULL,
    summary_plain TEXT NOT NULL,

    -- Strong grounding: the exact chunk IDs whose content was fed to the LLM
    source_chunk_ids UUID[] NOT NULL,

    model TEXT NOT NULL,
    prompt_hash TEXT NOT NULL,           -- hash of the prompt template + version

    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),

    UNIQUE (document_id, section_id, model)
);

-- ============================================================================
-- Rich Extraction Enhancements (Quality-First for Tables, Figures & Architectures)
--
-- These additions support much deeper interaction with complex paper elements.
-- The author accepts heavy processing at ingestion time for superior chat quality.
-- ============================================================================

-- Add structured table data to existing chunks (only populated for table-type chunks)
-- This allows the model to query tables intelligently ("what was the F1 score for the 7B variant in Table 4?")
ALTER TABLE chunks
    ADD COLUMN IF NOT EXISTS table_json JSONB;

-- Figure / Diagram / Architecture Descriptions
-- Generated at ingestion time using VLM (gemma4:26b vision or equivalent).
-- Stored separately so they can be retrieved by GLOBAL search, OVERVIEW, or targeted tools,
-- while the original image remains in chunk_assets.
CREATE TABLE IF NOT EXISTS figure_descriptions (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
    document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
    chunk_id UUID NOT NULL REFERENCES chunks(id) ON DELETE CASCADE,

    -- Original image reference
    image_path TEXT NOT NULL,           -- relative path under images/ or original MinerU name

    -- Rich VLM-generated description (technical, precise, good for architectures)
    description_markdown TEXT NOT NULL,
    description_plain TEXT NOT NULL,

    -- Attribution for grounding/citations
    source_sequence_start INTEGER,
    source_sequence_end INTEGER,
    referenced_by_chunk_ids UUID[],     -- text chunks that mention this figure

    model TEXT NOT NULL,
    prompt_hash TEXT NOT NULL,

    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),

    UNIQUE (chunk_id, model)
);

CREATE INDEX IF NOT EXISTS idx_figure_descriptions_document
    ON figure_descriptions(document_id, created_at DESC);

CREATE INDEX IF NOT EXISTS idx_figure_descriptions_chunk
    ON figure_descriptions(chunk_id);
CREATE INDEX IF NOT EXISTS idx_section_summaries_document
    ON section_summaries(document_id, level, sequence_start);

CREATE INDEX IF NOT EXISTS idx_section_summaries_document_created
    ON section_summaries(document_id, created_at DESC);