Bio-AI / database /schema.py
binduchinnam26
Add BioLitAI-X project
05dc1d3
"""
SQLite schema definitions for BioLitAI-X.
All tables are created via plain DDL executed against the SQLite connection.
"""
SCHEMA_SQL = """
PRAGMA journal_mode=WAL;
PRAGMA foreign_keys=ON;
-- ─────────────────────────────────────────────────────────────────────────────
-- Core paper table
-- ─────────────────────────────────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS papers (
pmid TEXT PRIMARY KEY,
title TEXT,
abstract TEXT,
authors TEXT, -- JSON-serialised list of author dicts
keywords TEXT, -- JSON-serialised list (author keywords only)
mesh_terms TEXT, -- JSON-serialised list of MeSH descriptor dicts
mesh_qualifiers TEXT, -- JSON-serialised list of qualifier strings
chemical_terms TEXT, -- JSON-serialised list of chemical dicts
publication_types TEXT, -- JSON-serialised list of strings
grant_info TEXT, -- JSON-serialised list of grant dicts
journal TEXT,
volume TEXT,
issue TEXT,
pages TEXT,
pub_date TEXT,
doi TEXT,
language TEXT,
query_used TEXT,
fetched_at TEXT
);
-- ─────────────────────────────────────────────────────────────────────────────
-- Normalised author table
-- ─────────────────────────────────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS authors (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
normalized_name TEXT,
affiliation TEXT
);
CREATE TABLE IF NOT EXISTS paper_authors (
paper_pmid TEXT NOT NULL REFERENCES papers(pmid) ON DELETE CASCADE,
author_id INTEGER NOT NULL REFERENCES authors(id) ON DELETE CASCADE,
author_position INTEGER,
PRIMARY KEY (paper_pmid, author_id)
);
-- ─────────────────────────────────────────────────────────────────────────────
-- Keywords (author-supplied keywords only)
-- ─────────────────────────────────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS keywords (
id INTEGER PRIMARY KEY AUTOINCREMENT,
keyword TEXT NOT NULL,
normalized_keyword TEXT,
keyword_type TEXT DEFAULT 'author_keyword'
);
CREATE TABLE IF NOT EXISTS paper_keywords (
paper_pmid TEXT NOT NULL REFERENCES papers(pmid) ON DELETE CASCADE,
keyword_id INTEGER NOT NULL REFERENCES keywords(id) ON DELETE CASCADE,
PRIMARY KEY (paper_pmid, keyword_id)
);
-- ─────────────────────────────────────────────────────────────────────────────
-- MeSH terms (descriptor + qualifier stored separately)
-- ─────────────────────────────────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS mesh_terms (
id INTEGER PRIMARY KEY AUTOINCREMENT,
descriptor TEXT NOT NULL,
qualifier TEXT,
is_major_topic INTEGER DEFAULT 0 -- 1 = major topic, 0 = minor
);
CREATE TABLE IF NOT EXISTS paper_mesh (
paper_pmid TEXT NOT NULL REFERENCES papers(pmid) ON DELETE CASCADE,
mesh_term_id INTEGER NOT NULL REFERENCES mesh_terms(id) ON DELETE CASCADE,
PRIMARY KEY (paper_pmid, mesh_term_id)
);
-- ─────────────────────────────────────────────────────────────────────────────
-- Chemical / substance terms
-- ─────────────────────────────────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS chemical_terms (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
registry_number TEXT
);
CREATE TABLE IF NOT EXISTS paper_chemicals (
paper_pmid TEXT NOT NULL REFERENCES papers(pmid) ON DELETE CASCADE,
chemical_term_id INTEGER NOT NULL REFERENCES chemical_terms(id) ON DELETE CASCADE,
PRIMARY KEY (paper_pmid, chemical_term_id)
);
-- ─────────────────────────────────────────────────────────────────────────────
-- Publication types
-- ─────────────────────────────────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS publication_types (
id INTEGER PRIMARY KEY AUTOINCREMENT,
publication_type TEXT NOT NULL UNIQUE
);
CREATE TABLE IF NOT EXISTS paper_publication_types (
paper_pmid TEXT NOT NULL REFERENCES papers(pmid) ON DELETE CASCADE,
publication_type_id INTEGER NOT NULL REFERENCES publication_types(id) ON DELETE CASCADE,
PRIMARY KEY (paper_pmid, publication_type_id)
);
-- ─────────────────────────────────────────────────────────────────────────────
-- BERTopic topics
-- ─────────────────────────────────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS topics (
id INTEGER PRIMARY KEY AUTOINCREMENT,
topic_label TEXT,
top_words TEXT, -- JSON-serialised list of (word, score) tuples
paper_count INTEGER DEFAULT 0,
query_used TEXT
);
CREATE TABLE IF NOT EXISTS paper_topics (
paper_pmid TEXT NOT NULL REFERENCES papers(pmid) ON DELETE CASCADE,
topic_id INTEGER NOT NULL REFERENCES topics(id) ON DELETE CASCADE,
probability REAL DEFAULT 0.0,
PRIMARY KEY (paper_pmid, topic_id)
);
-- ─────────────────────────────────────────────────────────────────────────────
-- NER entities
-- ─────────────────────────────────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS entities (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
entity_type TEXT,
umls_id TEXT
);
CREATE TABLE IF NOT EXISTS paper_entities (
paper_pmid TEXT NOT NULL REFERENCES papers(pmid) ON DELETE CASCADE,
entity_id INTEGER NOT NULL REFERENCES entities(id) ON DELETE CASCADE,
sentence_context TEXT,
PRIMARY KEY (paper_pmid, entity_id)
);
-- ─────────────────────────────────────────────────────────────────────────────
-- Entity relationships
-- ─────────────────────────────────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS relationships (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source_entity_id INTEGER NOT NULL REFERENCES entities(id) ON DELETE CASCADE,
target_entity_id INTEGER NOT NULL REFERENCES entities(id) ON DELETE CASCADE,
relationship_type TEXT,
evidence_pmid TEXT,
confidence_score REAL DEFAULT 0.0
);
-- ─────────────────────────────────────────────────────────────────────────────
-- Hypotheses
-- ─────────────────────────────────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS hypotheses (
id INTEGER PRIMARY KEY AUTOINCREMENT,
concept_a TEXT,
concept_b TEXT,
hypothesis_text TEXT,
evidence_pmids TEXT, -- JSON-serialised list
confidence_score REAL DEFAULT 0.0,
created_at TEXT,
query_used TEXT,
raw_response TEXT
);
-- ─────────────────────────────────────────────────────────────────────────────
-- Embedding metadata
-- ─────────────────────────────────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS embeddings_meta (
pmid TEXT PRIMARY KEY REFERENCES papers(pmid) ON DELETE CASCADE,
embedding_path TEXT,
model_used TEXT,
created_at TEXT
);
-- ─────────────────────────────────────────────────────────────────────────────
-- Query sessions
-- ─────────────────────────────────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS query_sessions (
id INTEGER PRIMARY KEY AUTOINCREMENT,
query_text TEXT NOT NULL,
max_results INTEGER,
papers_fetched INTEGER DEFAULT 0,
pipeline_status TEXT DEFAULT 'pending',
created_at TEXT
);
-- ─────────────────────────────────────────────────────────────────────────────
-- Indexes for common look-ups
-- ─────────────────────────────────────────────────────────────────────────────
CREATE INDEX IF NOT EXISTS idx_papers_query ON papers(query_used);
CREATE INDEX IF NOT EXISTS idx_papers_pubdate ON papers(pub_date);
CREATE INDEX IF NOT EXISTS idx_authors_norm ON authors(normalized_name);
CREATE INDEX IF NOT EXISTS idx_entities_type ON entities(entity_type);
CREATE INDEX IF NOT EXISTS idx_entities_name ON entities(name);
CREATE INDEX IF NOT EXISTS idx_rels_source ON relationships(source_entity_id);
CREATE INDEX IF NOT EXISTS idx_rels_target ON relationships(target_entity_id);
CREATE INDEX IF NOT EXISTS idx_hyp_query ON hypotheses(query_used);
CREATE INDEX IF NOT EXISTS idx_sessions_query ON query_sessions(query_text);
"""