File size: 7,031 Bytes
a99d4dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
-- Telegram Chat Indexing Schema (Optimized)
-- SQLite with FTS5 for full-text search + performance optimizations

-- ============================================
-- PRAGMA OPTIMIZATIONS
-- ============================================
PRAGMA journal_mode = WAL;           -- Write-Ahead Logging for better concurrency
PRAGMA synchronous = NORMAL;         -- Balance between safety and speed
PRAGMA cache_size = -64000;          -- 64MB cache
PRAGMA temp_store = MEMORY;          -- Store temp tables in memory
PRAGMA mmap_size = 268435456;        -- 256MB memory-mapped I/O

-- ============================================
-- MAIN TABLES
-- ============================================

-- Main messages table
CREATE TABLE IF NOT EXISTS messages (
    id INTEGER PRIMARY KEY,
    type TEXT DEFAULT 'message',
    date TEXT,
    date_unixtime INTEGER NOT NULL,
    from_name TEXT,
    from_id TEXT NOT NULL,
    reply_to_message_id INTEGER,
    forwarded_from TEXT,
    forwarded_from_id TEXT,
    text_plain TEXT,
    text_length INTEGER DEFAULT 0,
    has_media INTEGER DEFAULT 0,
    has_photo INTEGER DEFAULT 0,
    has_links INTEGER DEFAULT 0,
    has_mentions INTEGER DEFAULT 0,
    is_edited INTEGER DEFAULT 0,
    edited_unixtime INTEGER,
    photo_file_size INTEGER,
    photo_width INTEGER,
    photo_height INTEGER,
    raw_json TEXT
);

-- Users table (extracted from messages)
CREATE TABLE IF NOT EXISTS users (
    user_id TEXT PRIMARY KEY,
    display_name TEXT,
    first_seen INTEGER,
    last_seen INTEGER,
    message_count INTEGER DEFAULT 0
);

-- Entities table (links, mentions, etc.)
CREATE TABLE IF NOT EXISTS entities (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    message_id INTEGER NOT NULL,
    type TEXT NOT NULL,
    value TEXT NOT NULL,
    FOREIGN KEY (message_id) REFERENCES messages(id) ON DELETE CASCADE
);

-- ============================================
-- GRAPH STRUCTURE FOR REPLY THREADS
-- ============================================

-- Pre-computed reply graph edges for fast traversal
CREATE TABLE IF NOT EXISTS reply_graph (
    parent_id INTEGER NOT NULL,
    child_id INTEGER NOT NULL,
    depth INTEGER DEFAULT 1,
    PRIMARY KEY (parent_id, child_id)
);

-- Conversation threads (connected components)
CREATE TABLE IF NOT EXISTS threads (
    thread_id INTEGER PRIMARY KEY AUTOINCREMENT,
    root_message_id INTEGER UNIQUE,
    message_count INTEGER DEFAULT 0,
    first_message_time INTEGER,
    last_message_time INTEGER,
    participant_count INTEGER DEFAULT 0
);

-- Message to thread mapping
CREATE TABLE IF NOT EXISTS message_threads (
    message_id INTEGER PRIMARY KEY,
    thread_id INTEGER NOT NULL,
    depth INTEGER DEFAULT 0,
    FOREIGN KEY (thread_id) REFERENCES threads(thread_id)
);

-- ============================================
-- TRIGRAM INDEX FOR FUZZY SEARCH
-- ============================================

-- Trigrams for fuzzy/approximate string matching
CREATE TABLE IF NOT EXISTS trigrams (
    trigram TEXT NOT NULL,
    message_id INTEGER NOT NULL,
    position INTEGER NOT NULL,
    PRIMARY KEY (trigram, message_id, position)
);

-- ============================================
-- FTS5 FULL-TEXT SEARCH (OPTIMIZED)
-- ============================================

-- Full-text search with prefix index for autocomplete
CREATE VIRTUAL TABLE IF NOT EXISTS messages_fts USING fts5(
    text_plain,
    from_name,
    content='messages',
    content_rowid='id',
    tokenize='unicode61 remove_diacritics 2',
    prefix='2 3 4'  -- Enable prefix queries for autocomplete
);

-- Triggers to keep FTS in sync
CREATE TRIGGER IF NOT EXISTS messages_ai AFTER INSERT ON messages BEGIN
    INSERT INTO messages_fts(rowid, text_plain, from_name)
    VALUES (new.id, new.text_plain, new.from_name);
END;

CREATE TRIGGER IF NOT EXISTS messages_ad AFTER DELETE ON messages BEGIN
    INSERT INTO messages_fts(messages_fts, rowid, text_plain, from_name)
    VALUES ('delete', old.id, old.text_plain, old.from_name);
END;

CREATE TRIGGER IF NOT EXISTS messages_au AFTER UPDATE ON messages BEGIN
    INSERT INTO messages_fts(messages_fts, rowid, text_plain, from_name)
    VALUES ('delete', old.id, old.text_plain, old.from_name);
    INSERT INTO messages_fts(rowid, text_plain, from_name)
    VALUES (new.id, new.text_plain, new.from_name);
END;

-- ============================================
-- OPTIMIZED INDEXES
-- ============================================

-- Composite indexes for common query patterns
CREATE INDEX IF NOT EXISTS idx_messages_date ON messages(date_unixtime);
CREATE INDEX IF NOT EXISTS idx_messages_from ON messages(from_id);
CREATE INDEX IF NOT EXISTS idx_messages_from_date ON messages(from_id, date_unixtime);
CREATE INDEX IF NOT EXISTS idx_messages_reply ON messages(reply_to_message_id) WHERE reply_to_message_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_messages_forwarded ON messages(forwarded_from_id) WHERE forwarded_from_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_messages_has_links ON messages(has_links) WHERE has_links = 1;
CREATE INDEX IF NOT EXISTS idx_messages_has_media ON messages(has_media) WHERE has_media = 1;

-- Entity indexes
CREATE INDEX IF NOT EXISTS idx_entities_message ON entities(message_id);
CREATE INDEX IF NOT EXISTS idx_entities_type_value ON entities(type, value);
CREATE INDEX IF NOT EXISTS idx_entities_value ON entities(value);

-- Graph indexes
CREATE INDEX IF NOT EXISTS idx_reply_graph_child ON reply_graph(child_id);
CREATE INDEX IF NOT EXISTS idx_message_threads_thread ON message_threads(thread_id);

-- Trigram index
CREATE INDEX IF NOT EXISTS idx_trigrams_trigram ON trigrams(trigram);

-- ============================================
-- PARTICIPANTS TABLE (from Telethon API)
-- ============================================

CREATE TABLE IF NOT EXISTS participants (
    user_id TEXT PRIMARY KEY,
    first_name TEXT,
    last_name TEXT,
    username TEXT,
    phone TEXT,
    is_bot INTEGER DEFAULT 0,
    is_admin INTEGER DEFAULT 0,
    is_creator INTEGER DEFAULT 0,
    is_premium INTEGER DEFAULT 0,
    join_date INTEGER,
    last_status TEXT DEFAULT 'unknown',
    last_online INTEGER,
    about TEXT,
    updated_at INTEGER
);

-- ============================================
-- STATISTICS TABLE FOR FAST AGGREGATIONS
-- ============================================

CREATE TABLE IF NOT EXISTS stats_cache (
    key TEXT PRIMARY KEY,
    value TEXT,
    updated_at INTEGER
);

-- ============================================
-- VECTOR EMBEDDINGS TABLE (OPTIONAL)
-- ============================================

-- For semantic search with FAISS
CREATE TABLE IF NOT EXISTS embeddings (
    message_id INTEGER PRIMARY KEY,
    embedding BLOB,  -- Serialized numpy array
    model_name TEXT DEFAULT 'default',
    FOREIGN KEY (message_id) REFERENCES messages(id) ON DELETE CASCADE
);