rottg commited on
Commit
4a21e7e
·
1 Parent(s): a99d4dc

Upload folder using huggingface_hub

Browse files
analyzer.py ADDED
@@ -0,0 +1,881 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Telegram Chat Analytics (Enhanced with Course Algorithms)
4
+
5
+ Features:
6
+ - LCS-based similar message detection
7
+ - Heap-based Top-K (O(n log k) instead of O(n log n))
8
+ - Selection algorithm for O(n) median/percentiles
9
+ - Rank Tree for order statistics queries
10
+ - Bucket Sort for time-based histograms
11
+
12
+ Usage:
13
+ python analyzer.py --db telegram.db [options]
14
+ python analyzer.py --stats
15
+ python analyzer.py --top-users
16
+ python analyzer.py --similar # NEW: Find similar messages
17
+ python analyzer.py --percentiles # NEW: Message length percentiles
18
+ python analyzer.py --user-rank USER # NEW: Get user's rank
19
+ """
20
+
21
+ import sqlite3
22
+ import argparse
23
+ import json
24
+ from collections import Counter
25
+ from datetime import datetime
26
+ from typing import Optional
27
+ import re
28
+
29
+ # Import course algorithms
30
+ from algorithms import (
31
+ # LCS
32
+ lcs_similarity, find_similar_messages,
33
+ # Top-K
34
+ TopK, top_k_frequent, top_k_by_field,
35
+ # Selection
36
+ find_median, find_percentile,
37
+ # Rank Tree
38
+ RankTree,
39
+ # Bucket Sort
40
+ bucket_sort_by_time, time_histogram, hourly_distribution,
41
+ # Combined
42
+ RankedTimeIndex
43
+ )
44
+
45
+
46
+ class TelegramAnalyzer:
47
+ """
48
+ Analytics interface for indexed Telegram messages.
49
+
50
+ Enhanced with efficient algorithms:
51
+ - Top-K queries: O(n log k) using heap
52
+ - Percentiles: O(n) using selection algorithm
53
+ - Rank queries: O(log n) using rank tree
54
+ - Similar messages: LCS-based detection
55
+ """
56
+
57
+ def __init__(self, db_path: str = 'telegram.db'):
58
+ self.db_path = db_path
59
+ self.conn = sqlite3.connect(db_path)
60
+ self.conn.row_factory = sqlite3.Row
61
+
62
+ # Lazy-loaded data structures
63
+ self._rank_tree: Optional[RankTree] = None
64
+ self._time_index: Optional[RankedTimeIndex] = None
65
+
66
+ def close(self):
67
+ self.conn.close()
68
+
69
+ def __enter__(self):
70
+ return self
71
+
72
+ def __exit__(self, *args):
73
+ self.close()
74
+
75
+ # ==========================================
76
+ # ORIGINAL METHODS (kept for compatibility)
77
+ # ==========================================
78
+
79
+ def get_stats(self) -> dict:
80
+ """Get general statistics about the indexed data."""
81
+ stats = {}
82
+
83
+ cursor = self.conn.execute('SELECT COUNT(*) FROM messages')
84
+ stats['total_messages'] = cursor.fetchone()[0]
85
+
86
+ cursor = self.conn.execute('SELECT COUNT(DISTINCT from_id) FROM messages')
87
+ stats['total_users'] = cursor.fetchone()[0]
88
+
89
+ cursor = self.conn.execute('''
90
+ SELECT MIN(date_unixtime), MAX(date_unixtime) FROM messages
91
+ WHERE date_unixtime IS NOT NULL
92
+ ''')
93
+ row = cursor.fetchone()
94
+ if row[0] and row[1]:
95
+ stats['first_message'] = datetime.fromtimestamp(row[0]).isoformat()
96
+ stats['last_message'] = datetime.fromtimestamp(row[1]).isoformat()
97
+ stats['days_span'] = (row[1] - row[0]) // 86400
98
+
99
+ cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE has_media = 1')
100
+ stats['messages_with_media'] = cursor.fetchone()[0]
101
+
102
+ cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE has_links = 1')
103
+ stats['messages_with_links'] = cursor.fetchone()[0]
104
+
105
+ cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE has_mentions = 1')
106
+ stats['messages_with_mentions'] = cursor.fetchone()[0]
107
+
108
+ cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE forwarded_from IS NOT NULL')
109
+ stats['forwarded_messages'] = cursor.fetchone()[0]
110
+
111
+ cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE reply_to_message_id IS NOT NULL')
112
+ stats['reply_messages'] = cursor.fetchone()[0]
113
+
114
+ cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE is_edited = 1')
115
+ stats['edited_messages'] = cursor.fetchone()[0]
116
+
117
+ cursor = self.conn.execute('SELECT type, COUNT(*) FROM entities GROUP BY type')
118
+ stats['entities'] = {row[0]: row[1] for row in cursor.fetchall()}
119
+
120
+ # NEW: Add percentile stats using Selection algorithm
121
+ lengths = self._get_message_lengths()
122
+ if lengths:
123
+ stats['median_message_length'] = find_median(lengths)
124
+ stats['p90_message_length'] = find_percentile(lengths, 90)
125
+
126
+ return stats
127
+
128
+ def _get_message_lengths(self) -> list[int]:
129
+ """Get all message lengths for statistical analysis."""
130
+ cursor = self.conn.execute(
131
+ 'SELECT length(text_plain) FROM messages WHERE text_plain IS NOT NULL'
132
+ )
133
+ return [row[0] for row in cursor.fetchall() if row[0]]
134
+
135
+ # ==========================================
136
+ # ENHANCED TOP-K METHODS (using Heap)
137
+ # ==========================================
138
+
139
+ def get_top_users(self, limit: int = 20) -> list[dict]:
140
+ """
141
+ Get most active users by message count.
142
+
143
+ Uses Heap-based Top-K: O(n log k) instead of O(n log n)
144
+ """
145
+ cursor = self.conn.execute('''
146
+ SELECT
147
+ from_id,
148
+ from_name,
149
+ COUNT(*) as message_count,
150
+ SUM(has_links) as links_shared,
151
+ SUM(has_media) as media_shared,
152
+ MIN(date_unixtime) as first_message,
153
+ MAX(date_unixtime) as last_message
154
+ FROM messages
155
+ WHERE from_id IS NOT NULL AND from_id != ''
156
+ GROUP BY from_id
157
+ ''')
158
+
159
+ # Use heap-based Top-K
160
+ top = TopK(limit, key=lambda x: x['message_count'])
161
+ for row in cursor.fetchall():
162
+ top.push(dict(row))
163
+
164
+ return top.get_top()
165
+
166
+ def get_top_words_heap(self, limit: int = 50, min_length: int = 3) -> list[tuple[str, int]]:
167
+ """
168
+ Get most frequent words using Heap-based Top-K.
169
+
170
+ O(n + m log k) where n=total words, m=unique words, k=limit
171
+ """
172
+ cursor = self.conn.execute('SELECT text_plain FROM messages WHERE text_plain IS NOT NULL')
173
+
174
+ word_pattern = re.compile(r'[\u0590-\u05FFa-zA-Z]+')
175
+ words = []
176
+
177
+ for row in cursor.fetchall():
178
+ text = row[0]
179
+ for word in word_pattern.findall(text.lower()):
180
+ if len(word) >= min_length:
181
+ words.append(word)
182
+
183
+ return top_k_frequent(words, limit)
184
+
185
+ def get_top_domains_heap(self, limit: int = 20) -> list[tuple[str, int]]:
186
+ """Get most shared domains using Heap-based Top-K."""
187
+ cursor = self.conn.execute("SELECT value FROM entities WHERE type = 'link'")
188
+
189
+ domain_pattern = re.compile(r'https?://(?:www\.)?([^/]+)')
190
+ domains = []
191
+
192
+ for row in cursor.fetchall():
193
+ match = domain_pattern.match(row[0])
194
+ if match:
195
+ domains.append(match.group(1))
196
+
197
+ return top_k_frequent(domains, limit)
198
+
199
+ # ==========================================
200
+ # LCS-BASED SIMILAR MESSAGE DETECTION
201
+ # ==========================================
202
+
203
+ def find_similar_messages(
204
+ self,
205
+ threshold: float = 0.7,
206
+ min_length: int = 30,
207
+ limit: int = 100,
208
+ sample_size: int = 1000
209
+ ) -> list[tuple[int, int, float, str, str]]:
210
+ """
211
+ Find similar/duplicate messages using LCS algorithm.
212
+
213
+ Args:
214
+ threshold: Minimum similarity (0-1)
215
+ min_length: Minimum message length to consider
216
+ limit: Maximum pairs to return
217
+ sample_size: Sample size for large datasets
218
+
219
+ Returns:
220
+ List of (id1, id2, similarity, text1, text2) tuples
221
+ """
222
+ cursor = self.conn.execute('''
223
+ SELECT id, text_plain FROM messages
224
+ WHERE text_plain IS NOT NULL AND length(text_plain) >= ?
225
+ ORDER BY RANDOM()
226
+ LIMIT ?
227
+ ''', (min_length, sample_size))
228
+
229
+ messages = [(row[0], row[1]) for row in cursor.fetchall()]
230
+
231
+ # Find similar pairs using LCS
232
+ similar_pairs = find_similar_messages(messages, threshold, min_length)
233
+
234
+ # Fetch full text for results
235
+ results = []
236
+ for id1, id2, sim in similar_pairs[:limit]:
237
+ cursor = self.conn.execute(
238
+ 'SELECT text_plain FROM messages WHERE id IN (?, ?)',
239
+ (id1, id2)
240
+ )
241
+ rows = cursor.fetchall()
242
+ if len(rows) == 2:
243
+ results.append((id1, id2, sim, rows[0][0][:100], rows[1][0][:100]))
244
+
245
+ return results
246
+
247
+ def find_reposts(self, threshold: float = 0.9) -> list[dict]:
248
+ """
249
+ Find potential reposts (very similar messages from different users).
250
+ """
251
+ cursor = self.conn.execute('''
252
+ SELECT id, from_id, text_plain FROM messages
253
+ WHERE text_plain IS NOT NULL AND length(text_plain) >= 50
254
+ ORDER BY date_unixtime DESC
255
+ LIMIT 500
256
+ ''')
257
+
258
+ messages = [(row[0], row[1], row[2]) for row in cursor.fetchall()]
259
+ reposts = []
260
+
261
+ for i in range(len(messages)):
262
+ for j in range(i + 1, len(messages)):
263
+ id1, user1, text1 = messages[i]
264
+ id2, user2, text2 = messages[j]
265
+
266
+ # Only consider different users
267
+ if user1 == user2:
268
+ continue
269
+
270
+ sim = lcs_similarity(text1, text2)
271
+ if sim >= threshold:
272
+ reposts.append({
273
+ 'message_id_1': id1,
274
+ 'message_id_2': id2,
275
+ 'user_1': user1,
276
+ 'user_2': user2,
277
+ 'similarity': sim,
278
+ 'text_preview': text1[:80]
279
+ })
280
+
281
+ return sorted(reposts, key=lambda x: x['similarity'], reverse=True)
282
+
283
+ # ==========================================
284
+ # SELECTION ALGORITHM (PERCENTILES)
285
+ # ==========================================
286
+
287
+ def get_message_length_stats(self) -> dict:
288
+ """
289
+ Get message length statistics using O(n) Selection algorithm.
290
+
291
+ Much faster than sorting for percentile calculations.
292
+ """
293
+ lengths = self._get_message_lengths()
294
+
295
+ if not lengths:
296
+ return {}
297
+
298
+ return {
299
+ 'count': len(lengths),
300
+ 'min': min(lengths),
301
+ 'max': max(lengths),
302
+ 'median': find_median(lengths),
303
+ 'p25': find_percentile(lengths, 25),
304
+ 'p75': find_percentile(lengths, 75),
305
+ 'p90': find_percentile(lengths, 90),
306
+ 'p95': find_percentile(lengths, 95),
307
+ 'p99': find_percentile(lengths, 99),
308
+ }
309
+
310
+ def get_response_time_percentiles(self) -> dict:
311
+ """
312
+ Calculate response time percentiles for replies.
313
+
314
+ Uses Selection algorithm for O(n) percentile calculation.
315
+ """
316
+ cursor = self.conn.execute('''
317
+ SELECT
318
+ m1.date_unixtime - m2.date_unixtime as response_time
319
+ FROM messages m1
320
+ JOIN messages m2 ON m1.reply_to_message_id = m2.id
321
+ WHERE m1.date_unixtime > m2.date_unixtime
322
+ ''')
323
+
324
+ times = [row[0] for row in cursor.fetchall() if row[0] and row[0] > 0]
325
+
326
+ if not times:
327
+ return {}
328
+
329
+ return {
330
+ 'count': len(times),
331
+ 'median_seconds': find_median(times),
332
+ 'p75_seconds': find_percentile(times, 75),
333
+ 'p90_seconds': find_percentile(times, 90),
334
+ 'p95_seconds': find_percentile(times, 95),
335
+ }
336
+
337
+ # ==========================================
338
+ # RANK TREE (ORDER STATISTICS)
339
+ # ==========================================
340
+
341
+ def _build_user_rank_tree(self) -> RankTree:
342
+ """Build rank tree for user activity ranking."""
343
+ if self._rank_tree is not None:
344
+ return self._rank_tree
345
+
346
+ self._rank_tree = RankTree()
347
+
348
+ cursor = self.conn.execute('''
349
+ SELECT from_id, from_name, COUNT(*) as msg_count
350
+ FROM messages
351
+ WHERE from_id IS NOT NULL AND from_id != ''
352
+ GROUP BY from_id
353
+ ''')
354
+
355
+ for row in cursor.fetchall():
356
+ self._rank_tree.insert(
357
+ row['msg_count'],
358
+ {'user_id': row['from_id'], 'name': row['from_name'], 'count': row['msg_count']}
359
+ )
360
+
361
+ return self._rank_tree
362
+
363
+ def get_user_rank(self, user_id: str) -> dict:
364
+ """
365
+ Get a user's rank among all users.
366
+
367
+ Uses Rank Tree: O(log n) instead of O(n log n)
368
+ """
369
+ tree = self._build_user_rank_tree()
370
+
371
+ # Get user's message count
372
+ cursor = self.conn.execute(
373
+ 'SELECT COUNT(*) FROM messages WHERE from_id = ?',
374
+ (user_id,)
375
+ )
376
+ count = cursor.fetchone()[0]
377
+
378
+ if count == 0:
379
+ return {'error': 'User not found'}
380
+
381
+ rank = tree.rank(count)
382
+ total_users = len(tree)
383
+
384
+ return {
385
+ 'user_id': user_id,
386
+ 'message_count': count,
387
+ 'rank': total_users - rank + 1, # Reverse for "top" ranking
388
+ 'total_users': total_users,
389
+ 'percentile': ((total_users - rank) / total_users) * 100
390
+ }
391
+
392
+ def get_user_by_rank(self, rank: int) -> Optional[dict]:
393
+ """
394
+ Get the user at a specific rank.
395
+
396
+ Uses Rank Tree select(): O(log n)
397
+ """
398
+ tree = self._build_user_rank_tree()
399
+ total = len(tree)
400
+
401
+ if rank < 1 or rank > total:
402
+ return None
403
+
404
+ # Convert to tree rank (reverse order for "top")
405
+ tree_rank = total - rank + 1
406
+ return tree.select(tree_rank)
407
+
408
+ # ==========================================
409
+ # BUCKET SORT (TIME-BASED HISTOGRAMS)
410
+ # ==========================================
411
+
412
+ def get_activity_histogram(
413
+ self,
414
+ bucket_size: int = 86400, # 1 day default
415
+ start_time: int = None,
416
+ end_time: int = None
417
+ ) -> list[tuple[str, int]]:
418
+ """
419
+ Get activity histogram using Bucket Sort.
420
+
421
+ O(n + k) where k = number of buckets
422
+
423
+ Args:
424
+ bucket_size: Bucket size in seconds (default: 1 day)
425
+ start_time: Start timestamp (default: earliest message)
426
+ end_time: End timestamp (default: latest message)
427
+
428
+ Returns:
429
+ List of (date_string, count) tuples
430
+ """
431
+ cursor = self.conn.execute(
432
+ 'SELECT id, date_unixtime FROM messages WHERE date_unixtime IS NOT NULL'
433
+ )
434
+ records = [{'id': row[0], 'date_unixtime': row[1]} for row in cursor.fetchall()]
435
+
436
+ if not records:
437
+ return []
438
+
439
+ hist = time_histogram(records, 'date_unixtime', bucket_size)
440
+
441
+ # Format timestamps as dates
442
+ return [
443
+ (datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M'), count)
444
+ for ts, count in hist
445
+ ]
446
+
447
+ def get_hourly_distribution(self) -> dict[int, int]:
448
+ """
449
+ Get message distribution by hour of day.
450
+
451
+ Uses Bucket Sort: O(n)
452
+ """
453
+ cursor = self.conn.execute(
454
+ 'SELECT id, date_unixtime FROM messages WHERE date_unixtime IS NOT NULL'
455
+ )
456
+ records = [{'id': row[0], 'date_unixtime': row[1]} for row in cursor.fetchall()]
457
+
458
+ return hourly_distribution(records, 'date_unixtime')
459
+
460
+ # ==========================================
461
+ # ORIGINAL METHODS (kept for compatibility)
462
+ # ==========================================
463
+
464
+ def get_hourly_activity(self) -> dict[int, int]:
465
+ """Get message count by hour of day."""
466
+ sql = '''
467
+ SELECT
468
+ CAST(strftime('%H', datetime(date_unixtime, 'unixepoch')) AS INTEGER) as hour,
469
+ COUNT(*) as count
470
+ FROM messages
471
+ WHERE date_unixtime IS NOT NULL
472
+ GROUP BY hour
473
+ ORDER BY hour
474
+ '''
475
+ cursor = self.conn.execute(sql)
476
+ return {row[0]: row[1] for row in cursor.fetchall()}
477
+
478
+ def get_daily_activity(self) -> dict[str, int]:
479
+ """Get message count by day of week."""
480
+ days = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
481
+ sql = '''
482
+ SELECT
483
+ CAST(strftime('%w', datetime(date_unixtime, 'unixepoch')) AS INTEGER) as day,
484
+ COUNT(*) as count
485
+ FROM messages
486
+ WHERE date_unixtime IS NOT NULL
487
+ GROUP BY day
488
+ ORDER BY day
489
+ '''
490
+ cursor = self.conn.execute(sql)
491
+ return {days[row[0]]: row[1] for row in cursor.fetchall()}
492
+
493
+ def get_monthly_activity(self) -> dict[str, int]:
494
+ """Get message count by month."""
495
+ sql = '''
496
+ SELECT
497
+ strftime('%Y-%m', datetime(date_unixtime, 'unixepoch')) as month,
498
+ COUNT(*) as count
499
+ FROM messages
500
+ WHERE date_unixtime IS NOT NULL
501
+ GROUP BY month
502
+ ORDER BY month
503
+ '''
504
+ cursor = self.conn.execute(sql)
505
+ return {row[0]: row[1] for row in cursor.fetchall()}
506
+
507
+ def get_top_domains(self, limit: int = 20) -> list[tuple[str, int]]:
508
+ """Get most shared domains from links."""
509
+ return self.get_top_domains_heap(limit)
510
+
511
+ def get_top_mentioned(self, limit: int = 20) -> list[tuple[str, int]]:
512
+ """Get most mentioned users/channels."""
513
+ sql = '''
514
+ SELECT value, COUNT(*) as count
515
+ FROM entities
516
+ WHERE type = 'mention'
517
+ GROUP BY value
518
+ ORDER BY count DESC
519
+ LIMIT ?
520
+ '''
521
+ cursor = self.conn.execute(sql, (limit,))
522
+ return [(row[0], row[1]) for row in cursor.fetchall()]
523
+
524
+ def get_forwarded_sources(self, limit: int = 20) -> list[dict]:
525
+ """Get top sources of forwarded messages."""
526
+ sql = '''
527
+ SELECT
528
+ forwarded_from,
529
+ forwarded_from_id,
530
+ COUNT(*) as count
531
+ FROM messages
532
+ WHERE forwarded_from IS NOT NULL
533
+ GROUP BY forwarded_from_id
534
+ ORDER BY count DESC
535
+ LIMIT ?
536
+ '''
537
+ cursor = self.conn.execute(sql, (limit,))
538
+ return [dict(row) for row in cursor.fetchall()]
539
+
540
+ def get_word_frequency(self, limit: int = 50, min_length: int = 3) -> list[tuple[str, int]]:
541
+ """Get most frequent words using Heap-based Top-K."""
542
+ return self.get_top_words_heap(limit, min_length)
543
+
544
+ def get_reply_network(self, limit: int = 100) -> list[dict]:
545
+ """Get reply relationships between users."""
546
+ sql = '''
547
+ SELECT
548
+ m1.from_id as replier_id,
549
+ m1.from_name as replier_name,
550
+ m2.from_id as replied_to_id,
551
+ m2.from_name as replied_to_name,
552
+ COUNT(*) as reply_count
553
+ FROM messages m1
554
+ JOIN messages m2 ON m1.reply_to_message_id = m2.id
555
+ WHERE m1.reply_to_message_id IS NOT NULL
556
+ GROUP BY m1.from_id, m2.from_id
557
+ ORDER BY reply_count DESC
558
+ LIMIT ?
559
+ '''
560
+ cursor = self.conn.execute(sql, (limit,))
561
+ return [dict(row) for row in cursor.fetchall()]
562
+
563
+ def get_user_stats(self, user_id: str) -> dict:
564
+ """Get detailed statistics for a specific user."""
565
+ stats = {}
566
+
567
+ cursor = self.conn.execute('''
568
+ SELECT
569
+ COUNT(*) as total,
570
+ SUM(has_links) as links,
571
+ SUM(has_media) as media,
572
+ SUM(has_mentions) as mentions,
573
+ SUM(is_edited) as edited,
574
+ MIN(date_unixtime) as first_msg,
575
+ MAX(date_unixtime) as last_msg
576
+ FROM messages WHERE from_id = ?
577
+ ''', (user_id,))
578
+ row = cursor.fetchone()
579
+ stats.update(dict(row))
580
+
581
+ cursor = self.conn.execute('''
582
+ SELECT COUNT(*) FROM messages m1
583
+ JOIN messages m2 ON m1.reply_to_message_id = m2.id
584
+ WHERE m2.from_id = ?
585
+ ''', (user_id,))
586
+ stats['replies_received'] = cursor.fetchone()[0]
587
+
588
+ cursor = self.conn.execute('''
589
+ SELECT COUNT(*) FROM messages
590
+ WHERE from_id = ? AND reply_to_message_id IS NOT NULL
591
+ ''', (user_id,))
592
+ stats['replies_sent'] = cursor.fetchone()[0]
593
+
594
+ # Add rank info using Rank Tree
595
+ rank_info = self.get_user_rank(user_id)
596
+ stats['rank'] = rank_info.get('rank')
597
+ stats['percentile'] = rank_info.get('percentile')
598
+
599
+ return stats
600
+
601
+
602
+ def print_bar(value: int, max_value: int, width: int = 40) -> str:
603
+ """Create a simple ASCII bar."""
604
+ if max_value == 0:
605
+ return ''
606
+ bar_length = int((value / max_value) * width)
607
+ return '█' * bar_length + '░' * (width - bar_length)
608
+
609
+
610
+ def main():
611
+ parser = argparse.ArgumentParser(description='Analyze indexed Telegram messages (Enhanced)')
612
+ parser.add_argument('--db', default='telegram.db', help='Database path')
613
+
614
+ # Original options
615
+ parser.add_argument('--stats', action='store_true', help='Show general statistics')
616
+ parser.add_argument('--top-users', action='store_true', help='Show top users')
617
+ parser.add_argument('--hourly', action='store_true', help='Show hourly activity')
618
+ parser.add_argument('--daily', action='store_true', help='Show daily activity')
619
+ parser.add_argument('--monthly', action='store_true', help='Show monthly activity')
620
+ parser.add_argument('--domains', action='store_true', help='Show top shared domains')
621
+ parser.add_argument('--mentions', action='store_true', help='Show top mentions')
622
+ parser.add_argument('--words', action='store_true', help='Show word frequency')
623
+ parser.add_argument('--sources', action='store_true', help='Show forwarded message sources')
624
+ parser.add_argument('--replies', action='store_true', help='Show reply network')
625
+ parser.add_argument('--user', help='Show stats for specific user ID')
626
+
627
+ # NEW: Enhanced options
628
+ parser.add_argument('--similar', action='store_true', help='Find similar messages (LCS)')
629
+ parser.add_argument('--reposts', action='store_true', help='Find potential reposts')
630
+ parser.add_argument('--percentiles', action='store_true', help='Show message length percentiles')
631
+ parser.add_argument('--response-times', action='store_true', help='Show response time percentiles')
632
+ parser.add_argument('--user-rank', help='Get rank of specific user')
633
+ parser.add_argument('--rank', type=int, help='Get user at specific rank')
634
+ parser.add_argument('--histogram', action='store_true', help='Show activity histogram')
635
+ parser.add_argument('--bucket-size', type=int, default=86400, help='Histogram bucket size in seconds')
636
+
637
+ parser.add_argument('--limit', type=int, default=20, help='Limit results')
638
+ parser.add_argument('--json', action='store_true', help='Output as JSON')
639
+ parser.add_argument('--threshold', type=float, default=0.7, help='Similarity threshold')
640
+
641
+ args = parser.parse_args()
642
+
643
+ with TelegramAnalyzer(args.db) as analyzer:
644
+ # === ORIGINAL OPTIONS ===
645
+ if args.stats:
646
+ stats = analyzer.get_stats()
647
+ if args.json:
648
+ print(json.dumps(stats, indent=2, ensure_ascii=False))
649
+ else:
650
+ print("=== General Statistics ===\n")
651
+ print(f"Total messages: {stats['total_messages']:,}")
652
+ print(f"Total users: {stats['total_users']:,}")
653
+ print(f"First message: {stats.get('first_message', 'N/A')}")
654
+ print(f"Last message: {stats.get('last_message', 'N/A')}")
655
+ print(f"Days span: {stats.get('days_span', 'N/A')}")
656
+ print(f"Messages with media: {stats['messages_with_media']:,}")
657
+ print(f"Messages with links: {stats['messages_with_links']:,}")
658
+ print(f"Forwarded messages: {stats['forwarded_messages']:,}")
659
+ print(f"Reply messages: {stats['reply_messages']:,}")
660
+ if 'median_message_length' in stats:
661
+ print(f"\nMedian msg length: {stats['median_message_length']:.0f} chars")
662
+ print(f"90th percentile: {stats['p90_message_length']:.0f} chars")
663
+ print(f"\nEntities: {stats.get('entities', {})}")
664
+ return
665
+
666
+ if args.top_users:
667
+ users = analyzer.get_top_users(args.limit)
668
+ if args.json:
669
+ print(json.dumps(users, indent=2, ensure_ascii=False))
670
+ else:
671
+ print("=== Top Users by Message Count (Heap-based Top-K) ===\n")
672
+ max_count = users[0]['message_count'] if users else 0
673
+ for i, user in enumerate(users, 1):
674
+ bar = print_bar(user['message_count'], max_count, 30)
675
+ print(f"{i:2}. {user['from_name'][:20]:20} {bar} {user['message_count']:,}")
676
+ return
677
+
678
+ if args.hourly:
679
+ hourly = analyzer.get_hourly_activity()
680
+ if args.json:
681
+ print(json.dumps(hourly, indent=2))
682
+ else:
683
+ print("=== Hourly Activity ===\n")
684
+ max_count = max(hourly.values()) if hourly else 0
685
+ for hour in range(24):
686
+ count = hourly.get(hour, 0)
687
+ bar = print_bar(count, max_count, 40)
688
+ print(f"{hour:02}:00 {bar} {count:,}")
689
+ return
690
+
691
+ if args.daily:
692
+ daily = analyzer.get_daily_activity()
693
+ if args.json:
694
+ print(json.dumps(daily, indent=2))
695
+ else:
696
+ print("=== Daily Activity ===\n")
697
+ max_count = max(daily.values()) if daily else 0
698
+ for day, count in daily.items():
699
+ bar = print_bar(count, max_count, 40)
700
+ print(f"{day:10} {bar} {count:,}")
701
+ return
702
+
703
+ if args.monthly:
704
+ monthly = analyzer.get_monthly_activity()
705
+ if args.json:
706
+ print(json.dumps(monthly, indent=2))
707
+ else:
708
+ print("=== Monthly Activity ===\n")
709
+ max_count = max(monthly.values()) if monthly else 0
710
+ for month, count in monthly.items():
711
+ bar = print_bar(count, max_count, 40)
712
+ print(f"{month} {bar} {count:,}")
713
+ return
714
+
715
+ if args.domains:
716
+ domains = analyzer.get_top_domains(args.limit)
717
+ if args.json:
718
+ print(json.dumps(dict(domains), indent=2))
719
+ else:
720
+ print("=== Top Shared Domains (Heap-based Top-K) ===\n")
721
+ max_count = domains[0][1] if domains else 0
722
+ for domain, count in domains:
723
+ bar = print_bar(count, max_count, 30)
724
+ print(f"{domain[:30]:30} {bar} {count:,}")
725
+ return
726
+
727
+ if args.mentions:
728
+ mentions = analyzer.get_top_mentioned(args.limit)
729
+ if args.json:
730
+ print(json.dumps(dict(mentions), indent=2))
731
+ else:
732
+ print("=== Top Mentioned Users ===\n")
733
+ max_count = mentions[0][1] if mentions else 0
734
+ for mention, count in mentions:
735
+ bar = print_bar(count, max_count, 30)
736
+ print(f"{mention:20} {bar} {count:,}")
737
+ return
738
+
739
+ if args.words:
740
+ words = analyzer.get_word_frequency(args.limit)
741
+ if args.json:
742
+ print(json.dumps(dict(words), indent=2, ensure_ascii=False))
743
+ else:
744
+ print("=== Top Words (Heap-based Top-K) ===\n")
745
+ max_count = words[0][1] if words else 0
746
+ for word, count in words:
747
+ bar = print_bar(count, max_count, 30)
748
+ print(f"{word:20} {bar} {count:,}")
749
+ return
750
+
751
+ if args.sources:
752
+ sources = analyzer.get_forwarded_sources(args.limit)
753
+ if args.json:
754
+ print(json.dumps(sources, indent=2, ensure_ascii=False))
755
+ else:
756
+ print("=== Top Forwarded Sources ===\n")
757
+ max_count = sources[0]['count'] if sources else 0
758
+ for src in sources:
759
+ bar = print_bar(src['count'], max_count, 30)
760
+ name = src['forwarded_from'] or 'Unknown'
761
+ print(f"{name[:30]:30} {bar} {src['count']:,}")
762
+ return
763
+
764
+ if args.replies:
765
+ replies = analyzer.get_reply_network(args.limit)
766
+ if args.json:
767
+ print(json.dumps(replies, indent=2, ensure_ascii=False))
768
+ else:
769
+ print("=== Reply Network ===\n")
770
+ for r in replies:
771
+ print(f"{r['replier_name']} → {r['replied_to_name']}: {r['reply_count']} replies")
772
+ return
773
+
774
+ if args.user:
775
+ user_stats = analyzer.get_user_stats(args.user)
776
+ if args.json:
777
+ print(json.dumps(user_stats, indent=2))
778
+ else:
779
+ print(f"=== Stats for {args.user} ===\n")
780
+ for key, value in user_stats.items():
781
+ print(f"{key}: {value}")
782
+ return
783
+
784
+ # === NEW ENHANCED OPTIONS ===
785
+
786
+ if args.similar:
787
+ print(f"=== Similar Messages (LCS, threshold={args.threshold}) ===\n")
788
+ similar = analyzer.find_similar_messages(
789
+ threshold=args.threshold,
790
+ limit=args.limit
791
+ )
792
+ if args.json:
793
+ print(json.dumps(similar, indent=2, ensure_ascii=False))
794
+ else:
795
+ for id1, id2, sim, text1, text2 in similar:
796
+ print(f"Similarity: {sim:.1%}")
797
+ print(f" [{id1}] {text1}...")
798
+ print(f" [{id2}] {text2}...")
799
+ print()
800
+ return
801
+
802
+ if args.reposts:
803
+ print("=== Potential Reposts (LCS-based) ===\n")
804
+ reposts = analyzer.find_reposts(threshold=args.threshold)
805
+ if args.json:
806
+ print(json.dumps(reposts, indent=2, ensure_ascii=False))
807
+ else:
808
+ for r in reposts[:args.limit]:
809
+ print(f"Similarity: {r['similarity']:.1%}")
810
+ print(f" User 1: {r['user_1']}")
811
+ print(f" User 2: {r['user_2']}")
812
+ print(f" Text: {r['text_preview']}...")
813
+ print()
814
+ return
815
+
816
+ if args.percentiles:
817
+ print("=== Message Length Percentiles (Selection Algorithm) ===\n")
818
+ stats = analyzer.get_message_length_stats()
819
+ if args.json:
820
+ print(json.dumps(stats, indent=2))
821
+ else:
822
+ for key, value in stats.items():
823
+ print(f"{key:15}: {value:,.0f}")
824
+ return
825
+
826
+ if args.response_times:
827
+ print("=== Response Time Percentiles (Selection Algorithm) ===\n")
828
+ stats = analyzer.get_response_time_percentiles()
829
+ if args.json:
830
+ print(json.dumps(stats, indent=2))
831
+ else:
832
+ for key, value in stats.items():
833
+ if 'seconds' in key:
834
+ print(f"{key:15}: {value:,.0f}s ({value/60:.1f}m)")
835
+ else:
836
+ print(f"{key:15}: {value:,}")
837
+ return
838
+
839
+ if args.user_rank:
840
+ print(f"=== User Rank (Rank Tree O(log n)) ===\n")
841
+ rank_info = analyzer.get_user_rank(args.user_rank)
842
+ if args.json:
843
+ print(json.dumps(rank_info, indent=2))
844
+ else:
845
+ print(f"User ID: {rank_info.get('user_id')}")
846
+ print(f"Message count: {rank_info.get('message_count'):,}")
847
+ print(f"Rank: #{rank_info.get('rank')} of {rank_info.get('total_users')}")
848
+ print(f"Percentile: Top {rank_info.get('percentile'):.1f}%")
849
+ return
850
+
851
+ if args.rank:
852
+ print(f"=== User at Rank #{args.rank} (Rank Tree O(log n)) ===\n")
853
+ user = analyzer.get_user_by_rank(args.rank)
854
+ if args.json:
855
+ print(json.dumps(user, indent=2, ensure_ascii=False))
856
+ elif user:
857
+ print(f"Name: {user.get('name')}")
858
+ print(f"User ID: {user.get('user_id')}")
859
+ print(f"Message count: {user.get('count'):,}")
860
+ else:
861
+ print(f"No user at rank {args.rank}")
862
+ return
863
+
864
+ if args.histogram:
865
+ print(f"=== Activity Histogram (Bucket Sort, bucket={args.bucket_size}s) ===\n")
866
+ hist = analyzer.get_activity_histogram(bucket_size=args.bucket_size)
867
+ if args.json:
868
+ print(json.dumps(hist, indent=2))
869
+ else:
870
+ max_count = max(c for _, c in hist) if hist else 0
871
+ for date_str, count in hist[-args.limit:]:
872
+ bar = print_bar(count, max_count, 40)
873
+ print(f"{date_str} {bar} {count:,}")
874
+ return
875
+
876
+ # Default: show help
877
+ parser.print_help()
878
+
879
+
880
+ if __name__ == '__main__':
881
+ main()
check_db.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Quick script to check database contents."""
3
+ import sqlite3
4
+ import os
5
+
6
+ DB_PATH = 'telegram.db'
7
+
8
+ if not os.path.exists(DB_PATH):
9
+ print(f"Database not found: {DB_PATH}")
10
+ exit(1)
11
+
12
+ try:
13
+ conn = sqlite3.connect(DB_PATH)
14
+ conn.row_factory = sqlite3.Row
15
+
16
+ # Get total count
17
+ total = conn.execute("SELECT COUNT(*) FROM messages").fetchone()[0]
18
+ print(f"Total messages in database: {total}")
19
+
20
+ # Get date range
21
+ date_range = conn.execute("""
22
+ SELECT MIN(date) as earliest, MAX(date) as latest
23
+ FROM messages
24
+ """).fetchone()
25
+ print(f"Date range: {date_range['earliest']} to {date_range['latest']}")
26
+ print()
27
+
28
+ # Show 50 newest messages
29
+ print("=" * 60)
30
+ print("50 NEWEST MESSAGES:")
31
+ print("=" * 60)
32
+
33
+ rows = conn.execute("""
34
+ SELECT date, from_name, text_plain
35
+ FROM messages
36
+ ORDER BY date DESC
37
+ LIMIT 50
38
+ """).fetchall()
39
+
40
+ for row in rows:
41
+ text = (row['text_plain'] or '')[:80].replace('\n', ' ')
42
+ name = row['from_name'] or 'Unknown'
43
+ print(f"{row['date']} | {name}: {text}")
44
+
45
+ conn.close()
46
+ except Exception as e:
47
+ print(f"Error: {e}")
48
+ import traceback
49
+ traceback.print_exc()
daily_sync.py ADDED
@@ -0,0 +1,664 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Daily Telegram Sync Script
4
+ ===========================
5
+ Automatically syncs new messages from Telegram group to the analytics system.
6
+
7
+ What it does:
8
+ 1. Connects to Telegram via Telethon
9
+ 2. Fetches messages from the last 36 hours (12h overlap for safety)
10
+ 3. Adds new messages to telegram.db (duplicates ignored)
11
+ 4. Generates embeddings for new messages locally
12
+ 5. Adds embeddings to embeddings.db (duplicates ignored)
13
+
14
+ Usage:
15
+ First time: python daily_sync.py --setup
16
+ Daily run: python daily_sync.py
17
+ Custom hours: python daily_sync.py --hours 48
18
+
19
+ Automation:
20
+ Windows Task Scheduler: python daily_sync.py
21
+ Linux cron: 0 3 * * * cd /path/to/telegram && python daily_sync.py >> sync.log 2>&1
22
+ """
23
+
24
+ import os
25
+ import sys
26
+ import json
27
+ import time
28
+ import sqlite3
29
+ import asyncio
30
+ import argparse
31
+ import logging
32
+ from datetime import datetime, timedelta, timezone
33
+ from pathlib import Path
34
+
35
+ # Setup logging
36
+ LOG_FILE = Path(__file__).parent / 'sync.log'
37
+ logging.basicConfig(
38
+ level=logging.INFO,
39
+ format='%(asctime)s [%(levelname)s] %(message)s',
40
+ handlers=[
41
+ logging.StreamHandler(),
42
+ logging.FileHandler(LOG_FILE, encoding='utf-8')
43
+ ]
44
+ )
45
+ log = logging.getLogger('daily_sync')
46
+
47
+ # Paths
48
+ BASE_DIR = Path(__file__).parent
49
+ DB_PATH = BASE_DIR / 'telegram.db'
50
+ EMBEDDINGS_DB_PATH = BASE_DIR / 'embeddings.db'
51
+ SESSION_FILE = BASE_DIR / 'telegram_session'
52
+ CONFIG_FILE = BASE_DIR / 'sync_config.json'
53
+
54
+ # ==========================================
55
+ # CONFIGURATION
56
+ # ==========================================
57
+
58
+ def load_config() -> dict:
59
+ """Load configuration from sync_config.json."""
60
+ if CONFIG_FILE.exists():
61
+ with open(CONFIG_FILE, 'r') as f:
62
+ return json.load(f)
63
+ return {}
64
+
65
+
66
+ def save_config(config: dict):
67
+ """Save configuration to sync_config.json."""
68
+ with open(CONFIG_FILE, 'w') as f:
69
+ json.dump(config, f, indent=2)
70
+
71
+
72
+ def setup_config():
73
+ """Interactive setup for first time configuration."""
74
+ print("=" * 50)
75
+ print(" Telegram Daily Sync - Setup")
76
+ print("=" * 50)
77
+ print()
78
+ print("You need Telegram API credentials.")
79
+ print("Get them from: https://my.telegram.org/apps")
80
+ print()
81
+
82
+ api_id = input("API ID: ").strip()
83
+ api_hash = input("API Hash: ").strip()
84
+ group = input("Group username or ID (e.g., @mygroup or -1001234567890): ").strip()
85
+
86
+ config = {
87
+ 'api_id': int(api_id),
88
+ 'api_hash': api_hash,
89
+ 'group': group,
90
+ 'hours': 36
91
+ }
92
+
93
+ save_config(config)
94
+ print(f"\nConfiguration saved to {CONFIG_FILE}")
95
+ print("Now run: python daily_sync.py")
96
+ print("(First run will ask you to log in to Telegram)")
97
+ return config
98
+
99
+
100
+ # ==========================================
101
+ # TELETHON: FETCH MESSAGES
102
+ # ==========================================
103
+
104
+ def telethon_message_to_json(message) -> dict | None:
105
+ """
106
+ Convert a Telethon message object to Telegram Desktop export JSON format.
107
+ This ensures compatibility with the existing parse_message() in indexer.py.
108
+ """
109
+ from telethon.tl.types import (
110
+ MessageEntityUrl, MessageEntityTextUrl,
111
+ MessageEntityMention, MessageEntityMentionName,
112
+ MessageEntityBold, MessageEntityItalic,
113
+ MessageEntityCode, MessageEntityPre,
114
+ MessageEntityHashtag, MessageEntityEmail,
115
+ MessageEntityPhone, MessageEntityBotCommand,
116
+ )
117
+
118
+ if message.text is None and message.raw_text is None:
119
+ return None
120
+
121
+ text = message.raw_text or ''
122
+ if not text.strip():
123
+ # Skip empty messages (media-only, service messages, etc.)
124
+ return None
125
+
126
+ # Build text_entities in Telegram Desktop export format
127
+ text_entities = []
128
+ if message.entities:
129
+ for entity in message.entities:
130
+ start = entity.offset
131
+ end = entity.offset + entity.length
132
+ entity_text = text[start:end]
133
+
134
+ entity_type = 'plain'
135
+ if isinstance(entity, (MessageEntityUrl,)):
136
+ entity_type = 'link'
137
+ elif isinstance(entity, MessageEntityTextUrl):
138
+ entity_type = 'text_link'
139
+ entity_text = entity.url
140
+ elif isinstance(entity, (MessageEntityMention, MessageEntityMentionName)):
141
+ entity_type = 'mention'
142
+ elif isinstance(entity, MessageEntityBold):
143
+ entity_type = 'bold'
144
+ elif isinstance(entity, MessageEntityItalic):
145
+ entity_type = 'italic'
146
+ elif isinstance(entity, (MessageEntityCode, MessageEntityPre)):
147
+ entity_type = 'code'
148
+ elif isinstance(entity, MessageEntityHashtag):
149
+ entity_type = 'hashtag'
150
+ elif isinstance(entity, MessageEntityEmail):
151
+ entity_type = 'email'
152
+ elif isinstance(entity, MessageEntityPhone):
153
+ entity_type = 'phone'
154
+ elif isinstance(entity, MessageEntityBotCommand):
155
+ entity_type = 'bot_command'
156
+
157
+ if entity_type != 'plain':
158
+ text_entities.append({
159
+ 'type': entity_type,
160
+ 'text': entity_text
161
+ })
162
+
163
+ # Get sender info
164
+ sender = message.sender
165
+ from_name = ''
166
+ from_id = ''
167
+
168
+ if sender:
169
+ if hasattr(sender, 'first_name'):
170
+ # User
171
+ parts = [sender.first_name or '']
172
+ if sender.last_name:
173
+ parts.append(sender.last_name)
174
+ from_name = ' '.join(parts).strip()
175
+ from_id = f'user{sender.id}'
176
+ elif hasattr(sender, 'title'):
177
+ # Channel/Group
178
+ from_name = sender.title or ''
179
+ from_id = f'channel{sender.id}'
180
+
181
+ # Handle forwarded messages
182
+ forwarded_from = None
183
+ forwarded_from_id = None
184
+ if message.forward:
185
+ fwd = message.forward
186
+ try:
187
+ if fwd.sender:
188
+ if hasattr(fwd.sender, 'first_name'):
189
+ parts = [fwd.sender.first_name or '']
190
+ if fwd.sender.last_name:
191
+ parts.append(fwd.sender.last_name)
192
+ forwarded_from = ' '.join(parts).strip()
193
+ forwarded_from_id = f'user{fwd.sender.id}'
194
+ elif hasattr(fwd.sender, 'title'):
195
+ forwarded_from = fwd.sender.title
196
+ forwarded_from_id = f'channel{fwd.sender.id}'
197
+ elif getattr(fwd, 'sender_name', None):
198
+ forwarded_from = fwd.sender_name
199
+ elif getattr(fwd, 'from_name', None):
200
+ forwarded_from = fwd.from_name
201
+ except Exception:
202
+ pass # Skip forward info if any attribute is missing
203
+
204
+ # Photo info
205
+ photo_info = {}
206
+ has_photo = False
207
+ has_media = False
208
+
209
+ if message.photo:
210
+ has_photo = True
211
+ has_media = True
212
+ if hasattr(message.photo, 'sizes') and message.photo.sizes:
213
+ largest = message.photo.sizes[-1]
214
+ if hasattr(largest, 'w'):
215
+ photo_info['width'] = largest.w
216
+ photo_info['height'] = largest.h
217
+ elif message.document or message.video or message.audio:
218
+ has_media = True
219
+
220
+ # Reply info
221
+ reply_to = None
222
+ if message.reply_to:
223
+ reply_to = message.reply_to.reply_to_msg_id
224
+
225
+ # Date handling
226
+ msg_date = message.date.replace(tzinfo=None) if message.date else datetime.utcnow()
227
+
228
+ # Build the JSON in Telegram Desktop export format
229
+ msg_json = {
230
+ 'id': message.id,
231
+ 'type': 'message',
232
+ 'date': msg_date.strftime('%Y-%m-%dT%H:%M:%S'),
233
+ 'date_unixtime': str(int(msg_date.replace(tzinfo=timezone.utc).timestamp())),
234
+ 'from': from_name,
235
+ 'from_id': from_id,
236
+ 'text': text,
237
+ 'text_entities': text_entities,
238
+ }
239
+
240
+ if reply_to:
241
+ msg_json['reply_to_message_id'] = reply_to
242
+ if forwarded_from:
243
+ msg_json['forwarded_from'] = forwarded_from
244
+ if forwarded_from_id:
245
+ msg_json['forwarded_from_id'] = forwarded_from_id
246
+ if has_photo:
247
+ msg_json['photo'] = '(photo)'
248
+ msg_json.update(photo_info)
249
+ if has_media and not has_photo:
250
+ msg_json['media_type'] = 'document'
251
+ if message.edit_date:
252
+ edit_date = message.edit_date.replace(tzinfo=None)
253
+ msg_json['edited'] = edit_date.strftime('%Y-%m-%dT%H:%M:%S')
254
+ msg_json['edited_unixtime'] = str(int(edit_date.replace(tzinfo=timezone.utc).timestamp()))
255
+
256
+ return msg_json
257
+
258
+
259
+ async def fetch_messages(config: dict, hours: int = 36) -> list[dict]:
260
+ """
261
+ Fetch messages from the last N hours using Telethon.
262
+ Returns messages in Telegram Desktop export JSON format.
263
+ """
264
+ from telethon import TelegramClient
265
+
266
+ api_id = config['api_id']
267
+ api_hash = config['api_hash']
268
+ group = config['group']
269
+
270
+ client = TelegramClient(str(SESSION_FILE), api_id, api_hash)
271
+ await client.start()
272
+
273
+ log.info(f"Connected to Telegram")
274
+
275
+ # Resolve group - handle numeric IDs properly
276
+ if isinstance(group, str) and group.lstrip('-').isdigit():
277
+ group = int(group)
278
+ if isinstance(group, int) and group < 0:
279
+ # Convert Telegram Web format to Telethon PeerChannel
280
+ # -100XXXXXXXXXX → channel_id = XXXXXXXXXX
281
+ from telethon.tl.types import PeerChannel
282
+ channel_id = int(str(group).replace('-100', ''))
283
+ entity = await client.get_entity(PeerChannel(channel_id))
284
+ else:
285
+ entity = await client.get_entity(group)
286
+ log.info(f"Fetching from: {getattr(entity, 'title', group)}")
287
+
288
+ # Calculate time window
289
+ since = datetime.now(timezone.utc) - timedelta(hours=hours)
290
+ log.info(f"Fetching messages from last {hours} hours (since {since.strftime('%Y-%m-%d %H:%M')} UTC)")
291
+
292
+ # Fetch messages
293
+ messages_json = []
294
+ count = 0
295
+
296
+ async for message in client.iter_messages(entity, offset_date=None, reverse=False):
297
+ # Stop when we've gone past our time window
298
+ if message.date < since:
299
+ break
300
+
301
+ count += 1
302
+ msg_json = telethon_message_to_json(message)
303
+ if msg_json:
304
+ messages_json.append(msg_json)
305
+
306
+ await client.disconnect()
307
+
308
+ log.info(f"Fetched {count} messages, {len(messages_json)} with text content")
309
+ return messages_json
310
+
311
+
312
+ async def fetch_participants(config: dict) -> list[dict]:
313
+ """
314
+ Fetch all group participants with metadata using Telethon.
315
+ Returns participant info: name, username, status, join date, admin, etc.
316
+ """
317
+ from telethon import TelegramClient
318
+ from telethon.tl.types import (
319
+ UserStatusOnline, UserStatusOffline, UserStatusRecently,
320
+ UserStatusLastWeek, UserStatusLastMonth,
321
+ ChannelParticipantAdmin, ChannelParticipantCreator,
322
+ )
323
+
324
+ api_id = config['api_id']
325
+ api_hash = config['api_hash']
326
+ group = config['group']
327
+
328
+ client = TelegramClient(str(SESSION_FILE), api_id, api_hash)
329
+ await client.start()
330
+
331
+ # Resolve group
332
+ if isinstance(group, str) and group.lstrip('-').isdigit():
333
+ group = int(group)
334
+ if isinstance(group, int) and group < 0:
335
+ from telethon.tl.types import PeerChannel
336
+ channel_id = int(str(group).replace('-100', ''))
337
+ entity = await client.get_entity(PeerChannel(channel_id))
338
+ else:
339
+ entity = await client.get_entity(group)
340
+
341
+ log.info(f"Fetching participants from: {getattr(entity, 'title', group)}")
342
+
343
+ participants = []
344
+ now_ts = int(datetime.now(timezone.utc).timestamp())
345
+
346
+ async for user in client.iter_participants(entity):
347
+ # Determine status
348
+ status = 'unknown'
349
+ last_online = None
350
+
351
+ if isinstance(user.status, UserStatusOnline):
352
+ status = 'online'
353
+ last_online = now_ts
354
+ elif isinstance(user.status, UserStatusOffline):
355
+ status = 'offline'
356
+ if user.status.was_online:
357
+ last_online = int(user.status.was_online.timestamp())
358
+ elif isinstance(user.status, UserStatusRecently):
359
+ status = 'recently'
360
+ elif isinstance(user.status, UserStatusLastWeek):
361
+ status = 'last_week'
362
+ elif isinstance(user.status, UserStatusLastMonth):
363
+ status = 'last_month'
364
+
365
+ # Determine role
366
+ is_admin = False
367
+ is_creator = False
368
+ join_date = None
369
+
370
+ if hasattr(user, 'participant'):
371
+ p = user.participant
372
+ if isinstance(p, ChannelParticipantCreator):
373
+ is_creator = True
374
+ is_admin = True
375
+ elif isinstance(p, ChannelParticipantAdmin):
376
+ is_admin = True
377
+ if hasattr(p, 'date') and p.date:
378
+ join_date = int(p.date.timestamp())
379
+
380
+ participants.append({
381
+ 'user_id': f'user{user.id}',
382
+ 'first_name': user.first_name or '',
383
+ 'last_name': user.last_name or '',
384
+ 'username': user.username or '',
385
+ 'phone': user.phone or '',
386
+ 'is_bot': 1 if user.bot else 0,
387
+ 'is_admin': 1 if is_admin else 0,
388
+ 'is_creator': 1 if is_creator else 0,
389
+ 'is_premium': 1 if getattr(user, 'premium', False) else 0,
390
+ 'join_date': join_date,
391
+ 'last_status': status,
392
+ 'last_online': last_online,
393
+ 'about': '', # Requires separate API call per user
394
+ 'updated_at': now_ts,
395
+ })
396
+
397
+ await client.disconnect()
398
+
399
+ log.info(f"Fetched {len(participants)} participants")
400
+ return participants
401
+
402
+
403
+ def sync_participants(participants: list[dict]) -> dict:
404
+ """Save participants to telegram.db."""
405
+ if not participants:
406
+ return {'synced': 0}
407
+
408
+ conn = sqlite3.connect(str(DB_PATH))
409
+
410
+ # Create table if not exists
411
+ conn.execute("""
412
+ CREATE TABLE IF NOT EXISTS participants (
413
+ user_id TEXT PRIMARY KEY,
414
+ first_name TEXT,
415
+ last_name TEXT,
416
+ username TEXT,
417
+ phone TEXT,
418
+ is_bot INTEGER DEFAULT 0,
419
+ is_admin INTEGER DEFAULT 0,
420
+ is_creator INTEGER DEFAULT 0,
421
+ is_premium INTEGER DEFAULT 0,
422
+ join_date INTEGER,
423
+ last_status TEXT DEFAULT 'unknown',
424
+ last_online INTEGER,
425
+ about TEXT,
426
+ updated_at INTEGER
427
+ )
428
+ """)
429
+
430
+ # Upsert participants
431
+ conn.executemany("""
432
+ INSERT OR REPLACE INTO participants
433
+ (user_id, first_name, last_name, username, phone, is_bot, is_admin,
434
+ is_creator, is_premium, join_date, last_status, last_online, about, updated_at)
435
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
436
+ """, [
437
+ (p['user_id'], p['first_name'], p['last_name'], p['username'],
438
+ p['phone'], p['is_bot'], p['is_admin'], p['is_creator'],
439
+ p['is_premium'], p['join_date'], p['last_status'],
440
+ p['last_online'], p['about'], p['updated_at'])
441
+ for p in participants
442
+ ])
443
+
444
+ conn.commit()
445
+ conn.close()
446
+
447
+ log.info(f"Synced {len(participants)} participants to DB")
448
+ return {'synced': len(participants)}
449
+
450
+
451
+ # ==========================================
452
+ # DATABASE: INDEX NEW MESSAGES
453
+ # ==========================================
454
+
455
+ def index_messages(messages_json: list[dict]) -> dict:
456
+ """
457
+ Add new messages to telegram.db using IncrementalIndexer.
458
+ Duplicates are automatically ignored.
459
+ """
460
+ if not messages_json:
461
+ return {'new_messages': 0, 'duplicates': 0}
462
+
463
+ from indexer import IncrementalIndexer
464
+
465
+ log.info(f"Indexing {len(messages_json)} messages into telegram.db...")
466
+
467
+ indexer = IncrementalIndexer(str(DB_PATH))
468
+ stats = indexer.update_from_json_data({'messages': messages_json}, show_progress=True)
469
+ indexer.close()
470
+
471
+ log.info(f"Indexing done: {stats['new_messages']} new, {stats['duplicates']} duplicates")
472
+ return stats
473
+
474
+
475
+ # ==========================================
476
+ # EMBEDDINGS: GENERATE FOR NEW MESSAGES
477
+ # ==========================================
478
+
479
+ def generate_embeddings(messages_json: list[dict]) -> dict:
480
+ """
481
+ Generate embeddings for new messages and add to embeddings.db.
482
+ Only processes messages that don't already have embeddings.
483
+ """
484
+ if not os.path.exists(EMBEDDINGS_DB_PATH):
485
+ log.warning(f"embeddings.db not found at {EMBEDDINGS_DB_PATH}. Skipping embeddings.")
486
+ return {'new_embeddings': 0, 'skipped': 0}
487
+
488
+ import numpy as np
489
+
490
+ # Find which messages don't have embeddings yet
491
+ emb_conn = sqlite3.connect(str(EMBEDDINGS_DB_PATH))
492
+ existing_ids = set()
493
+ for row in emb_conn.execute("SELECT message_id FROM embeddings"):
494
+ existing_ids.add(row[0])
495
+
496
+ # Filter to messages that:
497
+ # 1. Have text content
498
+ # 2. Text is longer than 10 chars
499
+ # 3. Don't already have embeddings
500
+ new_messages = []
501
+ for msg in messages_json:
502
+ msg_id = msg.get('id')
503
+ text = msg.get('text', '')
504
+ if isinstance(text, list):
505
+ # Handle complex text format
506
+ text = ''.join(
507
+ part if isinstance(part, str) else part.get('text', '')
508
+ for part in text
509
+ )
510
+
511
+ if msg_id and msg_id not in existing_ids and text and len(text.strip()) > 10:
512
+ new_messages.append({
513
+ 'id': msg_id,
514
+ 'from_name': msg.get('from', ''),
515
+ 'text': text
516
+ })
517
+
518
+ if not new_messages:
519
+ emb_conn.close()
520
+ log.info("No new messages need embeddings")
521
+ return {'new_embeddings': 0, 'skipped': len(messages_json)}
522
+
523
+ log.info(f"Generating embeddings for {len(new_messages)} new messages...")
524
+
525
+ # Load model
526
+ from sentence_transformers import SentenceTransformer
527
+ model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
528
+
529
+ # Generate embeddings
530
+ texts = [m['text'][:500] for m in new_messages] # Max 500 chars per message
531
+ embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True,
532
+ batch_size=64)
533
+
534
+ # Insert into embeddings.db
535
+ data = []
536
+ for i, msg in enumerate(new_messages):
537
+ emb_blob = embeddings[i].astype(np.float32).tobytes()
538
+ data.append((
539
+ msg['id'],
540
+ msg['from_name'],
541
+ msg['text'][:100], # Preview
542
+ emb_blob
543
+ ))
544
+
545
+ emb_conn.executemany(
546
+ "INSERT OR IGNORE INTO embeddings (message_id, from_name, text_preview, embedding) VALUES (?, ?, ?, ?)",
547
+ data
548
+ )
549
+ emb_conn.commit()
550
+
551
+ # Verify
552
+ total = emb_conn.execute("SELECT COUNT(*) FROM embeddings").fetchone()[0]
553
+ emb_conn.close()
554
+
555
+ log.info(f"Added {len(data)} new embeddings. Total embeddings: {total:,}")
556
+ return {'new_embeddings': len(data), 'total_embeddings': total}
557
+
558
+
559
+ # ==========================================
560
+ # MAIN
561
+ # ==========================================
562
+
563
+ def run_sync(hours: int = 36, skip_embeddings: bool = False):
564
+ """Run the full sync pipeline."""
565
+ start_time = time.time()
566
+
567
+ log.info("=" * 50)
568
+ log.info("Starting daily sync")
569
+ log.info("=" * 50)
570
+
571
+ # Load config
572
+ config = load_config()
573
+ if not config:
574
+ log.error("No configuration found. Run: python daily_sync.py --setup")
575
+ sys.exit(1)
576
+
577
+ # Step 1: Fetch messages from Telegram
578
+ log.info("[1/4] Fetching messages from Telegram...")
579
+ messages_json = asyncio.run(fetch_messages(config, hours=hours))
580
+
581
+ if not messages_json:
582
+ log.info("No messages found in the time window.")
583
+
584
+ # Step 2: Index into telegram.db
585
+ index_stats = {'new_messages': 0, 'duplicates': 0}
586
+ if messages_json:
587
+ log.info("[2/4] Indexing new messages...")
588
+ index_stats = index_messages(messages_json)
589
+
590
+ # Step 3: Sync participants
591
+ log.info("[3/4] Syncing participants...")
592
+ try:
593
+ participants = asyncio.run(fetch_participants(config))
594
+ part_stats = sync_participants(participants)
595
+ except Exception as e:
596
+ log.warning(f"Failed to sync participants: {e}")
597
+ part_stats = {'synced': 0}
598
+
599
+ # Step 4: Generate embeddings
600
+ if skip_embeddings or not messages_json:
601
+ log.info("[4/4] Skipping embeddings")
602
+ emb_stats = {'new_embeddings': 0}
603
+ else:
604
+ log.info("[4/4] Generating embeddings for new messages...")
605
+ emb_stats = generate_embeddings(messages_json)
606
+
607
+ # Notify running server to invalidate caches and reload embeddings
608
+ has_changes = (index_stats.get('new_messages', 0) > 0
609
+ or part_stats.get('synced', 0) > 0
610
+ or emb_stats.get('new_embeddings', 0) > 0)
611
+ if has_changes:
612
+ try:
613
+ import urllib.request
614
+ urllib.request.urlopen('http://localhost:5000/api/cache/invalidate', timeout=5)
615
+ log.info("Server caches invalidated")
616
+ if emb_stats.get('new_embeddings', 0) > 0:
617
+ urllib.request.urlopen('http://localhost:5000/api/embeddings/reload', timeout=5)
618
+ log.info("Server notified to reload embeddings")
619
+ except Exception:
620
+ log.info("Server not running or unreachable - caches will refresh on next restart")
621
+
622
+ # Summary
623
+ elapsed = time.time() - start_time
624
+ log.info("=" * 50)
625
+ log.info("Sync complete!")
626
+ log.info(f" Messages fetched: {len(messages_json) if messages_json else 0}")
627
+ log.info(f" New to DB: {index_stats.get('new_messages', 0)}")
628
+ log.info(f" Duplicates skipped: {index_stats.get('duplicates', 0)}")
629
+ log.info(f" Participants synced: {part_stats.get('synced', 0)}")
630
+ log.info(f" New embeddings: {emb_stats.get('new_embeddings', 0)}")
631
+ log.info(f" Time: {elapsed:.1f}s")
632
+ log.info("=" * 50)
633
+
634
+
635
+ def main():
636
+ parser = argparse.ArgumentParser(description='Daily Telegram Sync')
637
+ parser.add_argument('--setup', action='store_true', help='First time setup')
638
+ parser.add_argument('--hours', type=int, default=36, help='Hours to look back (default: 36)')
639
+ parser.add_argument('--skip-embeddings', action='store_true', help='Skip embedding generation')
640
+ parser.add_argument('--fetch-only', action='store_true', help='Only fetch, do not index')
641
+ args = parser.parse_args()
642
+
643
+ if args.setup:
644
+ setup_config()
645
+ return
646
+
647
+ if args.fetch_only:
648
+ config = load_config()
649
+ if not config:
650
+ log.error("No configuration found. Run: python daily_sync.py --setup")
651
+ sys.exit(1)
652
+ messages = asyncio.run(fetch_messages(config, hours=args.hours))
653
+ # Save to file for inspection
654
+ output = BASE_DIR / 'fetched_messages.json'
655
+ with open(output, 'w', encoding='utf-8') as f:
656
+ json.dump(messages, f, ensure_ascii=False, indent=2)
657
+ log.info(f"Saved {len(messages)} messages to {output}")
658
+ return
659
+
660
+ run_sync(hours=args.hours, skip_embeddings=args.skip_embeddings)
661
+
662
+
663
+ if __name__ == '__main__':
664
+ main()
static/css/style.css CHANGED
@@ -67,6 +67,11 @@ body {
67
  flex-direction: column;
68
  border-right: 1px solid var(--border-color);
69
  z-index: 100;
 
 
 
 
 
70
  }
71
 
72
  .logo {
@@ -821,8 +826,54 @@ input:focus {
821
  }
822
 
823
  @media (max-width: 768px) {
824
- .stats-grid {
825
- grid-template-columns: repeat(2, 1fr);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
826
  }
827
 
828
  .header {
@@ -831,9 +882,93 @@ input:focus {
831
  align-items: flex-start;
832
  }
833
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
834
  .user-stats-grid {
835
  grid-template-columns: repeat(2, 1fr);
836
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
837
  }
838
 
839
  /* ==========================================
 
67
  flex-direction: column;
68
  border-right: 1px solid var(--border-color);
69
  z-index: 100;
70
+ transition: width 0.3s ease;
71
+ }
72
+
73
+ .mobile-menu-btn {
74
+ display: none;
75
  }
76
 
77
  .logo {
 
826
  }
827
 
828
  @media (max-width: 768px) {
829
+ .sidebar {
830
+ width: 0;
831
+ overflow: hidden;
832
+ transition: width 0.3s ease;
833
+ }
834
+
835
+ .sidebar.open {
836
+ width: 250px;
837
+ }
838
+
839
+ .sidebar-overlay {
840
+ display: none;
841
+ position: fixed;
842
+ top: 0;
843
+ left: 0;
844
+ right: 0;
845
+ bottom: 0;
846
+ background: rgba(0,0,0,0.5);
847
+ z-index: 99;
848
+ }
849
+
850
+ .sidebar-overlay.active {
851
+ display: block;
852
+ }
853
+
854
+ .mobile-menu-btn {
855
+ display: flex !important;
856
+ align-items: center;
857
+ justify-content: center;
858
+ width: 40px;
859
+ height: 40px;
860
+ background: var(--bg-card);
861
+ border: 1px solid var(--border-color);
862
+ border-radius: var(--radius-md);
863
+ color: var(--text-primary);
864
+ font-size: 1.5rem;
865
+ cursor: pointer;
866
+ position: fixed;
867
+ top: var(--spacing-md);
868
+ left: var(--spacing-md);
869
+ z-index: 98;
870
+ }
871
+
872
+ .main-content {
873
+ margin-left: 0;
874
+ max-width: 100vw;
875
+ padding: var(--spacing-md);
876
+ padding-top: 60px;
877
  }
878
 
879
  .header {
 
882
  align-items: flex-start;
883
  }
884
 
885
+ .header h1 {
886
+ font-size: 1.25rem;
887
+ }
888
+
889
+ .header-controls {
890
+ width: 100%;
891
+ flex-wrap: wrap;
892
+ }
893
+
894
+ .stats-grid {
895
+ grid-template-columns: repeat(2, 1fr);
896
+ }
897
+
898
+ .stats-grid .stat-card {
899
+ padding: var(--spacing-md);
900
+ }
901
+
902
+ .charts-row {
903
+ grid-template-columns: 1fr;
904
+ }
905
+
906
+ .chart-card.full-width,
907
+ .chart-card.large {
908
+ grid-column: span 1;
909
+ }
910
+
911
+ .lists-row {
912
+ grid-template-columns: 1fr;
913
+ }
914
+
915
  .user-stats-grid {
916
  grid-template-columns: repeat(2, 1fr);
917
  }
918
+
919
+ .heatmap-table th,
920
+ .heatmap-table td {
921
+ min-width: 25px;
922
+ font-size: 0.65rem;
923
+ padding: 2px;
924
+ }
925
+
926
+ .heatmap-cell {
927
+ width: 22px;
928
+ height: 22px;
929
+ }
930
+
931
+ /* Chat messages */
932
+ .message {
933
+ max-width: 95% !important;
934
+ }
935
+
936
+ /* Search */
937
+ .search-box {
938
+ flex-direction: column;
939
+ }
940
+
941
+ .search-box input {
942
+ width: 100%;
943
+ }
944
+
945
+ /* Tables */
946
+ .data-table {
947
+ font-size: 0.8rem;
948
+ }
949
+
950
+ .data-table th,
951
+ .data-table td {
952
+ padding: var(--spacing-sm);
953
+ }
954
+ }
955
+
956
+ @media (max-width: 480px) {
957
+ .stats-grid {
958
+ grid-template-columns: 1fr;
959
+ }
960
+
961
+ .user-stats-grid {
962
+ grid-template-columns: 1fr;
963
+ }
964
+
965
+ .header h1 {
966
+ font-size: 1.1rem;
967
+ }
968
+
969
+ .chart-container {
970
+ height: 200px;
971
+ }
972
  }
973
 
974
  /* ==========================================
static/js/dashboard.js CHANGED
@@ -9,6 +9,31 @@
9
  * - Export functionality
10
  */
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  // ==========================================
13
  // GLOBAL STATE
14
  // ==========================================
 
9
  * - Export functionality
10
  */
11
 
12
+ // ==========================================
13
+ // MOBILE MENU
14
+ // ==========================================
15
+
16
+ function toggleMobileMenu() {
17
+ const sidebar = document.querySelector('.sidebar');
18
+ const overlay = document.querySelector('.sidebar-overlay');
19
+ sidebar.classList.toggle('open');
20
+ if (overlay) overlay.classList.toggle('active');
21
+ }
22
+
23
+ // Close mobile menu when clicking a nav link
24
+ document.addEventListener('DOMContentLoaded', function() {
25
+ document.querySelectorAll('.nav-link').forEach(function(link) {
26
+ link.addEventListener('click', function() {
27
+ if (window.innerWidth <= 768) {
28
+ const sidebar = document.querySelector('.sidebar');
29
+ const overlay = document.querySelector('.sidebar-overlay');
30
+ sidebar.classList.remove('open');
31
+ if (overlay) overlay.classList.remove('active');
32
+ }
33
+ });
34
+ });
35
+ });
36
+
37
  // ==========================================
38
  // GLOBAL STATE
39
  // ==========================================
templates/index.html CHANGED
@@ -8,6 +8,8 @@
8
  <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
9
  </head>
10
  <body>
 
 
11
  <!-- Sidebar -->
12
  <nav class="sidebar">
13
  <div class="logo">
 
8
  <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
9
  </head>
10
  <body>
11
+ <button class="mobile-menu-btn" onclick="toggleMobileMenu()">&#9776;</button>
12
+ <div class="sidebar-overlay" onclick="toggleMobileMenu()"></div>
13
  <!-- Sidebar -->
14
  <nav class="sidebar">
15
  <div class="logo">
templates/moderation.html CHANGED
@@ -8,6 +8,8 @@
8
  <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
9
  </head>
10
  <body>
 
 
11
  <!-- Sidebar -->
12
  <nav class="sidebar">
13
  <div class="logo">
 
8
  <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
9
  </head>
10
  <body>
11
+ <button class="mobile-menu-btn" onclick="toggleMobileMenu()">&#9776;</button>
12
+ <div class="sidebar-overlay" onclick="toggleMobileMenu()"></div>
13
  <!-- Sidebar -->
14
  <nav class="sidebar">
15
  <div class="logo">
templates/search.html CHANGED
@@ -7,6 +7,8 @@
7
  <link rel="stylesheet" href="/static/css/style.css">
8
  </head>
9
  <body>
 
 
10
  <!-- Sidebar -->
11
  <nav class="sidebar">
12
  <div class="logo">
 
7
  <link rel="stylesheet" href="/static/css/style.css">
8
  </head>
9
  <body>
10
+ <button class="mobile-menu-btn" onclick="toggleMobileMenu()">&#9776;</button>
11
+ <div class="sidebar-overlay" onclick="toggleMobileMenu()"></div>
12
  <!-- Sidebar -->
13
  <nav class="sidebar">
14
  <div class="logo">
templates/settings.html CHANGED
@@ -167,6 +167,8 @@
167
  </style>
168
  </head>
169
  <body>
 
 
170
  <!-- Sidebar -->
171
  <nav class="sidebar">
172
  <div class="logo">
 
167
  </style>
168
  </head>
169
  <body>
170
+ <button class="mobile-menu-btn" onclick="toggleMobileMenu()">&#9776;</button>
171
+ <div class="sidebar-overlay" onclick="toggleMobileMenu()"></div>
172
  <!-- Sidebar -->
173
  <nav class="sidebar">
174
  <div class="logo">
templates/user_profile.html CHANGED
@@ -253,6 +253,8 @@
253
  </style>
254
  </head>
255
  <body>
 
 
256
  <!-- Sidebar -->
257
  <nav class="sidebar">
258
  <div class="logo">
 
253
  </style>
254
  </head>
255
  <body>
256
+ <button class="mobile-menu-btn" onclick="toggleMobileMenu()">&#9776;</button>
257
+ <div class="sidebar-overlay" onclick="toggleMobileMenu()"></div>
258
  <!-- Sidebar -->
259
  <nav class="sidebar">
260
  <div class="logo">
templates/users.html CHANGED
@@ -8,6 +8,8 @@
8
  <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
9
  </head>
10
  <body>
 
 
11
  <!-- Sidebar -->
12
  <nav class="sidebar">
13
  <div class="logo">
 
8
  <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
9
  </head>
10
  <body>
11
+ <button class="mobile-menu-btn" onclick="toggleMobileMenu()">&#9776;</button>
12
+ <div class="sidebar-overlay" onclick="toggleMobileMenu()"></div>
13
  <!-- Sidebar -->
14
  <nav class="sidebar">
15
  <div class="logo">
tests.py ADDED
@@ -0,0 +1,474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Integration tests for Telegram Analytics: indexer, search, and dashboard endpoints.
4
+
5
+ Run with: python -m pytest tests.py -v
6
+ Or: python tests.py
7
+ """
8
+
9
+ import json
10
+ import os
11
+ import sqlite3
12
+ import tempfile
13
+ import time
14
+ import unittest
15
+
16
+ from pathlib import Path
17
+
18
+
19
+ # ---------------------------------------------------------------------------
20
+ # Helpers
21
+ # ---------------------------------------------------------------------------
22
+
23
+ def _sample_messages(n: int = 5) -> list[dict]:
24
+ """Generate N realistic Telegram-format messages."""
25
+ base_ts = 1700000000
26
+ users = [
27
+ ("user1", "Alice"),
28
+ ("user2", "Bob"),
29
+ ("user3", "Carol"),
30
+ ]
31
+ msgs = []
32
+ for i in range(1, n + 1):
33
+ uid, name = users[i % len(users)]
34
+ msgs.append({
35
+ "id": 1000 + i,
36
+ "type": "message",
37
+ "date": f"2024-01-{(i % 28) + 1:02d}T10:00:00",
38
+ "date_unixtime": str(base_ts + i * 3600),
39
+ "from": name,
40
+ "from_id": uid,
41
+ "text": f"Test message number {i} from {name}",
42
+ "text_entities": [
43
+ {"type": "plain", "text": f"Test message number {i} from {name}"}
44
+ ],
45
+ "reply_to_message_id": (1000 + i - 1) if i > 1 else None,
46
+ })
47
+ return msgs
48
+
49
+
50
+ def _write_json(path: str, messages: list[dict]):
51
+ """Write messages in Telegram export JSON format."""
52
+ with open(path, "w", encoding="utf-8") as f:
53
+ json.dump({"messages": messages}, f, ensure_ascii=False)
54
+
55
+
56
+ # ---------------------------------------------------------------------------
57
+ # 1. Indexer Tests
58
+ # ---------------------------------------------------------------------------
59
+
60
+ class TestIndexer(unittest.TestCase):
61
+ """Tests for OptimizedIndexer and IncrementalIndexer."""
62
+
63
+ def setUp(self):
64
+ self.tmpdir = tempfile.mkdtemp()
65
+ self.db_path = os.path.join(self.tmpdir, "test.db")
66
+ self.json_path = os.path.join(self.tmpdir, "messages.json")
67
+ self.messages = _sample_messages(10)
68
+ _write_json(self.json_path, self.messages)
69
+
70
+ def tearDown(self):
71
+ import shutil
72
+ shutil.rmtree(self.tmpdir, ignore_errors=True)
73
+
74
+ def test_optimized_indexer_indexes_messages(self):
75
+ from indexer import OptimizedIndexer
76
+ indexer = OptimizedIndexer(self.db_path, build_trigrams=False, build_graph=False)
77
+ stats = indexer.index_file(self.json_path, show_progress=False)
78
+
79
+ self.assertGreater(stats["messages"], 0)
80
+
81
+ conn = sqlite3.connect(self.db_path)
82
+ count = conn.execute("SELECT COUNT(*) FROM messages").fetchone()[0]
83
+ conn.close()
84
+ self.assertEqual(count, stats["messages"])
85
+
86
+ def test_incremental_indexer_deduplication(self):
87
+ from indexer import OptimizedIndexer, IncrementalIndexer
88
+
89
+ # First: create DB with OptimizedIndexer
90
+ opt = OptimizedIndexer(self.db_path, build_trigrams=False, build_graph=False)
91
+ opt.index_file(self.json_path, show_progress=False)
92
+
93
+ # Now use IncrementalIndexer – same data, should all be duplicates
94
+ idx = IncrementalIndexer(self.db_path)
95
+ stats = idx.update_from_json(self.json_path, show_progress=False)
96
+ idx.close()
97
+
98
+ self.assertEqual(stats["new_messages"], 0)
99
+ self.assertGreater(stats["duplicates"], 0)
100
+
101
+ def test_incremental_indexer_adds_new(self):
102
+ from indexer import OptimizedIndexer, IncrementalIndexer
103
+
104
+ # Create DB with 5 messages
105
+ msgs5 = _sample_messages(5)
106
+ _write_json(self.json_path, msgs5)
107
+ opt = OptimizedIndexer(self.db_path, build_trigrams=False, build_graph=False)
108
+ opt.index_file(self.json_path, show_progress=False)
109
+
110
+ # Now add 10 messages (5 old + 5 new)
111
+ msgs10 = _sample_messages(10)
112
+ json2 = os.path.join(self.tmpdir, "messages2.json")
113
+ _write_json(json2, msgs10)
114
+
115
+ idx = IncrementalIndexer(self.db_path)
116
+ stats = idx.update_from_json(json2, show_progress=False)
117
+ idx.close()
118
+
119
+ self.assertEqual(stats["new_messages"], 5)
120
+ self.assertEqual(stats["duplicates"], 5)
121
+
122
+ def test_incremental_indexer_from_json_data(self):
123
+ from indexer import OptimizedIndexer, IncrementalIndexer
124
+
125
+ # Init DB first
126
+ opt = OptimizedIndexer(self.db_path, build_trigrams=False, build_graph=False)
127
+ opt.index_file(self.json_path, show_progress=False)
128
+
129
+ # Add new messages via json_data
130
+ new_msgs = _sample_messages(15) # 10 old + 5 new
131
+ idx = IncrementalIndexer(self.db_path)
132
+ stats = idx.update_from_json_data(new_msgs, show_progress=False)
133
+ idx.close()
134
+
135
+ self.assertEqual(stats["new_messages"], 5)
136
+
137
+ def test_fts5_search_works(self):
138
+ from indexer import OptimizedIndexer
139
+ indexer = OptimizedIndexer(self.db_path, build_trigrams=False, build_graph=False)
140
+ indexer.index_file(self.json_path, show_progress=False)
141
+
142
+ conn = sqlite3.connect(self.db_path)
143
+ cursor = conn.execute(
144
+ "SELECT COUNT(*) FROM messages_fts WHERE messages_fts MATCH 'message'"
145
+ )
146
+ count = cursor.fetchone()[0]
147
+ conn.close()
148
+
149
+ self.assertGreater(count, 0, "FTS5 search should find messages with 'message'")
150
+
151
+ def test_streaming_load_json_messages(self):
152
+ from indexer import load_json_messages
153
+ msgs = list(load_json_messages(self.json_path))
154
+ self.assertEqual(len(msgs), 10)
155
+ self.assertIn("text_plain", msgs[0])
156
+
157
+ def test_entities_extracted(self):
158
+ """Messages with links/mentions in text_entities should have entities stored."""
159
+ msgs = [
160
+ {
161
+ "id": 9001,
162
+ "type": "message",
163
+ "date": "2024-01-01T10:00:00",
164
+ "date_unixtime": "1700000000",
165
+ "from": "Alice",
166
+ "from_id": "user1",
167
+ "text": "Check https://example.com and @bob",
168
+ "text_entities": [
169
+ {"type": "plain", "text": "Check "},
170
+ {"type": "link", "text": "https://example.com"},
171
+ {"type": "plain", "text": " and "},
172
+ {"type": "mention", "text": "@bob"},
173
+ ],
174
+ }
175
+ ]
176
+ _write_json(self.json_path, msgs)
177
+
178
+ from indexer import OptimizedIndexer
179
+ indexer = OptimizedIndexer(self.db_path, build_trigrams=False, build_graph=False)
180
+ indexer.index_file(self.json_path, show_progress=False)
181
+
182
+ conn = sqlite3.connect(self.db_path)
183
+ entities = conn.execute("SELECT type, value FROM entities WHERE message_id = 9001").fetchall()
184
+ conn.close()
185
+
186
+ types = [e[0] for e in entities]
187
+ self.assertIn("link", types)
188
+ self.assertIn("mention", types)
189
+
190
+
191
+ # ---------------------------------------------------------------------------
192
+ # 2. Search Tests
193
+ # ---------------------------------------------------------------------------
194
+
195
+ class TestSearch(unittest.TestCase):
196
+ """Tests for FTS search."""
197
+
198
+ def setUp(self):
199
+ self.tmpdir = tempfile.mkdtemp()
200
+ self.db_path = os.path.join(self.tmpdir, "test.db")
201
+ self.json_path = os.path.join(self.tmpdir, "messages.json")
202
+ _write_json(self.json_path, _sample_messages(20))
203
+
204
+ from indexer import OptimizedIndexer
205
+ indexer = OptimizedIndexer(self.db_path, build_trigrams=False, build_graph=False)
206
+ indexer.index_file(self.json_path, show_progress=False)
207
+
208
+ def tearDown(self):
209
+ import shutil
210
+ shutil.rmtree(self.tmpdir, ignore_errors=True)
211
+
212
+ def test_fts_match_query(self):
213
+ conn = sqlite3.connect(self.db_path)
214
+ conn.row_factory = sqlite3.Row
215
+ rows = conn.execute(
216
+ "SELECT id, text_plain FROM messages WHERE id IN "
217
+ "(SELECT rowid FROM messages_fts WHERE messages_fts MATCH 'Alice')"
218
+ ).fetchall()
219
+ conn.close()
220
+ self.assertGreater(len(rows), 0)
221
+ for r in rows:
222
+ self.assertIn("Alice", r["text_plain"])
223
+
224
+ def test_fts_returns_no_results_for_nonsense(self):
225
+ conn = sqlite3.connect(self.db_path)
226
+ rows = conn.execute(
227
+ "SELECT COUNT(*) FROM messages_fts WHERE messages_fts MATCH 'xyzzyplugh'"
228
+ ).fetchone()[0]
229
+ conn.close()
230
+ self.assertEqual(rows, 0)
231
+
232
+
233
+ # ---------------------------------------------------------------------------
234
+ # 3. SemanticSearch Empty Embeddings
235
+ # ---------------------------------------------------------------------------
236
+
237
+ try:
238
+ import numpy as np
239
+ HAS_NUMPY = True
240
+ except ImportError:
241
+ HAS_NUMPY = False
242
+
243
+
244
+ @unittest.skipUnless(HAS_NUMPY, "numpy not installed")
245
+ class TestSemanticSearchEmpty(unittest.TestCase):
246
+ """Test that SemanticSearch handles missing/empty embeddings gracefully."""
247
+
248
+ def test_is_available_missing_db(self):
249
+ from semantic_search import SemanticSearch
250
+ ss = SemanticSearch(embeddings_db="/tmp/nonexistent_embeddings_12345.db")
251
+ self.assertFalse(ss.is_available())
252
+
253
+ def test_is_available_empty_db(self):
254
+ from semantic_search import SemanticSearch
255
+ tmpdir = tempfile.mkdtemp()
256
+ db_path = os.path.join(tmpdir, "empty_emb.db")
257
+
258
+ conn = sqlite3.connect(db_path)
259
+ conn.execute(
260
+ "CREATE TABLE embeddings (message_id INTEGER PRIMARY KEY, "
261
+ "from_name TEXT, text_preview TEXT, embedding BLOB)"
262
+ )
263
+ conn.commit()
264
+ conn.close()
265
+
266
+ ss = SemanticSearch(embeddings_db=db_path)
267
+ self.assertFalse(ss.is_available())
268
+
269
+ import shutil
270
+ shutil.rmtree(tmpdir, ignore_errors=True)
271
+
272
+ def test_load_empty_embeddings_no_crash(self):
273
+ from semantic_search import SemanticSearch
274
+ tmpdir = tempfile.mkdtemp()
275
+ db_path = os.path.join(tmpdir, "empty_emb.db")
276
+
277
+ conn = sqlite3.connect(db_path)
278
+ conn.execute(
279
+ "CREATE TABLE embeddings (message_id INTEGER PRIMARY KEY, "
280
+ "from_name TEXT, text_preview TEXT, embedding BLOB)"
281
+ )
282
+ conn.commit()
283
+ conn.close()
284
+
285
+ ss = SemanticSearch(embeddings_db=db_path)
286
+ ss._load_embeddings() # Should not crash
287
+
288
+ self.assertTrue(ss.embeddings_loaded)
289
+ self.assertEqual(len(ss.message_ids), 0)
290
+
291
+ import shutil
292
+ shutil.rmtree(tmpdir, ignore_errors=True)
293
+
294
+ def test_stats_empty_db(self):
295
+ from semantic_search import SemanticSearch
296
+ tmpdir = tempfile.mkdtemp()
297
+ db_path = os.path.join(tmpdir, "empty_emb.db")
298
+
299
+ conn = sqlite3.connect(db_path)
300
+ conn.execute(
301
+ "CREATE TABLE embeddings (message_id INTEGER PRIMARY KEY, "
302
+ "from_name TEXT, text_preview TEXT, embedding BLOB)"
303
+ )
304
+ conn.commit()
305
+ conn.close()
306
+
307
+ ss = SemanticSearch(embeddings_db=db_path)
308
+ s = ss.stats()
309
+ self.assertTrue(s["available"]) # File exists and table exists
310
+ self.assertEqual(s["count"], 0)
311
+
312
+ import shutil
313
+ shutil.rmtree(tmpdir, ignore_errors=True)
314
+
315
+
316
+ # ---------------------------------------------------------------------------
317
+ # 4. Dashboard Endpoint Tests
318
+ # ---------------------------------------------------------------------------
319
+
320
+ try:
321
+ import flask
322
+ HAS_FLASK = True
323
+ except ImportError:
324
+ HAS_FLASK = False
325
+
326
+
327
+ @unittest.skipUnless(HAS_FLASK, "flask not installed")
328
+ class TestDashboardEndpoints(unittest.TestCase):
329
+ """Test Flask dashboard API endpoints."""
330
+
331
+ @classmethod
332
+ def setUpClass(cls):
333
+ """Create a test DB and configure Flask test client."""
334
+ cls.tmpdir = tempfile.mkdtemp()
335
+ cls.db_path = os.path.join(cls.tmpdir, "test.db")
336
+ cls.json_path = os.path.join(cls.tmpdir, "messages.json")
337
+
338
+ _write_json(cls.json_path, _sample_messages(50))
339
+
340
+ from indexer import OptimizedIndexer
341
+ indexer = OptimizedIndexer(cls.db_path, build_trigrams=False, build_graph=False)
342
+ indexer.index_file(cls.json_path, show_progress=False)
343
+
344
+ import dashboard
345
+ dashboard.DB_PATH = cls.db_path
346
+ dashboard.app.config["TESTING"] = True
347
+ cls.client = dashboard.app.test_client()
348
+
349
+ @classmethod
350
+ def tearDownClass(cls):
351
+ import shutil
352
+ shutil.rmtree(cls.tmpdir, ignore_errors=True)
353
+
354
+ def test_overview_endpoint(self):
355
+ resp = self.client.get("/api/overview?timeframe=all")
356
+ self.assertEqual(resp.status_code, 200)
357
+ data = resp.get_json()
358
+ self.assertIn("total_messages", data)
359
+ self.assertGreater(data["total_messages"], 0)
360
+
361
+ def test_users_endpoint(self):
362
+ resp = self.client.get("/api/users?timeframe=all&limit=10")
363
+ self.assertEqual(resp.status_code, 200)
364
+ data = resp.get_json()
365
+ self.assertIn("users", data)
366
+ self.assertGreater(len(data["users"]), 0)
367
+ user = data["users"][0]
368
+ for field in ("user_id", "name", "messages", "percentage"):
369
+ self.assertIn(field, user)
370
+
371
+ def test_users_include_inactive(self):
372
+ resp = self.client.get("/api/users?timeframe=all&include_inactive=0")
373
+ self.assertEqual(resp.status_code, 200)
374
+ data = resp.get_json()
375
+ for user in data["users"]:
376
+ self.assertGreater(user["messages"], 0)
377
+
378
+ def test_search_fts_endpoint(self):
379
+ resp = self.client.get("/api/search?q=message&mode=fts&limit=5")
380
+ self.assertEqual(resp.status_code, 200)
381
+ data = resp.get_json()
382
+ self.assertIn("results", data)
383
+
384
+ def test_chart_hourly_endpoint(self):
385
+ resp = self.client.get("/api/chart/hourly?timeframe=all")
386
+ self.assertEqual(resp.status_code, 200)
387
+ data = resp.get_json()
388
+ self.assertIsInstance(data, list)
389
+ self.assertEqual(len(data), 24)
390
+
391
+ def test_chart_daily_endpoint(self):
392
+ resp = self.client.get("/api/chart/daily?timeframe=all")
393
+ self.assertEqual(resp.status_code, 200)
394
+ data = resp.get_json()
395
+ self.assertIsInstance(data, list)
396
+
397
+ def test_cache_invalidate_endpoint(self):
398
+ resp = self.client.get("/api/cache/invalidate")
399
+ self.assertEqual(resp.status_code, 200)
400
+ data = resp.get_json()
401
+ self.assertEqual(data["status"], "invalidated")
402
+
403
+ def test_page_routes_return_200(self):
404
+ """All page routes should return 200."""
405
+ for route in ("/", "/users", "/search", "/chat", "/moderation", "/settings"):
406
+ resp = self.client.get(route)
407
+ self.assertEqual(resp.status_code, 200, f"Route {route} failed")
408
+
409
+ def test_user_profile_endpoint(self):
410
+ resp = self.client.get("/api/users?timeframe=all&limit=1")
411
+ data = resp.get_json()
412
+ if data["users"]:
413
+ uid = data["users"][0]["user_id"]
414
+ resp2 = self.client.get(f"/api/user/{uid}/profile")
415
+ self.assertEqual(resp2.status_code, 200)
416
+ profile = resp2.get_json()
417
+ self.assertIn("total_messages", profile)
418
+ self.assertIn("hourly_activity", profile)
419
+
420
+ def test_overview_has_expected_keys(self):
421
+ resp = self.client.get("/api/overview?timeframe=all")
422
+ data = resp.get_json()
423
+ for key in ("total_messages", "total_users", "links_count", "media_count"):
424
+ self.assertIn(key, data, f"Missing key: {key}")
425
+
426
+
427
+ # ---------------------------------------------------------------------------
428
+ # 5. AI Search Schema Test
429
+ # ---------------------------------------------------------------------------
430
+
431
+ class TestAISearchSchema(unittest.TestCase):
432
+ """Test that AI search schema generation matches actual DB."""
433
+
434
+ def test_dynamic_schema_includes_real_columns(self):
435
+ tmpdir = tempfile.mkdtemp()
436
+ db_path = os.path.join(tmpdir, "test.db")
437
+
438
+ # Initialize DB with real schema
439
+ from indexer import init_database
440
+ conn = init_database(db_path)
441
+ conn.close()
442
+
443
+ from ai_search import AISearchEngine
444
+ # Create instance without connecting to a provider
445
+ engine = AISearchEngine.__new__(AISearchEngine)
446
+ engine.db_path = db_path
447
+
448
+ schema = engine._get_db_schema()
449
+
450
+ # Verify real column names are present
451
+ self.assertIn("text_plain", schema)
452
+ self.assertIn("date_unixtime", schema)
453
+ self.assertIn("has_links", schema)
454
+ self.assertIn("has_media", schema)
455
+ self.assertIn("from_id", schema)
456
+ self.assertIn("participants", schema)
457
+
458
+ # Verify old wrong column names are NOT in the dynamic output
459
+ self.assertNotIn("char_count", schema)
460
+ # media_type would not appear unless there's a column named that
461
+ lines_lower = schema.lower()
462
+ # "media_type" should not be a column name (has_media is the real one)
463
+ self.assertNotIn("media_type (", lines_lower)
464
+
465
+ import shutil
466
+ shutil.rmtree(tmpdir, ignore_errors=True)
467
+
468
+
469
+ # ---------------------------------------------------------------------------
470
+ # Runner
471
+ # ---------------------------------------------------------------------------
472
+
473
+ if __name__ == "__main__":
474
+ unittest.main(verbosity=2)
update_hf.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Automated sync + deploy to Hugging Face Spaces.
4
+
5
+ Usage:
6
+ python update_hf.py # sync + upload everything
7
+ python update_hf.py --db-only # just upload DB (skip sync)
8
+ python update_hf.py --full # upload all files + DB (first time or after code changes)
9
+ """
10
+
11
+ import subprocess
12
+ import sys
13
+ import os
14
+
15
+ # === CONFIGURATION ===
16
+ REPO_ID = "rottg/telegram-analytics"
17
+ PROJECT_DIR = os.path.dirname(os.path.abspath(__file__))
18
+ DB_PATH = os.path.join(PROJECT_DIR, "telegram.db")
19
+
20
+ # Files to upload (code + config)
21
+ CODE_FILES = [
22
+ "dashboard.py", "ai_search.py", "algorithms.py", "data_structures.py",
23
+ "indexer.py", "search.py", "semantic_search.py", "schema.sql",
24
+ "Dockerfile", "requirements.txt", "README.md",
25
+ ]
26
+ FOLDERS = ["static", "templates"]
27
+
28
+ # Token is read from environment variable or .hf_token file
29
+ HF_TOKEN = os.environ.get("HF_TOKEN")
30
+ if not HF_TOKEN:
31
+ token_file = os.path.join(PROJECT_DIR, ".hf_token")
32
+ if os.path.exists(token_file):
33
+ with open(token_file) as f:
34
+ HF_TOKEN = f.read().strip()
35
+ else:
36
+ print("ERROR: Set HF_TOKEN env var or create .hf_token file with your token")
37
+ sys.exit(1)
38
+
39
+
40
+ def run_sync():
41
+ """Run daily_sync.py"""
42
+ sync_script = os.path.join(PROJECT_DIR, "daily_sync.py")
43
+ print("\n=== Step 1: Running daily sync ===")
44
+ result = subprocess.run([sys.executable, sync_script], cwd=PROJECT_DIR)
45
+ if result.returncode != 0:
46
+ print("ERROR: Sync failed!")
47
+ sys.exit(1)
48
+ print("Sync complete.")
49
+
50
+
51
+ def upload_to_hf(full=False):
52
+ """Upload files to HF Space using the API (no git needed)."""
53
+ from huggingface_hub import HfApi
54
+
55
+ api = HfApi(token=HF_TOKEN)
56
+
57
+ if full:
58
+ # Upload all code files + folders + DB
59
+ print("\n=== Uploading all files to HF ===")
60
+
61
+ # Collect all files to upload
62
+ upload_files = []
63
+ for f in CODE_FILES:
64
+ path = os.path.join(PROJECT_DIR, f)
65
+ if os.path.exists(path):
66
+ upload_files.append((path, f))
67
+
68
+ for folder in FOLDERS:
69
+ folder_path = os.path.join(PROJECT_DIR, folder)
70
+ if os.path.exists(folder_path):
71
+ for root, dirs, files in os.walk(folder_path):
72
+ for fname in files:
73
+ full_path = os.path.join(root, fname)
74
+ rel_path = os.path.relpath(full_path, PROJECT_DIR)
75
+ upload_files.append((full_path, rel_path.replace("\\", "/")))
76
+
77
+ # Add DB
78
+ upload_files.append((DB_PATH, "telegram.db"))
79
+
80
+ print(f"Uploading {len(upload_files)} files...")
81
+ for local_path, repo_path in upload_files:
82
+ size_mb = os.path.getsize(local_path) / (1024 * 1024)
83
+ if size_mb > 1:
84
+ print(f" {repo_path} ({size_mb:.0f} MB)...")
85
+ else:
86
+ print(f" {repo_path}")
87
+
88
+ api.upload_folder(
89
+ folder_path=PROJECT_DIR,
90
+ repo_id=REPO_ID,
91
+ repo_type="space",
92
+ allow_patterns=[f for _, f in upload_files],
93
+ )
94
+ else:
95
+ # DB only - delete old, upload new
96
+ print("\n=== Removing old DB from HF ===")
97
+ try:
98
+ api.delete_file("telegram.db", repo_id=REPO_ID, repo_type="space")
99
+ print("Old DB removed.")
100
+ except Exception as e:
101
+ print(f"No old DB to remove ({e})")
102
+
103
+ print("\n=== Uploading new DB to HF ===")
104
+ db_size_mb = os.path.getsize(DB_PATH) / (1024 * 1024)
105
+ print(f"Uploading {db_size_mb:.0f} MB...")
106
+
107
+ api.upload_file(
108
+ path_or_fileobj=DB_PATH,
109
+ path_in_repo="telegram.db",
110
+ repo_id=REPO_ID,
111
+ repo_type="space",
112
+ )
113
+
114
+ print("Upload complete!")
115
+ print(f"\nSite will rebuild at: https://rottg-telegram-analytics.hf.space")
116
+
117
+
118
+ def main():
119
+ db_only = "--db-only" in sys.argv
120
+ full = "--full" in sys.argv
121
+
122
+ if not db_only:
123
+ run_sync()
124
+ else:
125
+ print("Skipping sync (--db-only)")
126
+
127
+ upload_to_hf(full=full or not db_only and "--db-only" not in sys.argv)
128
+ print("\n=== Done! ===")
129
+
130
+
131
+ if __name__ == "__main__":
132
+ main()
vector_search.py ADDED
@@ -0,0 +1,448 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Vector Search Module for Semantic Similarity
4
+
5
+ Optional module that adds semantic search capabilities using:
6
+ - Sentence embeddings (sentence-transformers)
7
+ - FAISS for efficient similarity search
8
+
9
+ Dependencies (optional, install with):
10
+ pip install sentence-transformers faiss-cpu numpy
11
+
12
+ If dependencies are not installed, the module gracefully degrades.
13
+ """
14
+
15
+ import sqlite3
16
+ import pickle
17
+ from pathlib import Path
18
+ from typing import Optional
19
+
20
+ # Try importing optional dependencies
21
+ VECTOR_SEARCH_AVAILABLE = False
22
+ try:
23
+ import numpy as np
24
+ NUMPY_AVAILABLE = True
25
+ except ImportError:
26
+ NUMPY_AVAILABLE = False
27
+ np = None
28
+
29
+ try:
30
+ import faiss
31
+ FAISS_AVAILABLE = True
32
+ except ImportError:
33
+ FAISS_AVAILABLE = False
34
+ faiss = None
35
+
36
+ try:
37
+ from sentence_transformers import SentenceTransformer
38
+ SENTENCE_TRANSFORMERS_AVAILABLE = True
39
+ except ImportError:
40
+ SENTENCE_TRANSFORMERS_AVAILABLE = False
41
+ SentenceTransformer = None
42
+
43
+ VECTOR_SEARCH_AVAILABLE = all([NUMPY_AVAILABLE, FAISS_AVAILABLE, SENTENCE_TRANSFORMERS_AVAILABLE])
44
+
45
+
46
+ class VectorSearchUnavailable:
47
+ """Placeholder when dependencies are not installed."""
48
+
49
+ def __init__(self, *args, **kwargs):
50
+ pass
51
+
52
+ def __getattr__(self, name):
53
+ def method(*args, **kwargs):
54
+ raise RuntimeError(
55
+ "Vector search requires additional dependencies. Install with:\n"
56
+ "pip install sentence-transformers faiss-cpu numpy"
57
+ )
58
+ return method
59
+
60
+
61
+ class VectorSearch:
62
+ """
63
+ Semantic search using sentence embeddings and FAISS.
64
+
65
+ Features:
66
+ - Generate embeddings for messages
67
+ - Build FAISS index for fast similarity search
68
+ - Find semantically similar messages (not just keyword match)
69
+ - Supports Hebrew and multilingual text
70
+
71
+ Example:
72
+ vs = VectorSearch(db_path='telegram.db')
73
+ vs.build_index() # One-time, can take a while
74
+
75
+ # Find similar messages
76
+ results = vs.search("מה קורה היום?", limit=10)
77
+ for msg_id, score, text in results:
78
+ print(f"{score:.3f}: {text[:50]}")
79
+ """
80
+
81
+ # Recommended models for multilingual/Hebrew support
82
+ MODELS = {
83
+ 'fast': 'paraphrase-multilingual-MiniLM-L12-v2', # Fast, good multilingual
84
+ 'accurate': 'paraphrase-multilingual-mpnet-base-v2', # More accurate
85
+ 'small': 'all-MiniLM-L6-v2', # Smallest, English-focused
86
+ }
87
+
88
+ def __init__(
89
+ self,
90
+ db_path: str = 'telegram.db',
91
+ model_name: str = 'fast',
92
+ index_path: Optional[str] = None
93
+ ):
94
+ """
95
+ Initialize vector search.
96
+
97
+ Args:
98
+ db_path: Path to SQLite database
99
+ model_name: Model preset ('fast', 'accurate', 'small') or full model name
100
+ index_path: Path to save/load FAISS index (default: db_path + '.faiss')
101
+ """
102
+ if not VECTOR_SEARCH_AVAILABLE:
103
+ raise RuntimeError(
104
+ "Vector search requires additional dependencies. Install with:\n"
105
+ "pip install sentence-transformers faiss-cpu numpy"
106
+ )
107
+
108
+ self.db_path = db_path
109
+ self.index_path = index_path or f"{db_path}.faiss"
110
+ self.id_map_path = f"{self.index_path}.ids"
111
+
112
+ # Load model
113
+ model_id = self.MODELS.get(model_name, model_name)
114
+ print(f"Loading embedding model: {model_id}")
115
+ self.model = SentenceTransformer(model_id)
116
+ self.dimension = self.model.get_sentence_embedding_dimension()
117
+
118
+ # Initialize FAISS index
119
+ self.index = None
120
+ self.id_map: list[int] = [] # Maps FAISS index position to message_id
121
+
122
+ # Try to load existing index
123
+ if Path(self.index_path).exists():
124
+ self.load_index()
125
+
126
+ def _get_connection(self) -> sqlite3.Connection:
127
+ """Get database connection."""
128
+ conn = sqlite3.connect(self.db_path)
129
+ conn.row_factory = sqlite3.Row
130
+ return conn
131
+
132
+ def encode(self, texts: list[str], batch_size: int = 32, show_progress: bool = True) -> 'np.ndarray':
133
+ """
134
+ Encode texts to embeddings.
135
+
136
+ Args:
137
+ texts: List of texts to encode
138
+ batch_size: Batch size for encoding
139
+ show_progress: Show progress bar
140
+
141
+ Returns:
142
+ numpy array of shape (n_texts, dimension)
143
+ """
144
+ return self.model.encode(
145
+ texts,
146
+ batch_size=batch_size,
147
+ show_progress_bar=show_progress,
148
+ convert_to_numpy=True,
149
+ normalize_embeddings=True # For cosine similarity
150
+ )
151
+
152
+ def build_index(
153
+ self,
154
+ batch_size: int = 1000,
155
+ min_text_length: int = 10,
156
+ use_gpu: bool = False
157
+ ) -> None:
158
+ """
159
+ Build FAISS index from all messages in database.
160
+
161
+ Args:
162
+ batch_size: Number of messages to process at once
163
+ min_text_length: Minimum text length to index
164
+ use_gpu: Use GPU acceleration if available
165
+ """
166
+ conn = self._get_connection()
167
+
168
+ # Count messages
169
+ cursor = conn.execute(
170
+ 'SELECT COUNT(*) FROM messages WHERE length(text_plain) >= ?',
171
+ (min_text_length,)
172
+ )
173
+ total = cursor.fetchone()[0]
174
+ print(f"Building index for {total} messages...")
175
+
176
+ # Create FAISS index
177
+ # Using IndexFlatIP (Inner Product) since we normalize embeddings
178
+ self.index = faiss.IndexFlatIP(self.dimension)
179
+
180
+ if use_gpu and faiss.get_num_gpus() > 0:
181
+ print("Using GPU acceleration")
182
+ self.index = faiss.index_cpu_to_gpu(
183
+ faiss.StandardGpuResources(),
184
+ 0,
185
+ self.index
186
+ )
187
+
188
+ self.id_map = []
189
+
190
+ # Process in batches
191
+ offset = 0
192
+ while offset < total:
193
+ cursor = conn.execute(
194
+ '''
195
+ SELECT id, text_plain FROM messages
196
+ WHERE length(text_plain) >= ?
197
+ ORDER BY id
198
+ LIMIT ? OFFSET ?
199
+ ''',
200
+ (min_text_length, batch_size, offset)
201
+ )
202
+
203
+ rows = cursor.fetchall()
204
+ if not rows:
205
+ break
206
+
207
+ ids = [row['id'] for row in rows]
208
+ texts = [row['text_plain'] for row in rows]
209
+
210
+ # Encode batch
211
+ embeddings = self.encode(texts, show_progress=False)
212
+
213
+ # Add to index
214
+ self.index.add(embeddings)
215
+ self.id_map.extend(ids)
216
+
217
+ offset += len(rows)
218
+ print(f"Indexed {offset}/{total} messages ({100*offset/total:.1f}%)")
219
+
220
+ conn.close()
221
+
222
+ # Save index
223
+ self.save_index()
224
+ print(f"Index built: {self.index.ntotal} vectors")
225
+
226
+ def save_index(self) -> None:
227
+ """Save FAISS index and ID map to disk."""
228
+ if self.index is None:
229
+ return
230
+
231
+ # Convert GPU index to CPU for saving
232
+ if hasattr(faiss, 'index_gpu_to_cpu'):
233
+ try:
234
+ cpu_index = faiss.index_gpu_to_cpu(self.index)
235
+ except:
236
+ cpu_index = self.index
237
+ else:
238
+ cpu_index = self.index
239
+
240
+ faiss.write_index(cpu_index, self.index_path)
241
+
242
+ with open(self.id_map_path, 'wb') as f:
243
+ pickle.dump(self.id_map, f)
244
+
245
+ print(f"Index saved to {self.index_path}")
246
+
247
+ def load_index(self) -> bool:
248
+ """Load FAISS index from disk."""
249
+ try:
250
+ self.index = faiss.read_index(self.index_path)
251
+ with open(self.id_map_path, 'rb') as f:
252
+ self.id_map = pickle.load(f)
253
+ print(f"Loaded index with {self.index.ntotal} vectors")
254
+ return True
255
+ except Exception as e:
256
+ print(f"Could not load index: {e}")
257
+ return False
258
+
259
+ def search(
260
+ self,
261
+ query: str,
262
+ limit: int = 10,
263
+ min_score: float = 0.0
264
+ ) -> list[tuple[int, float, str]]:
265
+ """
266
+ Search for semantically similar messages.
267
+
268
+ Args:
269
+ query: Search query text
270
+ limit: Maximum results to return
271
+ min_score: Minimum similarity score (0-1)
272
+
273
+ Returns:
274
+ List of (message_id, score, text) tuples
275
+ """
276
+ if self.index is None or self.index.ntotal == 0:
277
+ raise RuntimeError("Index not built. Call build_index() first.")
278
+
279
+ # Encode query
280
+ query_vector = self.encode([query], show_progress=False)
281
+
282
+ # Search FAISS
283
+ scores, indices = self.index.search(query_vector, limit)
284
+
285
+ # Get message texts from DB
286
+ conn = self._get_connection()
287
+ results = []
288
+
289
+ for score, idx in zip(scores[0], indices[0]):
290
+ if idx == -1 or score < min_score:
291
+ continue
292
+
293
+ message_id = self.id_map[idx]
294
+ cursor = conn.execute(
295
+ 'SELECT text_plain FROM messages WHERE id = ?',
296
+ (message_id,)
297
+ )
298
+ row = cursor.fetchone()
299
+ if row:
300
+ results.append((message_id, float(score), row['text_plain']))
301
+
302
+ conn.close()
303
+ return results
304
+
305
+ def find_similar(
306
+ self,
307
+ message_id: int,
308
+ limit: int = 10,
309
+ exclude_same_user: bool = False
310
+ ) -> list[tuple[int, float, str]]:
311
+ """
312
+ Find messages similar to a specific message.
313
+
314
+ Args:
315
+ message_id: ID of the reference message
316
+ limit: Maximum results to return
317
+ exclude_same_user: Exclude messages from same user
318
+
319
+ Returns:
320
+ List of (message_id, score, text) tuples
321
+ """
322
+ conn = self._get_connection()
323
+
324
+ # Get the reference message
325
+ cursor = conn.execute(
326
+ 'SELECT text_plain, from_id FROM messages WHERE id = ?',
327
+ (message_id,)
328
+ )
329
+ row = cursor.fetchone()
330
+ if not row:
331
+ conn.close()
332
+ return []
333
+
334
+ reference_text = row['text_plain']
335
+ reference_user = row['from_id']
336
+ conn.close()
337
+
338
+ # Search
339
+ results = self.search(reference_text, limit=limit * 2)
340
+
341
+ # Filter
342
+ filtered = []
343
+ for msg_id, score, text in results:
344
+ if msg_id == message_id:
345
+ continue
346
+ if exclude_same_user:
347
+ conn = self._get_connection()
348
+ cursor = conn.execute(
349
+ 'SELECT from_id FROM messages WHERE id = ?',
350
+ (msg_id,)
351
+ )
352
+ msg_row = cursor.fetchone()
353
+ conn.close()
354
+ if msg_row and msg_row['from_id'] == reference_user:
355
+ continue
356
+ filtered.append((msg_id, score, text))
357
+ if len(filtered) >= limit:
358
+ break
359
+
360
+ return filtered
361
+
362
+ def cluster_messages(
363
+ self,
364
+ n_clusters: int = 10,
365
+ sample_size: Optional[int] = None
366
+ ) -> dict[int, list[int]]:
367
+ """
368
+ Cluster messages by semantic similarity using K-means.
369
+
370
+ Args:
371
+ n_clusters: Number of clusters
372
+ sample_size: Number of messages to sample (None = all)
373
+
374
+ Returns:
375
+ Dict mapping cluster_id to list of message_ids
376
+ """
377
+ if self.index is None or self.index.ntotal == 0:
378
+ raise RuntimeError("Index not built. Call build_index() first.")
379
+
380
+ # Get vectors
381
+ n_vectors = self.index.ntotal
382
+ if sample_size and sample_size < n_vectors:
383
+ indices = np.random.choice(n_vectors, sample_size, replace=False)
384
+ vectors = np.array([self.index.reconstruct(int(i)) for i in indices])
385
+ ids = [self.id_map[i] for i in indices]
386
+ else:
387
+ vectors = np.array([self.index.reconstruct(i) for i in range(n_vectors)])
388
+ ids = self.id_map
389
+
390
+ # K-means clustering
391
+ kmeans = faiss.Kmeans(self.dimension, n_clusters, niter=20, verbose=True)
392
+ kmeans.train(vectors)
393
+
394
+ # Assign clusters
395
+ _, assignments = kmeans.index.search(vectors, 1)
396
+
397
+ # Group by cluster
398
+ clusters: dict[int, list[int]] = {i: [] for i in range(n_clusters)}
399
+ for msg_id, cluster_id in zip(ids, assignments.flatten()):
400
+ clusters[int(cluster_id)].append(msg_id)
401
+
402
+ return clusters
403
+
404
+ @property
405
+ def stats(self) -> dict:
406
+ """Get index statistics."""
407
+ return {
408
+ 'available': VECTOR_SEARCH_AVAILABLE,
409
+ 'model': self.model.get_sentence_embedding_dimension() if self.model else None,
410
+ 'dimension': self.dimension,
411
+ 'index_size': self.index.ntotal if self.index else 0,
412
+ 'index_path': self.index_path
413
+ }
414
+
415
+
416
+ # Export appropriate class based on availability
417
+ if VECTOR_SEARCH_AVAILABLE:
418
+ SemanticSearch = VectorSearch
419
+ else:
420
+ SemanticSearch = VectorSearchUnavailable
421
+
422
+
423
+ def check_dependencies() -> dict:
424
+ """Check which dependencies are available."""
425
+ return {
426
+ 'numpy': NUMPY_AVAILABLE,
427
+ 'faiss': FAISS_AVAILABLE,
428
+ 'sentence_transformers': SENTENCE_TRANSFORMERS_AVAILABLE,
429
+ 'vector_search_available': VECTOR_SEARCH_AVAILABLE
430
+ }
431
+
432
+
433
+ if __name__ == '__main__':
434
+ print("=== Vector Search Dependencies ===")
435
+ deps = check_dependencies()
436
+ for name, available in deps.items():
437
+ status = "✓" if available else "✗"
438
+ print(f" {status} {name}")
439
+
440
+ if VECTOR_SEARCH_AVAILABLE:
441
+ print("\nVector search is available!")
442
+ print("Usage:")
443
+ print(" vs = VectorSearch('telegram.db')")
444
+ print(" vs.build_index() # One-time indexing")
445
+ print(" results = vs.search('מה קורה?')")
446
+ else:
447
+ print("\nTo enable vector search, install dependencies:")
448
+ print(" pip install sentence-transformers faiss-cpu numpy")