Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- analyzer.py +881 -0
- check_db.py +49 -0
- daily_sync.py +664 -0
- static/css/style.css +137 -2
- static/js/dashboard.js +25 -0
- templates/index.html +2 -0
- templates/moderation.html +2 -0
- templates/search.html +2 -0
- templates/settings.html +2 -0
- templates/user_profile.html +2 -0
- templates/users.html +2 -0
- tests.py +474 -0
- update_hf.py +132 -0
- vector_search.py +448 -0
analyzer.py
ADDED
|
@@ -0,0 +1,881 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Telegram Chat Analytics (Enhanced with Course Algorithms)
|
| 4 |
+
|
| 5 |
+
Features:
|
| 6 |
+
- LCS-based similar message detection
|
| 7 |
+
- Heap-based Top-K (O(n log k) instead of O(n log n))
|
| 8 |
+
- Selection algorithm for O(n) median/percentiles
|
| 9 |
+
- Rank Tree for order statistics queries
|
| 10 |
+
- Bucket Sort for time-based histograms
|
| 11 |
+
|
| 12 |
+
Usage:
|
| 13 |
+
python analyzer.py --db telegram.db [options]
|
| 14 |
+
python analyzer.py --stats
|
| 15 |
+
python analyzer.py --top-users
|
| 16 |
+
python analyzer.py --similar # NEW: Find similar messages
|
| 17 |
+
python analyzer.py --percentiles # NEW: Message length percentiles
|
| 18 |
+
python analyzer.py --user-rank USER # NEW: Get user's rank
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
import sqlite3
|
| 22 |
+
import argparse
|
| 23 |
+
import json
|
| 24 |
+
from collections import Counter
|
| 25 |
+
from datetime import datetime
|
| 26 |
+
from typing import Optional
|
| 27 |
+
import re
|
| 28 |
+
|
| 29 |
+
# Import course algorithms
|
| 30 |
+
from algorithms import (
|
| 31 |
+
# LCS
|
| 32 |
+
lcs_similarity, find_similar_messages,
|
| 33 |
+
# Top-K
|
| 34 |
+
TopK, top_k_frequent, top_k_by_field,
|
| 35 |
+
# Selection
|
| 36 |
+
find_median, find_percentile,
|
| 37 |
+
# Rank Tree
|
| 38 |
+
RankTree,
|
| 39 |
+
# Bucket Sort
|
| 40 |
+
bucket_sort_by_time, time_histogram, hourly_distribution,
|
| 41 |
+
# Combined
|
| 42 |
+
RankedTimeIndex
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class TelegramAnalyzer:
|
| 47 |
+
"""
|
| 48 |
+
Analytics interface for indexed Telegram messages.
|
| 49 |
+
|
| 50 |
+
Enhanced with efficient algorithms:
|
| 51 |
+
- Top-K queries: O(n log k) using heap
|
| 52 |
+
- Percentiles: O(n) using selection algorithm
|
| 53 |
+
- Rank queries: O(log n) using rank tree
|
| 54 |
+
- Similar messages: LCS-based detection
|
| 55 |
+
"""
|
| 56 |
+
|
| 57 |
+
def __init__(self, db_path: str = 'telegram.db'):
|
| 58 |
+
self.db_path = db_path
|
| 59 |
+
self.conn = sqlite3.connect(db_path)
|
| 60 |
+
self.conn.row_factory = sqlite3.Row
|
| 61 |
+
|
| 62 |
+
# Lazy-loaded data structures
|
| 63 |
+
self._rank_tree: Optional[RankTree] = None
|
| 64 |
+
self._time_index: Optional[RankedTimeIndex] = None
|
| 65 |
+
|
| 66 |
+
def close(self):
|
| 67 |
+
self.conn.close()
|
| 68 |
+
|
| 69 |
+
def __enter__(self):
|
| 70 |
+
return self
|
| 71 |
+
|
| 72 |
+
def __exit__(self, *args):
|
| 73 |
+
self.close()
|
| 74 |
+
|
| 75 |
+
# ==========================================
|
| 76 |
+
# ORIGINAL METHODS (kept for compatibility)
|
| 77 |
+
# ==========================================
|
| 78 |
+
|
| 79 |
+
def get_stats(self) -> dict:
|
| 80 |
+
"""Get general statistics about the indexed data."""
|
| 81 |
+
stats = {}
|
| 82 |
+
|
| 83 |
+
cursor = self.conn.execute('SELECT COUNT(*) FROM messages')
|
| 84 |
+
stats['total_messages'] = cursor.fetchone()[0]
|
| 85 |
+
|
| 86 |
+
cursor = self.conn.execute('SELECT COUNT(DISTINCT from_id) FROM messages')
|
| 87 |
+
stats['total_users'] = cursor.fetchone()[0]
|
| 88 |
+
|
| 89 |
+
cursor = self.conn.execute('''
|
| 90 |
+
SELECT MIN(date_unixtime), MAX(date_unixtime) FROM messages
|
| 91 |
+
WHERE date_unixtime IS NOT NULL
|
| 92 |
+
''')
|
| 93 |
+
row = cursor.fetchone()
|
| 94 |
+
if row[0] and row[1]:
|
| 95 |
+
stats['first_message'] = datetime.fromtimestamp(row[0]).isoformat()
|
| 96 |
+
stats['last_message'] = datetime.fromtimestamp(row[1]).isoformat()
|
| 97 |
+
stats['days_span'] = (row[1] - row[0]) // 86400
|
| 98 |
+
|
| 99 |
+
cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE has_media = 1')
|
| 100 |
+
stats['messages_with_media'] = cursor.fetchone()[0]
|
| 101 |
+
|
| 102 |
+
cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE has_links = 1')
|
| 103 |
+
stats['messages_with_links'] = cursor.fetchone()[0]
|
| 104 |
+
|
| 105 |
+
cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE has_mentions = 1')
|
| 106 |
+
stats['messages_with_mentions'] = cursor.fetchone()[0]
|
| 107 |
+
|
| 108 |
+
cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE forwarded_from IS NOT NULL')
|
| 109 |
+
stats['forwarded_messages'] = cursor.fetchone()[0]
|
| 110 |
+
|
| 111 |
+
cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE reply_to_message_id IS NOT NULL')
|
| 112 |
+
stats['reply_messages'] = cursor.fetchone()[0]
|
| 113 |
+
|
| 114 |
+
cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE is_edited = 1')
|
| 115 |
+
stats['edited_messages'] = cursor.fetchone()[0]
|
| 116 |
+
|
| 117 |
+
cursor = self.conn.execute('SELECT type, COUNT(*) FROM entities GROUP BY type')
|
| 118 |
+
stats['entities'] = {row[0]: row[1] for row in cursor.fetchall()}
|
| 119 |
+
|
| 120 |
+
# NEW: Add percentile stats using Selection algorithm
|
| 121 |
+
lengths = self._get_message_lengths()
|
| 122 |
+
if lengths:
|
| 123 |
+
stats['median_message_length'] = find_median(lengths)
|
| 124 |
+
stats['p90_message_length'] = find_percentile(lengths, 90)
|
| 125 |
+
|
| 126 |
+
return stats
|
| 127 |
+
|
| 128 |
+
def _get_message_lengths(self) -> list[int]:
|
| 129 |
+
"""Get all message lengths for statistical analysis."""
|
| 130 |
+
cursor = self.conn.execute(
|
| 131 |
+
'SELECT length(text_plain) FROM messages WHERE text_plain IS NOT NULL'
|
| 132 |
+
)
|
| 133 |
+
return [row[0] for row in cursor.fetchall() if row[0]]
|
| 134 |
+
|
| 135 |
+
# ==========================================
|
| 136 |
+
# ENHANCED TOP-K METHODS (using Heap)
|
| 137 |
+
# ==========================================
|
| 138 |
+
|
| 139 |
+
def get_top_users(self, limit: int = 20) -> list[dict]:
|
| 140 |
+
"""
|
| 141 |
+
Get most active users by message count.
|
| 142 |
+
|
| 143 |
+
Uses Heap-based Top-K: O(n log k) instead of O(n log n)
|
| 144 |
+
"""
|
| 145 |
+
cursor = self.conn.execute('''
|
| 146 |
+
SELECT
|
| 147 |
+
from_id,
|
| 148 |
+
from_name,
|
| 149 |
+
COUNT(*) as message_count,
|
| 150 |
+
SUM(has_links) as links_shared,
|
| 151 |
+
SUM(has_media) as media_shared,
|
| 152 |
+
MIN(date_unixtime) as first_message,
|
| 153 |
+
MAX(date_unixtime) as last_message
|
| 154 |
+
FROM messages
|
| 155 |
+
WHERE from_id IS NOT NULL AND from_id != ''
|
| 156 |
+
GROUP BY from_id
|
| 157 |
+
''')
|
| 158 |
+
|
| 159 |
+
# Use heap-based Top-K
|
| 160 |
+
top = TopK(limit, key=lambda x: x['message_count'])
|
| 161 |
+
for row in cursor.fetchall():
|
| 162 |
+
top.push(dict(row))
|
| 163 |
+
|
| 164 |
+
return top.get_top()
|
| 165 |
+
|
| 166 |
+
def get_top_words_heap(self, limit: int = 50, min_length: int = 3) -> list[tuple[str, int]]:
|
| 167 |
+
"""
|
| 168 |
+
Get most frequent words using Heap-based Top-K.
|
| 169 |
+
|
| 170 |
+
O(n + m log k) where n=total words, m=unique words, k=limit
|
| 171 |
+
"""
|
| 172 |
+
cursor = self.conn.execute('SELECT text_plain FROM messages WHERE text_plain IS NOT NULL')
|
| 173 |
+
|
| 174 |
+
word_pattern = re.compile(r'[\u0590-\u05FFa-zA-Z]+')
|
| 175 |
+
words = []
|
| 176 |
+
|
| 177 |
+
for row in cursor.fetchall():
|
| 178 |
+
text = row[0]
|
| 179 |
+
for word in word_pattern.findall(text.lower()):
|
| 180 |
+
if len(word) >= min_length:
|
| 181 |
+
words.append(word)
|
| 182 |
+
|
| 183 |
+
return top_k_frequent(words, limit)
|
| 184 |
+
|
| 185 |
+
def get_top_domains_heap(self, limit: int = 20) -> list[tuple[str, int]]:
|
| 186 |
+
"""Get most shared domains using Heap-based Top-K."""
|
| 187 |
+
cursor = self.conn.execute("SELECT value FROM entities WHERE type = 'link'")
|
| 188 |
+
|
| 189 |
+
domain_pattern = re.compile(r'https?://(?:www\.)?([^/]+)')
|
| 190 |
+
domains = []
|
| 191 |
+
|
| 192 |
+
for row in cursor.fetchall():
|
| 193 |
+
match = domain_pattern.match(row[0])
|
| 194 |
+
if match:
|
| 195 |
+
domains.append(match.group(1))
|
| 196 |
+
|
| 197 |
+
return top_k_frequent(domains, limit)
|
| 198 |
+
|
| 199 |
+
# ==========================================
|
| 200 |
+
# LCS-BASED SIMILAR MESSAGE DETECTION
|
| 201 |
+
# ==========================================
|
| 202 |
+
|
| 203 |
+
def find_similar_messages(
|
| 204 |
+
self,
|
| 205 |
+
threshold: float = 0.7,
|
| 206 |
+
min_length: int = 30,
|
| 207 |
+
limit: int = 100,
|
| 208 |
+
sample_size: int = 1000
|
| 209 |
+
) -> list[tuple[int, int, float, str, str]]:
|
| 210 |
+
"""
|
| 211 |
+
Find similar/duplicate messages using LCS algorithm.
|
| 212 |
+
|
| 213 |
+
Args:
|
| 214 |
+
threshold: Minimum similarity (0-1)
|
| 215 |
+
min_length: Minimum message length to consider
|
| 216 |
+
limit: Maximum pairs to return
|
| 217 |
+
sample_size: Sample size for large datasets
|
| 218 |
+
|
| 219 |
+
Returns:
|
| 220 |
+
List of (id1, id2, similarity, text1, text2) tuples
|
| 221 |
+
"""
|
| 222 |
+
cursor = self.conn.execute('''
|
| 223 |
+
SELECT id, text_plain FROM messages
|
| 224 |
+
WHERE text_plain IS NOT NULL AND length(text_plain) >= ?
|
| 225 |
+
ORDER BY RANDOM()
|
| 226 |
+
LIMIT ?
|
| 227 |
+
''', (min_length, sample_size))
|
| 228 |
+
|
| 229 |
+
messages = [(row[0], row[1]) for row in cursor.fetchall()]
|
| 230 |
+
|
| 231 |
+
# Find similar pairs using LCS
|
| 232 |
+
similar_pairs = find_similar_messages(messages, threshold, min_length)
|
| 233 |
+
|
| 234 |
+
# Fetch full text for results
|
| 235 |
+
results = []
|
| 236 |
+
for id1, id2, sim in similar_pairs[:limit]:
|
| 237 |
+
cursor = self.conn.execute(
|
| 238 |
+
'SELECT text_plain FROM messages WHERE id IN (?, ?)',
|
| 239 |
+
(id1, id2)
|
| 240 |
+
)
|
| 241 |
+
rows = cursor.fetchall()
|
| 242 |
+
if len(rows) == 2:
|
| 243 |
+
results.append((id1, id2, sim, rows[0][0][:100], rows[1][0][:100]))
|
| 244 |
+
|
| 245 |
+
return results
|
| 246 |
+
|
| 247 |
+
def find_reposts(self, threshold: float = 0.9) -> list[dict]:
|
| 248 |
+
"""
|
| 249 |
+
Find potential reposts (very similar messages from different users).
|
| 250 |
+
"""
|
| 251 |
+
cursor = self.conn.execute('''
|
| 252 |
+
SELECT id, from_id, text_plain FROM messages
|
| 253 |
+
WHERE text_plain IS NOT NULL AND length(text_plain) >= 50
|
| 254 |
+
ORDER BY date_unixtime DESC
|
| 255 |
+
LIMIT 500
|
| 256 |
+
''')
|
| 257 |
+
|
| 258 |
+
messages = [(row[0], row[1], row[2]) for row in cursor.fetchall()]
|
| 259 |
+
reposts = []
|
| 260 |
+
|
| 261 |
+
for i in range(len(messages)):
|
| 262 |
+
for j in range(i + 1, len(messages)):
|
| 263 |
+
id1, user1, text1 = messages[i]
|
| 264 |
+
id2, user2, text2 = messages[j]
|
| 265 |
+
|
| 266 |
+
# Only consider different users
|
| 267 |
+
if user1 == user2:
|
| 268 |
+
continue
|
| 269 |
+
|
| 270 |
+
sim = lcs_similarity(text1, text2)
|
| 271 |
+
if sim >= threshold:
|
| 272 |
+
reposts.append({
|
| 273 |
+
'message_id_1': id1,
|
| 274 |
+
'message_id_2': id2,
|
| 275 |
+
'user_1': user1,
|
| 276 |
+
'user_2': user2,
|
| 277 |
+
'similarity': sim,
|
| 278 |
+
'text_preview': text1[:80]
|
| 279 |
+
})
|
| 280 |
+
|
| 281 |
+
return sorted(reposts, key=lambda x: x['similarity'], reverse=True)
|
| 282 |
+
|
| 283 |
+
# ==========================================
|
| 284 |
+
# SELECTION ALGORITHM (PERCENTILES)
|
| 285 |
+
# ==========================================
|
| 286 |
+
|
| 287 |
+
def get_message_length_stats(self) -> dict:
|
| 288 |
+
"""
|
| 289 |
+
Get message length statistics using O(n) Selection algorithm.
|
| 290 |
+
|
| 291 |
+
Much faster than sorting for percentile calculations.
|
| 292 |
+
"""
|
| 293 |
+
lengths = self._get_message_lengths()
|
| 294 |
+
|
| 295 |
+
if not lengths:
|
| 296 |
+
return {}
|
| 297 |
+
|
| 298 |
+
return {
|
| 299 |
+
'count': len(lengths),
|
| 300 |
+
'min': min(lengths),
|
| 301 |
+
'max': max(lengths),
|
| 302 |
+
'median': find_median(lengths),
|
| 303 |
+
'p25': find_percentile(lengths, 25),
|
| 304 |
+
'p75': find_percentile(lengths, 75),
|
| 305 |
+
'p90': find_percentile(lengths, 90),
|
| 306 |
+
'p95': find_percentile(lengths, 95),
|
| 307 |
+
'p99': find_percentile(lengths, 99),
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
def get_response_time_percentiles(self) -> dict:
|
| 311 |
+
"""
|
| 312 |
+
Calculate response time percentiles for replies.
|
| 313 |
+
|
| 314 |
+
Uses Selection algorithm for O(n) percentile calculation.
|
| 315 |
+
"""
|
| 316 |
+
cursor = self.conn.execute('''
|
| 317 |
+
SELECT
|
| 318 |
+
m1.date_unixtime - m2.date_unixtime as response_time
|
| 319 |
+
FROM messages m1
|
| 320 |
+
JOIN messages m2 ON m1.reply_to_message_id = m2.id
|
| 321 |
+
WHERE m1.date_unixtime > m2.date_unixtime
|
| 322 |
+
''')
|
| 323 |
+
|
| 324 |
+
times = [row[0] for row in cursor.fetchall() if row[0] and row[0] > 0]
|
| 325 |
+
|
| 326 |
+
if not times:
|
| 327 |
+
return {}
|
| 328 |
+
|
| 329 |
+
return {
|
| 330 |
+
'count': len(times),
|
| 331 |
+
'median_seconds': find_median(times),
|
| 332 |
+
'p75_seconds': find_percentile(times, 75),
|
| 333 |
+
'p90_seconds': find_percentile(times, 90),
|
| 334 |
+
'p95_seconds': find_percentile(times, 95),
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
# ==========================================
|
| 338 |
+
# RANK TREE (ORDER STATISTICS)
|
| 339 |
+
# ==========================================
|
| 340 |
+
|
| 341 |
+
def _build_user_rank_tree(self) -> RankTree:
|
| 342 |
+
"""Build rank tree for user activity ranking."""
|
| 343 |
+
if self._rank_tree is not None:
|
| 344 |
+
return self._rank_tree
|
| 345 |
+
|
| 346 |
+
self._rank_tree = RankTree()
|
| 347 |
+
|
| 348 |
+
cursor = self.conn.execute('''
|
| 349 |
+
SELECT from_id, from_name, COUNT(*) as msg_count
|
| 350 |
+
FROM messages
|
| 351 |
+
WHERE from_id IS NOT NULL AND from_id != ''
|
| 352 |
+
GROUP BY from_id
|
| 353 |
+
''')
|
| 354 |
+
|
| 355 |
+
for row in cursor.fetchall():
|
| 356 |
+
self._rank_tree.insert(
|
| 357 |
+
row['msg_count'],
|
| 358 |
+
{'user_id': row['from_id'], 'name': row['from_name'], 'count': row['msg_count']}
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
return self._rank_tree
|
| 362 |
+
|
| 363 |
+
def get_user_rank(self, user_id: str) -> dict:
|
| 364 |
+
"""
|
| 365 |
+
Get a user's rank among all users.
|
| 366 |
+
|
| 367 |
+
Uses Rank Tree: O(log n) instead of O(n log n)
|
| 368 |
+
"""
|
| 369 |
+
tree = self._build_user_rank_tree()
|
| 370 |
+
|
| 371 |
+
# Get user's message count
|
| 372 |
+
cursor = self.conn.execute(
|
| 373 |
+
'SELECT COUNT(*) FROM messages WHERE from_id = ?',
|
| 374 |
+
(user_id,)
|
| 375 |
+
)
|
| 376 |
+
count = cursor.fetchone()[0]
|
| 377 |
+
|
| 378 |
+
if count == 0:
|
| 379 |
+
return {'error': 'User not found'}
|
| 380 |
+
|
| 381 |
+
rank = tree.rank(count)
|
| 382 |
+
total_users = len(tree)
|
| 383 |
+
|
| 384 |
+
return {
|
| 385 |
+
'user_id': user_id,
|
| 386 |
+
'message_count': count,
|
| 387 |
+
'rank': total_users - rank + 1, # Reverse for "top" ranking
|
| 388 |
+
'total_users': total_users,
|
| 389 |
+
'percentile': ((total_users - rank) / total_users) * 100
|
| 390 |
+
}
|
| 391 |
+
|
| 392 |
+
def get_user_by_rank(self, rank: int) -> Optional[dict]:
|
| 393 |
+
"""
|
| 394 |
+
Get the user at a specific rank.
|
| 395 |
+
|
| 396 |
+
Uses Rank Tree select(): O(log n)
|
| 397 |
+
"""
|
| 398 |
+
tree = self._build_user_rank_tree()
|
| 399 |
+
total = len(tree)
|
| 400 |
+
|
| 401 |
+
if rank < 1 or rank > total:
|
| 402 |
+
return None
|
| 403 |
+
|
| 404 |
+
# Convert to tree rank (reverse order for "top")
|
| 405 |
+
tree_rank = total - rank + 1
|
| 406 |
+
return tree.select(tree_rank)
|
| 407 |
+
|
| 408 |
+
# ==========================================
|
| 409 |
+
# BUCKET SORT (TIME-BASED HISTOGRAMS)
|
| 410 |
+
# ==========================================
|
| 411 |
+
|
| 412 |
+
def get_activity_histogram(
|
| 413 |
+
self,
|
| 414 |
+
bucket_size: int = 86400, # 1 day default
|
| 415 |
+
start_time: int = None,
|
| 416 |
+
end_time: int = None
|
| 417 |
+
) -> list[tuple[str, int]]:
|
| 418 |
+
"""
|
| 419 |
+
Get activity histogram using Bucket Sort.
|
| 420 |
+
|
| 421 |
+
O(n + k) where k = number of buckets
|
| 422 |
+
|
| 423 |
+
Args:
|
| 424 |
+
bucket_size: Bucket size in seconds (default: 1 day)
|
| 425 |
+
start_time: Start timestamp (default: earliest message)
|
| 426 |
+
end_time: End timestamp (default: latest message)
|
| 427 |
+
|
| 428 |
+
Returns:
|
| 429 |
+
List of (date_string, count) tuples
|
| 430 |
+
"""
|
| 431 |
+
cursor = self.conn.execute(
|
| 432 |
+
'SELECT id, date_unixtime FROM messages WHERE date_unixtime IS NOT NULL'
|
| 433 |
+
)
|
| 434 |
+
records = [{'id': row[0], 'date_unixtime': row[1]} for row in cursor.fetchall()]
|
| 435 |
+
|
| 436 |
+
if not records:
|
| 437 |
+
return []
|
| 438 |
+
|
| 439 |
+
hist = time_histogram(records, 'date_unixtime', bucket_size)
|
| 440 |
+
|
| 441 |
+
# Format timestamps as dates
|
| 442 |
+
return [
|
| 443 |
+
(datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M'), count)
|
| 444 |
+
for ts, count in hist
|
| 445 |
+
]
|
| 446 |
+
|
| 447 |
+
def get_hourly_distribution(self) -> dict[int, int]:
|
| 448 |
+
"""
|
| 449 |
+
Get message distribution by hour of day.
|
| 450 |
+
|
| 451 |
+
Uses Bucket Sort: O(n)
|
| 452 |
+
"""
|
| 453 |
+
cursor = self.conn.execute(
|
| 454 |
+
'SELECT id, date_unixtime FROM messages WHERE date_unixtime IS NOT NULL'
|
| 455 |
+
)
|
| 456 |
+
records = [{'id': row[0], 'date_unixtime': row[1]} for row in cursor.fetchall()]
|
| 457 |
+
|
| 458 |
+
return hourly_distribution(records, 'date_unixtime')
|
| 459 |
+
|
| 460 |
+
# ==========================================
|
| 461 |
+
# ORIGINAL METHODS (kept for compatibility)
|
| 462 |
+
# ==========================================
|
| 463 |
+
|
| 464 |
+
def get_hourly_activity(self) -> dict[int, int]:
|
| 465 |
+
"""Get message count by hour of day."""
|
| 466 |
+
sql = '''
|
| 467 |
+
SELECT
|
| 468 |
+
CAST(strftime('%H', datetime(date_unixtime, 'unixepoch')) AS INTEGER) as hour,
|
| 469 |
+
COUNT(*) as count
|
| 470 |
+
FROM messages
|
| 471 |
+
WHERE date_unixtime IS NOT NULL
|
| 472 |
+
GROUP BY hour
|
| 473 |
+
ORDER BY hour
|
| 474 |
+
'''
|
| 475 |
+
cursor = self.conn.execute(sql)
|
| 476 |
+
return {row[0]: row[1] for row in cursor.fetchall()}
|
| 477 |
+
|
| 478 |
+
def get_daily_activity(self) -> dict[str, int]:
|
| 479 |
+
"""Get message count by day of week."""
|
| 480 |
+
days = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
|
| 481 |
+
sql = '''
|
| 482 |
+
SELECT
|
| 483 |
+
CAST(strftime('%w', datetime(date_unixtime, 'unixepoch')) AS INTEGER) as day,
|
| 484 |
+
COUNT(*) as count
|
| 485 |
+
FROM messages
|
| 486 |
+
WHERE date_unixtime IS NOT NULL
|
| 487 |
+
GROUP BY day
|
| 488 |
+
ORDER BY day
|
| 489 |
+
'''
|
| 490 |
+
cursor = self.conn.execute(sql)
|
| 491 |
+
return {days[row[0]]: row[1] for row in cursor.fetchall()}
|
| 492 |
+
|
| 493 |
+
def get_monthly_activity(self) -> dict[str, int]:
|
| 494 |
+
"""Get message count by month."""
|
| 495 |
+
sql = '''
|
| 496 |
+
SELECT
|
| 497 |
+
strftime('%Y-%m', datetime(date_unixtime, 'unixepoch')) as month,
|
| 498 |
+
COUNT(*) as count
|
| 499 |
+
FROM messages
|
| 500 |
+
WHERE date_unixtime IS NOT NULL
|
| 501 |
+
GROUP BY month
|
| 502 |
+
ORDER BY month
|
| 503 |
+
'''
|
| 504 |
+
cursor = self.conn.execute(sql)
|
| 505 |
+
return {row[0]: row[1] for row in cursor.fetchall()}
|
| 506 |
+
|
| 507 |
+
def get_top_domains(self, limit: int = 20) -> list[tuple[str, int]]:
|
| 508 |
+
"""Get most shared domains from links."""
|
| 509 |
+
return self.get_top_domains_heap(limit)
|
| 510 |
+
|
| 511 |
+
def get_top_mentioned(self, limit: int = 20) -> list[tuple[str, int]]:
|
| 512 |
+
"""Get most mentioned users/channels."""
|
| 513 |
+
sql = '''
|
| 514 |
+
SELECT value, COUNT(*) as count
|
| 515 |
+
FROM entities
|
| 516 |
+
WHERE type = 'mention'
|
| 517 |
+
GROUP BY value
|
| 518 |
+
ORDER BY count DESC
|
| 519 |
+
LIMIT ?
|
| 520 |
+
'''
|
| 521 |
+
cursor = self.conn.execute(sql, (limit,))
|
| 522 |
+
return [(row[0], row[1]) for row in cursor.fetchall()]
|
| 523 |
+
|
| 524 |
+
def get_forwarded_sources(self, limit: int = 20) -> list[dict]:
|
| 525 |
+
"""Get top sources of forwarded messages."""
|
| 526 |
+
sql = '''
|
| 527 |
+
SELECT
|
| 528 |
+
forwarded_from,
|
| 529 |
+
forwarded_from_id,
|
| 530 |
+
COUNT(*) as count
|
| 531 |
+
FROM messages
|
| 532 |
+
WHERE forwarded_from IS NOT NULL
|
| 533 |
+
GROUP BY forwarded_from_id
|
| 534 |
+
ORDER BY count DESC
|
| 535 |
+
LIMIT ?
|
| 536 |
+
'''
|
| 537 |
+
cursor = self.conn.execute(sql, (limit,))
|
| 538 |
+
return [dict(row) for row in cursor.fetchall()]
|
| 539 |
+
|
| 540 |
+
def get_word_frequency(self, limit: int = 50, min_length: int = 3) -> list[tuple[str, int]]:
|
| 541 |
+
"""Get most frequent words using Heap-based Top-K."""
|
| 542 |
+
return self.get_top_words_heap(limit, min_length)
|
| 543 |
+
|
| 544 |
+
def get_reply_network(self, limit: int = 100) -> list[dict]:
|
| 545 |
+
"""Get reply relationships between users."""
|
| 546 |
+
sql = '''
|
| 547 |
+
SELECT
|
| 548 |
+
m1.from_id as replier_id,
|
| 549 |
+
m1.from_name as replier_name,
|
| 550 |
+
m2.from_id as replied_to_id,
|
| 551 |
+
m2.from_name as replied_to_name,
|
| 552 |
+
COUNT(*) as reply_count
|
| 553 |
+
FROM messages m1
|
| 554 |
+
JOIN messages m2 ON m1.reply_to_message_id = m2.id
|
| 555 |
+
WHERE m1.reply_to_message_id IS NOT NULL
|
| 556 |
+
GROUP BY m1.from_id, m2.from_id
|
| 557 |
+
ORDER BY reply_count DESC
|
| 558 |
+
LIMIT ?
|
| 559 |
+
'''
|
| 560 |
+
cursor = self.conn.execute(sql, (limit,))
|
| 561 |
+
return [dict(row) for row in cursor.fetchall()]
|
| 562 |
+
|
| 563 |
+
def get_user_stats(self, user_id: str) -> dict:
|
| 564 |
+
"""Get detailed statistics for a specific user."""
|
| 565 |
+
stats = {}
|
| 566 |
+
|
| 567 |
+
cursor = self.conn.execute('''
|
| 568 |
+
SELECT
|
| 569 |
+
COUNT(*) as total,
|
| 570 |
+
SUM(has_links) as links,
|
| 571 |
+
SUM(has_media) as media,
|
| 572 |
+
SUM(has_mentions) as mentions,
|
| 573 |
+
SUM(is_edited) as edited,
|
| 574 |
+
MIN(date_unixtime) as first_msg,
|
| 575 |
+
MAX(date_unixtime) as last_msg
|
| 576 |
+
FROM messages WHERE from_id = ?
|
| 577 |
+
''', (user_id,))
|
| 578 |
+
row = cursor.fetchone()
|
| 579 |
+
stats.update(dict(row))
|
| 580 |
+
|
| 581 |
+
cursor = self.conn.execute('''
|
| 582 |
+
SELECT COUNT(*) FROM messages m1
|
| 583 |
+
JOIN messages m2 ON m1.reply_to_message_id = m2.id
|
| 584 |
+
WHERE m2.from_id = ?
|
| 585 |
+
''', (user_id,))
|
| 586 |
+
stats['replies_received'] = cursor.fetchone()[0]
|
| 587 |
+
|
| 588 |
+
cursor = self.conn.execute('''
|
| 589 |
+
SELECT COUNT(*) FROM messages
|
| 590 |
+
WHERE from_id = ? AND reply_to_message_id IS NOT NULL
|
| 591 |
+
''', (user_id,))
|
| 592 |
+
stats['replies_sent'] = cursor.fetchone()[0]
|
| 593 |
+
|
| 594 |
+
# Add rank info using Rank Tree
|
| 595 |
+
rank_info = self.get_user_rank(user_id)
|
| 596 |
+
stats['rank'] = rank_info.get('rank')
|
| 597 |
+
stats['percentile'] = rank_info.get('percentile')
|
| 598 |
+
|
| 599 |
+
return stats
|
| 600 |
+
|
| 601 |
+
|
| 602 |
+
def print_bar(value: int, max_value: int, width: int = 40) -> str:
|
| 603 |
+
"""Create a simple ASCII bar."""
|
| 604 |
+
if max_value == 0:
|
| 605 |
+
return ''
|
| 606 |
+
bar_length = int((value / max_value) * width)
|
| 607 |
+
return '█' * bar_length + '░' * (width - bar_length)
|
| 608 |
+
|
| 609 |
+
|
| 610 |
+
def main():
|
| 611 |
+
parser = argparse.ArgumentParser(description='Analyze indexed Telegram messages (Enhanced)')
|
| 612 |
+
parser.add_argument('--db', default='telegram.db', help='Database path')
|
| 613 |
+
|
| 614 |
+
# Original options
|
| 615 |
+
parser.add_argument('--stats', action='store_true', help='Show general statistics')
|
| 616 |
+
parser.add_argument('--top-users', action='store_true', help='Show top users')
|
| 617 |
+
parser.add_argument('--hourly', action='store_true', help='Show hourly activity')
|
| 618 |
+
parser.add_argument('--daily', action='store_true', help='Show daily activity')
|
| 619 |
+
parser.add_argument('--monthly', action='store_true', help='Show monthly activity')
|
| 620 |
+
parser.add_argument('--domains', action='store_true', help='Show top shared domains')
|
| 621 |
+
parser.add_argument('--mentions', action='store_true', help='Show top mentions')
|
| 622 |
+
parser.add_argument('--words', action='store_true', help='Show word frequency')
|
| 623 |
+
parser.add_argument('--sources', action='store_true', help='Show forwarded message sources')
|
| 624 |
+
parser.add_argument('--replies', action='store_true', help='Show reply network')
|
| 625 |
+
parser.add_argument('--user', help='Show stats for specific user ID')
|
| 626 |
+
|
| 627 |
+
# NEW: Enhanced options
|
| 628 |
+
parser.add_argument('--similar', action='store_true', help='Find similar messages (LCS)')
|
| 629 |
+
parser.add_argument('--reposts', action='store_true', help='Find potential reposts')
|
| 630 |
+
parser.add_argument('--percentiles', action='store_true', help='Show message length percentiles')
|
| 631 |
+
parser.add_argument('--response-times', action='store_true', help='Show response time percentiles')
|
| 632 |
+
parser.add_argument('--user-rank', help='Get rank of specific user')
|
| 633 |
+
parser.add_argument('--rank', type=int, help='Get user at specific rank')
|
| 634 |
+
parser.add_argument('--histogram', action='store_true', help='Show activity histogram')
|
| 635 |
+
parser.add_argument('--bucket-size', type=int, default=86400, help='Histogram bucket size in seconds')
|
| 636 |
+
|
| 637 |
+
parser.add_argument('--limit', type=int, default=20, help='Limit results')
|
| 638 |
+
parser.add_argument('--json', action='store_true', help='Output as JSON')
|
| 639 |
+
parser.add_argument('--threshold', type=float, default=0.7, help='Similarity threshold')
|
| 640 |
+
|
| 641 |
+
args = parser.parse_args()
|
| 642 |
+
|
| 643 |
+
with TelegramAnalyzer(args.db) as analyzer:
|
| 644 |
+
# === ORIGINAL OPTIONS ===
|
| 645 |
+
if args.stats:
|
| 646 |
+
stats = analyzer.get_stats()
|
| 647 |
+
if args.json:
|
| 648 |
+
print(json.dumps(stats, indent=2, ensure_ascii=False))
|
| 649 |
+
else:
|
| 650 |
+
print("=== General Statistics ===\n")
|
| 651 |
+
print(f"Total messages: {stats['total_messages']:,}")
|
| 652 |
+
print(f"Total users: {stats['total_users']:,}")
|
| 653 |
+
print(f"First message: {stats.get('first_message', 'N/A')}")
|
| 654 |
+
print(f"Last message: {stats.get('last_message', 'N/A')}")
|
| 655 |
+
print(f"Days span: {stats.get('days_span', 'N/A')}")
|
| 656 |
+
print(f"Messages with media: {stats['messages_with_media']:,}")
|
| 657 |
+
print(f"Messages with links: {stats['messages_with_links']:,}")
|
| 658 |
+
print(f"Forwarded messages: {stats['forwarded_messages']:,}")
|
| 659 |
+
print(f"Reply messages: {stats['reply_messages']:,}")
|
| 660 |
+
if 'median_message_length' in stats:
|
| 661 |
+
print(f"\nMedian msg length: {stats['median_message_length']:.0f} chars")
|
| 662 |
+
print(f"90th percentile: {stats['p90_message_length']:.0f} chars")
|
| 663 |
+
print(f"\nEntities: {stats.get('entities', {})}")
|
| 664 |
+
return
|
| 665 |
+
|
| 666 |
+
if args.top_users:
|
| 667 |
+
users = analyzer.get_top_users(args.limit)
|
| 668 |
+
if args.json:
|
| 669 |
+
print(json.dumps(users, indent=2, ensure_ascii=False))
|
| 670 |
+
else:
|
| 671 |
+
print("=== Top Users by Message Count (Heap-based Top-K) ===\n")
|
| 672 |
+
max_count = users[0]['message_count'] if users else 0
|
| 673 |
+
for i, user in enumerate(users, 1):
|
| 674 |
+
bar = print_bar(user['message_count'], max_count, 30)
|
| 675 |
+
print(f"{i:2}. {user['from_name'][:20]:20} {bar} {user['message_count']:,}")
|
| 676 |
+
return
|
| 677 |
+
|
| 678 |
+
if args.hourly:
|
| 679 |
+
hourly = analyzer.get_hourly_activity()
|
| 680 |
+
if args.json:
|
| 681 |
+
print(json.dumps(hourly, indent=2))
|
| 682 |
+
else:
|
| 683 |
+
print("=== Hourly Activity ===\n")
|
| 684 |
+
max_count = max(hourly.values()) if hourly else 0
|
| 685 |
+
for hour in range(24):
|
| 686 |
+
count = hourly.get(hour, 0)
|
| 687 |
+
bar = print_bar(count, max_count, 40)
|
| 688 |
+
print(f"{hour:02}:00 {bar} {count:,}")
|
| 689 |
+
return
|
| 690 |
+
|
| 691 |
+
if args.daily:
|
| 692 |
+
daily = analyzer.get_daily_activity()
|
| 693 |
+
if args.json:
|
| 694 |
+
print(json.dumps(daily, indent=2))
|
| 695 |
+
else:
|
| 696 |
+
print("=== Daily Activity ===\n")
|
| 697 |
+
max_count = max(daily.values()) if daily else 0
|
| 698 |
+
for day, count in daily.items():
|
| 699 |
+
bar = print_bar(count, max_count, 40)
|
| 700 |
+
print(f"{day:10} {bar} {count:,}")
|
| 701 |
+
return
|
| 702 |
+
|
| 703 |
+
if args.monthly:
|
| 704 |
+
monthly = analyzer.get_monthly_activity()
|
| 705 |
+
if args.json:
|
| 706 |
+
print(json.dumps(monthly, indent=2))
|
| 707 |
+
else:
|
| 708 |
+
print("=== Monthly Activity ===\n")
|
| 709 |
+
max_count = max(monthly.values()) if monthly else 0
|
| 710 |
+
for month, count in monthly.items():
|
| 711 |
+
bar = print_bar(count, max_count, 40)
|
| 712 |
+
print(f"{month} {bar} {count:,}")
|
| 713 |
+
return
|
| 714 |
+
|
| 715 |
+
if args.domains:
|
| 716 |
+
domains = analyzer.get_top_domains(args.limit)
|
| 717 |
+
if args.json:
|
| 718 |
+
print(json.dumps(dict(domains), indent=2))
|
| 719 |
+
else:
|
| 720 |
+
print("=== Top Shared Domains (Heap-based Top-K) ===\n")
|
| 721 |
+
max_count = domains[0][1] if domains else 0
|
| 722 |
+
for domain, count in domains:
|
| 723 |
+
bar = print_bar(count, max_count, 30)
|
| 724 |
+
print(f"{domain[:30]:30} {bar} {count:,}")
|
| 725 |
+
return
|
| 726 |
+
|
| 727 |
+
if args.mentions:
|
| 728 |
+
mentions = analyzer.get_top_mentioned(args.limit)
|
| 729 |
+
if args.json:
|
| 730 |
+
print(json.dumps(dict(mentions), indent=2))
|
| 731 |
+
else:
|
| 732 |
+
print("=== Top Mentioned Users ===\n")
|
| 733 |
+
max_count = mentions[0][1] if mentions else 0
|
| 734 |
+
for mention, count in mentions:
|
| 735 |
+
bar = print_bar(count, max_count, 30)
|
| 736 |
+
print(f"{mention:20} {bar} {count:,}")
|
| 737 |
+
return
|
| 738 |
+
|
| 739 |
+
if args.words:
|
| 740 |
+
words = analyzer.get_word_frequency(args.limit)
|
| 741 |
+
if args.json:
|
| 742 |
+
print(json.dumps(dict(words), indent=2, ensure_ascii=False))
|
| 743 |
+
else:
|
| 744 |
+
print("=== Top Words (Heap-based Top-K) ===\n")
|
| 745 |
+
max_count = words[0][1] if words else 0
|
| 746 |
+
for word, count in words:
|
| 747 |
+
bar = print_bar(count, max_count, 30)
|
| 748 |
+
print(f"{word:20} {bar} {count:,}")
|
| 749 |
+
return
|
| 750 |
+
|
| 751 |
+
if args.sources:
|
| 752 |
+
sources = analyzer.get_forwarded_sources(args.limit)
|
| 753 |
+
if args.json:
|
| 754 |
+
print(json.dumps(sources, indent=2, ensure_ascii=False))
|
| 755 |
+
else:
|
| 756 |
+
print("=== Top Forwarded Sources ===\n")
|
| 757 |
+
max_count = sources[0]['count'] if sources else 0
|
| 758 |
+
for src in sources:
|
| 759 |
+
bar = print_bar(src['count'], max_count, 30)
|
| 760 |
+
name = src['forwarded_from'] or 'Unknown'
|
| 761 |
+
print(f"{name[:30]:30} {bar} {src['count']:,}")
|
| 762 |
+
return
|
| 763 |
+
|
| 764 |
+
if args.replies:
|
| 765 |
+
replies = analyzer.get_reply_network(args.limit)
|
| 766 |
+
if args.json:
|
| 767 |
+
print(json.dumps(replies, indent=2, ensure_ascii=False))
|
| 768 |
+
else:
|
| 769 |
+
print("=== Reply Network ===\n")
|
| 770 |
+
for r in replies:
|
| 771 |
+
print(f"{r['replier_name']} → {r['replied_to_name']}: {r['reply_count']} replies")
|
| 772 |
+
return
|
| 773 |
+
|
| 774 |
+
if args.user:
|
| 775 |
+
user_stats = analyzer.get_user_stats(args.user)
|
| 776 |
+
if args.json:
|
| 777 |
+
print(json.dumps(user_stats, indent=2))
|
| 778 |
+
else:
|
| 779 |
+
print(f"=== Stats for {args.user} ===\n")
|
| 780 |
+
for key, value in user_stats.items():
|
| 781 |
+
print(f"{key}: {value}")
|
| 782 |
+
return
|
| 783 |
+
|
| 784 |
+
# === NEW ENHANCED OPTIONS ===
|
| 785 |
+
|
| 786 |
+
if args.similar:
|
| 787 |
+
print(f"=== Similar Messages (LCS, threshold={args.threshold}) ===\n")
|
| 788 |
+
similar = analyzer.find_similar_messages(
|
| 789 |
+
threshold=args.threshold,
|
| 790 |
+
limit=args.limit
|
| 791 |
+
)
|
| 792 |
+
if args.json:
|
| 793 |
+
print(json.dumps(similar, indent=2, ensure_ascii=False))
|
| 794 |
+
else:
|
| 795 |
+
for id1, id2, sim, text1, text2 in similar:
|
| 796 |
+
print(f"Similarity: {sim:.1%}")
|
| 797 |
+
print(f" [{id1}] {text1}...")
|
| 798 |
+
print(f" [{id2}] {text2}...")
|
| 799 |
+
print()
|
| 800 |
+
return
|
| 801 |
+
|
| 802 |
+
if args.reposts:
|
| 803 |
+
print("=== Potential Reposts (LCS-based) ===\n")
|
| 804 |
+
reposts = analyzer.find_reposts(threshold=args.threshold)
|
| 805 |
+
if args.json:
|
| 806 |
+
print(json.dumps(reposts, indent=2, ensure_ascii=False))
|
| 807 |
+
else:
|
| 808 |
+
for r in reposts[:args.limit]:
|
| 809 |
+
print(f"Similarity: {r['similarity']:.1%}")
|
| 810 |
+
print(f" User 1: {r['user_1']}")
|
| 811 |
+
print(f" User 2: {r['user_2']}")
|
| 812 |
+
print(f" Text: {r['text_preview']}...")
|
| 813 |
+
print()
|
| 814 |
+
return
|
| 815 |
+
|
| 816 |
+
if args.percentiles:
|
| 817 |
+
print("=== Message Length Percentiles (Selection Algorithm) ===\n")
|
| 818 |
+
stats = analyzer.get_message_length_stats()
|
| 819 |
+
if args.json:
|
| 820 |
+
print(json.dumps(stats, indent=2))
|
| 821 |
+
else:
|
| 822 |
+
for key, value in stats.items():
|
| 823 |
+
print(f"{key:15}: {value:,.0f}")
|
| 824 |
+
return
|
| 825 |
+
|
| 826 |
+
if args.response_times:
|
| 827 |
+
print("=== Response Time Percentiles (Selection Algorithm) ===\n")
|
| 828 |
+
stats = analyzer.get_response_time_percentiles()
|
| 829 |
+
if args.json:
|
| 830 |
+
print(json.dumps(stats, indent=2))
|
| 831 |
+
else:
|
| 832 |
+
for key, value in stats.items():
|
| 833 |
+
if 'seconds' in key:
|
| 834 |
+
print(f"{key:15}: {value:,.0f}s ({value/60:.1f}m)")
|
| 835 |
+
else:
|
| 836 |
+
print(f"{key:15}: {value:,}")
|
| 837 |
+
return
|
| 838 |
+
|
| 839 |
+
if args.user_rank:
|
| 840 |
+
print(f"=== User Rank (Rank Tree O(log n)) ===\n")
|
| 841 |
+
rank_info = analyzer.get_user_rank(args.user_rank)
|
| 842 |
+
if args.json:
|
| 843 |
+
print(json.dumps(rank_info, indent=2))
|
| 844 |
+
else:
|
| 845 |
+
print(f"User ID: {rank_info.get('user_id')}")
|
| 846 |
+
print(f"Message count: {rank_info.get('message_count'):,}")
|
| 847 |
+
print(f"Rank: #{rank_info.get('rank')} of {rank_info.get('total_users')}")
|
| 848 |
+
print(f"Percentile: Top {rank_info.get('percentile'):.1f}%")
|
| 849 |
+
return
|
| 850 |
+
|
| 851 |
+
if args.rank:
|
| 852 |
+
print(f"=== User at Rank #{args.rank} (Rank Tree O(log n)) ===\n")
|
| 853 |
+
user = analyzer.get_user_by_rank(args.rank)
|
| 854 |
+
if args.json:
|
| 855 |
+
print(json.dumps(user, indent=2, ensure_ascii=False))
|
| 856 |
+
elif user:
|
| 857 |
+
print(f"Name: {user.get('name')}")
|
| 858 |
+
print(f"User ID: {user.get('user_id')}")
|
| 859 |
+
print(f"Message count: {user.get('count'):,}")
|
| 860 |
+
else:
|
| 861 |
+
print(f"No user at rank {args.rank}")
|
| 862 |
+
return
|
| 863 |
+
|
| 864 |
+
if args.histogram:
|
| 865 |
+
print(f"=== Activity Histogram (Bucket Sort, bucket={args.bucket_size}s) ===\n")
|
| 866 |
+
hist = analyzer.get_activity_histogram(bucket_size=args.bucket_size)
|
| 867 |
+
if args.json:
|
| 868 |
+
print(json.dumps(hist, indent=2))
|
| 869 |
+
else:
|
| 870 |
+
max_count = max(c for _, c in hist) if hist else 0
|
| 871 |
+
for date_str, count in hist[-args.limit:]:
|
| 872 |
+
bar = print_bar(count, max_count, 40)
|
| 873 |
+
print(f"{date_str} {bar} {count:,}")
|
| 874 |
+
return
|
| 875 |
+
|
| 876 |
+
# Default: show help
|
| 877 |
+
parser.print_help()
|
| 878 |
+
|
| 879 |
+
|
| 880 |
+
if __name__ == '__main__':
|
| 881 |
+
main()
|
check_db.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Quick script to check database contents."""
|
| 3 |
+
import sqlite3
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
DB_PATH = 'telegram.db'
|
| 7 |
+
|
| 8 |
+
if not os.path.exists(DB_PATH):
|
| 9 |
+
print(f"Database not found: {DB_PATH}")
|
| 10 |
+
exit(1)
|
| 11 |
+
|
| 12 |
+
try:
|
| 13 |
+
conn = sqlite3.connect(DB_PATH)
|
| 14 |
+
conn.row_factory = sqlite3.Row
|
| 15 |
+
|
| 16 |
+
# Get total count
|
| 17 |
+
total = conn.execute("SELECT COUNT(*) FROM messages").fetchone()[0]
|
| 18 |
+
print(f"Total messages in database: {total}")
|
| 19 |
+
|
| 20 |
+
# Get date range
|
| 21 |
+
date_range = conn.execute("""
|
| 22 |
+
SELECT MIN(date) as earliest, MAX(date) as latest
|
| 23 |
+
FROM messages
|
| 24 |
+
""").fetchone()
|
| 25 |
+
print(f"Date range: {date_range['earliest']} to {date_range['latest']}")
|
| 26 |
+
print()
|
| 27 |
+
|
| 28 |
+
# Show 50 newest messages
|
| 29 |
+
print("=" * 60)
|
| 30 |
+
print("50 NEWEST MESSAGES:")
|
| 31 |
+
print("=" * 60)
|
| 32 |
+
|
| 33 |
+
rows = conn.execute("""
|
| 34 |
+
SELECT date, from_name, text_plain
|
| 35 |
+
FROM messages
|
| 36 |
+
ORDER BY date DESC
|
| 37 |
+
LIMIT 50
|
| 38 |
+
""").fetchall()
|
| 39 |
+
|
| 40 |
+
for row in rows:
|
| 41 |
+
text = (row['text_plain'] or '')[:80].replace('\n', ' ')
|
| 42 |
+
name = row['from_name'] or 'Unknown'
|
| 43 |
+
print(f"{row['date']} | {name}: {text}")
|
| 44 |
+
|
| 45 |
+
conn.close()
|
| 46 |
+
except Exception as e:
|
| 47 |
+
print(f"Error: {e}")
|
| 48 |
+
import traceback
|
| 49 |
+
traceback.print_exc()
|
daily_sync.py
ADDED
|
@@ -0,0 +1,664 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Daily Telegram Sync Script
|
| 4 |
+
===========================
|
| 5 |
+
Automatically syncs new messages from Telegram group to the analytics system.
|
| 6 |
+
|
| 7 |
+
What it does:
|
| 8 |
+
1. Connects to Telegram via Telethon
|
| 9 |
+
2. Fetches messages from the last 36 hours (12h overlap for safety)
|
| 10 |
+
3. Adds new messages to telegram.db (duplicates ignored)
|
| 11 |
+
4. Generates embeddings for new messages locally
|
| 12 |
+
5. Adds embeddings to embeddings.db (duplicates ignored)
|
| 13 |
+
|
| 14 |
+
Usage:
|
| 15 |
+
First time: python daily_sync.py --setup
|
| 16 |
+
Daily run: python daily_sync.py
|
| 17 |
+
Custom hours: python daily_sync.py --hours 48
|
| 18 |
+
|
| 19 |
+
Automation:
|
| 20 |
+
Windows Task Scheduler: python daily_sync.py
|
| 21 |
+
Linux cron: 0 3 * * * cd /path/to/telegram && python daily_sync.py >> sync.log 2>&1
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
import os
|
| 25 |
+
import sys
|
| 26 |
+
import json
|
| 27 |
+
import time
|
| 28 |
+
import sqlite3
|
| 29 |
+
import asyncio
|
| 30 |
+
import argparse
|
| 31 |
+
import logging
|
| 32 |
+
from datetime import datetime, timedelta, timezone
|
| 33 |
+
from pathlib import Path
|
| 34 |
+
|
| 35 |
+
# Setup logging
|
| 36 |
+
LOG_FILE = Path(__file__).parent / 'sync.log'
|
| 37 |
+
logging.basicConfig(
|
| 38 |
+
level=logging.INFO,
|
| 39 |
+
format='%(asctime)s [%(levelname)s] %(message)s',
|
| 40 |
+
handlers=[
|
| 41 |
+
logging.StreamHandler(),
|
| 42 |
+
logging.FileHandler(LOG_FILE, encoding='utf-8')
|
| 43 |
+
]
|
| 44 |
+
)
|
| 45 |
+
log = logging.getLogger('daily_sync')
|
| 46 |
+
|
| 47 |
+
# Paths
|
| 48 |
+
BASE_DIR = Path(__file__).parent
|
| 49 |
+
DB_PATH = BASE_DIR / 'telegram.db'
|
| 50 |
+
EMBEDDINGS_DB_PATH = BASE_DIR / 'embeddings.db'
|
| 51 |
+
SESSION_FILE = BASE_DIR / 'telegram_session'
|
| 52 |
+
CONFIG_FILE = BASE_DIR / 'sync_config.json'
|
| 53 |
+
|
| 54 |
+
# ==========================================
|
| 55 |
+
# CONFIGURATION
|
| 56 |
+
# ==========================================
|
| 57 |
+
|
| 58 |
+
def load_config() -> dict:
|
| 59 |
+
"""Load configuration from sync_config.json."""
|
| 60 |
+
if CONFIG_FILE.exists():
|
| 61 |
+
with open(CONFIG_FILE, 'r') as f:
|
| 62 |
+
return json.load(f)
|
| 63 |
+
return {}
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def save_config(config: dict):
|
| 67 |
+
"""Save configuration to sync_config.json."""
|
| 68 |
+
with open(CONFIG_FILE, 'w') as f:
|
| 69 |
+
json.dump(config, f, indent=2)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def setup_config():
|
| 73 |
+
"""Interactive setup for first time configuration."""
|
| 74 |
+
print("=" * 50)
|
| 75 |
+
print(" Telegram Daily Sync - Setup")
|
| 76 |
+
print("=" * 50)
|
| 77 |
+
print()
|
| 78 |
+
print("You need Telegram API credentials.")
|
| 79 |
+
print("Get them from: https://my.telegram.org/apps")
|
| 80 |
+
print()
|
| 81 |
+
|
| 82 |
+
api_id = input("API ID: ").strip()
|
| 83 |
+
api_hash = input("API Hash: ").strip()
|
| 84 |
+
group = input("Group username or ID (e.g., @mygroup or -1001234567890): ").strip()
|
| 85 |
+
|
| 86 |
+
config = {
|
| 87 |
+
'api_id': int(api_id),
|
| 88 |
+
'api_hash': api_hash,
|
| 89 |
+
'group': group,
|
| 90 |
+
'hours': 36
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
save_config(config)
|
| 94 |
+
print(f"\nConfiguration saved to {CONFIG_FILE}")
|
| 95 |
+
print("Now run: python daily_sync.py")
|
| 96 |
+
print("(First run will ask you to log in to Telegram)")
|
| 97 |
+
return config
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
# ==========================================
|
| 101 |
+
# TELETHON: FETCH MESSAGES
|
| 102 |
+
# ==========================================
|
| 103 |
+
|
| 104 |
+
def telethon_message_to_json(message) -> dict | None:
|
| 105 |
+
"""
|
| 106 |
+
Convert a Telethon message object to Telegram Desktop export JSON format.
|
| 107 |
+
This ensures compatibility with the existing parse_message() in indexer.py.
|
| 108 |
+
"""
|
| 109 |
+
from telethon.tl.types import (
|
| 110 |
+
MessageEntityUrl, MessageEntityTextUrl,
|
| 111 |
+
MessageEntityMention, MessageEntityMentionName,
|
| 112 |
+
MessageEntityBold, MessageEntityItalic,
|
| 113 |
+
MessageEntityCode, MessageEntityPre,
|
| 114 |
+
MessageEntityHashtag, MessageEntityEmail,
|
| 115 |
+
MessageEntityPhone, MessageEntityBotCommand,
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
if message.text is None and message.raw_text is None:
|
| 119 |
+
return None
|
| 120 |
+
|
| 121 |
+
text = message.raw_text or ''
|
| 122 |
+
if not text.strip():
|
| 123 |
+
# Skip empty messages (media-only, service messages, etc.)
|
| 124 |
+
return None
|
| 125 |
+
|
| 126 |
+
# Build text_entities in Telegram Desktop export format
|
| 127 |
+
text_entities = []
|
| 128 |
+
if message.entities:
|
| 129 |
+
for entity in message.entities:
|
| 130 |
+
start = entity.offset
|
| 131 |
+
end = entity.offset + entity.length
|
| 132 |
+
entity_text = text[start:end]
|
| 133 |
+
|
| 134 |
+
entity_type = 'plain'
|
| 135 |
+
if isinstance(entity, (MessageEntityUrl,)):
|
| 136 |
+
entity_type = 'link'
|
| 137 |
+
elif isinstance(entity, MessageEntityTextUrl):
|
| 138 |
+
entity_type = 'text_link'
|
| 139 |
+
entity_text = entity.url
|
| 140 |
+
elif isinstance(entity, (MessageEntityMention, MessageEntityMentionName)):
|
| 141 |
+
entity_type = 'mention'
|
| 142 |
+
elif isinstance(entity, MessageEntityBold):
|
| 143 |
+
entity_type = 'bold'
|
| 144 |
+
elif isinstance(entity, MessageEntityItalic):
|
| 145 |
+
entity_type = 'italic'
|
| 146 |
+
elif isinstance(entity, (MessageEntityCode, MessageEntityPre)):
|
| 147 |
+
entity_type = 'code'
|
| 148 |
+
elif isinstance(entity, MessageEntityHashtag):
|
| 149 |
+
entity_type = 'hashtag'
|
| 150 |
+
elif isinstance(entity, MessageEntityEmail):
|
| 151 |
+
entity_type = 'email'
|
| 152 |
+
elif isinstance(entity, MessageEntityPhone):
|
| 153 |
+
entity_type = 'phone'
|
| 154 |
+
elif isinstance(entity, MessageEntityBotCommand):
|
| 155 |
+
entity_type = 'bot_command'
|
| 156 |
+
|
| 157 |
+
if entity_type != 'plain':
|
| 158 |
+
text_entities.append({
|
| 159 |
+
'type': entity_type,
|
| 160 |
+
'text': entity_text
|
| 161 |
+
})
|
| 162 |
+
|
| 163 |
+
# Get sender info
|
| 164 |
+
sender = message.sender
|
| 165 |
+
from_name = ''
|
| 166 |
+
from_id = ''
|
| 167 |
+
|
| 168 |
+
if sender:
|
| 169 |
+
if hasattr(sender, 'first_name'):
|
| 170 |
+
# User
|
| 171 |
+
parts = [sender.first_name or '']
|
| 172 |
+
if sender.last_name:
|
| 173 |
+
parts.append(sender.last_name)
|
| 174 |
+
from_name = ' '.join(parts).strip()
|
| 175 |
+
from_id = f'user{sender.id}'
|
| 176 |
+
elif hasattr(sender, 'title'):
|
| 177 |
+
# Channel/Group
|
| 178 |
+
from_name = sender.title or ''
|
| 179 |
+
from_id = f'channel{sender.id}'
|
| 180 |
+
|
| 181 |
+
# Handle forwarded messages
|
| 182 |
+
forwarded_from = None
|
| 183 |
+
forwarded_from_id = None
|
| 184 |
+
if message.forward:
|
| 185 |
+
fwd = message.forward
|
| 186 |
+
try:
|
| 187 |
+
if fwd.sender:
|
| 188 |
+
if hasattr(fwd.sender, 'first_name'):
|
| 189 |
+
parts = [fwd.sender.first_name or '']
|
| 190 |
+
if fwd.sender.last_name:
|
| 191 |
+
parts.append(fwd.sender.last_name)
|
| 192 |
+
forwarded_from = ' '.join(parts).strip()
|
| 193 |
+
forwarded_from_id = f'user{fwd.sender.id}'
|
| 194 |
+
elif hasattr(fwd.sender, 'title'):
|
| 195 |
+
forwarded_from = fwd.sender.title
|
| 196 |
+
forwarded_from_id = f'channel{fwd.sender.id}'
|
| 197 |
+
elif getattr(fwd, 'sender_name', None):
|
| 198 |
+
forwarded_from = fwd.sender_name
|
| 199 |
+
elif getattr(fwd, 'from_name', None):
|
| 200 |
+
forwarded_from = fwd.from_name
|
| 201 |
+
except Exception:
|
| 202 |
+
pass # Skip forward info if any attribute is missing
|
| 203 |
+
|
| 204 |
+
# Photo info
|
| 205 |
+
photo_info = {}
|
| 206 |
+
has_photo = False
|
| 207 |
+
has_media = False
|
| 208 |
+
|
| 209 |
+
if message.photo:
|
| 210 |
+
has_photo = True
|
| 211 |
+
has_media = True
|
| 212 |
+
if hasattr(message.photo, 'sizes') and message.photo.sizes:
|
| 213 |
+
largest = message.photo.sizes[-1]
|
| 214 |
+
if hasattr(largest, 'w'):
|
| 215 |
+
photo_info['width'] = largest.w
|
| 216 |
+
photo_info['height'] = largest.h
|
| 217 |
+
elif message.document or message.video or message.audio:
|
| 218 |
+
has_media = True
|
| 219 |
+
|
| 220 |
+
# Reply info
|
| 221 |
+
reply_to = None
|
| 222 |
+
if message.reply_to:
|
| 223 |
+
reply_to = message.reply_to.reply_to_msg_id
|
| 224 |
+
|
| 225 |
+
# Date handling
|
| 226 |
+
msg_date = message.date.replace(tzinfo=None) if message.date else datetime.utcnow()
|
| 227 |
+
|
| 228 |
+
# Build the JSON in Telegram Desktop export format
|
| 229 |
+
msg_json = {
|
| 230 |
+
'id': message.id,
|
| 231 |
+
'type': 'message',
|
| 232 |
+
'date': msg_date.strftime('%Y-%m-%dT%H:%M:%S'),
|
| 233 |
+
'date_unixtime': str(int(msg_date.replace(tzinfo=timezone.utc).timestamp())),
|
| 234 |
+
'from': from_name,
|
| 235 |
+
'from_id': from_id,
|
| 236 |
+
'text': text,
|
| 237 |
+
'text_entities': text_entities,
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
if reply_to:
|
| 241 |
+
msg_json['reply_to_message_id'] = reply_to
|
| 242 |
+
if forwarded_from:
|
| 243 |
+
msg_json['forwarded_from'] = forwarded_from
|
| 244 |
+
if forwarded_from_id:
|
| 245 |
+
msg_json['forwarded_from_id'] = forwarded_from_id
|
| 246 |
+
if has_photo:
|
| 247 |
+
msg_json['photo'] = '(photo)'
|
| 248 |
+
msg_json.update(photo_info)
|
| 249 |
+
if has_media and not has_photo:
|
| 250 |
+
msg_json['media_type'] = 'document'
|
| 251 |
+
if message.edit_date:
|
| 252 |
+
edit_date = message.edit_date.replace(tzinfo=None)
|
| 253 |
+
msg_json['edited'] = edit_date.strftime('%Y-%m-%dT%H:%M:%S')
|
| 254 |
+
msg_json['edited_unixtime'] = str(int(edit_date.replace(tzinfo=timezone.utc).timestamp()))
|
| 255 |
+
|
| 256 |
+
return msg_json
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
async def fetch_messages(config: dict, hours: int = 36) -> list[dict]:
|
| 260 |
+
"""
|
| 261 |
+
Fetch messages from the last N hours using Telethon.
|
| 262 |
+
Returns messages in Telegram Desktop export JSON format.
|
| 263 |
+
"""
|
| 264 |
+
from telethon import TelegramClient
|
| 265 |
+
|
| 266 |
+
api_id = config['api_id']
|
| 267 |
+
api_hash = config['api_hash']
|
| 268 |
+
group = config['group']
|
| 269 |
+
|
| 270 |
+
client = TelegramClient(str(SESSION_FILE), api_id, api_hash)
|
| 271 |
+
await client.start()
|
| 272 |
+
|
| 273 |
+
log.info(f"Connected to Telegram")
|
| 274 |
+
|
| 275 |
+
# Resolve group - handle numeric IDs properly
|
| 276 |
+
if isinstance(group, str) and group.lstrip('-').isdigit():
|
| 277 |
+
group = int(group)
|
| 278 |
+
if isinstance(group, int) and group < 0:
|
| 279 |
+
# Convert Telegram Web format to Telethon PeerChannel
|
| 280 |
+
# -100XXXXXXXXXX → channel_id = XXXXXXXXXX
|
| 281 |
+
from telethon.tl.types import PeerChannel
|
| 282 |
+
channel_id = int(str(group).replace('-100', ''))
|
| 283 |
+
entity = await client.get_entity(PeerChannel(channel_id))
|
| 284 |
+
else:
|
| 285 |
+
entity = await client.get_entity(group)
|
| 286 |
+
log.info(f"Fetching from: {getattr(entity, 'title', group)}")
|
| 287 |
+
|
| 288 |
+
# Calculate time window
|
| 289 |
+
since = datetime.now(timezone.utc) - timedelta(hours=hours)
|
| 290 |
+
log.info(f"Fetching messages from last {hours} hours (since {since.strftime('%Y-%m-%d %H:%M')} UTC)")
|
| 291 |
+
|
| 292 |
+
# Fetch messages
|
| 293 |
+
messages_json = []
|
| 294 |
+
count = 0
|
| 295 |
+
|
| 296 |
+
async for message in client.iter_messages(entity, offset_date=None, reverse=False):
|
| 297 |
+
# Stop when we've gone past our time window
|
| 298 |
+
if message.date < since:
|
| 299 |
+
break
|
| 300 |
+
|
| 301 |
+
count += 1
|
| 302 |
+
msg_json = telethon_message_to_json(message)
|
| 303 |
+
if msg_json:
|
| 304 |
+
messages_json.append(msg_json)
|
| 305 |
+
|
| 306 |
+
await client.disconnect()
|
| 307 |
+
|
| 308 |
+
log.info(f"Fetched {count} messages, {len(messages_json)} with text content")
|
| 309 |
+
return messages_json
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
async def fetch_participants(config: dict) -> list[dict]:
|
| 313 |
+
"""
|
| 314 |
+
Fetch all group participants with metadata using Telethon.
|
| 315 |
+
Returns participant info: name, username, status, join date, admin, etc.
|
| 316 |
+
"""
|
| 317 |
+
from telethon import TelegramClient
|
| 318 |
+
from telethon.tl.types import (
|
| 319 |
+
UserStatusOnline, UserStatusOffline, UserStatusRecently,
|
| 320 |
+
UserStatusLastWeek, UserStatusLastMonth,
|
| 321 |
+
ChannelParticipantAdmin, ChannelParticipantCreator,
|
| 322 |
+
)
|
| 323 |
+
|
| 324 |
+
api_id = config['api_id']
|
| 325 |
+
api_hash = config['api_hash']
|
| 326 |
+
group = config['group']
|
| 327 |
+
|
| 328 |
+
client = TelegramClient(str(SESSION_FILE), api_id, api_hash)
|
| 329 |
+
await client.start()
|
| 330 |
+
|
| 331 |
+
# Resolve group
|
| 332 |
+
if isinstance(group, str) and group.lstrip('-').isdigit():
|
| 333 |
+
group = int(group)
|
| 334 |
+
if isinstance(group, int) and group < 0:
|
| 335 |
+
from telethon.tl.types import PeerChannel
|
| 336 |
+
channel_id = int(str(group).replace('-100', ''))
|
| 337 |
+
entity = await client.get_entity(PeerChannel(channel_id))
|
| 338 |
+
else:
|
| 339 |
+
entity = await client.get_entity(group)
|
| 340 |
+
|
| 341 |
+
log.info(f"Fetching participants from: {getattr(entity, 'title', group)}")
|
| 342 |
+
|
| 343 |
+
participants = []
|
| 344 |
+
now_ts = int(datetime.now(timezone.utc).timestamp())
|
| 345 |
+
|
| 346 |
+
async for user in client.iter_participants(entity):
|
| 347 |
+
# Determine status
|
| 348 |
+
status = 'unknown'
|
| 349 |
+
last_online = None
|
| 350 |
+
|
| 351 |
+
if isinstance(user.status, UserStatusOnline):
|
| 352 |
+
status = 'online'
|
| 353 |
+
last_online = now_ts
|
| 354 |
+
elif isinstance(user.status, UserStatusOffline):
|
| 355 |
+
status = 'offline'
|
| 356 |
+
if user.status.was_online:
|
| 357 |
+
last_online = int(user.status.was_online.timestamp())
|
| 358 |
+
elif isinstance(user.status, UserStatusRecently):
|
| 359 |
+
status = 'recently'
|
| 360 |
+
elif isinstance(user.status, UserStatusLastWeek):
|
| 361 |
+
status = 'last_week'
|
| 362 |
+
elif isinstance(user.status, UserStatusLastMonth):
|
| 363 |
+
status = 'last_month'
|
| 364 |
+
|
| 365 |
+
# Determine role
|
| 366 |
+
is_admin = False
|
| 367 |
+
is_creator = False
|
| 368 |
+
join_date = None
|
| 369 |
+
|
| 370 |
+
if hasattr(user, 'participant'):
|
| 371 |
+
p = user.participant
|
| 372 |
+
if isinstance(p, ChannelParticipantCreator):
|
| 373 |
+
is_creator = True
|
| 374 |
+
is_admin = True
|
| 375 |
+
elif isinstance(p, ChannelParticipantAdmin):
|
| 376 |
+
is_admin = True
|
| 377 |
+
if hasattr(p, 'date') and p.date:
|
| 378 |
+
join_date = int(p.date.timestamp())
|
| 379 |
+
|
| 380 |
+
participants.append({
|
| 381 |
+
'user_id': f'user{user.id}',
|
| 382 |
+
'first_name': user.first_name or '',
|
| 383 |
+
'last_name': user.last_name or '',
|
| 384 |
+
'username': user.username or '',
|
| 385 |
+
'phone': user.phone or '',
|
| 386 |
+
'is_bot': 1 if user.bot else 0,
|
| 387 |
+
'is_admin': 1 if is_admin else 0,
|
| 388 |
+
'is_creator': 1 if is_creator else 0,
|
| 389 |
+
'is_premium': 1 if getattr(user, 'premium', False) else 0,
|
| 390 |
+
'join_date': join_date,
|
| 391 |
+
'last_status': status,
|
| 392 |
+
'last_online': last_online,
|
| 393 |
+
'about': '', # Requires separate API call per user
|
| 394 |
+
'updated_at': now_ts,
|
| 395 |
+
})
|
| 396 |
+
|
| 397 |
+
await client.disconnect()
|
| 398 |
+
|
| 399 |
+
log.info(f"Fetched {len(participants)} participants")
|
| 400 |
+
return participants
|
| 401 |
+
|
| 402 |
+
|
| 403 |
+
def sync_participants(participants: list[dict]) -> dict:
|
| 404 |
+
"""Save participants to telegram.db."""
|
| 405 |
+
if not participants:
|
| 406 |
+
return {'synced': 0}
|
| 407 |
+
|
| 408 |
+
conn = sqlite3.connect(str(DB_PATH))
|
| 409 |
+
|
| 410 |
+
# Create table if not exists
|
| 411 |
+
conn.execute("""
|
| 412 |
+
CREATE TABLE IF NOT EXISTS participants (
|
| 413 |
+
user_id TEXT PRIMARY KEY,
|
| 414 |
+
first_name TEXT,
|
| 415 |
+
last_name TEXT,
|
| 416 |
+
username TEXT,
|
| 417 |
+
phone TEXT,
|
| 418 |
+
is_bot INTEGER DEFAULT 0,
|
| 419 |
+
is_admin INTEGER DEFAULT 0,
|
| 420 |
+
is_creator INTEGER DEFAULT 0,
|
| 421 |
+
is_premium INTEGER DEFAULT 0,
|
| 422 |
+
join_date INTEGER,
|
| 423 |
+
last_status TEXT DEFAULT 'unknown',
|
| 424 |
+
last_online INTEGER,
|
| 425 |
+
about TEXT,
|
| 426 |
+
updated_at INTEGER
|
| 427 |
+
)
|
| 428 |
+
""")
|
| 429 |
+
|
| 430 |
+
# Upsert participants
|
| 431 |
+
conn.executemany("""
|
| 432 |
+
INSERT OR REPLACE INTO participants
|
| 433 |
+
(user_id, first_name, last_name, username, phone, is_bot, is_admin,
|
| 434 |
+
is_creator, is_premium, join_date, last_status, last_online, about, updated_at)
|
| 435 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 436 |
+
""", [
|
| 437 |
+
(p['user_id'], p['first_name'], p['last_name'], p['username'],
|
| 438 |
+
p['phone'], p['is_bot'], p['is_admin'], p['is_creator'],
|
| 439 |
+
p['is_premium'], p['join_date'], p['last_status'],
|
| 440 |
+
p['last_online'], p['about'], p['updated_at'])
|
| 441 |
+
for p in participants
|
| 442 |
+
])
|
| 443 |
+
|
| 444 |
+
conn.commit()
|
| 445 |
+
conn.close()
|
| 446 |
+
|
| 447 |
+
log.info(f"Synced {len(participants)} participants to DB")
|
| 448 |
+
return {'synced': len(participants)}
|
| 449 |
+
|
| 450 |
+
|
| 451 |
+
# ==========================================
|
| 452 |
+
# DATABASE: INDEX NEW MESSAGES
|
| 453 |
+
# ==========================================
|
| 454 |
+
|
| 455 |
+
def index_messages(messages_json: list[dict]) -> dict:
|
| 456 |
+
"""
|
| 457 |
+
Add new messages to telegram.db using IncrementalIndexer.
|
| 458 |
+
Duplicates are automatically ignored.
|
| 459 |
+
"""
|
| 460 |
+
if not messages_json:
|
| 461 |
+
return {'new_messages': 0, 'duplicates': 0}
|
| 462 |
+
|
| 463 |
+
from indexer import IncrementalIndexer
|
| 464 |
+
|
| 465 |
+
log.info(f"Indexing {len(messages_json)} messages into telegram.db...")
|
| 466 |
+
|
| 467 |
+
indexer = IncrementalIndexer(str(DB_PATH))
|
| 468 |
+
stats = indexer.update_from_json_data({'messages': messages_json}, show_progress=True)
|
| 469 |
+
indexer.close()
|
| 470 |
+
|
| 471 |
+
log.info(f"Indexing done: {stats['new_messages']} new, {stats['duplicates']} duplicates")
|
| 472 |
+
return stats
|
| 473 |
+
|
| 474 |
+
|
| 475 |
+
# ==========================================
|
| 476 |
+
# EMBEDDINGS: GENERATE FOR NEW MESSAGES
|
| 477 |
+
# ==========================================
|
| 478 |
+
|
| 479 |
+
def generate_embeddings(messages_json: list[dict]) -> dict:
|
| 480 |
+
"""
|
| 481 |
+
Generate embeddings for new messages and add to embeddings.db.
|
| 482 |
+
Only processes messages that don't already have embeddings.
|
| 483 |
+
"""
|
| 484 |
+
if not os.path.exists(EMBEDDINGS_DB_PATH):
|
| 485 |
+
log.warning(f"embeddings.db not found at {EMBEDDINGS_DB_PATH}. Skipping embeddings.")
|
| 486 |
+
return {'new_embeddings': 0, 'skipped': 0}
|
| 487 |
+
|
| 488 |
+
import numpy as np
|
| 489 |
+
|
| 490 |
+
# Find which messages don't have embeddings yet
|
| 491 |
+
emb_conn = sqlite3.connect(str(EMBEDDINGS_DB_PATH))
|
| 492 |
+
existing_ids = set()
|
| 493 |
+
for row in emb_conn.execute("SELECT message_id FROM embeddings"):
|
| 494 |
+
existing_ids.add(row[0])
|
| 495 |
+
|
| 496 |
+
# Filter to messages that:
|
| 497 |
+
# 1. Have text content
|
| 498 |
+
# 2. Text is longer than 10 chars
|
| 499 |
+
# 3. Don't already have embeddings
|
| 500 |
+
new_messages = []
|
| 501 |
+
for msg in messages_json:
|
| 502 |
+
msg_id = msg.get('id')
|
| 503 |
+
text = msg.get('text', '')
|
| 504 |
+
if isinstance(text, list):
|
| 505 |
+
# Handle complex text format
|
| 506 |
+
text = ''.join(
|
| 507 |
+
part if isinstance(part, str) else part.get('text', '')
|
| 508 |
+
for part in text
|
| 509 |
+
)
|
| 510 |
+
|
| 511 |
+
if msg_id and msg_id not in existing_ids and text and len(text.strip()) > 10:
|
| 512 |
+
new_messages.append({
|
| 513 |
+
'id': msg_id,
|
| 514 |
+
'from_name': msg.get('from', ''),
|
| 515 |
+
'text': text
|
| 516 |
+
})
|
| 517 |
+
|
| 518 |
+
if not new_messages:
|
| 519 |
+
emb_conn.close()
|
| 520 |
+
log.info("No new messages need embeddings")
|
| 521 |
+
return {'new_embeddings': 0, 'skipped': len(messages_json)}
|
| 522 |
+
|
| 523 |
+
log.info(f"Generating embeddings for {len(new_messages)} new messages...")
|
| 524 |
+
|
| 525 |
+
# Load model
|
| 526 |
+
from sentence_transformers import SentenceTransformer
|
| 527 |
+
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
|
| 528 |
+
|
| 529 |
+
# Generate embeddings
|
| 530 |
+
texts = [m['text'][:500] for m in new_messages] # Max 500 chars per message
|
| 531 |
+
embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True,
|
| 532 |
+
batch_size=64)
|
| 533 |
+
|
| 534 |
+
# Insert into embeddings.db
|
| 535 |
+
data = []
|
| 536 |
+
for i, msg in enumerate(new_messages):
|
| 537 |
+
emb_blob = embeddings[i].astype(np.float32).tobytes()
|
| 538 |
+
data.append((
|
| 539 |
+
msg['id'],
|
| 540 |
+
msg['from_name'],
|
| 541 |
+
msg['text'][:100], # Preview
|
| 542 |
+
emb_blob
|
| 543 |
+
))
|
| 544 |
+
|
| 545 |
+
emb_conn.executemany(
|
| 546 |
+
"INSERT OR IGNORE INTO embeddings (message_id, from_name, text_preview, embedding) VALUES (?, ?, ?, ?)",
|
| 547 |
+
data
|
| 548 |
+
)
|
| 549 |
+
emb_conn.commit()
|
| 550 |
+
|
| 551 |
+
# Verify
|
| 552 |
+
total = emb_conn.execute("SELECT COUNT(*) FROM embeddings").fetchone()[0]
|
| 553 |
+
emb_conn.close()
|
| 554 |
+
|
| 555 |
+
log.info(f"Added {len(data)} new embeddings. Total embeddings: {total:,}")
|
| 556 |
+
return {'new_embeddings': len(data), 'total_embeddings': total}
|
| 557 |
+
|
| 558 |
+
|
| 559 |
+
# ==========================================
|
| 560 |
+
# MAIN
|
| 561 |
+
# ==========================================
|
| 562 |
+
|
| 563 |
+
def run_sync(hours: int = 36, skip_embeddings: bool = False):
|
| 564 |
+
"""Run the full sync pipeline."""
|
| 565 |
+
start_time = time.time()
|
| 566 |
+
|
| 567 |
+
log.info("=" * 50)
|
| 568 |
+
log.info("Starting daily sync")
|
| 569 |
+
log.info("=" * 50)
|
| 570 |
+
|
| 571 |
+
# Load config
|
| 572 |
+
config = load_config()
|
| 573 |
+
if not config:
|
| 574 |
+
log.error("No configuration found. Run: python daily_sync.py --setup")
|
| 575 |
+
sys.exit(1)
|
| 576 |
+
|
| 577 |
+
# Step 1: Fetch messages from Telegram
|
| 578 |
+
log.info("[1/4] Fetching messages from Telegram...")
|
| 579 |
+
messages_json = asyncio.run(fetch_messages(config, hours=hours))
|
| 580 |
+
|
| 581 |
+
if not messages_json:
|
| 582 |
+
log.info("No messages found in the time window.")
|
| 583 |
+
|
| 584 |
+
# Step 2: Index into telegram.db
|
| 585 |
+
index_stats = {'new_messages': 0, 'duplicates': 0}
|
| 586 |
+
if messages_json:
|
| 587 |
+
log.info("[2/4] Indexing new messages...")
|
| 588 |
+
index_stats = index_messages(messages_json)
|
| 589 |
+
|
| 590 |
+
# Step 3: Sync participants
|
| 591 |
+
log.info("[3/4] Syncing participants...")
|
| 592 |
+
try:
|
| 593 |
+
participants = asyncio.run(fetch_participants(config))
|
| 594 |
+
part_stats = sync_participants(participants)
|
| 595 |
+
except Exception as e:
|
| 596 |
+
log.warning(f"Failed to sync participants: {e}")
|
| 597 |
+
part_stats = {'synced': 0}
|
| 598 |
+
|
| 599 |
+
# Step 4: Generate embeddings
|
| 600 |
+
if skip_embeddings or not messages_json:
|
| 601 |
+
log.info("[4/4] Skipping embeddings")
|
| 602 |
+
emb_stats = {'new_embeddings': 0}
|
| 603 |
+
else:
|
| 604 |
+
log.info("[4/4] Generating embeddings for new messages...")
|
| 605 |
+
emb_stats = generate_embeddings(messages_json)
|
| 606 |
+
|
| 607 |
+
# Notify running server to invalidate caches and reload embeddings
|
| 608 |
+
has_changes = (index_stats.get('new_messages', 0) > 0
|
| 609 |
+
or part_stats.get('synced', 0) > 0
|
| 610 |
+
or emb_stats.get('new_embeddings', 0) > 0)
|
| 611 |
+
if has_changes:
|
| 612 |
+
try:
|
| 613 |
+
import urllib.request
|
| 614 |
+
urllib.request.urlopen('http://localhost:5000/api/cache/invalidate', timeout=5)
|
| 615 |
+
log.info("Server caches invalidated")
|
| 616 |
+
if emb_stats.get('new_embeddings', 0) > 0:
|
| 617 |
+
urllib.request.urlopen('http://localhost:5000/api/embeddings/reload', timeout=5)
|
| 618 |
+
log.info("Server notified to reload embeddings")
|
| 619 |
+
except Exception:
|
| 620 |
+
log.info("Server not running or unreachable - caches will refresh on next restart")
|
| 621 |
+
|
| 622 |
+
# Summary
|
| 623 |
+
elapsed = time.time() - start_time
|
| 624 |
+
log.info("=" * 50)
|
| 625 |
+
log.info("Sync complete!")
|
| 626 |
+
log.info(f" Messages fetched: {len(messages_json) if messages_json else 0}")
|
| 627 |
+
log.info(f" New to DB: {index_stats.get('new_messages', 0)}")
|
| 628 |
+
log.info(f" Duplicates skipped: {index_stats.get('duplicates', 0)}")
|
| 629 |
+
log.info(f" Participants synced: {part_stats.get('synced', 0)}")
|
| 630 |
+
log.info(f" New embeddings: {emb_stats.get('new_embeddings', 0)}")
|
| 631 |
+
log.info(f" Time: {elapsed:.1f}s")
|
| 632 |
+
log.info("=" * 50)
|
| 633 |
+
|
| 634 |
+
|
| 635 |
+
def main():
|
| 636 |
+
parser = argparse.ArgumentParser(description='Daily Telegram Sync')
|
| 637 |
+
parser.add_argument('--setup', action='store_true', help='First time setup')
|
| 638 |
+
parser.add_argument('--hours', type=int, default=36, help='Hours to look back (default: 36)')
|
| 639 |
+
parser.add_argument('--skip-embeddings', action='store_true', help='Skip embedding generation')
|
| 640 |
+
parser.add_argument('--fetch-only', action='store_true', help='Only fetch, do not index')
|
| 641 |
+
args = parser.parse_args()
|
| 642 |
+
|
| 643 |
+
if args.setup:
|
| 644 |
+
setup_config()
|
| 645 |
+
return
|
| 646 |
+
|
| 647 |
+
if args.fetch_only:
|
| 648 |
+
config = load_config()
|
| 649 |
+
if not config:
|
| 650 |
+
log.error("No configuration found. Run: python daily_sync.py --setup")
|
| 651 |
+
sys.exit(1)
|
| 652 |
+
messages = asyncio.run(fetch_messages(config, hours=args.hours))
|
| 653 |
+
# Save to file for inspection
|
| 654 |
+
output = BASE_DIR / 'fetched_messages.json'
|
| 655 |
+
with open(output, 'w', encoding='utf-8') as f:
|
| 656 |
+
json.dump(messages, f, ensure_ascii=False, indent=2)
|
| 657 |
+
log.info(f"Saved {len(messages)} messages to {output}")
|
| 658 |
+
return
|
| 659 |
+
|
| 660 |
+
run_sync(hours=args.hours, skip_embeddings=args.skip_embeddings)
|
| 661 |
+
|
| 662 |
+
|
| 663 |
+
if __name__ == '__main__':
|
| 664 |
+
main()
|
static/css/style.css
CHANGED
|
@@ -67,6 +67,11 @@ body {
|
|
| 67 |
flex-direction: column;
|
| 68 |
border-right: 1px solid var(--border-color);
|
| 69 |
z-index: 100;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
}
|
| 71 |
|
| 72 |
.logo {
|
|
@@ -821,8 +826,54 @@ input:focus {
|
|
| 821 |
}
|
| 822 |
|
| 823 |
@media (max-width: 768px) {
|
| 824 |
-
.
|
| 825 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 826 |
}
|
| 827 |
|
| 828 |
.header {
|
|
@@ -831,9 +882,93 @@ input:focus {
|
|
| 831 |
align-items: flex-start;
|
| 832 |
}
|
| 833 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 834 |
.user-stats-grid {
|
| 835 |
grid-template-columns: repeat(2, 1fr);
|
| 836 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 837 |
}
|
| 838 |
|
| 839 |
/* ==========================================
|
|
|
|
| 67 |
flex-direction: column;
|
| 68 |
border-right: 1px solid var(--border-color);
|
| 69 |
z-index: 100;
|
| 70 |
+
transition: width 0.3s ease;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
.mobile-menu-btn {
|
| 74 |
+
display: none;
|
| 75 |
}
|
| 76 |
|
| 77 |
.logo {
|
|
|
|
| 826 |
}
|
| 827 |
|
| 828 |
@media (max-width: 768px) {
|
| 829 |
+
.sidebar {
|
| 830 |
+
width: 0;
|
| 831 |
+
overflow: hidden;
|
| 832 |
+
transition: width 0.3s ease;
|
| 833 |
+
}
|
| 834 |
+
|
| 835 |
+
.sidebar.open {
|
| 836 |
+
width: 250px;
|
| 837 |
+
}
|
| 838 |
+
|
| 839 |
+
.sidebar-overlay {
|
| 840 |
+
display: none;
|
| 841 |
+
position: fixed;
|
| 842 |
+
top: 0;
|
| 843 |
+
left: 0;
|
| 844 |
+
right: 0;
|
| 845 |
+
bottom: 0;
|
| 846 |
+
background: rgba(0,0,0,0.5);
|
| 847 |
+
z-index: 99;
|
| 848 |
+
}
|
| 849 |
+
|
| 850 |
+
.sidebar-overlay.active {
|
| 851 |
+
display: block;
|
| 852 |
+
}
|
| 853 |
+
|
| 854 |
+
.mobile-menu-btn {
|
| 855 |
+
display: flex !important;
|
| 856 |
+
align-items: center;
|
| 857 |
+
justify-content: center;
|
| 858 |
+
width: 40px;
|
| 859 |
+
height: 40px;
|
| 860 |
+
background: var(--bg-card);
|
| 861 |
+
border: 1px solid var(--border-color);
|
| 862 |
+
border-radius: var(--radius-md);
|
| 863 |
+
color: var(--text-primary);
|
| 864 |
+
font-size: 1.5rem;
|
| 865 |
+
cursor: pointer;
|
| 866 |
+
position: fixed;
|
| 867 |
+
top: var(--spacing-md);
|
| 868 |
+
left: var(--spacing-md);
|
| 869 |
+
z-index: 98;
|
| 870 |
+
}
|
| 871 |
+
|
| 872 |
+
.main-content {
|
| 873 |
+
margin-left: 0;
|
| 874 |
+
max-width: 100vw;
|
| 875 |
+
padding: var(--spacing-md);
|
| 876 |
+
padding-top: 60px;
|
| 877 |
}
|
| 878 |
|
| 879 |
.header {
|
|
|
|
| 882 |
align-items: flex-start;
|
| 883 |
}
|
| 884 |
|
| 885 |
+
.header h1 {
|
| 886 |
+
font-size: 1.25rem;
|
| 887 |
+
}
|
| 888 |
+
|
| 889 |
+
.header-controls {
|
| 890 |
+
width: 100%;
|
| 891 |
+
flex-wrap: wrap;
|
| 892 |
+
}
|
| 893 |
+
|
| 894 |
+
.stats-grid {
|
| 895 |
+
grid-template-columns: repeat(2, 1fr);
|
| 896 |
+
}
|
| 897 |
+
|
| 898 |
+
.stats-grid .stat-card {
|
| 899 |
+
padding: var(--spacing-md);
|
| 900 |
+
}
|
| 901 |
+
|
| 902 |
+
.charts-row {
|
| 903 |
+
grid-template-columns: 1fr;
|
| 904 |
+
}
|
| 905 |
+
|
| 906 |
+
.chart-card.full-width,
|
| 907 |
+
.chart-card.large {
|
| 908 |
+
grid-column: span 1;
|
| 909 |
+
}
|
| 910 |
+
|
| 911 |
+
.lists-row {
|
| 912 |
+
grid-template-columns: 1fr;
|
| 913 |
+
}
|
| 914 |
+
|
| 915 |
.user-stats-grid {
|
| 916 |
grid-template-columns: repeat(2, 1fr);
|
| 917 |
}
|
| 918 |
+
|
| 919 |
+
.heatmap-table th,
|
| 920 |
+
.heatmap-table td {
|
| 921 |
+
min-width: 25px;
|
| 922 |
+
font-size: 0.65rem;
|
| 923 |
+
padding: 2px;
|
| 924 |
+
}
|
| 925 |
+
|
| 926 |
+
.heatmap-cell {
|
| 927 |
+
width: 22px;
|
| 928 |
+
height: 22px;
|
| 929 |
+
}
|
| 930 |
+
|
| 931 |
+
/* Chat messages */
|
| 932 |
+
.message {
|
| 933 |
+
max-width: 95% !important;
|
| 934 |
+
}
|
| 935 |
+
|
| 936 |
+
/* Search */
|
| 937 |
+
.search-box {
|
| 938 |
+
flex-direction: column;
|
| 939 |
+
}
|
| 940 |
+
|
| 941 |
+
.search-box input {
|
| 942 |
+
width: 100%;
|
| 943 |
+
}
|
| 944 |
+
|
| 945 |
+
/* Tables */
|
| 946 |
+
.data-table {
|
| 947 |
+
font-size: 0.8rem;
|
| 948 |
+
}
|
| 949 |
+
|
| 950 |
+
.data-table th,
|
| 951 |
+
.data-table td {
|
| 952 |
+
padding: var(--spacing-sm);
|
| 953 |
+
}
|
| 954 |
+
}
|
| 955 |
+
|
| 956 |
+
@media (max-width: 480px) {
|
| 957 |
+
.stats-grid {
|
| 958 |
+
grid-template-columns: 1fr;
|
| 959 |
+
}
|
| 960 |
+
|
| 961 |
+
.user-stats-grid {
|
| 962 |
+
grid-template-columns: 1fr;
|
| 963 |
+
}
|
| 964 |
+
|
| 965 |
+
.header h1 {
|
| 966 |
+
font-size: 1.1rem;
|
| 967 |
+
}
|
| 968 |
+
|
| 969 |
+
.chart-container {
|
| 970 |
+
height: 200px;
|
| 971 |
+
}
|
| 972 |
}
|
| 973 |
|
| 974 |
/* ==========================================
|
static/js/dashboard.js
CHANGED
|
@@ -9,6 +9,31 @@
|
|
| 9 |
* - Export functionality
|
| 10 |
*/
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
// ==========================================
|
| 13 |
// GLOBAL STATE
|
| 14 |
// ==========================================
|
|
|
|
| 9 |
* - Export functionality
|
| 10 |
*/
|
| 11 |
|
| 12 |
+
// ==========================================
|
| 13 |
+
// MOBILE MENU
|
| 14 |
+
// ==========================================
|
| 15 |
+
|
| 16 |
+
function toggleMobileMenu() {
|
| 17 |
+
const sidebar = document.querySelector('.sidebar');
|
| 18 |
+
const overlay = document.querySelector('.sidebar-overlay');
|
| 19 |
+
sidebar.classList.toggle('open');
|
| 20 |
+
if (overlay) overlay.classList.toggle('active');
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
// Close mobile menu when clicking a nav link
|
| 24 |
+
document.addEventListener('DOMContentLoaded', function() {
|
| 25 |
+
document.querySelectorAll('.nav-link').forEach(function(link) {
|
| 26 |
+
link.addEventListener('click', function() {
|
| 27 |
+
if (window.innerWidth <= 768) {
|
| 28 |
+
const sidebar = document.querySelector('.sidebar');
|
| 29 |
+
const overlay = document.querySelector('.sidebar-overlay');
|
| 30 |
+
sidebar.classList.remove('open');
|
| 31 |
+
if (overlay) overlay.classList.remove('active');
|
| 32 |
+
}
|
| 33 |
+
});
|
| 34 |
+
});
|
| 35 |
+
});
|
| 36 |
+
|
| 37 |
// ==========================================
|
| 38 |
// GLOBAL STATE
|
| 39 |
// ==========================================
|
templates/index.html
CHANGED
|
@@ -8,6 +8,8 @@
|
|
| 8 |
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
| 9 |
</head>
|
| 10 |
<body>
|
|
|
|
|
|
|
| 11 |
<!-- Sidebar -->
|
| 12 |
<nav class="sidebar">
|
| 13 |
<div class="logo">
|
|
|
|
| 8 |
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
| 9 |
</head>
|
| 10 |
<body>
|
| 11 |
+
<button class="mobile-menu-btn" onclick="toggleMobileMenu()">☰</button>
|
| 12 |
+
<div class="sidebar-overlay" onclick="toggleMobileMenu()"></div>
|
| 13 |
<!-- Sidebar -->
|
| 14 |
<nav class="sidebar">
|
| 15 |
<div class="logo">
|
templates/moderation.html
CHANGED
|
@@ -8,6 +8,8 @@
|
|
| 8 |
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
| 9 |
</head>
|
| 10 |
<body>
|
|
|
|
|
|
|
| 11 |
<!-- Sidebar -->
|
| 12 |
<nav class="sidebar">
|
| 13 |
<div class="logo">
|
|
|
|
| 8 |
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
| 9 |
</head>
|
| 10 |
<body>
|
| 11 |
+
<button class="mobile-menu-btn" onclick="toggleMobileMenu()">☰</button>
|
| 12 |
+
<div class="sidebar-overlay" onclick="toggleMobileMenu()"></div>
|
| 13 |
<!-- Sidebar -->
|
| 14 |
<nav class="sidebar">
|
| 15 |
<div class="logo">
|
templates/search.html
CHANGED
|
@@ -7,6 +7,8 @@
|
|
| 7 |
<link rel="stylesheet" href="/static/css/style.css">
|
| 8 |
</head>
|
| 9 |
<body>
|
|
|
|
|
|
|
| 10 |
<!-- Sidebar -->
|
| 11 |
<nav class="sidebar">
|
| 12 |
<div class="logo">
|
|
|
|
| 7 |
<link rel="stylesheet" href="/static/css/style.css">
|
| 8 |
</head>
|
| 9 |
<body>
|
| 10 |
+
<button class="mobile-menu-btn" onclick="toggleMobileMenu()">☰</button>
|
| 11 |
+
<div class="sidebar-overlay" onclick="toggleMobileMenu()"></div>
|
| 12 |
<!-- Sidebar -->
|
| 13 |
<nav class="sidebar">
|
| 14 |
<div class="logo">
|
templates/settings.html
CHANGED
|
@@ -167,6 +167,8 @@
|
|
| 167 |
</style>
|
| 168 |
</head>
|
| 169 |
<body>
|
|
|
|
|
|
|
| 170 |
<!-- Sidebar -->
|
| 171 |
<nav class="sidebar">
|
| 172 |
<div class="logo">
|
|
|
|
| 167 |
</style>
|
| 168 |
</head>
|
| 169 |
<body>
|
| 170 |
+
<button class="mobile-menu-btn" onclick="toggleMobileMenu()">☰</button>
|
| 171 |
+
<div class="sidebar-overlay" onclick="toggleMobileMenu()"></div>
|
| 172 |
<!-- Sidebar -->
|
| 173 |
<nav class="sidebar">
|
| 174 |
<div class="logo">
|
templates/user_profile.html
CHANGED
|
@@ -253,6 +253,8 @@
|
|
| 253 |
</style>
|
| 254 |
</head>
|
| 255 |
<body>
|
|
|
|
|
|
|
| 256 |
<!-- Sidebar -->
|
| 257 |
<nav class="sidebar">
|
| 258 |
<div class="logo">
|
|
|
|
| 253 |
</style>
|
| 254 |
</head>
|
| 255 |
<body>
|
| 256 |
+
<button class="mobile-menu-btn" onclick="toggleMobileMenu()">☰</button>
|
| 257 |
+
<div class="sidebar-overlay" onclick="toggleMobileMenu()"></div>
|
| 258 |
<!-- Sidebar -->
|
| 259 |
<nav class="sidebar">
|
| 260 |
<div class="logo">
|
templates/users.html
CHANGED
|
@@ -8,6 +8,8 @@
|
|
| 8 |
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
| 9 |
</head>
|
| 10 |
<body>
|
|
|
|
|
|
|
| 11 |
<!-- Sidebar -->
|
| 12 |
<nav class="sidebar">
|
| 13 |
<div class="logo">
|
|
|
|
| 8 |
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
| 9 |
</head>
|
| 10 |
<body>
|
| 11 |
+
<button class="mobile-menu-btn" onclick="toggleMobileMenu()">☰</button>
|
| 12 |
+
<div class="sidebar-overlay" onclick="toggleMobileMenu()"></div>
|
| 13 |
<!-- Sidebar -->
|
| 14 |
<nav class="sidebar">
|
| 15 |
<div class="logo">
|
tests.py
ADDED
|
@@ -0,0 +1,474 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Integration tests for Telegram Analytics: indexer, search, and dashboard endpoints.
|
| 4 |
+
|
| 5 |
+
Run with: python -m pytest tests.py -v
|
| 6 |
+
Or: python tests.py
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
import os
|
| 11 |
+
import sqlite3
|
| 12 |
+
import tempfile
|
| 13 |
+
import time
|
| 14 |
+
import unittest
|
| 15 |
+
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# ---------------------------------------------------------------------------
|
| 20 |
+
# Helpers
|
| 21 |
+
# ---------------------------------------------------------------------------
|
| 22 |
+
|
| 23 |
+
def _sample_messages(n: int = 5) -> list[dict]:
|
| 24 |
+
"""Generate N realistic Telegram-format messages."""
|
| 25 |
+
base_ts = 1700000000
|
| 26 |
+
users = [
|
| 27 |
+
("user1", "Alice"),
|
| 28 |
+
("user2", "Bob"),
|
| 29 |
+
("user3", "Carol"),
|
| 30 |
+
]
|
| 31 |
+
msgs = []
|
| 32 |
+
for i in range(1, n + 1):
|
| 33 |
+
uid, name = users[i % len(users)]
|
| 34 |
+
msgs.append({
|
| 35 |
+
"id": 1000 + i,
|
| 36 |
+
"type": "message",
|
| 37 |
+
"date": f"2024-01-{(i % 28) + 1:02d}T10:00:00",
|
| 38 |
+
"date_unixtime": str(base_ts + i * 3600),
|
| 39 |
+
"from": name,
|
| 40 |
+
"from_id": uid,
|
| 41 |
+
"text": f"Test message number {i} from {name}",
|
| 42 |
+
"text_entities": [
|
| 43 |
+
{"type": "plain", "text": f"Test message number {i} from {name}"}
|
| 44 |
+
],
|
| 45 |
+
"reply_to_message_id": (1000 + i - 1) if i > 1 else None,
|
| 46 |
+
})
|
| 47 |
+
return msgs
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _write_json(path: str, messages: list[dict]):
|
| 51 |
+
"""Write messages in Telegram export JSON format."""
|
| 52 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 53 |
+
json.dump({"messages": messages}, f, ensure_ascii=False)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
# ---------------------------------------------------------------------------
|
| 57 |
+
# 1. Indexer Tests
|
| 58 |
+
# ---------------------------------------------------------------------------
|
| 59 |
+
|
| 60 |
+
class TestIndexer(unittest.TestCase):
|
| 61 |
+
"""Tests for OptimizedIndexer and IncrementalIndexer."""
|
| 62 |
+
|
| 63 |
+
def setUp(self):
|
| 64 |
+
self.tmpdir = tempfile.mkdtemp()
|
| 65 |
+
self.db_path = os.path.join(self.tmpdir, "test.db")
|
| 66 |
+
self.json_path = os.path.join(self.tmpdir, "messages.json")
|
| 67 |
+
self.messages = _sample_messages(10)
|
| 68 |
+
_write_json(self.json_path, self.messages)
|
| 69 |
+
|
| 70 |
+
def tearDown(self):
|
| 71 |
+
import shutil
|
| 72 |
+
shutil.rmtree(self.tmpdir, ignore_errors=True)
|
| 73 |
+
|
| 74 |
+
def test_optimized_indexer_indexes_messages(self):
|
| 75 |
+
from indexer import OptimizedIndexer
|
| 76 |
+
indexer = OptimizedIndexer(self.db_path, build_trigrams=False, build_graph=False)
|
| 77 |
+
stats = indexer.index_file(self.json_path, show_progress=False)
|
| 78 |
+
|
| 79 |
+
self.assertGreater(stats["messages"], 0)
|
| 80 |
+
|
| 81 |
+
conn = sqlite3.connect(self.db_path)
|
| 82 |
+
count = conn.execute("SELECT COUNT(*) FROM messages").fetchone()[0]
|
| 83 |
+
conn.close()
|
| 84 |
+
self.assertEqual(count, stats["messages"])
|
| 85 |
+
|
| 86 |
+
def test_incremental_indexer_deduplication(self):
|
| 87 |
+
from indexer import OptimizedIndexer, IncrementalIndexer
|
| 88 |
+
|
| 89 |
+
# First: create DB with OptimizedIndexer
|
| 90 |
+
opt = OptimizedIndexer(self.db_path, build_trigrams=False, build_graph=False)
|
| 91 |
+
opt.index_file(self.json_path, show_progress=False)
|
| 92 |
+
|
| 93 |
+
# Now use IncrementalIndexer – same data, should all be duplicates
|
| 94 |
+
idx = IncrementalIndexer(self.db_path)
|
| 95 |
+
stats = idx.update_from_json(self.json_path, show_progress=False)
|
| 96 |
+
idx.close()
|
| 97 |
+
|
| 98 |
+
self.assertEqual(stats["new_messages"], 0)
|
| 99 |
+
self.assertGreater(stats["duplicates"], 0)
|
| 100 |
+
|
| 101 |
+
def test_incremental_indexer_adds_new(self):
|
| 102 |
+
from indexer import OptimizedIndexer, IncrementalIndexer
|
| 103 |
+
|
| 104 |
+
# Create DB with 5 messages
|
| 105 |
+
msgs5 = _sample_messages(5)
|
| 106 |
+
_write_json(self.json_path, msgs5)
|
| 107 |
+
opt = OptimizedIndexer(self.db_path, build_trigrams=False, build_graph=False)
|
| 108 |
+
opt.index_file(self.json_path, show_progress=False)
|
| 109 |
+
|
| 110 |
+
# Now add 10 messages (5 old + 5 new)
|
| 111 |
+
msgs10 = _sample_messages(10)
|
| 112 |
+
json2 = os.path.join(self.tmpdir, "messages2.json")
|
| 113 |
+
_write_json(json2, msgs10)
|
| 114 |
+
|
| 115 |
+
idx = IncrementalIndexer(self.db_path)
|
| 116 |
+
stats = idx.update_from_json(json2, show_progress=False)
|
| 117 |
+
idx.close()
|
| 118 |
+
|
| 119 |
+
self.assertEqual(stats["new_messages"], 5)
|
| 120 |
+
self.assertEqual(stats["duplicates"], 5)
|
| 121 |
+
|
| 122 |
+
def test_incremental_indexer_from_json_data(self):
|
| 123 |
+
from indexer import OptimizedIndexer, IncrementalIndexer
|
| 124 |
+
|
| 125 |
+
# Init DB first
|
| 126 |
+
opt = OptimizedIndexer(self.db_path, build_trigrams=False, build_graph=False)
|
| 127 |
+
opt.index_file(self.json_path, show_progress=False)
|
| 128 |
+
|
| 129 |
+
# Add new messages via json_data
|
| 130 |
+
new_msgs = _sample_messages(15) # 10 old + 5 new
|
| 131 |
+
idx = IncrementalIndexer(self.db_path)
|
| 132 |
+
stats = idx.update_from_json_data(new_msgs, show_progress=False)
|
| 133 |
+
idx.close()
|
| 134 |
+
|
| 135 |
+
self.assertEqual(stats["new_messages"], 5)
|
| 136 |
+
|
| 137 |
+
def test_fts5_search_works(self):
|
| 138 |
+
from indexer import OptimizedIndexer
|
| 139 |
+
indexer = OptimizedIndexer(self.db_path, build_trigrams=False, build_graph=False)
|
| 140 |
+
indexer.index_file(self.json_path, show_progress=False)
|
| 141 |
+
|
| 142 |
+
conn = sqlite3.connect(self.db_path)
|
| 143 |
+
cursor = conn.execute(
|
| 144 |
+
"SELECT COUNT(*) FROM messages_fts WHERE messages_fts MATCH 'message'"
|
| 145 |
+
)
|
| 146 |
+
count = cursor.fetchone()[0]
|
| 147 |
+
conn.close()
|
| 148 |
+
|
| 149 |
+
self.assertGreater(count, 0, "FTS5 search should find messages with 'message'")
|
| 150 |
+
|
| 151 |
+
def test_streaming_load_json_messages(self):
|
| 152 |
+
from indexer import load_json_messages
|
| 153 |
+
msgs = list(load_json_messages(self.json_path))
|
| 154 |
+
self.assertEqual(len(msgs), 10)
|
| 155 |
+
self.assertIn("text_plain", msgs[0])
|
| 156 |
+
|
| 157 |
+
def test_entities_extracted(self):
|
| 158 |
+
"""Messages with links/mentions in text_entities should have entities stored."""
|
| 159 |
+
msgs = [
|
| 160 |
+
{
|
| 161 |
+
"id": 9001,
|
| 162 |
+
"type": "message",
|
| 163 |
+
"date": "2024-01-01T10:00:00",
|
| 164 |
+
"date_unixtime": "1700000000",
|
| 165 |
+
"from": "Alice",
|
| 166 |
+
"from_id": "user1",
|
| 167 |
+
"text": "Check https://example.com and @bob",
|
| 168 |
+
"text_entities": [
|
| 169 |
+
{"type": "plain", "text": "Check "},
|
| 170 |
+
{"type": "link", "text": "https://example.com"},
|
| 171 |
+
{"type": "plain", "text": " and "},
|
| 172 |
+
{"type": "mention", "text": "@bob"},
|
| 173 |
+
],
|
| 174 |
+
}
|
| 175 |
+
]
|
| 176 |
+
_write_json(self.json_path, msgs)
|
| 177 |
+
|
| 178 |
+
from indexer import OptimizedIndexer
|
| 179 |
+
indexer = OptimizedIndexer(self.db_path, build_trigrams=False, build_graph=False)
|
| 180 |
+
indexer.index_file(self.json_path, show_progress=False)
|
| 181 |
+
|
| 182 |
+
conn = sqlite3.connect(self.db_path)
|
| 183 |
+
entities = conn.execute("SELECT type, value FROM entities WHERE message_id = 9001").fetchall()
|
| 184 |
+
conn.close()
|
| 185 |
+
|
| 186 |
+
types = [e[0] for e in entities]
|
| 187 |
+
self.assertIn("link", types)
|
| 188 |
+
self.assertIn("mention", types)
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
# ---------------------------------------------------------------------------
|
| 192 |
+
# 2. Search Tests
|
| 193 |
+
# ---------------------------------------------------------------------------
|
| 194 |
+
|
| 195 |
+
class TestSearch(unittest.TestCase):
|
| 196 |
+
"""Tests for FTS search."""
|
| 197 |
+
|
| 198 |
+
def setUp(self):
|
| 199 |
+
self.tmpdir = tempfile.mkdtemp()
|
| 200 |
+
self.db_path = os.path.join(self.tmpdir, "test.db")
|
| 201 |
+
self.json_path = os.path.join(self.tmpdir, "messages.json")
|
| 202 |
+
_write_json(self.json_path, _sample_messages(20))
|
| 203 |
+
|
| 204 |
+
from indexer import OptimizedIndexer
|
| 205 |
+
indexer = OptimizedIndexer(self.db_path, build_trigrams=False, build_graph=False)
|
| 206 |
+
indexer.index_file(self.json_path, show_progress=False)
|
| 207 |
+
|
| 208 |
+
def tearDown(self):
|
| 209 |
+
import shutil
|
| 210 |
+
shutil.rmtree(self.tmpdir, ignore_errors=True)
|
| 211 |
+
|
| 212 |
+
def test_fts_match_query(self):
|
| 213 |
+
conn = sqlite3.connect(self.db_path)
|
| 214 |
+
conn.row_factory = sqlite3.Row
|
| 215 |
+
rows = conn.execute(
|
| 216 |
+
"SELECT id, text_plain FROM messages WHERE id IN "
|
| 217 |
+
"(SELECT rowid FROM messages_fts WHERE messages_fts MATCH 'Alice')"
|
| 218 |
+
).fetchall()
|
| 219 |
+
conn.close()
|
| 220 |
+
self.assertGreater(len(rows), 0)
|
| 221 |
+
for r in rows:
|
| 222 |
+
self.assertIn("Alice", r["text_plain"])
|
| 223 |
+
|
| 224 |
+
def test_fts_returns_no_results_for_nonsense(self):
|
| 225 |
+
conn = sqlite3.connect(self.db_path)
|
| 226 |
+
rows = conn.execute(
|
| 227 |
+
"SELECT COUNT(*) FROM messages_fts WHERE messages_fts MATCH 'xyzzyplugh'"
|
| 228 |
+
).fetchone()[0]
|
| 229 |
+
conn.close()
|
| 230 |
+
self.assertEqual(rows, 0)
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
# ---------------------------------------------------------------------------
|
| 234 |
+
# 3. SemanticSearch Empty Embeddings
|
| 235 |
+
# ---------------------------------------------------------------------------
|
| 236 |
+
|
| 237 |
+
try:
|
| 238 |
+
import numpy as np
|
| 239 |
+
HAS_NUMPY = True
|
| 240 |
+
except ImportError:
|
| 241 |
+
HAS_NUMPY = False
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
@unittest.skipUnless(HAS_NUMPY, "numpy not installed")
|
| 245 |
+
class TestSemanticSearchEmpty(unittest.TestCase):
|
| 246 |
+
"""Test that SemanticSearch handles missing/empty embeddings gracefully."""
|
| 247 |
+
|
| 248 |
+
def test_is_available_missing_db(self):
|
| 249 |
+
from semantic_search import SemanticSearch
|
| 250 |
+
ss = SemanticSearch(embeddings_db="/tmp/nonexistent_embeddings_12345.db")
|
| 251 |
+
self.assertFalse(ss.is_available())
|
| 252 |
+
|
| 253 |
+
def test_is_available_empty_db(self):
|
| 254 |
+
from semantic_search import SemanticSearch
|
| 255 |
+
tmpdir = tempfile.mkdtemp()
|
| 256 |
+
db_path = os.path.join(tmpdir, "empty_emb.db")
|
| 257 |
+
|
| 258 |
+
conn = sqlite3.connect(db_path)
|
| 259 |
+
conn.execute(
|
| 260 |
+
"CREATE TABLE embeddings (message_id INTEGER PRIMARY KEY, "
|
| 261 |
+
"from_name TEXT, text_preview TEXT, embedding BLOB)"
|
| 262 |
+
)
|
| 263 |
+
conn.commit()
|
| 264 |
+
conn.close()
|
| 265 |
+
|
| 266 |
+
ss = SemanticSearch(embeddings_db=db_path)
|
| 267 |
+
self.assertFalse(ss.is_available())
|
| 268 |
+
|
| 269 |
+
import shutil
|
| 270 |
+
shutil.rmtree(tmpdir, ignore_errors=True)
|
| 271 |
+
|
| 272 |
+
def test_load_empty_embeddings_no_crash(self):
|
| 273 |
+
from semantic_search import SemanticSearch
|
| 274 |
+
tmpdir = tempfile.mkdtemp()
|
| 275 |
+
db_path = os.path.join(tmpdir, "empty_emb.db")
|
| 276 |
+
|
| 277 |
+
conn = sqlite3.connect(db_path)
|
| 278 |
+
conn.execute(
|
| 279 |
+
"CREATE TABLE embeddings (message_id INTEGER PRIMARY KEY, "
|
| 280 |
+
"from_name TEXT, text_preview TEXT, embedding BLOB)"
|
| 281 |
+
)
|
| 282 |
+
conn.commit()
|
| 283 |
+
conn.close()
|
| 284 |
+
|
| 285 |
+
ss = SemanticSearch(embeddings_db=db_path)
|
| 286 |
+
ss._load_embeddings() # Should not crash
|
| 287 |
+
|
| 288 |
+
self.assertTrue(ss.embeddings_loaded)
|
| 289 |
+
self.assertEqual(len(ss.message_ids), 0)
|
| 290 |
+
|
| 291 |
+
import shutil
|
| 292 |
+
shutil.rmtree(tmpdir, ignore_errors=True)
|
| 293 |
+
|
| 294 |
+
def test_stats_empty_db(self):
|
| 295 |
+
from semantic_search import SemanticSearch
|
| 296 |
+
tmpdir = tempfile.mkdtemp()
|
| 297 |
+
db_path = os.path.join(tmpdir, "empty_emb.db")
|
| 298 |
+
|
| 299 |
+
conn = sqlite3.connect(db_path)
|
| 300 |
+
conn.execute(
|
| 301 |
+
"CREATE TABLE embeddings (message_id INTEGER PRIMARY KEY, "
|
| 302 |
+
"from_name TEXT, text_preview TEXT, embedding BLOB)"
|
| 303 |
+
)
|
| 304 |
+
conn.commit()
|
| 305 |
+
conn.close()
|
| 306 |
+
|
| 307 |
+
ss = SemanticSearch(embeddings_db=db_path)
|
| 308 |
+
s = ss.stats()
|
| 309 |
+
self.assertTrue(s["available"]) # File exists and table exists
|
| 310 |
+
self.assertEqual(s["count"], 0)
|
| 311 |
+
|
| 312 |
+
import shutil
|
| 313 |
+
shutil.rmtree(tmpdir, ignore_errors=True)
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
# ---------------------------------------------------------------------------
|
| 317 |
+
# 4. Dashboard Endpoint Tests
|
| 318 |
+
# ---------------------------------------------------------------------------
|
| 319 |
+
|
| 320 |
+
try:
|
| 321 |
+
import flask
|
| 322 |
+
HAS_FLASK = True
|
| 323 |
+
except ImportError:
|
| 324 |
+
HAS_FLASK = False
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
@unittest.skipUnless(HAS_FLASK, "flask not installed")
|
| 328 |
+
class TestDashboardEndpoints(unittest.TestCase):
|
| 329 |
+
"""Test Flask dashboard API endpoints."""
|
| 330 |
+
|
| 331 |
+
@classmethod
|
| 332 |
+
def setUpClass(cls):
|
| 333 |
+
"""Create a test DB and configure Flask test client."""
|
| 334 |
+
cls.tmpdir = tempfile.mkdtemp()
|
| 335 |
+
cls.db_path = os.path.join(cls.tmpdir, "test.db")
|
| 336 |
+
cls.json_path = os.path.join(cls.tmpdir, "messages.json")
|
| 337 |
+
|
| 338 |
+
_write_json(cls.json_path, _sample_messages(50))
|
| 339 |
+
|
| 340 |
+
from indexer import OptimizedIndexer
|
| 341 |
+
indexer = OptimizedIndexer(cls.db_path, build_trigrams=False, build_graph=False)
|
| 342 |
+
indexer.index_file(cls.json_path, show_progress=False)
|
| 343 |
+
|
| 344 |
+
import dashboard
|
| 345 |
+
dashboard.DB_PATH = cls.db_path
|
| 346 |
+
dashboard.app.config["TESTING"] = True
|
| 347 |
+
cls.client = dashboard.app.test_client()
|
| 348 |
+
|
| 349 |
+
@classmethod
|
| 350 |
+
def tearDownClass(cls):
|
| 351 |
+
import shutil
|
| 352 |
+
shutil.rmtree(cls.tmpdir, ignore_errors=True)
|
| 353 |
+
|
| 354 |
+
def test_overview_endpoint(self):
|
| 355 |
+
resp = self.client.get("/api/overview?timeframe=all")
|
| 356 |
+
self.assertEqual(resp.status_code, 200)
|
| 357 |
+
data = resp.get_json()
|
| 358 |
+
self.assertIn("total_messages", data)
|
| 359 |
+
self.assertGreater(data["total_messages"], 0)
|
| 360 |
+
|
| 361 |
+
def test_users_endpoint(self):
|
| 362 |
+
resp = self.client.get("/api/users?timeframe=all&limit=10")
|
| 363 |
+
self.assertEqual(resp.status_code, 200)
|
| 364 |
+
data = resp.get_json()
|
| 365 |
+
self.assertIn("users", data)
|
| 366 |
+
self.assertGreater(len(data["users"]), 0)
|
| 367 |
+
user = data["users"][0]
|
| 368 |
+
for field in ("user_id", "name", "messages", "percentage"):
|
| 369 |
+
self.assertIn(field, user)
|
| 370 |
+
|
| 371 |
+
def test_users_include_inactive(self):
|
| 372 |
+
resp = self.client.get("/api/users?timeframe=all&include_inactive=0")
|
| 373 |
+
self.assertEqual(resp.status_code, 200)
|
| 374 |
+
data = resp.get_json()
|
| 375 |
+
for user in data["users"]:
|
| 376 |
+
self.assertGreater(user["messages"], 0)
|
| 377 |
+
|
| 378 |
+
def test_search_fts_endpoint(self):
|
| 379 |
+
resp = self.client.get("/api/search?q=message&mode=fts&limit=5")
|
| 380 |
+
self.assertEqual(resp.status_code, 200)
|
| 381 |
+
data = resp.get_json()
|
| 382 |
+
self.assertIn("results", data)
|
| 383 |
+
|
| 384 |
+
def test_chart_hourly_endpoint(self):
|
| 385 |
+
resp = self.client.get("/api/chart/hourly?timeframe=all")
|
| 386 |
+
self.assertEqual(resp.status_code, 200)
|
| 387 |
+
data = resp.get_json()
|
| 388 |
+
self.assertIsInstance(data, list)
|
| 389 |
+
self.assertEqual(len(data), 24)
|
| 390 |
+
|
| 391 |
+
def test_chart_daily_endpoint(self):
|
| 392 |
+
resp = self.client.get("/api/chart/daily?timeframe=all")
|
| 393 |
+
self.assertEqual(resp.status_code, 200)
|
| 394 |
+
data = resp.get_json()
|
| 395 |
+
self.assertIsInstance(data, list)
|
| 396 |
+
|
| 397 |
+
def test_cache_invalidate_endpoint(self):
|
| 398 |
+
resp = self.client.get("/api/cache/invalidate")
|
| 399 |
+
self.assertEqual(resp.status_code, 200)
|
| 400 |
+
data = resp.get_json()
|
| 401 |
+
self.assertEqual(data["status"], "invalidated")
|
| 402 |
+
|
| 403 |
+
def test_page_routes_return_200(self):
|
| 404 |
+
"""All page routes should return 200."""
|
| 405 |
+
for route in ("/", "/users", "/search", "/chat", "/moderation", "/settings"):
|
| 406 |
+
resp = self.client.get(route)
|
| 407 |
+
self.assertEqual(resp.status_code, 200, f"Route {route} failed")
|
| 408 |
+
|
| 409 |
+
def test_user_profile_endpoint(self):
|
| 410 |
+
resp = self.client.get("/api/users?timeframe=all&limit=1")
|
| 411 |
+
data = resp.get_json()
|
| 412 |
+
if data["users"]:
|
| 413 |
+
uid = data["users"][0]["user_id"]
|
| 414 |
+
resp2 = self.client.get(f"/api/user/{uid}/profile")
|
| 415 |
+
self.assertEqual(resp2.status_code, 200)
|
| 416 |
+
profile = resp2.get_json()
|
| 417 |
+
self.assertIn("total_messages", profile)
|
| 418 |
+
self.assertIn("hourly_activity", profile)
|
| 419 |
+
|
| 420 |
+
def test_overview_has_expected_keys(self):
|
| 421 |
+
resp = self.client.get("/api/overview?timeframe=all")
|
| 422 |
+
data = resp.get_json()
|
| 423 |
+
for key in ("total_messages", "total_users", "links_count", "media_count"):
|
| 424 |
+
self.assertIn(key, data, f"Missing key: {key}")
|
| 425 |
+
|
| 426 |
+
|
| 427 |
+
# ---------------------------------------------------------------------------
|
| 428 |
+
# 5. AI Search Schema Test
|
| 429 |
+
# ---------------------------------------------------------------------------
|
| 430 |
+
|
| 431 |
+
class TestAISearchSchema(unittest.TestCase):
|
| 432 |
+
"""Test that AI search schema generation matches actual DB."""
|
| 433 |
+
|
| 434 |
+
def test_dynamic_schema_includes_real_columns(self):
|
| 435 |
+
tmpdir = tempfile.mkdtemp()
|
| 436 |
+
db_path = os.path.join(tmpdir, "test.db")
|
| 437 |
+
|
| 438 |
+
# Initialize DB with real schema
|
| 439 |
+
from indexer import init_database
|
| 440 |
+
conn = init_database(db_path)
|
| 441 |
+
conn.close()
|
| 442 |
+
|
| 443 |
+
from ai_search import AISearchEngine
|
| 444 |
+
# Create instance without connecting to a provider
|
| 445 |
+
engine = AISearchEngine.__new__(AISearchEngine)
|
| 446 |
+
engine.db_path = db_path
|
| 447 |
+
|
| 448 |
+
schema = engine._get_db_schema()
|
| 449 |
+
|
| 450 |
+
# Verify real column names are present
|
| 451 |
+
self.assertIn("text_plain", schema)
|
| 452 |
+
self.assertIn("date_unixtime", schema)
|
| 453 |
+
self.assertIn("has_links", schema)
|
| 454 |
+
self.assertIn("has_media", schema)
|
| 455 |
+
self.assertIn("from_id", schema)
|
| 456 |
+
self.assertIn("participants", schema)
|
| 457 |
+
|
| 458 |
+
# Verify old wrong column names are NOT in the dynamic output
|
| 459 |
+
self.assertNotIn("char_count", schema)
|
| 460 |
+
# media_type would not appear unless there's a column named that
|
| 461 |
+
lines_lower = schema.lower()
|
| 462 |
+
# "media_type" should not be a column name (has_media is the real one)
|
| 463 |
+
self.assertNotIn("media_type (", lines_lower)
|
| 464 |
+
|
| 465 |
+
import shutil
|
| 466 |
+
shutil.rmtree(tmpdir, ignore_errors=True)
|
| 467 |
+
|
| 468 |
+
|
| 469 |
+
# ---------------------------------------------------------------------------
|
| 470 |
+
# Runner
|
| 471 |
+
# ---------------------------------------------------------------------------
|
| 472 |
+
|
| 473 |
+
if __name__ == "__main__":
|
| 474 |
+
unittest.main(verbosity=2)
|
update_hf.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Automated sync + deploy to Hugging Face Spaces.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python update_hf.py # sync + upload everything
|
| 7 |
+
python update_hf.py --db-only # just upload DB (skip sync)
|
| 8 |
+
python update_hf.py --full # upload all files + DB (first time or after code changes)
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import subprocess
|
| 12 |
+
import sys
|
| 13 |
+
import os
|
| 14 |
+
|
| 15 |
+
# === CONFIGURATION ===
|
| 16 |
+
REPO_ID = "rottg/telegram-analytics"
|
| 17 |
+
PROJECT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 18 |
+
DB_PATH = os.path.join(PROJECT_DIR, "telegram.db")
|
| 19 |
+
|
| 20 |
+
# Files to upload (code + config)
|
| 21 |
+
CODE_FILES = [
|
| 22 |
+
"dashboard.py", "ai_search.py", "algorithms.py", "data_structures.py",
|
| 23 |
+
"indexer.py", "search.py", "semantic_search.py", "schema.sql",
|
| 24 |
+
"Dockerfile", "requirements.txt", "README.md",
|
| 25 |
+
]
|
| 26 |
+
FOLDERS = ["static", "templates"]
|
| 27 |
+
|
| 28 |
+
# Token is read from environment variable or .hf_token file
|
| 29 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 30 |
+
if not HF_TOKEN:
|
| 31 |
+
token_file = os.path.join(PROJECT_DIR, ".hf_token")
|
| 32 |
+
if os.path.exists(token_file):
|
| 33 |
+
with open(token_file) as f:
|
| 34 |
+
HF_TOKEN = f.read().strip()
|
| 35 |
+
else:
|
| 36 |
+
print("ERROR: Set HF_TOKEN env var or create .hf_token file with your token")
|
| 37 |
+
sys.exit(1)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def run_sync():
|
| 41 |
+
"""Run daily_sync.py"""
|
| 42 |
+
sync_script = os.path.join(PROJECT_DIR, "daily_sync.py")
|
| 43 |
+
print("\n=== Step 1: Running daily sync ===")
|
| 44 |
+
result = subprocess.run([sys.executable, sync_script], cwd=PROJECT_DIR)
|
| 45 |
+
if result.returncode != 0:
|
| 46 |
+
print("ERROR: Sync failed!")
|
| 47 |
+
sys.exit(1)
|
| 48 |
+
print("Sync complete.")
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def upload_to_hf(full=False):
|
| 52 |
+
"""Upload files to HF Space using the API (no git needed)."""
|
| 53 |
+
from huggingface_hub import HfApi
|
| 54 |
+
|
| 55 |
+
api = HfApi(token=HF_TOKEN)
|
| 56 |
+
|
| 57 |
+
if full:
|
| 58 |
+
# Upload all code files + folders + DB
|
| 59 |
+
print("\n=== Uploading all files to HF ===")
|
| 60 |
+
|
| 61 |
+
# Collect all files to upload
|
| 62 |
+
upload_files = []
|
| 63 |
+
for f in CODE_FILES:
|
| 64 |
+
path = os.path.join(PROJECT_DIR, f)
|
| 65 |
+
if os.path.exists(path):
|
| 66 |
+
upload_files.append((path, f))
|
| 67 |
+
|
| 68 |
+
for folder in FOLDERS:
|
| 69 |
+
folder_path = os.path.join(PROJECT_DIR, folder)
|
| 70 |
+
if os.path.exists(folder_path):
|
| 71 |
+
for root, dirs, files in os.walk(folder_path):
|
| 72 |
+
for fname in files:
|
| 73 |
+
full_path = os.path.join(root, fname)
|
| 74 |
+
rel_path = os.path.relpath(full_path, PROJECT_DIR)
|
| 75 |
+
upload_files.append((full_path, rel_path.replace("\\", "/")))
|
| 76 |
+
|
| 77 |
+
# Add DB
|
| 78 |
+
upload_files.append((DB_PATH, "telegram.db"))
|
| 79 |
+
|
| 80 |
+
print(f"Uploading {len(upload_files)} files...")
|
| 81 |
+
for local_path, repo_path in upload_files:
|
| 82 |
+
size_mb = os.path.getsize(local_path) / (1024 * 1024)
|
| 83 |
+
if size_mb > 1:
|
| 84 |
+
print(f" {repo_path} ({size_mb:.0f} MB)...")
|
| 85 |
+
else:
|
| 86 |
+
print(f" {repo_path}")
|
| 87 |
+
|
| 88 |
+
api.upload_folder(
|
| 89 |
+
folder_path=PROJECT_DIR,
|
| 90 |
+
repo_id=REPO_ID,
|
| 91 |
+
repo_type="space",
|
| 92 |
+
allow_patterns=[f for _, f in upload_files],
|
| 93 |
+
)
|
| 94 |
+
else:
|
| 95 |
+
# DB only - delete old, upload new
|
| 96 |
+
print("\n=== Removing old DB from HF ===")
|
| 97 |
+
try:
|
| 98 |
+
api.delete_file("telegram.db", repo_id=REPO_ID, repo_type="space")
|
| 99 |
+
print("Old DB removed.")
|
| 100 |
+
except Exception as e:
|
| 101 |
+
print(f"No old DB to remove ({e})")
|
| 102 |
+
|
| 103 |
+
print("\n=== Uploading new DB to HF ===")
|
| 104 |
+
db_size_mb = os.path.getsize(DB_PATH) / (1024 * 1024)
|
| 105 |
+
print(f"Uploading {db_size_mb:.0f} MB...")
|
| 106 |
+
|
| 107 |
+
api.upload_file(
|
| 108 |
+
path_or_fileobj=DB_PATH,
|
| 109 |
+
path_in_repo="telegram.db",
|
| 110 |
+
repo_id=REPO_ID,
|
| 111 |
+
repo_type="space",
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
print("Upload complete!")
|
| 115 |
+
print(f"\nSite will rebuild at: https://rottg-telegram-analytics.hf.space")
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def main():
|
| 119 |
+
db_only = "--db-only" in sys.argv
|
| 120 |
+
full = "--full" in sys.argv
|
| 121 |
+
|
| 122 |
+
if not db_only:
|
| 123 |
+
run_sync()
|
| 124 |
+
else:
|
| 125 |
+
print("Skipping sync (--db-only)")
|
| 126 |
+
|
| 127 |
+
upload_to_hf(full=full or not db_only and "--db-only" not in sys.argv)
|
| 128 |
+
print("\n=== Done! ===")
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
if __name__ == "__main__":
|
| 132 |
+
main()
|
vector_search.py
ADDED
|
@@ -0,0 +1,448 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Vector Search Module for Semantic Similarity
|
| 4 |
+
|
| 5 |
+
Optional module that adds semantic search capabilities using:
|
| 6 |
+
- Sentence embeddings (sentence-transformers)
|
| 7 |
+
- FAISS for efficient similarity search
|
| 8 |
+
|
| 9 |
+
Dependencies (optional, install with):
|
| 10 |
+
pip install sentence-transformers faiss-cpu numpy
|
| 11 |
+
|
| 12 |
+
If dependencies are not installed, the module gracefully degrades.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import sqlite3
|
| 16 |
+
import pickle
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
from typing import Optional
|
| 19 |
+
|
| 20 |
+
# Try importing optional dependencies
|
| 21 |
+
VECTOR_SEARCH_AVAILABLE = False
|
| 22 |
+
try:
|
| 23 |
+
import numpy as np
|
| 24 |
+
NUMPY_AVAILABLE = True
|
| 25 |
+
except ImportError:
|
| 26 |
+
NUMPY_AVAILABLE = False
|
| 27 |
+
np = None
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
import faiss
|
| 31 |
+
FAISS_AVAILABLE = True
|
| 32 |
+
except ImportError:
|
| 33 |
+
FAISS_AVAILABLE = False
|
| 34 |
+
faiss = None
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
from sentence_transformers import SentenceTransformer
|
| 38 |
+
SENTENCE_TRANSFORMERS_AVAILABLE = True
|
| 39 |
+
except ImportError:
|
| 40 |
+
SENTENCE_TRANSFORMERS_AVAILABLE = False
|
| 41 |
+
SentenceTransformer = None
|
| 42 |
+
|
| 43 |
+
VECTOR_SEARCH_AVAILABLE = all([NUMPY_AVAILABLE, FAISS_AVAILABLE, SENTENCE_TRANSFORMERS_AVAILABLE])
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class VectorSearchUnavailable:
|
| 47 |
+
"""Placeholder when dependencies are not installed."""
|
| 48 |
+
|
| 49 |
+
def __init__(self, *args, **kwargs):
|
| 50 |
+
pass
|
| 51 |
+
|
| 52 |
+
def __getattr__(self, name):
|
| 53 |
+
def method(*args, **kwargs):
|
| 54 |
+
raise RuntimeError(
|
| 55 |
+
"Vector search requires additional dependencies. Install with:\n"
|
| 56 |
+
"pip install sentence-transformers faiss-cpu numpy"
|
| 57 |
+
)
|
| 58 |
+
return method
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class VectorSearch:
|
| 62 |
+
"""
|
| 63 |
+
Semantic search using sentence embeddings and FAISS.
|
| 64 |
+
|
| 65 |
+
Features:
|
| 66 |
+
- Generate embeddings for messages
|
| 67 |
+
- Build FAISS index for fast similarity search
|
| 68 |
+
- Find semantically similar messages (not just keyword match)
|
| 69 |
+
- Supports Hebrew and multilingual text
|
| 70 |
+
|
| 71 |
+
Example:
|
| 72 |
+
vs = VectorSearch(db_path='telegram.db')
|
| 73 |
+
vs.build_index() # One-time, can take a while
|
| 74 |
+
|
| 75 |
+
# Find similar messages
|
| 76 |
+
results = vs.search("מה קורה היום?", limit=10)
|
| 77 |
+
for msg_id, score, text in results:
|
| 78 |
+
print(f"{score:.3f}: {text[:50]}")
|
| 79 |
+
"""
|
| 80 |
+
|
| 81 |
+
# Recommended models for multilingual/Hebrew support
|
| 82 |
+
MODELS = {
|
| 83 |
+
'fast': 'paraphrase-multilingual-MiniLM-L12-v2', # Fast, good multilingual
|
| 84 |
+
'accurate': 'paraphrase-multilingual-mpnet-base-v2', # More accurate
|
| 85 |
+
'small': 'all-MiniLM-L6-v2', # Smallest, English-focused
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
def __init__(
|
| 89 |
+
self,
|
| 90 |
+
db_path: str = 'telegram.db',
|
| 91 |
+
model_name: str = 'fast',
|
| 92 |
+
index_path: Optional[str] = None
|
| 93 |
+
):
|
| 94 |
+
"""
|
| 95 |
+
Initialize vector search.
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
db_path: Path to SQLite database
|
| 99 |
+
model_name: Model preset ('fast', 'accurate', 'small') or full model name
|
| 100 |
+
index_path: Path to save/load FAISS index (default: db_path + '.faiss')
|
| 101 |
+
"""
|
| 102 |
+
if not VECTOR_SEARCH_AVAILABLE:
|
| 103 |
+
raise RuntimeError(
|
| 104 |
+
"Vector search requires additional dependencies. Install with:\n"
|
| 105 |
+
"pip install sentence-transformers faiss-cpu numpy"
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
self.db_path = db_path
|
| 109 |
+
self.index_path = index_path or f"{db_path}.faiss"
|
| 110 |
+
self.id_map_path = f"{self.index_path}.ids"
|
| 111 |
+
|
| 112 |
+
# Load model
|
| 113 |
+
model_id = self.MODELS.get(model_name, model_name)
|
| 114 |
+
print(f"Loading embedding model: {model_id}")
|
| 115 |
+
self.model = SentenceTransformer(model_id)
|
| 116 |
+
self.dimension = self.model.get_sentence_embedding_dimension()
|
| 117 |
+
|
| 118 |
+
# Initialize FAISS index
|
| 119 |
+
self.index = None
|
| 120 |
+
self.id_map: list[int] = [] # Maps FAISS index position to message_id
|
| 121 |
+
|
| 122 |
+
# Try to load existing index
|
| 123 |
+
if Path(self.index_path).exists():
|
| 124 |
+
self.load_index()
|
| 125 |
+
|
| 126 |
+
def _get_connection(self) -> sqlite3.Connection:
|
| 127 |
+
"""Get database connection."""
|
| 128 |
+
conn = sqlite3.connect(self.db_path)
|
| 129 |
+
conn.row_factory = sqlite3.Row
|
| 130 |
+
return conn
|
| 131 |
+
|
| 132 |
+
def encode(self, texts: list[str], batch_size: int = 32, show_progress: bool = True) -> 'np.ndarray':
|
| 133 |
+
"""
|
| 134 |
+
Encode texts to embeddings.
|
| 135 |
+
|
| 136 |
+
Args:
|
| 137 |
+
texts: List of texts to encode
|
| 138 |
+
batch_size: Batch size for encoding
|
| 139 |
+
show_progress: Show progress bar
|
| 140 |
+
|
| 141 |
+
Returns:
|
| 142 |
+
numpy array of shape (n_texts, dimension)
|
| 143 |
+
"""
|
| 144 |
+
return self.model.encode(
|
| 145 |
+
texts,
|
| 146 |
+
batch_size=batch_size,
|
| 147 |
+
show_progress_bar=show_progress,
|
| 148 |
+
convert_to_numpy=True,
|
| 149 |
+
normalize_embeddings=True # For cosine similarity
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
def build_index(
|
| 153 |
+
self,
|
| 154 |
+
batch_size: int = 1000,
|
| 155 |
+
min_text_length: int = 10,
|
| 156 |
+
use_gpu: bool = False
|
| 157 |
+
) -> None:
|
| 158 |
+
"""
|
| 159 |
+
Build FAISS index from all messages in database.
|
| 160 |
+
|
| 161 |
+
Args:
|
| 162 |
+
batch_size: Number of messages to process at once
|
| 163 |
+
min_text_length: Minimum text length to index
|
| 164 |
+
use_gpu: Use GPU acceleration if available
|
| 165 |
+
"""
|
| 166 |
+
conn = self._get_connection()
|
| 167 |
+
|
| 168 |
+
# Count messages
|
| 169 |
+
cursor = conn.execute(
|
| 170 |
+
'SELECT COUNT(*) FROM messages WHERE length(text_plain) >= ?',
|
| 171 |
+
(min_text_length,)
|
| 172 |
+
)
|
| 173 |
+
total = cursor.fetchone()[0]
|
| 174 |
+
print(f"Building index for {total} messages...")
|
| 175 |
+
|
| 176 |
+
# Create FAISS index
|
| 177 |
+
# Using IndexFlatIP (Inner Product) since we normalize embeddings
|
| 178 |
+
self.index = faiss.IndexFlatIP(self.dimension)
|
| 179 |
+
|
| 180 |
+
if use_gpu and faiss.get_num_gpus() > 0:
|
| 181 |
+
print("Using GPU acceleration")
|
| 182 |
+
self.index = faiss.index_cpu_to_gpu(
|
| 183 |
+
faiss.StandardGpuResources(),
|
| 184 |
+
0,
|
| 185 |
+
self.index
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
self.id_map = []
|
| 189 |
+
|
| 190 |
+
# Process in batches
|
| 191 |
+
offset = 0
|
| 192 |
+
while offset < total:
|
| 193 |
+
cursor = conn.execute(
|
| 194 |
+
'''
|
| 195 |
+
SELECT id, text_plain FROM messages
|
| 196 |
+
WHERE length(text_plain) >= ?
|
| 197 |
+
ORDER BY id
|
| 198 |
+
LIMIT ? OFFSET ?
|
| 199 |
+
''',
|
| 200 |
+
(min_text_length, batch_size, offset)
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
rows = cursor.fetchall()
|
| 204 |
+
if not rows:
|
| 205 |
+
break
|
| 206 |
+
|
| 207 |
+
ids = [row['id'] for row in rows]
|
| 208 |
+
texts = [row['text_plain'] for row in rows]
|
| 209 |
+
|
| 210 |
+
# Encode batch
|
| 211 |
+
embeddings = self.encode(texts, show_progress=False)
|
| 212 |
+
|
| 213 |
+
# Add to index
|
| 214 |
+
self.index.add(embeddings)
|
| 215 |
+
self.id_map.extend(ids)
|
| 216 |
+
|
| 217 |
+
offset += len(rows)
|
| 218 |
+
print(f"Indexed {offset}/{total} messages ({100*offset/total:.1f}%)")
|
| 219 |
+
|
| 220 |
+
conn.close()
|
| 221 |
+
|
| 222 |
+
# Save index
|
| 223 |
+
self.save_index()
|
| 224 |
+
print(f"Index built: {self.index.ntotal} vectors")
|
| 225 |
+
|
| 226 |
+
def save_index(self) -> None:
|
| 227 |
+
"""Save FAISS index and ID map to disk."""
|
| 228 |
+
if self.index is None:
|
| 229 |
+
return
|
| 230 |
+
|
| 231 |
+
# Convert GPU index to CPU for saving
|
| 232 |
+
if hasattr(faiss, 'index_gpu_to_cpu'):
|
| 233 |
+
try:
|
| 234 |
+
cpu_index = faiss.index_gpu_to_cpu(self.index)
|
| 235 |
+
except:
|
| 236 |
+
cpu_index = self.index
|
| 237 |
+
else:
|
| 238 |
+
cpu_index = self.index
|
| 239 |
+
|
| 240 |
+
faiss.write_index(cpu_index, self.index_path)
|
| 241 |
+
|
| 242 |
+
with open(self.id_map_path, 'wb') as f:
|
| 243 |
+
pickle.dump(self.id_map, f)
|
| 244 |
+
|
| 245 |
+
print(f"Index saved to {self.index_path}")
|
| 246 |
+
|
| 247 |
+
def load_index(self) -> bool:
|
| 248 |
+
"""Load FAISS index from disk."""
|
| 249 |
+
try:
|
| 250 |
+
self.index = faiss.read_index(self.index_path)
|
| 251 |
+
with open(self.id_map_path, 'rb') as f:
|
| 252 |
+
self.id_map = pickle.load(f)
|
| 253 |
+
print(f"Loaded index with {self.index.ntotal} vectors")
|
| 254 |
+
return True
|
| 255 |
+
except Exception as e:
|
| 256 |
+
print(f"Could not load index: {e}")
|
| 257 |
+
return False
|
| 258 |
+
|
| 259 |
+
def search(
|
| 260 |
+
self,
|
| 261 |
+
query: str,
|
| 262 |
+
limit: int = 10,
|
| 263 |
+
min_score: float = 0.0
|
| 264 |
+
) -> list[tuple[int, float, str]]:
|
| 265 |
+
"""
|
| 266 |
+
Search for semantically similar messages.
|
| 267 |
+
|
| 268 |
+
Args:
|
| 269 |
+
query: Search query text
|
| 270 |
+
limit: Maximum results to return
|
| 271 |
+
min_score: Minimum similarity score (0-1)
|
| 272 |
+
|
| 273 |
+
Returns:
|
| 274 |
+
List of (message_id, score, text) tuples
|
| 275 |
+
"""
|
| 276 |
+
if self.index is None or self.index.ntotal == 0:
|
| 277 |
+
raise RuntimeError("Index not built. Call build_index() first.")
|
| 278 |
+
|
| 279 |
+
# Encode query
|
| 280 |
+
query_vector = self.encode([query], show_progress=False)
|
| 281 |
+
|
| 282 |
+
# Search FAISS
|
| 283 |
+
scores, indices = self.index.search(query_vector, limit)
|
| 284 |
+
|
| 285 |
+
# Get message texts from DB
|
| 286 |
+
conn = self._get_connection()
|
| 287 |
+
results = []
|
| 288 |
+
|
| 289 |
+
for score, idx in zip(scores[0], indices[0]):
|
| 290 |
+
if idx == -1 or score < min_score:
|
| 291 |
+
continue
|
| 292 |
+
|
| 293 |
+
message_id = self.id_map[idx]
|
| 294 |
+
cursor = conn.execute(
|
| 295 |
+
'SELECT text_plain FROM messages WHERE id = ?',
|
| 296 |
+
(message_id,)
|
| 297 |
+
)
|
| 298 |
+
row = cursor.fetchone()
|
| 299 |
+
if row:
|
| 300 |
+
results.append((message_id, float(score), row['text_plain']))
|
| 301 |
+
|
| 302 |
+
conn.close()
|
| 303 |
+
return results
|
| 304 |
+
|
| 305 |
+
def find_similar(
|
| 306 |
+
self,
|
| 307 |
+
message_id: int,
|
| 308 |
+
limit: int = 10,
|
| 309 |
+
exclude_same_user: bool = False
|
| 310 |
+
) -> list[tuple[int, float, str]]:
|
| 311 |
+
"""
|
| 312 |
+
Find messages similar to a specific message.
|
| 313 |
+
|
| 314 |
+
Args:
|
| 315 |
+
message_id: ID of the reference message
|
| 316 |
+
limit: Maximum results to return
|
| 317 |
+
exclude_same_user: Exclude messages from same user
|
| 318 |
+
|
| 319 |
+
Returns:
|
| 320 |
+
List of (message_id, score, text) tuples
|
| 321 |
+
"""
|
| 322 |
+
conn = self._get_connection()
|
| 323 |
+
|
| 324 |
+
# Get the reference message
|
| 325 |
+
cursor = conn.execute(
|
| 326 |
+
'SELECT text_plain, from_id FROM messages WHERE id = ?',
|
| 327 |
+
(message_id,)
|
| 328 |
+
)
|
| 329 |
+
row = cursor.fetchone()
|
| 330 |
+
if not row:
|
| 331 |
+
conn.close()
|
| 332 |
+
return []
|
| 333 |
+
|
| 334 |
+
reference_text = row['text_plain']
|
| 335 |
+
reference_user = row['from_id']
|
| 336 |
+
conn.close()
|
| 337 |
+
|
| 338 |
+
# Search
|
| 339 |
+
results = self.search(reference_text, limit=limit * 2)
|
| 340 |
+
|
| 341 |
+
# Filter
|
| 342 |
+
filtered = []
|
| 343 |
+
for msg_id, score, text in results:
|
| 344 |
+
if msg_id == message_id:
|
| 345 |
+
continue
|
| 346 |
+
if exclude_same_user:
|
| 347 |
+
conn = self._get_connection()
|
| 348 |
+
cursor = conn.execute(
|
| 349 |
+
'SELECT from_id FROM messages WHERE id = ?',
|
| 350 |
+
(msg_id,)
|
| 351 |
+
)
|
| 352 |
+
msg_row = cursor.fetchone()
|
| 353 |
+
conn.close()
|
| 354 |
+
if msg_row and msg_row['from_id'] == reference_user:
|
| 355 |
+
continue
|
| 356 |
+
filtered.append((msg_id, score, text))
|
| 357 |
+
if len(filtered) >= limit:
|
| 358 |
+
break
|
| 359 |
+
|
| 360 |
+
return filtered
|
| 361 |
+
|
| 362 |
+
def cluster_messages(
|
| 363 |
+
self,
|
| 364 |
+
n_clusters: int = 10,
|
| 365 |
+
sample_size: Optional[int] = None
|
| 366 |
+
) -> dict[int, list[int]]:
|
| 367 |
+
"""
|
| 368 |
+
Cluster messages by semantic similarity using K-means.
|
| 369 |
+
|
| 370 |
+
Args:
|
| 371 |
+
n_clusters: Number of clusters
|
| 372 |
+
sample_size: Number of messages to sample (None = all)
|
| 373 |
+
|
| 374 |
+
Returns:
|
| 375 |
+
Dict mapping cluster_id to list of message_ids
|
| 376 |
+
"""
|
| 377 |
+
if self.index is None or self.index.ntotal == 0:
|
| 378 |
+
raise RuntimeError("Index not built. Call build_index() first.")
|
| 379 |
+
|
| 380 |
+
# Get vectors
|
| 381 |
+
n_vectors = self.index.ntotal
|
| 382 |
+
if sample_size and sample_size < n_vectors:
|
| 383 |
+
indices = np.random.choice(n_vectors, sample_size, replace=False)
|
| 384 |
+
vectors = np.array([self.index.reconstruct(int(i)) for i in indices])
|
| 385 |
+
ids = [self.id_map[i] for i in indices]
|
| 386 |
+
else:
|
| 387 |
+
vectors = np.array([self.index.reconstruct(i) for i in range(n_vectors)])
|
| 388 |
+
ids = self.id_map
|
| 389 |
+
|
| 390 |
+
# K-means clustering
|
| 391 |
+
kmeans = faiss.Kmeans(self.dimension, n_clusters, niter=20, verbose=True)
|
| 392 |
+
kmeans.train(vectors)
|
| 393 |
+
|
| 394 |
+
# Assign clusters
|
| 395 |
+
_, assignments = kmeans.index.search(vectors, 1)
|
| 396 |
+
|
| 397 |
+
# Group by cluster
|
| 398 |
+
clusters: dict[int, list[int]] = {i: [] for i in range(n_clusters)}
|
| 399 |
+
for msg_id, cluster_id in zip(ids, assignments.flatten()):
|
| 400 |
+
clusters[int(cluster_id)].append(msg_id)
|
| 401 |
+
|
| 402 |
+
return clusters
|
| 403 |
+
|
| 404 |
+
@property
|
| 405 |
+
def stats(self) -> dict:
|
| 406 |
+
"""Get index statistics."""
|
| 407 |
+
return {
|
| 408 |
+
'available': VECTOR_SEARCH_AVAILABLE,
|
| 409 |
+
'model': self.model.get_sentence_embedding_dimension() if self.model else None,
|
| 410 |
+
'dimension': self.dimension,
|
| 411 |
+
'index_size': self.index.ntotal if self.index else 0,
|
| 412 |
+
'index_path': self.index_path
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
# Export appropriate class based on availability
|
| 417 |
+
if VECTOR_SEARCH_AVAILABLE:
|
| 418 |
+
SemanticSearch = VectorSearch
|
| 419 |
+
else:
|
| 420 |
+
SemanticSearch = VectorSearchUnavailable
|
| 421 |
+
|
| 422 |
+
|
| 423 |
+
def check_dependencies() -> dict:
|
| 424 |
+
"""Check which dependencies are available."""
|
| 425 |
+
return {
|
| 426 |
+
'numpy': NUMPY_AVAILABLE,
|
| 427 |
+
'faiss': FAISS_AVAILABLE,
|
| 428 |
+
'sentence_transformers': SENTENCE_TRANSFORMERS_AVAILABLE,
|
| 429 |
+
'vector_search_available': VECTOR_SEARCH_AVAILABLE
|
| 430 |
+
}
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
if __name__ == '__main__':
|
| 434 |
+
print("=== Vector Search Dependencies ===")
|
| 435 |
+
deps = check_dependencies()
|
| 436 |
+
for name, available in deps.items():
|
| 437 |
+
status = "✓" if available else "✗"
|
| 438 |
+
print(f" {status} {name}")
|
| 439 |
+
|
| 440 |
+
if VECTOR_SEARCH_AVAILABLE:
|
| 441 |
+
print("\nVector search is available!")
|
| 442 |
+
print("Usage:")
|
| 443 |
+
print(" vs = VectorSearch('telegram.db')")
|
| 444 |
+
print(" vs.build_index() # One-time indexing")
|
| 445 |
+
print(" results = vs.search('מה קורה?')")
|
| 446 |
+
else:
|
| 447 |
+
print("\nTo enable vector search, install dependencies:")
|
| 448 |
+
print(" pip install sentence-transformers faiss-cpu numpy")
|