Spaces:

rottg
/

telegram-analytics

Sleeping

App Files Files Community

rottg commited on Feb 5

Commit

4a21e7e

1 Parent(s): a99d4dc

Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

analyzer.py +881 -0
check_db.py +49 -0
daily_sync.py +664 -0
static/css/style.css +137 -2
static/js/dashboard.js +25 -0
templates/index.html +2 -0
templates/moderation.html +2 -0
templates/search.html +2 -0
templates/settings.html +2 -0
templates/user_profile.html +2 -0
templates/users.html +2 -0
tests.py +474 -0
update_hf.py +132 -0
vector_search.py +448 -0

analyzer.py ADDED Viewed

	@@ -0,0 +1,881 @@

+#!/usr/bin/env python3
+"""
+Telegram Chat Analytics (Enhanced with Course Algorithms)
+Features:
+- LCS-based similar message detection
+- Heap-based Top-K (O(n log k) instead of O(n log n))
+- Selection algorithm for O(n) median/percentiles
+- Rank Tree for order statistics queries
+- Bucket Sort for time-based histograms
+Usage:
+    python analyzer.py --db telegram.db [options]
+    python analyzer.py --stats
+    python analyzer.py --top-users
+    python analyzer.py --similar        # NEW: Find similar messages
+    python analyzer.py --percentiles    # NEW: Message length percentiles
+    python analyzer.py --user-rank USER # NEW: Get user's rank
+"""
+import sqlite3
+import argparse
+import json
+from collections import Counter
+from datetime import datetime
+from typing import Optional
+import re
+# Import course algorithms
+from algorithms import (
+    # LCS
+    lcs_similarity, find_similar_messages,
+    # Top-K
+    TopK, top_k_frequent, top_k_by_field,
+    # Selection
+    find_median, find_percentile,
+    # Rank Tree
+    RankTree,
+    # Bucket Sort
+    bucket_sort_by_time, time_histogram, hourly_distribution,
+    # Combined
+    RankedTimeIndex
+)
+class TelegramAnalyzer:
+    """
+    Analytics interface for indexed Telegram messages.
+    Enhanced with efficient algorithms:
+    - Top-K queries: O(n log k) using heap
+    - Percentiles: O(n) using selection algorithm
+    - Rank queries: O(log n) using rank tree
+    - Similar messages: LCS-based detection
+    """
+    def __init__(self, db_path: str = 'telegram.db'):
+        self.db_path = db_path
+        self.conn = sqlite3.connect(db_path)
+        self.conn.row_factory = sqlite3.Row
+        # Lazy-loaded data structures
+        self._rank_tree: Optional[RankTree] = None
+        self._time_index: Optional[RankedTimeIndex] = None
+    def close(self):
+        self.conn.close()
+    def __enter__(self):
+        return self
+    def __exit__(self, *args):
+        self.close()
+    # ==========================================
+    # ORIGINAL METHODS (kept for compatibility)
+    # ==========================================
+    def get_stats(self) -> dict:
+        """Get general statistics about the indexed data."""
+        stats = {}
+        cursor = self.conn.execute('SELECT COUNT(*) FROM messages')
+        stats['total_messages'] = cursor.fetchone()[0]
+        cursor = self.conn.execute('SELECT COUNT(DISTINCT from_id) FROM messages')
+        stats['total_users'] = cursor.fetchone()[0]
+        cursor = self.conn.execute('''
+            SELECT MIN(date_unixtime), MAX(date_unixtime) FROM messages
+            WHERE date_unixtime IS NOT NULL
+        ''')
+        row = cursor.fetchone()
+        if row[0] and row[1]:
+            stats['first_message'] = datetime.fromtimestamp(row[0]).isoformat()
+            stats['last_message'] = datetime.fromtimestamp(row[1]).isoformat()
+            stats['days_span'] = (row[1] - row[0]) // 86400
+        cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE has_media = 1')
+        stats['messages_with_media'] = cursor.fetchone()[0]
+        cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE has_links = 1')
+        stats['messages_with_links'] = cursor.fetchone()[0]
+        cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE has_mentions = 1')
+        stats['messages_with_mentions'] = cursor.fetchone()[0]
+        cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE forwarded_from IS NOT NULL')
+        stats['forwarded_messages'] = cursor.fetchone()[0]
+        cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE reply_to_message_id IS NOT NULL')
+        stats['reply_messages'] = cursor.fetchone()[0]
+        cursor = self.conn.execute('SELECT COUNT(*) FROM messages WHERE is_edited = 1')
+        stats['edited_messages'] = cursor.fetchone()[0]
+        cursor = self.conn.execute('SELECT type, COUNT(*) FROM entities GROUP BY type')
+        stats['entities'] = {row[0]: row[1] for row in cursor.fetchall()}
+        # NEW: Add percentile stats using Selection algorithm
+        lengths = self._get_message_lengths()
+        if lengths:
+            stats['median_message_length'] = find_median(lengths)
+            stats['p90_message_length'] = find_percentile(lengths, 90)
+        return stats
+    def _get_message_lengths(self) -> list[int]:
+        """Get all message lengths for statistical analysis."""
+        cursor = self.conn.execute(
+            'SELECT length(text_plain) FROM messages WHERE text_plain IS NOT NULL'
+        )
+        return [row[0] for row in cursor.fetchall() if row[0]]
+    # ==========================================
+    # ENHANCED TOP-K METHODS (using Heap)
+    # ==========================================
+    def get_top_users(self, limit: int = 20) -> list[dict]:
+        """
+        Get most active users by message count.
+        Uses Heap-based Top-K: O(n log k) instead of O(n log n)
+        """
+        cursor = self.conn.execute('''
+            SELECT
+                from_id,
+                from_name,
+                COUNT(*) as message_count,
+                SUM(has_links) as links_shared,
+                SUM(has_media) as media_shared,
+                MIN(date_unixtime) as first_message,
+                MAX(date_unixtime) as last_message
+            FROM messages
+            WHERE from_id IS NOT NULL AND from_id != ''
+            GROUP BY from_id
+        ''')
+        # Use heap-based Top-K
+        top = TopK(limit, key=lambda x: x['message_count'])
+        for row in cursor.fetchall():
+            top.push(dict(row))
+        return top.get_top()
+    def get_top_words_heap(self, limit: int = 50, min_length: int = 3) -> list[tuple[str, int]]:
+        """
+        Get most frequent words using Heap-based Top-K.
+        O(n + m log k) where n=total words, m=unique words, k=limit
+        """
+        cursor = self.conn.execute('SELECT text_plain FROM messages WHERE text_plain IS NOT NULL')
+        word_pattern = re.compile(r'[\u0590-\u05FFa-zA-Z]+')
+        words = []
+        for row in cursor.fetchall():
+            text = row[0]
+            for word in word_pattern.findall(text.lower()):
+                if len(word) >= min_length:
+                    words.append(word)
+        return top_k_frequent(words, limit)
+    def get_top_domains_heap(self, limit: int = 20) -> list[tuple[str, int]]:
+        """Get most shared domains using Heap-based Top-K."""
+        cursor = self.conn.execute("SELECT value FROM entities WHERE type = 'link'")
+        domain_pattern = re.compile(r'https?://(?:www\.)?([^/]+)')
+        domains = []
+        for row in cursor.fetchall():
+            match = domain_pattern.match(row[0])
+            if match:
+                domains.append(match.group(1))
+        return top_k_frequent(domains, limit)
+    # ==========================================
+    # LCS-BASED SIMILAR MESSAGE DETECTION
+    # ==========================================
+    def find_similar_messages(
+        self,
+        threshold: float = 0.7,
+        min_length: int = 30,
+        limit: int = 100,
+        sample_size: int = 1000
+    ) -> list[tuple[int, int, float, str, str]]:
+        """
+        Find similar/duplicate messages using LCS algorithm.
+        Args:
+            threshold: Minimum similarity (0-1)
+            min_length: Minimum message length to consider
+            limit: Maximum pairs to return
+            sample_size: Sample size for large datasets
+        Returns:
+            List of (id1, id2, similarity, text1, text2) tuples
+        """
+        cursor = self.conn.execute('''
+            SELECT id, text_plain FROM messages
+            WHERE text_plain IS NOT NULL AND length(text_plain) >= ?
+            ORDER BY RANDOM()
+            LIMIT ?
+        ''', (min_length, sample_size))
+        messages = [(row[0], row[1]) for row in cursor.fetchall()]
+        # Find similar pairs using LCS
+        similar_pairs = find_similar_messages(messages, threshold, min_length)
+        # Fetch full text for results
+        results = []
+        for id1, id2, sim in similar_pairs[:limit]:
+            cursor = self.conn.execute(
+                'SELECT text_plain FROM messages WHERE id IN (?, ?)',
+                (id1, id2)
+            )
+            rows = cursor.fetchall()
+            if len(rows) == 2:
+                results.append((id1, id2, sim, rows[0][0][:100], rows[1][0][:100]))
+        return results
+    def find_reposts(self, threshold: float = 0.9) -> list[dict]:
+        """
+        Find potential reposts (very similar messages from different users).
+        """
+        cursor = self.conn.execute('''
+            SELECT id, from_id, text_plain FROM messages
+            WHERE text_plain IS NOT NULL AND length(text_plain) >= 50
+            ORDER BY date_unixtime DESC
+            LIMIT 500
+        ''')
+        messages = [(row[0], row[1], row[2]) for row in cursor.fetchall()]
+        reposts = []
+        for i in range(len(messages)):
+            for j in range(i + 1, len(messages)):
+                id1, user1, text1 = messages[i]
+                id2, user2, text2 = messages[j]
+                # Only consider different users
+                if user1 == user2:
+                    continue
+                sim = lcs_similarity(text1, text2)
+                if sim >= threshold:
+                    reposts.append({
+                        'message_id_1': id1,
+                        'message_id_2': id2,
+                        'user_1': user1,
+                        'user_2': user2,
+                        'similarity': sim,
+                        'text_preview': text1[:80]
+                    })
+        return sorted(reposts, key=lambda x: x['similarity'], reverse=True)
+    # ==========================================
+    # SELECTION ALGORITHM (PERCENTILES)
+    # ==========================================
+    def get_message_length_stats(self) -> dict:
+        """
+        Get message length statistics using O(n) Selection algorithm.
+        Much faster than sorting for percentile calculations.
+        """
+        lengths = self._get_message_lengths()
+        if not lengths:
+            return {}
+        return {
+            'count': len(lengths),
+            'min': min(lengths),
+            'max': max(lengths),
+            'median': find_median(lengths),
+            'p25': find_percentile(lengths, 25),
+            'p75': find_percentile(lengths, 75),
+            'p90': find_percentile(lengths, 90),
+            'p95': find_percentile(lengths, 95),
+            'p99': find_percentile(lengths, 99),
+        }
+    def get_response_time_percentiles(self) -> dict:
+        """
+        Calculate response time percentiles for replies.
+        Uses Selection algorithm for O(n) percentile calculation.
+        """
+        cursor = self.conn.execute('''
+            SELECT
+                m1.date_unixtime - m2.date_unixtime as response_time
+            FROM messages m1
+            JOIN messages m2 ON m1.reply_to_message_id = m2.id
+            WHERE m1.date_unixtime > m2.date_unixtime
+        ''')
+        times = [row[0] for row in cursor.fetchall() if row[0] and row[0] > 0]
+        if not times:
+            return {}
+        return {
+            'count': len(times),
+            'median_seconds': find_median(times),
+            'p75_seconds': find_percentile(times, 75),
+            'p90_seconds': find_percentile(times, 90),
+            'p95_seconds': find_percentile(times, 95),
+        }
+    # ==========================================
+    # RANK TREE (ORDER STATISTICS)
+    # ==========================================
+    def _build_user_rank_tree(self) -> RankTree:
+        """Build rank tree for user activity ranking."""
+        if self._rank_tree is not None:
+            return self._rank_tree
+        self._rank_tree = RankTree()
+        cursor = self.conn.execute('''
+            SELECT from_id, from_name, COUNT(*) as msg_count
+            FROM messages
+            WHERE from_id IS NOT NULL AND from_id != ''
+            GROUP BY from_id
+        ''')
+        for row in cursor.fetchall():
+            self._rank_tree.insert(
+                row['msg_count'],
+                {'user_id': row['from_id'], 'name': row['from_name'], 'count': row['msg_count']}
+            )
+        return self._rank_tree
+    def get_user_rank(self, user_id: str) -> dict:
+        """
+        Get a user's rank among all users.
+        Uses Rank Tree: O(log n) instead of O(n log n)
+        """
+        tree = self._build_user_rank_tree()
+        # Get user's message count
+        cursor = self.conn.execute(
+            'SELECT COUNT(*) FROM messages WHERE from_id = ?',
+            (user_id,)
+        )
+        count = cursor.fetchone()[0]
+        if count == 0:
+            return {'error': 'User not found'}
+        rank = tree.rank(count)
+        total_users = len(tree)
+        return {
+            'user_id': user_id,
+            'message_count': count,
+            'rank': total_users - rank + 1,  # Reverse for "top" ranking
+            'total_users': total_users,
+            'percentile': ((total_users - rank) / total_users) * 100
+        }
+    def get_user_by_rank(self, rank: int) -> Optional[dict]:
+        """
+        Get the user at a specific rank.
+        Uses Rank Tree select(): O(log n)
+        """
+        tree = self._build_user_rank_tree()
+        total = len(tree)
+        if rank < 1 or rank > total:
+            return None
+        # Convert to tree rank (reverse order for "top")
+        tree_rank = total - rank + 1
+        return tree.select(tree_rank)
+    # ==========================================
+    # BUCKET SORT (TIME-BASED HISTOGRAMS)
+    # ==========================================
+    def get_activity_histogram(
+        self,
+        bucket_size: int = 86400,  # 1 day default
+        start_time: int = None,
+        end_time: int = None
+    ) -> list[tuple[str, int]]:
+        """
+        Get activity histogram using Bucket Sort.
+        O(n + k) where k = number of buckets
+        Args:
+            bucket_size: Bucket size in seconds (default: 1 day)
+            start_time: Start timestamp (default: earliest message)
+            end_time: End timestamp (default: latest message)
+        Returns:
+            List of (date_string, count) tuples
+        """
+        cursor = self.conn.execute(
+            'SELECT id, date_unixtime FROM messages WHERE date_unixtime IS NOT NULL'
+        )
+        records = [{'id': row[0], 'date_unixtime': row[1]} for row in cursor.fetchall()]
+        if not records:
+            return []
+        hist = time_histogram(records, 'date_unixtime', bucket_size)
+        # Format timestamps as dates
+        return [
+            (datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M'), count)
+            for ts, count in hist
+        ]
+    def get_hourly_distribution(self) -> dict[int, int]:
+        """
+        Get message distribution by hour of day.
+        Uses Bucket Sort: O(n)
+        """
+        cursor = self.conn.execute(
+            'SELECT id, date_unixtime FROM messages WHERE date_unixtime IS NOT NULL'
+        )
+        records = [{'id': row[0], 'date_unixtime': row[1]} for row in cursor.fetchall()]
+        return hourly_distribution(records, 'date_unixtime')
+    # ==========================================
+    # ORIGINAL METHODS (kept for compatibility)
+    # ==========================================
+    def get_hourly_activity(self) -> dict[int, int]:
+        """Get message count by hour of day."""
+        sql = '''
+            SELECT
+                CAST(strftime('%H', datetime(date_unixtime, 'unixepoch')) AS INTEGER) as hour,
+                COUNT(*) as count
+            FROM messages
+            WHERE date_unixtime IS NOT NULL
+            GROUP BY hour
+            ORDER BY hour
+        '''
+        cursor = self.conn.execute(sql)
+        return {row[0]: row[1] for row in cursor.fetchall()}
+    def get_daily_activity(self) -> dict[str, int]:
+        """Get message count by day of week."""
+        days = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
+        sql = '''
+            SELECT
+                CAST(strftime('%w', datetime(date_unixtime, 'unixepoch')) AS INTEGER) as day,
+                COUNT(*) as count
+            FROM messages
+            WHERE date_unixtime IS NOT NULL
+            GROUP BY day
+            ORDER BY day
+        '''
+        cursor = self.conn.execute(sql)
+        return {days[row[0]]: row[1] for row in cursor.fetchall()}
+    def get_monthly_activity(self) -> dict[str, int]:
+        """Get message count by month."""
+        sql = '''
+            SELECT
+                strftime('%Y-%m', datetime(date_unixtime, 'unixepoch')) as month,
+                COUNT(*) as count
+            FROM messages
+            WHERE date_unixtime IS NOT NULL
+            GROUP BY month
+            ORDER BY month
+        '''
+        cursor = self.conn.execute(sql)
+        return {row[0]: row[1] for row in cursor.fetchall()}
+    def get_top_domains(self, limit: int = 20) -> list[tuple[str, int]]:
+        """Get most shared domains from links."""
+        return self.get_top_domains_heap(limit)
+    def get_top_mentioned(self, limit: int = 20) -> list[tuple[str, int]]:
+        """Get most mentioned users/channels."""
+        sql = '''
+            SELECT value, COUNT(*) as count
+            FROM entities
+            WHERE type = 'mention'
+            GROUP BY value
+            ORDER BY count DESC
+            LIMIT ?
+        '''
+        cursor = self.conn.execute(sql, (limit,))
+        return [(row[0], row[1]) for row in cursor.fetchall()]
+    def get_forwarded_sources(self, limit: int = 20) -> list[dict]:
+        """Get top sources of forwarded messages."""
+        sql = '''
+            SELECT
+                forwarded_from,
+                forwarded_from_id,
+                COUNT(*) as count
+            FROM messages
+            WHERE forwarded_from IS NOT NULL
+            GROUP BY forwarded_from_id
+            ORDER BY count DESC
+            LIMIT ?
+        '''
+        cursor = self.conn.execute(sql, (limit,))
+        return [dict(row) for row in cursor.fetchall()]
+    def get_word_frequency(self, limit: int = 50, min_length: int = 3) -> list[tuple[str, int]]:
+        """Get most frequent words using Heap-based Top-K."""
+        return self.get_top_words_heap(limit, min_length)
+    def get_reply_network(self, limit: int = 100) -> list[dict]:
+        """Get reply relationships between users."""
+        sql = '''
+            SELECT
+                m1.from_id as replier_id,
+                m1.from_name as replier_name,
+                m2.from_id as replied_to_id,
+                m2.from_name as replied_to_name,
+                COUNT(*) as reply_count
+            FROM messages m1
+            JOIN messages m2 ON m1.reply_to_message_id = m2.id
+            WHERE m1.reply_to_message_id IS NOT NULL
+            GROUP BY m1.from_id, m2.from_id
+            ORDER BY reply_count DESC
+            LIMIT ?
+        '''
+        cursor = self.conn.execute(sql, (limit,))
+        return [dict(row) for row in cursor.fetchall()]
+    def get_user_stats(self, user_id: str) -> dict:
+        """Get detailed statistics for a specific user."""
+        stats = {}
+        cursor = self.conn.execute('''
+            SELECT
+                COUNT(*) as total,
+                SUM(has_links) as links,
+                SUM(has_media) as media,
+                SUM(has_mentions) as mentions,
+                SUM(is_edited) as edited,
+                MIN(date_unixtime) as first_msg,
+                MAX(date_unixtime) as last_msg
+            FROM messages WHERE from_id = ?
+        ''', (user_id,))
+        row = cursor.fetchone()
+        stats.update(dict(row))
+        cursor = self.conn.execute('''
+            SELECT COUNT(*) FROM messages m1
+            JOIN messages m2 ON m1.reply_to_message_id = m2.id
+            WHERE m2.from_id = ?
+        ''', (user_id,))
+        stats['replies_received'] = cursor.fetchone()[0]
+        cursor = self.conn.execute('''
+            SELECT COUNT(*) FROM messages
+            WHERE from_id = ? AND reply_to_message_id IS NOT NULL
+        ''', (user_id,))
+        stats['replies_sent'] = cursor.fetchone()[0]
+        # Add rank info using Rank Tree
+        rank_info = self.get_user_rank(user_id)
+        stats['rank'] = rank_info.get('rank')
+        stats['percentile'] = rank_info.get('percentile')
+        return stats
+def print_bar(value: int, max_value: int, width: int = 40) -> str:
+    """Create a simple ASCII bar."""
+    if max_value == 0:
+        return ''
+    bar_length = int((value / max_value) * width)
+    return '█' * bar_length + '░' * (width - bar_length)
+def main():
+    parser = argparse.ArgumentParser(description='Analyze indexed Telegram messages (Enhanced)')
+    parser.add_argument('--db', default='telegram.db', help='Database path')
+    # Original options
+    parser.add_argument('--stats', action='store_true', help='Show general statistics')
+    parser.add_argument('--top-users', action='store_true', help='Show top users')
+    parser.add_argument('--hourly', action='store_true', help='Show hourly activity')
+    parser.add_argument('--daily', action='store_true', help='Show daily activity')
+    parser.add_argument('--monthly', action='store_true', help='Show monthly activity')
+    parser.add_argument('--domains', action='store_true', help='Show top shared domains')
+    parser.add_argument('--mentions', action='store_true', help='Show top mentions')
+    parser.add_argument('--words', action='store_true', help='Show word frequency')
+    parser.add_argument('--sources', action='store_true', help='Show forwarded message sources')
+    parser.add_argument('--replies', action='store_true', help='Show reply network')
+    parser.add_argument('--user', help='Show stats for specific user ID')
+    # NEW: Enhanced options
+    parser.add_argument('--similar', action='store_true', help='Find similar messages (LCS)')
+    parser.add_argument('--reposts', action='store_true', help='Find potential reposts')
+    parser.add_argument('--percentiles', action='store_true', help='Show message length percentiles')
+    parser.add_argument('--response-times', action='store_true', help='Show response time percentiles')
+    parser.add_argument('--user-rank', help='Get rank of specific user')
+    parser.add_argument('--rank', type=int, help='Get user at specific rank')
+    parser.add_argument('--histogram', action='store_true', help='Show activity histogram')
+    parser.add_argument('--bucket-size', type=int, default=86400, help='Histogram bucket size in seconds')
+    parser.add_argument('--limit', type=int, default=20, help='Limit results')
+    parser.add_argument('--json', action='store_true', help='Output as JSON')
+    parser.add_argument('--threshold', type=float, default=0.7, help='Similarity threshold')
+    args = parser.parse_args()
+    with TelegramAnalyzer(args.db) as analyzer:
+        # === ORIGINAL OPTIONS ===
+        if args.stats:
+            stats = analyzer.get_stats()
+            if args.json:
+                print(json.dumps(stats, indent=2, ensure_ascii=False))
+            else:
+                print("=== General Statistics ===\n")
+                print(f"Total messages:      {stats['total_messages']:,}")
+                print(f"Total users:         {stats['total_users']:,}")
+                print(f"First message:       {stats.get('first_message', 'N/A')}")
+                print(f"Last message:        {stats.get('last_message', 'N/A')}")
+                print(f"Days span:           {stats.get('days_span', 'N/A')}")
+                print(f"Messages with media: {stats['messages_with_media']:,}")
+                print(f"Messages with links: {stats['messages_with_links']:,}")
+                print(f"Forwarded messages:  {stats['forwarded_messages']:,}")
+                print(f"Reply messages:      {stats['reply_messages']:,}")
+                if 'median_message_length' in stats:
+                    print(f"\nMedian msg length:   {stats['median_message_length']:.0f} chars")
+                    print(f"90th percentile:     {stats['p90_message_length']:.0f} chars")
+                print(f"\nEntities: {stats.get('entities', {})}")
+            return
+        if args.top_users:
+            users = analyzer.get_top_users(args.limit)
+            if args.json:
+                print(json.dumps(users, indent=2, ensure_ascii=False))
+            else:
+                print("=== Top Users by Message Count (Heap-based Top-K) ===\n")
+                max_count = users[0]['message_count'] if users else 0
+                for i, user in enumerate(users, 1):
+                    bar = print_bar(user['message_count'], max_count, 30)
+                    print(f"{i:2}. {user['from_name'][:20]:20} {bar} {user['message_count']:,}")
+            return
+        if args.hourly:
+            hourly = analyzer.get_hourly_activity()
+            if args.json:
+                print(json.dumps(hourly, indent=2))
+            else:
+                print("=== Hourly Activity ===\n")
+                max_count = max(hourly.values()) if hourly else 0
+                for hour in range(24):
+                    count = hourly.get(hour, 0)
+                    bar = print_bar(count, max_count, 40)
+                    print(f"{hour:02}:00  {bar} {count:,}")
+            return
+        if args.daily:
+            daily = analyzer.get_daily_activity()
+            if args.json:
+                print(json.dumps(daily, indent=2))
+            else:
+                print("=== Daily Activity ===\n")
+                max_count = max(daily.values()) if daily else 0
+                for day, count in daily.items():
+                    bar = print_bar(count, max_count, 40)
+                    print(f"{day:10} {bar} {count:,}")
+            return
+        if args.monthly:
+            monthly = analyzer.get_monthly_activity()
+            if args.json:
+                print(json.dumps(monthly, indent=2))
+            else:
+                print("=== Monthly Activity ===\n")
+                max_count = max(monthly.values()) if monthly else 0
+                for month, count in monthly.items():
+                    bar = print_bar(count, max_count, 40)
+                    print(f"{month}  {bar} {count:,}")
+            return
+        if args.domains:
+            domains = analyzer.get_top_domains(args.limit)
+            if args.json:
+                print(json.dumps(dict(domains), indent=2))
+            else:
+                print("=== Top Shared Domains (Heap-based Top-K) ===\n")
+                max_count = domains[0][1] if domains else 0
+                for domain, count in domains:
+                    bar = print_bar(count, max_count, 30)
+                    print(f"{domain[:30]:30} {bar} {count:,}")
+            return
+        if args.mentions:
+            mentions = analyzer.get_top_mentioned(args.limit)
+            if args.json:
+                print(json.dumps(dict(mentions), indent=2))
+            else:
+                print("=== Top Mentioned Users ===\n")
+                max_count = mentions[0][1] if mentions else 0
+                for mention, count in mentions:
+                    bar = print_bar(count, max_count, 30)
+                    print(f"{mention:20} {bar} {count:,}")
+            return
+        if args.words:
+            words = analyzer.get_word_frequency(args.limit)
+            if args.json:
+                print(json.dumps(dict(words), indent=2, ensure_ascii=False))
+            else:
+                print("=== Top Words (Heap-based Top-K) ===\n")
+                max_count = words[0][1] if words else 0
+                for word, count in words:
+                    bar = print_bar(count, max_count, 30)
+                    print(f"{word:20} {bar} {count:,}")
+            return
+        if args.sources:
+            sources = analyzer.get_forwarded_sources(args.limit)
+            if args.json:
+                print(json.dumps(sources, indent=2, ensure_ascii=False))
+            else:
+                print("=== Top Forwarded Sources ===\n")
+                max_count = sources[0]['count'] if sources else 0
+                for src in sources:
+                    bar = print_bar(src['count'], max_count, 30)
+                    name = src['forwarded_from'] or 'Unknown'
+                    print(f"{name[:30]:30} {bar} {src['count']:,}")
+            return
+        if args.replies:
+            replies = analyzer.get_reply_network(args.limit)
+            if args.json:
+                print(json.dumps(replies, indent=2, ensure_ascii=False))
+            else:
+                print("=== Reply Network ===\n")
+                for r in replies:
+                    print(f"{r['replier_name']} → {r['replied_to_name']}: {r['reply_count']} replies")
+            return
+        if args.user:
+            user_stats = analyzer.get_user_stats(args.user)
+            if args.json:
+                print(json.dumps(user_stats, indent=2))
+            else:
+                print(f"=== Stats for {args.user} ===\n")
+                for key, value in user_stats.items():
+                    print(f"{key}: {value}")
+            return
+        # === NEW ENHANCED OPTIONS ===
+        if args.similar:
+            print(f"=== Similar Messages (LCS, threshold={args.threshold}) ===\n")
+            similar = analyzer.find_similar_messages(
+                threshold=args.threshold,
+                limit=args.limit
+            )
+            if args.json:
+                print(json.dumps(similar, indent=2, ensure_ascii=False))
+            else:
+                for id1, id2, sim, text1, text2 in similar:
+                    print(f"Similarity: {sim:.1%}")
+                    print(f"  [{id1}] {text1}...")
+                    print(f"  [{id2}] {text2}...")
+                    print()
+            return
+        if args.reposts:
+            print("=== Potential Reposts (LCS-based) ===\n")
+            reposts = analyzer.find_reposts(threshold=args.threshold)
+            if args.json:
+                print(json.dumps(reposts, indent=2, ensure_ascii=False))
+            else:
+                for r in reposts[:args.limit]:
+                    print(f"Similarity: {r['similarity']:.1%}")
+                    print(f"  User 1: {r['user_1']}")
+                    print(f"  User 2: {r['user_2']}")
+                    print(f"  Text: {r['text_preview']}...")
+                    print()
+            return
+        if args.percentiles:
+            print("=== Message Length Percentiles (Selection Algorithm) ===\n")
+            stats = analyzer.get_message_length_stats()
+            if args.json:
+                print(json.dumps(stats, indent=2))
+            else:
+                for key, value in stats.items():
+                    print(f"{key:15}: {value:,.0f}")
+            return
+        if args.response_times:
+            print("=== Response Time Percentiles (Selection Algorithm) ===\n")
+            stats = analyzer.get_response_time_percentiles()
+            if args.json:
+                print(json.dumps(stats, indent=2))
+            else:
+                for key, value in stats.items():
+                    if 'seconds' in key:
+                        print(f"{key:15}: {value:,.0f}s ({value/60:.1f}m)")
+                    else:
+                        print(f"{key:15}: {value:,}")
+            return
+        if args.user_rank:
+            print(f"=== User Rank (Rank Tree O(log n)) ===\n")
+            rank_info = analyzer.get_user_rank(args.user_rank)
+            if args.json:
+                print(json.dumps(rank_info, indent=2))
+            else:
+                print(f"User ID:       {rank_info.get('user_id')}")
+                print(f"Message count: {rank_info.get('message_count'):,}")
+                print(f"Rank:          #{rank_info.get('rank')} of {rank_info.get('total_users')}")
+                print(f"Percentile:    Top {rank_info.get('percentile'):.1f}%")
+            return
+        if args.rank:
+            print(f"=== User at Rank #{args.rank} (Rank Tree O(log n)) ===\n")
+            user = analyzer.get_user_by_rank(args.rank)
+            if args.json:
+                print(json.dumps(user, indent=2, ensure_ascii=False))
+            elif user:
+                print(f"Name:          {user.get('name')}")
+                print(f"User ID:       {user.get('user_id')}")
+                print(f"Message count: {user.get('count'):,}")
+            else:
+                print(f"No user at rank {args.rank}")
+            return
+        if args.histogram:
+            print(f"=== Activity Histogram (Bucket Sort, bucket={args.bucket_size}s) ===\n")
+            hist = analyzer.get_activity_histogram(bucket_size=args.bucket_size)
+            if args.json:
+                print(json.dumps(hist, indent=2))
+            else:
+                max_count = max(c for _, c in hist) if hist else 0
+                for date_str, count in hist[-args.limit:]:
+                    bar = print_bar(count, max_count, 40)
+                    print(f"{date_str}  {bar} {count:,}")
+            return
+        # Default: show help
+        parser.print_help()
+if __name__ == '__main__':
+    main()

check_db.py ADDED Viewed

	@@ -0,0 +1,49 @@

+#!/usr/bin/env python3
+"""Quick script to check database contents."""
+import sqlite3
+import os
+DB_PATH = 'telegram.db'
+if not os.path.exists(DB_PATH):
+    print(f"Database not found: {DB_PATH}")
+    exit(1)
+try:
+    conn = sqlite3.connect(DB_PATH)
+    conn.row_factory = sqlite3.Row
+    # Get total count
+    total = conn.execute("SELECT COUNT(*) FROM messages").fetchone()[0]
+    print(f"Total messages in database: {total}")
+    # Get date range
+    date_range = conn.execute("""
+        SELECT MIN(date) as earliest, MAX(date) as latest
+        FROM messages
+    """).fetchone()
+    print(f"Date range: {date_range['earliest']} to {date_range['latest']}")
+    print()
+    # Show 50 newest messages
+    print("=" * 60)
+    print("50 NEWEST MESSAGES:")
+    print("=" * 60)
+    rows = conn.execute("""
+        SELECT date, from_name, text_plain
+        FROM messages
+        ORDER BY date DESC
+        LIMIT 50
+    """).fetchall()
+    for row in rows:
+        text = (row['text_plain'] or '')[:80].replace('\n', ' ')
+        name = row['from_name'] or 'Unknown'
+        print(f"{row['date']} | {name}: {text}")
+    conn.close()
+except Exception as e:
+    print(f"Error: {e}")
+    import traceback
+    traceback.print_exc()

daily_sync.py ADDED Viewed

	@@ -0,0 +1,664 @@

+#!/usr/bin/env python3
+"""
+Daily Telegram Sync Script
+===========================
+Automatically syncs new messages from Telegram group to the analytics system.
+What it does:
+1. Connects to Telegram via Telethon
+2. Fetches messages from the last 36 hours (12h overlap for safety)
+3. Adds new messages to telegram.db (duplicates ignored)
+4. Generates embeddings for new messages locally
+5. Adds embeddings to embeddings.db (duplicates ignored)
+Usage:
+    First time:  python daily_sync.py --setup
+    Daily run:   python daily_sync.py
+    Custom hours: python daily_sync.py --hours 48
+Automation:
+    Windows Task Scheduler: python daily_sync.py
+    Linux cron: 0 3 * * * cd /path/to/telegram && python daily_sync.py >> sync.log 2>&1
+"""
+import os
+import sys
+import json
+import time
+import sqlite3
+import asyncio
+import argparse
+import logging
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+# Setup logging
+LOG_FILE = Path(__file__).parent / 'sync.log'
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s [%(levelname)s] %(message)s',
+    handlers=[
+        logging.StreamHandler(),
+        logging.FileHandler(LOG_FILE, encoding='utf-8')
+    ]
+)
+log = logging.getLogger('daily_sync')
+# Paths
+BASE_DIR = Path(__file__).parent
+DB_PATH = BASE_DIR / 'telegram.db'
+EMBEDDINGS_DB_PATH = BASE_DIR / 'embeddings.db'
+SESSION_FILE = BASE_DIR / 'telegram_session'
+CONFIG_FILE = BASE_DIR / 'sync_config.json'
+# ==========================================
+# CONFIGURATION
+# ==========================================
+def load_config() -> dict:
+    """Load configuration from sync_config.json."""
+    if CONFIG_FILE.exists():
+        with open(CONFIG_FILE, 'r') as f:
+            return json.load(f)
+    return {}
+def save_config(config: dict):
+    """Save configuration to sync_config.json."""
+    with open(CONFIG_FILE, 'w') as f:
+        json.dump(config, f, indent=2)
+def setup_config():
+    """Interactive setup for first time configuration."""
+    print("=" * 50)
+    print("  Telegram Daily Sync - Setup")
+    print("=" * 50)
+    print()
+    print("You need Telegram API credentials.")
+    print("Get them from: https://my.telegram.org/apps")
+    print()
+    api_id = input("API ID: ").strip()
+    api_hash = input("API Hash: ").strip()
+    group = input("Group username or ID (e.g., @mygroup or -1001234567890): ").strip()
+    config = {
+        'api_id': int(api_id),
+        'api_hash': api_hash,
+        'group': group,
+        'hours': 36
+    }
+    save_config(config)
+    print(f"\nConfiguration saved to {CONFIG_FILE}")
+    print("Now run: python daily_sync.py")
+    print("(First run will ask you to log in to Telegram)")
+    return config
+# ==========================================
+# TELETHON: FETCH MESSAGES
+# ==========================================
+def telethon_message_to_json(message) -> dict | None:
+    """
+    Convert a Telethon message object to Telegram Desktop export JSON format.
+    This ensures compatibility with the existing parse_message() in indexer.py.
+    """
+    from telethon.tl.types import (
+        MessageEntityUrl, MessageEntityTextUrl,
+        MessageEntityMention, MessageEntityMentionName,
+        MessageEntityBold, MessageEntityItalic,
+        MessageEntityCode, MessageEntityPre,
+        MessageEntityHashtag, MessageEntityEmail,
+        MessageEntityPhone, MessageEntityBotCommand,
+    )
+    if message.text is None and message.raw_text is None:
+        return None
+    text = message.raw_text or ''
+    if not text.strip():
+        # Skip empty messages (media-only, service messages, etc.)
+        return None
+    # Build text_entities in Telegram Desktop export format
+    text_entities = []
+    if message.entities:
+        for entity in message.entities:
+            start = entity.offset
+            end = entity.offset + entity.length
+            entity_text = text[start:end]
+            entity_type = 'plain'
+            if isinstance(entity, (MessageEntityUrl,)):
+                entity_type = 'link'
+            elif isinstance(entity, MessageEntityTextUrl):
+                entity_type = 'text_link'
+                entity_text = entity.url
+            elif isinstance(entity, (MessageEntityMention, MessageEntityMentionName)):
+                entity_type = 'mention'
+            elif isinstance(entity, MessageEntityBold):
+                entity_type = 'bold'
+            elif isinstance(entity, MessageEntityItalic):
+                entity_type = 'italic'
+            elif isinstance(entity, (MessageEntityCode, MessageEntityPre)):
+                entity_type = 'code'
+            elif isinstance(entity, MessageEntityHashtag):
+                entity_type = 'hashtag'
+            elif isinstance(entity, MessageEntityEmail):
+                entity_type = 'email'
+            elif isinstance(entity, MessageEntityPhone):
+                entity_type = 'phone'
+            elif isinstance(entity, MessageEntityBotCommand):
+                entity_type = 'bot_command'
+            if entity_type != 'plain':
+                text_entities.append({
+                    'type': entity_type,
+                    'text': entity_text
+                })
+    # Get sender info
+    sender = message.sender
+    from_name = ''
+    from_id = ''
+    if sender:
+        if hasattr(sender, 'first_name'):
+            # User
+            parts = [sender.first_name or '']
+            if sender.last_name:
+                parts.append(sender.last_name)
+            from_name = ' '.join(parts).strip()
+            from_id = f'user{sender.id}'
+        elif hasattr(sender, 'title'):
+            # Channel/Group
+            from_name = sender.title or ''
+            from_id = f'channel{sender.id}'
+    # Handle forwarded messages
+    forwarded_from = None
+    forwarded_from_id = None
+    if message.forward:
+        fwd = message.forward
+        try:
+            if fwd.sender:
+                if hasattr(fwd.sender, 'first_name'):
+                    parts = [fwd.sender.first_name or '']
+                    if fwd.sender.last_name:
+                        parts.append(fwd.sender.last_name)
+                    forwarded_from = ' '.join(parts).strip()
+                    forwarded_from_id = f'user{fwd.sender.id}'
+                elif hasattr(fwd.sender, 'title'):
+                    forwarded_from = fwd.sender.title
+                    forwarded_from_id = f'channel{fwd.sender.id}'
+            elif getattr(fwd, 'sender_name', None):
+                forwarded_from = fwd.sender_name
+            elif getattr(fwd, 'from_name', None):
+                forwarded_from = fwd.from_name
+        except Exception:
+            pass  # Skip forward info if any attribute is missing
+    # Photo info
+    photo_info = {}
+    has_photo = False
+    has_media = False
+    if message.photo:
+        has_photo = True
+        has_media = True
+        if hasattr(message.photo, 'sizes') and message.photo.sizes:
+            largest = message.photo.sizes[-1]
+            if hasattr(largest, 'w'):
+                photo_info['width'] = largest.w
+                photo_info['height'] = largest.h
+    elif message.document or message.video or message.audio:
+        has_media = True
+    # Reply info
+    reply_to = None
+    if message.reply_to:
+        reply_to = message.reply_to.reply_to_msg_id
+    # Date handling
+    msg_date = message.date.replace(tzinfo=None) if message.date else datetime.utcnow()
+    # Build the JSON in Telegram Desktop export format
+    msg_json = {
+        'id': message.id,
+        'type': 'message',
+        'date': msg_date.strftime('%Y-%m-%dT%H:%M:%S'),
+        'date_unixtime': str(int(msg_date.replace(tzinfo=timezone.utc).timestamp())),
+        'from': from_name,
+        'from_id': from_id,
+        'text': text,
+        'text_entities': text_entities,
+    }
+    if reply_to:
+        msg_json['reply_to_message_id'] = reply_to
+    if forwarded_from:
+        msg_json['forwarded_from'] = forwarded_from
+    if forwarded_from_id:
+        msg_json['forwarded_from_id'] = forwarded_from_id
+    if has_photo:
+        msg_json['photo'] = '(photo)'
+        msg_json.update(photo_info)
+    if has_media and not has_photo:
+        msg_json['media_type'] = 'document'
+    if message.edit_date:
+        edit_date = message.edit_date.replace(tzinfo=None)
+        msg_json['edited'] = edit_date.strftime('%Y-%m-%dT%H:%M:%S')
+        msg_json['edited_unixtime'] = str(int(edit_date.replace(tzinfo=timezone.utc).timestamp()))
+    return msg_json
+async def fetch_messages(config: dict, hours: int = 36) -> list[dict]:
+    """
+    Fetch messages from the last N hours using Telethon.
+    Returns messages in Telegram Desktop export JSON format.
+    """
+    from telethon import TelegramClient
+    api_id = config['api_id']
+    api_hash = config['api_hash']
+    group = config['group']
+    client = TelegramClient(str(SESSION_FILE), api_id, api_hash)
+    await client.start()
+    log.info(f"Connected to Telegram")
+    # Resolve group - handle numeric IDs properly
+    if isinstance(group, str) and group.lstrip('-').isdigit():
+        group = int(group)
+    if isinstance(group, int) and group < 0:
+        # Convert Telegram Web format to Telethon PeerChannel
+        # -100XXXXXXXXXX → channel_id = XXXXXXXXXX
+        from telethon.tl.types import PeerChannel
+        channel_id = int(str(group).replace('-100', ''))
+        entity = await client.get_entity(PeerChannel(channel_id))
+    else:
+        entity = await client.get_entity(group)
+    log.info(f"Fetching from: {getattr(entity, 'title', group)}")
+    # Calculate time window
+    since = datetime.now(timezone.utc) - timedelta(hours=hours)
+    log.info(f"Fetching messages from last {hours} hours (since {since.strftime('%Y-%m-%d %H:%M')} UTC)")
+    # Fetch messages
+    messages_json = []
+    count = 0
+    async for message in client.iter_messages(entity, offset_date=None, reverse=False):
+        # Stop when we've gone past our time window
+        if message.date < since:
+            break
+        count += 1
+        msg_json = telethon_message_to_json(message)
+        if msg_json:
+            messages_json.append(msg_json)
+    await client.disconnect()
+    log.info(f"Fetched {count} messages, {len(messages_json)} with text content")
+    return messages_json
+async def fetch_participants(config: dict) -> list[dict]:
+    """
+    Fetch all group participants with metadata using Telethon.
+    Returns participant info: name, username, status, join date, admin, etc.
+    """
+    from telethon import TelegramClient
+    from telethon.tl.types import (
+        UserStatusOnline, UserStatusOffline, UserStatusRecently,
+        UserStatusLastWeek, UserStatusLastMonth,
+        ChannelParticipantAdmin, ChannelParticipantCreator,
+    )
+    api_id = config['api_id']
+    api_hash = config['api_hash']
+    group = config['group']
+    client = TelegramClient(str(SESSION_FILE), api_id, api_hash)
+    await client.start()
+    # Resolve group
+    if isinstance(group, str) and group.lstrip('-').isdigit():
+        group = int(group)
+    if isinstance(group, int) and group < 0:
+        from telethon.tl.types import PeerChannel
+        channel_id = int(str(group).replace('-100', ''))
+        entity = await client.get_entity(PeerChannel(channel_id))
+    else:
+        entity = await client.get_entity(group)
+    log.info(f"Fetching participants from: {getattr(entity, 'title', group)}")
+    participants = []
+    now_ts = int(datetime.now(timezone.utc).timestamp())
+    async for user in client.iter_participants(entity):
+        # Determine status
+        status = 'unknown'
+        last_online = None
+        if isinstance(user.status, UserStatusOnline):
+            status = 'online'
+            last_online = now_ts
+        elif isinstance(user.status, UserStatusOffline):
+            status = 'offline'
+            if user.status.was_online:
+                last_online = int(user.status.was_online.timestamp())
+        elif isinstance(user.status, UserStatusRecently):
+            status = 'recently'
+        elif isinstance(user.status, UserStatusLastWeek):
+            status = 'last_week'
+        elif isinstance(user.status, UserStatusLastMonth):
+            status = 'last_month'
+        # Determine role
+        is_admin = False
+        is_creator = False
+        join_date = None
+        if hasattr(user, 'participant'):
+            p = user.participant
+            if isinstance(p, ChannelParticipantCreator):
+                is_creator = True
+                is_admin = True
+            elif isinstance(p, ChannelParticipantAdmin):
+                is_admin = True
+            if hasattr(p, 'date') and p.date:
+                join_date = int(p.date.timestamp())
+        participants.append({
+            'user_id': f'user{user.id}',
+            'first_name': user.first_name or '',
+            'last_name': user.last_name or '',
+            'username': user.username or '',
+            'phone': user.phone or '',
+            'is_bot': 1 if user.bot else 0,
+            'is_admin': 1 if is_admin else 0,
+            'is_creator': 1 if is_creator else 0,
+            'is_premium': 1 if getattr(user, 'premium', False) else 0,
+            'join_date': join_date,
+            'last_status': status,
+            'last_online': last_online,
+            'about': '',  # Requires separate API call per user
+            'updated_at': now_ts,
+        })
+    await client.disconnect()
+    log.info(f"Fetched {len(participants)} participants")
+    return participants
+def sync_participants(participants: list[dict]) -> dict:
+    """Save participants to telegram.db."""
+    if not participants:
+        return {'synced': 0}
+    conn = sqlite3.connect(str(DB_PATH))
+    # Create table if not exists
+    conn.execute("""
+        CREATE TABLE IF NOT EXISTS participants (
+            user_id TEXT PRIMARY KEY,
+            first_name TEXT,
+            last_name TEXT,
+            username TEXT,
+            phone TEXT,
+            is_bot INTEGER DEFAULT 0,
+            is_admin INTEGER DEFAULT 0,
+            is_creator INTEGER DEFAULT 0,
+            is_premium INTEGER DEFAULT 0,
+            join_date INTEGER,
+            last_status TEXT DEFAULT 'unknown',
+            last_online INTEGER,
+            about TEXT,
+            updated_at INTEGER
+        )
+    """)
+    # Upsert participants
+    conn.executemany("""
+        INSERT OR REPLACE INTO participants
+        (user_id, first_name, last_name, username, phone, is_bot, is_admin,
+         is_creator, is_premium, join_date, last_status, last_online, about, updated_at)
+        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+    """, [
+        (p['user_id'], p['first_name'], p['last_name'], p['username'],
+         p['phone'], p['is_bot'], p['is_admin'], p['is_creator'],
+         p['is_premium'], p['join_date'], p['last_status'],
+         p['last_online'], p['about'], p['updated_at'])
+        for p in participants
+    ])
+    conn.commit()
+    conn.close()
+    log.info(f"Synced {len(participants)} participants to DB")
+    return {'synced': len(participants)}
+# ==========================================
+# DATABASE: INDEX NEW MESSAGES
+# ==========================================
+def index_messages(messages_json: list[dict]) -> dict:
+    """
+    Add new messages to telegram.db using IncrementalIndexer.
+    Duplicates are automatically ignored.
+    """
+    if not messages_json:
+        return {'new_messages': 0, 'duplicates': 0}
+    from indexer import IncrementalIndexer
+    log.info(f"Indexing {len(messages_json)} messages into telegram.db...")
+    indexer = IncrementalIndexer(str(DB_PATH))
+    stats = indexer.update_from_json_data({'messages': messages_json}, show_progress=True)
+    indexer.close()
+    log.info(f"Indexing done: {stats['new_messages']} new, {stats['duplicates']} duplicates")
+    return stats
+# ==========================================
+# EMBEDDINGS: GENERATE FOR NEW MESSAGES
+# ==========================================
+def generate_embeddings(messages_json: list[dict]) -> dict:
+    """
+    Generate embeddings for new messages and add to embeddings.db.
+    Only processes messages that don't already have embeddings.
+    """
+    if not os.path.exists(EMBEDDINGS_DB_PATH):
+        log.warning(f"embeddings.db not found at {EMBEDDINGS_DB_PATH}. Skipping embeddings.")
+        return {'new_embeddings': 0, 'skipped': 0}
+    import numpy as np
+    # Find which messages don't have embeddings yet
+    emb_conn = sqlite3.connect(str(EMBEDDINGS_DB_PATH))
+    existing_ids = set()
+    for row in emb_conn.execute("SELECT message_id FROM embeddings"):
+        existing_ids.add(row[0])
+    # Filter to messages that:
+    # 1. Have text content
+    # 2. Text is longer than 10 chars
+    # 3. Don't already have embeddings
+    new_messages = []
+    for msg in messages_json:
+        msg_id = msg.get('id')
+        text = msg.get('text', '')
+        if isinstance(text, list):
+            # Handle complex text format
+            text = ''.join(
+                part if isinstance(part, str) else part.get('text', '')
+                for part in text
+            )
+        if msg_id and msg_id not in existing_ids and text and len(text.strip()) > 10:
+            new_messages.append({
+                'id': msg_id,
+                'from_name': msg.get('from', ''),
+                'text': text
+            })
+    if not new_messages:
+        emb_conn.close()
+        log.info("No new messages need embeddings")
+        return {'new_embeddings': 0, 'skipped': len(messages_json)}
+    log.info(f"Generating embeddings for {len(new_messages)} new messages...")
+    # Load model
+    from sentence_transformers import SentenceTransformer
+    model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
+    # Generate embeddings
+    texts = [m['text'][:500] for m in new_messages]  # Max 500 chars per message
+    embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True,
+                              batch_size=64)
+    # Insert into embeddings.db
+    data = []
+    for i, msg in enumerate(new_messages):
+        emb_blob = embeddings[i].astype(np.float32).tobytes()
+        data.append((
+            msg['id'],
+            msg['from_name'],
+            msg['text'][:100],  # Preview
+            emb_blob
+        ))
+    emb_conn.executemany(
+        "INSERT OR IGNORE INTO embeddings (message_id, from_name, text_preview, embedding) VALUES (?, ?, ?, ?)",
+        data
+    )
+    emb_conn.commit()
+    # Verify
+    total = emb_conn.execute("SELECT COUNT(*) FROM embeddings").fetchone()[0]
+    emb_conn.close()
+    log.info(f"Added {len(data)} new embeddings. Total embeddings: {total:,}")
+    return {'new_embeddings': len(data), 'total_embeddings': total}
+# ==========================================
+# MAIN
+# ==========================================
+def run_sync(hours: int = 36, skip_embeddings: bool = False):
+    """Run the full sync pipeline."""
+    start_time = time.time()
+    log.info("=" * 50)
+    log.info("Starting daily sync")
+    log.info("=" * 50)
+    # Load config
+    config = load_config()
+    if not config:
+        log.error("No configuration found. Run: python daily_sync.py --setup")
+        sys.exit(1)
+    # Step 1: Fetch messages from Telegram
+    log.info("[1/4] Fetching messages from Telegram...")
+    messages_json = asyncio.run(fetch_messages(config, hours=hours))
+    if not messages_json:
+        log.info("No messages found in the time window.")
+    # Step 2: Index into telegram.db
+    index_stats = {'new_messages': 0, 'duplicates': 0}
+    if messages_json:
+        log.info("[2/4] Indexing new messages...")
+        index_stats = index_messages(messages_json)
+    # Step 3: Sync participants
+    log.info("[3/4] Syncing participants...")
+    try:
+        participants = asyncio.run(fetch_participants(config))
+        part_stats = sync_participants(participants)
+    except Exception as e:
+        log.warning(f"Failed to sync participants: {e}")
+        part_stats = {'synced': 0}
+    # Step 4: Generate embeddings
+    if skip_embeddings or not messages_json:
+        log.info("[4/4] Skipping embeddings")
+        emb_stats = {'new_embeddings': 0}
+    else:
+        log.info("[4/4] Generating embeddings for new messages...")
+        emb_stats = generate_embeddings(messages_json)
+    # Notify running server to invalidate caches and reload embeddings
+    has_changes = (index_stats.get('new_messages', 0) > 0
+                   or part_stats.get('synced', 0) > 0
+                   or emb_stats.get('new_embeddings', 0) > 0)
+    if has_changes:
+        try:
+            import urllib.request
+            urllib.request.urlopen('http://localhost:5000/api/cache/invalidate', timeout=5)
+            log.info("Server caches invalidated")
+            if emb_stats.get('new_embeddings', 0) > 0:
+                urllib.request.urlopen('http://localhost:5000/api/embeddings/reload', timeout=5)
+                log.info("Server notified to reload embeddings")
+        except Exception:
+            log.info("Server not running or unreachable - caches will refresh on next restart")
+    # Summary
+    elapsed = time.time() - start_time
+    log.info("=" * 50)
+    log.info("Sync complete!")
+    log.info(f"  Messages fetched:    {len(messages_json) if messages_json else 0}")
+    log.info(f"  New to DB:           {index_stats.get('new_messages', 0)}")
+    log.info(f"  Duplicates skipped:  {index_stats.get('duplicates', 0)}")
+    log.info(f"  Participants synced: {part_stats.get('synced', 0)}")
+    log.info(f"  New embeddings:      {emb_stats.get('new_embeddings', 0)}")
+    log.info(f"  Time:                {elapsed:.1f}s")
+    log.info("=" * 50)
+def main():
+    parser = argparse.ArgumentParser(description='Daily Telegram Sync')
+    parser.add_argument('--setup', action='store_true', help='First time setup')
+    parser.add_argument('--hours', type=int, default=36, help='Hours to look back (default: 36)')
+    parser.add_argument('--skip-embeddings', action='store_true', help='Skip embedding generation')
+    parser.add_argument('--fetch-only', action='store_true', help='Only fetch, do not index')
+    args = parser.parse_args()
+    if args.setup:
+        setup_config()
+        return
+    if args.fetch_only:
+        config = load_config()
+        if not config:
+            log.error("No configuration found. Run: python daily_sync.py --setup")
+            sys.exit(1)
+        messages = asyncio.run(fetch_messages(config, hours=args.hours))
+        # Save to file for inspection
+        output = BASE_DIR / 'fetched_messages.json'
+        with open(output, 'w', encoding='utf-8') as f:
+            json.dump(messages, f, ensure_ascii=False, indent=2)
+        log.info(f"Saved {len(messages)} messages to {output}")
+        return
+    run_sync(hours=args.hours, skip_embeddings=args.skip_embeddings)
+if __name__ == '__main__':
+    main()

static/css/style.css CHANGED Viewed

@@ -67,6 +67,11 @@ body {
     flex-direction: column;
     border-right: 1px solid var(--border-color);
     z-index: 100;
 }
 .logo {
@@ -821,8 +826,54 @@ input:focus {
 }
 @media (max-width: 768px) {
-    .stats-grid {
-        grid-template-columns: repeat(2, 1fr);
     }
     .header {
@@ -831,9 +882,93 @@ input:focus {
         align-items: flex-start;
     }
     .user-stats-grid {
         grid-template-columns: repeat(2, 1fr);
     }
 }
 /* ==========================================

     flex-direction: column;
     border-right: 1px solid var(--border-color);
     z-index: 100;
+    transition: width 0.3s ease;
+}
+.mobile-menu-btn {
+    display: none;
 }
 .logo {
 }
 @media (max-width: 768px) {
+    .sidebar {
+        width: 0;
+        overflow: hidden;
+        transition: width 0.3s ease;
+    }
+    .sidebar.open {
+        width: 250px;
+    }
+    .sidebar-overlay {
+        display: none;
+        position: fixed;
+        top: 0;
+        left: 0;
+        right: 0;
+        bottom: 0;
+        background: rgba(0,0,0,0.5);
+        z-index: 99;
+    }
+    .sidebar-overlay.active {
+        display: block;
+    }
+    .mobile-menu-btn {
+        display: flex !important;
+        align-items: center;
+        justify-content: center;
+        width: 40px;
+        height: 40px;
+        background: var(--bg-card);
+        border: 1px solid var(--border-color);
+        border-radius: var(--radius-md);
+        color: var(--text-primary);
+        font-size: 1.5rem;
+        cursor: pointer;
+        position: fixed;
+        top: var(--spacing-md);
+        left: var(--spacing-md);
+        z-index: 98;
+    }
+    .main-content {
+        margin-left: 0;
+        max-width: 100vw;
+        padding: var(--spacing-md);
+        padding-top: 60px;
     }
     .header {
         align-items: flex-start;
     }
+    .header h1 {
+        font-size: 1.25rem;
+    }
+    .header-controls {
+        width: 100%;
+        flex-wrap: wrap;
+    }
+    .stats-grid {
+        grid-template-columns: repeat(2, 1fr);
+    }
+    .stats-grid .stat-card {
+        padding: var(--spacing-md);
+    }
+    .charts-row {
+        grid-template-columns: 1fr;
+    }
+    .chart-card.full-width,
+    .chart-card.large {
+        grid-column: span 1;
+    }
+    .lists-row {
+        grid-template-columns: 1fr;
+    }
     .user-stats-grid {
         grid-template-columns: repeat(2, 1fr);
     }
+    .heatmap-table th,
+    .heatmap-table td {
+        min-width: 25px;
+        font-size: 0.65rem;
+        padding: 2px;
+    }
+    .heatmap-cell {
+        width: 22px;
+        height: 22px;
+    }
+    /* Chat messages */
+    .message {
+        max-width: 95% !important;
+    }
+    /* Search */
+    .search-box {
+        flex-direction: column;
+    }
+    .search-box input {
+        width: 100%;
+    }
+    /* Tables */
+    .data-table {
+        font-size: 0.8rem;
+    }
+    .data-table th,
+    .data-table td {
+        padding: var(--spacing-sm);
+    }
+}
+@media (max-width: 480px) {
+    .stats-grid {
+        grid-template-columns: 1fr;
+    }
+    .user-stats-grid {
+        grid-template-columns: 1fr;
+    }
+    .header h1 {
+        font-size: 1.1rem;
+    }
+    .chart-container {
+        height: 200px;
+    }
 }
 /* ==========================================

static/js/dashboard.js CHANGED Viewed

@@ -9,6 +9,31 @@
  * - Export functionality
  */
 // ==========================================
 // GLOBAL STATE
 // ==========================================

  * - Export functionality
  */
+// ==========================================
+// MOBILE MENU
+// ==========================================
+function toggleMobileMenu() {
+    const sidebar = document.querySelector('.sidebar');
+    const overlay = document.querySelector('.sidebar-overlay');
+    sidebar.classList.toggle('open');
+    if (overlay) overlay.classList.toggle('active');
+}
+// Close mobile menu when clicking a nav link
+document.addEventListener('DOMContentLoaded', function() {
+    document.querySelectorAll('.nav-link').forEach(function(link) {
+        link.addEventListener('click', function() {
+            if (window.innerWidth <= 768) {
+                const sidebar = document.querySelector('.sidebar');
+                const overlay = document.querySelector('.sidebar-overlay');
+                sidebar.classList.remove('open');
+                if (overlay) overlay.classList.remove('active');
+            }
+        });
+    });
+});
 // ==========================================
 // GLOBAL STATE
 // ==========================================

templates/index.html CHANGED Viewed

@@ -8,6 +8,8 @@
     <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
 </head>
 <body>
     <!-- Sidebar -->
     <nav class="sidebar">
         <div class="logo">

     <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
 </head>
 <body>
+    <button class="mobile-menu-btn" onclick="toggleMobileMenu()">&#9776;</button>
+    <div class="sidebar-overlay" onclick="toggleMobileMenu()"></div>
     <!-- Sidebar -->
     <nav class="sidebar">
         <div class="logo">

templates/moderation.html CHANGED Viewed

@@ -8,6 +8,8 @@
     <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
 </head>
 <body>
     <!-- Sidebar -->
     <nav class="sidebar">
         <div class="logo">

     <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
 </head>
 <body>
+    <button class="mobile-menu-btn" onclick="toggleMobileMenu()">&#9776;</button>
+    <div class="sidebar-overlay" onclick="toggleMobileMenu()"></div>
     <!-- Sidebar -->
     <nav class="sidebar">
         <div class="logo">

templates/search.html CHANGED Viewed

@@ -7,6 +7,8 @@
     <link rel="stylesheet" href="/static/css/style.css">
 </head>
 <body>
     <!-- Sidebar -->
     <nav class="sidebar">
         <div class="logo">

     <link rel="stylesheet" href="/static/css/style.css">
 </head>
 <body>
+    <button class="mobile-menu-btn" onclick="toggleMobileMenu()">&#9776;</button>
+    <div class="sidebar-overlay" onclick="toggleMobileMenu()"></div>
     <!-- Sidebar -->
     <nav class="sidebar">
         <div class="logo">

templates/settings.html CHANGED Viewed

@@ -167,6 +167,8 @@
     </style>
 </head>
 <body>
     <!-- Sidebar -->
     <nav class="sidebar">
         <div class="logo">

     </style>
 </head>
 <body>
+    <button class="mobile-menu-btn" onclick="toggleMobileMenu()">&#9776;</button>
+    <div class="sidebar-overlay" onclick="toggleMobileMenu()"></div>
     <!-- Sidebar -->
     <nav class="sidebar">
         <div class="logo">

templates/user_profile.html CHANGED Viewed

@@ -253,6 +253,8 @@
     </style>
 </head>
 <body>
     <!-- Sidebar -->
     <nav class="sidebar">
         <div class="logo">

     </style>
 </head>
 <body>
+    <button class="mobile-menu-btn" onclick="toggleMobileMenu()">&#9776;</button>
+    <div class="sidebar-overlay" onclick="toggleMobileMenu()"></div>
     <!-- Sidebar -->
     <nav class="sidebar">
         <div class="logo">

templates/users.html CHANGED Viewed

@@ -8,6 +8,8 @@
     <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
 </head>
 <body>
     <!-- Sidebar -->
     <nav class="sidebar">
         <div class="logo">

     <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
 </head>
 <body>
+    <button class="mobile-menu-btn" onclick="toggleMobileMenu()">&#9776;</button>
+    <div class="sidebar-overlay" onclick="toggleMobileMenu()"></div>
     <!-- Sidebar -->
     <nav class="sidebar">
         <div class="logo">

tests.py ADDED Viewed

	@@ -0,0 +1,474 @@

+#!/usr/bin/env python3
+"""
+Integration tests for Telegram Analytics: indexer, search, and dashboard endpoints.
+Run with: python -m pytest tests.py -v
+Or:       python tests.py
+"""
+import json
+import os
+import sqlite3
+import tempfile
+import time
+import unittest
+from pathlib import Path
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _sample_messages(n: int = 5) -> list[dict]:
+    """Generate N realistic Telegram-format messages."""
+    base_ts = 1700000000
+    users = [
+        ("user1", "Alice"),
+        ("user2", "Bob"),
+        ("user3", "Carol"),
+    ]
+    msgs = []
+    for i in range(1, n + 1):
+        uid, name = users[i % len(users)]
+        msgs.append({
+            "id": 1000 + i,
+            "type": "message",
+            "date": f"2024-01-{(i % 28) + 1:02d}T10:00:00",
+            "date_unixtime": str(base_ts + i * 3600),
+            "from": name,
+            "from_id": uid,
+            "text": f"Test message number {i} from {name}",
+            "text_entities": [
+                {"type": "plain", "text": f"Test message number {i} from {name}"}
+            ],
+            "reply_to_message_id": (1000 + i - 1) if i > 1 else None,
+        })
+    return msgs
+def _write_json(path: str, messages: list[dict]):
+    """Write messages in Telegram export JSON format."""
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump({"messages": messages}, f, ensure_ascii=False)
+# ---------------------------------------------------------------------------
+# 1. Indexer Tests
+# ---------------------------------------------------------------------------
+class TestIndexer(unittest.TestCase):
+    """Tests for OptimizedIndexer and IncrementalIndexer."""
+    def setUp(self):
+        self.tmpdir = tempfile.mkdtemp()
+        self.db_path = os.path.join(self.tmpdir, "test.db")
+        self.json_path = os.path.join(self.tmpdir, "messages.json")
+        self.messages = _sample_messages(10)
+        _write_json(self.json_path, self.messages)
+    def tearDown(self):
+        import shutil
+        shutil.rmtree(self.tmpdir, ignore_errors=True)
+    def test_optimized_indexer_indexes_messages(self):
+        from indexer import OptimizedIndexer
+        indexer = OptimizedIndexer(self.db_path, build_trigrams=False, build_graph=False)
+        stats = indexer.index_file(self.json_path, show_progress=False)
+        self.assertGreater(stats["messages"], 0)
+        conn = sqlite3.connect(self.db_path)
+        count = conn.execute("SELECT COUNT(*) FROM messages").fetchone()[0]
+        conn.close()
+        self.assertEqual(count, stats["messages"])
+    def test_incremental_indexer_deduplication(self):
+        from indexer import OptimizedIndexer, IncrementalIndexer
+        # First: create DB with OptimizedIndexer
+        opt = OptimizedIndexer(self.db_path, build_trigrams=False, build_graph=False)
+        opt.index_file(self.json_path, show_progress=False)
+        # Now use IncrementalIndexer – same data, should all be duplicates
+        idx = IncrementalIndexer(self.db_path)
+        stats = idx.update_from_json(self.json_path, show_progress=False)
+        idx.close()
+        self.assertEqual(stats["new_messages"], 0)
+        self.assertGreater(stats["duplicates"], 0)
+    def test_incremental_indexer_adds_new(self):
+        from indexer import OptimizedIndexer, IncrementalIndexer
+        # Create DB with 5 messages
+        msgs5 = _sample_messages(5)
+        _write_json(self.json_path, msgs5)
+        opt = OptimizedIndexer(self.db_path, build_trigrams=False, build_graph=False)
+        opt.index_file(self.json_path, show_progress=False)
+        # Now add 10 messages (5 old + 5 new)
+        msgs10 = _sample_messages(10)
+        json2 = os.path.join(self.tmpdir, "messages2.json")
+        _write_json(json2, msgs10)
+        idx = IncrementalIndexer(self.db_path)
+        stats = idx.update_from_json(json2, show_progress=False)
+        idx.close()
+        self.assertEqual(stats["new_messages"], 5)
+        self.assertEqual(stats["duplicates"], 5)
+    def test_incremental_indexer_from_json_data(self):
+        from indexer import OptimizedIndexer, IncrementalIndexer
+        # Init DB first
+        opt = OptimizedIndexer(self.db_path, build_trigrams=False, build_graph=False)
+        opt.index_file(self.json_path, show_progress=False)
+        # Add new messages via json_data
+        new_msgs = _sample_messages(15)  # 10 old + 5 new
+        idx = IncrementalIndexer(self.db_path)
+        stats = idx.update_from_json_data(new_msgs, show_progress=False)
+        idx.close()
+        self.assertEqual(stats["new_messages"], 5)
+    def test_fts5_search_works(self):
+        from indexer import OptimizedIndexer
+        indexer = OptimizedIndexer(self.db_path, build_trigrams=False, build_graph=False)
+        indexer.index_file(self.json_path, show_progress=False)
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.execute(
+            "SELECT COUNT(*) FROM messages_fts WHERE messages_fts MATCH 'message'"
+        )
+        count = cursor.fetchone()[0]
+        conn.close()
+        self.assertGreater(count, 0, "FTS5 search should find messages with 'message'")
+    def test_streaming_load_json_messages(self):
+        from indexer import load_json_messages
+        msgs = list(load_json_messages(self.json_path))
+        self.assertEqual(len(msgs), 10)
+        self.assertIn("text_plain", msgs[0])
+    def test_entities_extracted(self):
+        """Messages with links/mentions in text_entities should have entities stored."""
+        msgs = [
+            {
+                "id": 9001,
+                "type": "message",
+                "date": "2024-01-01T10:00:00",
+                "date_unixtime": "1700000000",
+                "from": "Alice",
+                "from_id": "user1",
+                "text": "Check https://example.com and @bob",
+                "text_entities": [
+                    {"type": "plain", "text": "Check "},
+                    {"type": "link", "text": "https://example.com"},
+                    {"type": "plain", "text": " and "},
+                    {"type": "mention", "text": "@bob"},
+                ],
+            }
+        ]
+        _write_json(self.json_path, msgs)
+        from indexer import OptimizedIndexer
+        indexer = OptimizedIndexer(self.db_path, build_trigrams=False, build_graph=False)
+        indexer.index_file(self.json_path, show_progress=False)
+        conn = sqlite3.connect(self.db_path)
+        entities = conn.execute("SELECT type, value FROM entities WHERE message_id = 9001").fetchall()
+        conn.close()
+        types = [e[0] for e in entities]
+        self.assertIn("link", types)
+        self.assertIn("mention", types)
+# ---------------------------------------------------------------------------
+# 2. Search Tests
+# ---------------------------------------------------------------------------
+class TestSearch(unittest.TestCase):
+    """Tests for FTS search."""
+    def setUp(self):
+        self.tmpdir = tempfile.mkdtemp()
+        self.db_path = os.path.join(self.tmpdir, "test.db")
+        self.json_path = os.path.join(self.tmpdir, "messages.json")
+        _write_json(self.json_path, _sample_messages(20))
+        from indexer import OptimizedIndexer
+        indexer = OptimizedIndexer(self.db_path, build_trigrams=False, build_graph=False)
+        indexer.index_file(self.json_path, show_progress=False)
+    def tearDown(self):
+        import shutil
+        shutil.rmtree(self.tmpdir, ignore_errors=True)
+    def test_fts_match_query(self):
+        conn = sqlite3.connect(self.db_path)
+        conn.row_factory = sqlite3.Row
+        rows = conn.execute(
+            "SELECT id, text_plain FROM messages WHERE id IN "
+            "(SELECT rowid FROM messages_fts WHERE messages_fts MATCH 'Alice')"
+        ).fetchall()
+        conn.close()
+        self.assertGreater(len(rows), 0)
+        for r in rows:
+            self.assertIn("Alice", r["text_plain"])
+    def test_fts_returns_no_results_for_nonsense(self):
+        conn = sqlite3.connect(self.db_path)
+        rows = conn.execute(
+            "SELECT COUNT(*) FROM messages_fts WHERE messages_fts MATCH 'xyzzyplugh'"
+        ).fetchone()[0]
+        conn.close()
+        self.assertEqual(rows, 0)
+# ---------------------------------------------------------------------------
+# 3. SemanticSearch Empty Embeddings
+# ---------------------------------------------------------------------------
+try:
+    import numpy as np
+    HAS_NUMPY = True
+except ImportError:
+    HAS_NUMPY = False
+@unittest.skipUnless(HAS_NUMPY, "numpy not installed")
+class TestSemanticSearchEmpty(unittest.TestCase):
+    """Test that SemanticSearch handles missing/empty embeddings gracefully."""
+    def test_is_available_missing_db(self):
+        from semantic_search import SemanticSearch
+        ss = SemanticSearch(embeddings_db="/tmp/nonexistent_embeddings_12345.db")
+        self.assertFalse(ss.is_available())
+    def test_is_available_empty_db(self):
+        from semantic_search import SemanticSearch
+        tmpdir = tempfile.mkdtemp()
+        db_path = os.path.join(tmpdir, "empty_emb.db")
+        conn = sqlite3.connect(db_path)
+        conn.execute(
+            "CREATE TABLE embeddings (message_id INTEGER PRIMARY KEY, "
+            "from_name TEXT, text_preview TEXT, embedding BLOB)"
+        )
+        conn.commit()
+        conn.close()
+        ss = SemanticSearch(embeddings_db=db_path)
+        self.assertFalse(ss.is_available())
+        import shutil
+        shutil.rmtree(tmpdir, ignore_errors=True)
+    def test_load_empty_embeddings_no_crash(self):
+        from semantic_search import SemanticSearch
+        tmpdir = tempfile.mkdtemp()
+        db_path = os.path.join(tmpdir, "empty_emb.db")
+        conn = sqlite3.connect(db_path)
+        conn.execute(
+            "CREATE TABLE embeddings (message_id INTEGER PRIMARY KEY, "
+            "from_name TEXT, text_preview TEXT, embedding BLOB)"
+        )
+        conn.commit()
+        conn.close()
+        ss = SemanticSearch(embeddings_db=db_path)
+        ss._load_embeddings()  # Should not crash
+        self.assertTrue(ss.embeddings_loaded)
+        self.assertEqual(len(ss.message_ids), 0)
+        import shutil
+        shutil.rmtree(tmpdir, ignore_errors=True)
+    def test_stats_empty_db(self):
+        from semantic_search import SemanticSearch
+        tmpdir = tempfile.mkdtemp()
+        db_path = os.path.join(tmpdir, "empty_emb.db")
+        conn = sqlite3.connect(db_path)
+        conn.execute(
+            "CREATE TABLE embeddings (message_id INTEGER PRIMARY KEY, "
+            "from_name TEXT, text_preview TEXT, embedding BLOB)"
+        )
+        conn.commit()
+        conn.close()
+        ss = SemanticSearch(embeddings_db=db_path)
+        s = ss.stats()
+        self.assertTrue(s["available"])  # File exists and table exists
+        self.assertEqual(s["count"], 0)
+        import shutil
+        shutil.rmtree(tmpdir, ignore_errors=True)
+# ---------------------------------------------------------------------------
+# 4. Dashboard Endpoint Tests
+# ---------------------------------------------------------------------------
+try:
+    import flask
+    HAS_FLASK = True
+except ImportError:
+    HAS_FLASK = False
+@unittest.skipUnless(HAS_FLASK, "flask not installed")
+class TestDashboardEndpoints(unittest.TestCase):
+    """Test Flask dashboard API endpoints."""
+    @classmethod
+    def setUpClass(cls):
+        """Create a test DB and configure Flask test client."""
+        cls.tmpdir = tempfile.mkdtemp()
+        cls.db_path = os.path.join(cls.tmpdir, "test.db")
+        cls.json_path = os.path.join(cls.tmpdir, "messages.json")
+        _write_json(cls.json_path, _sample_messages(50))
+        from indexer import OptimizedIndexer
+        indexer = OptimizedIndexer(cls.db_path, build_trigrams=False, build_graph=False)
+        indexer.index_file(cls.json_path, show_progress=False)
+        import dashboard
+        dashboard.DB_PATH = cls.db_path
+        dashboard.app.config["TESTING"] = True
+        cls.client = dashboard.app.test_client()
+    @classmethod
+    def tearDownClass(cls):
+        import shutil
+        shutil.rmtree(cls.tmpdir, ignore_errors=True)
+    def test_overview_endpoint(self):
+        resp = self.client.get("/api/overview?timeframe=all")
+        self.assertEqual(resp.status_code, 200)
+        data = resp.get_json()
+        self.assertIn("total_messages", data)
+        self.assertGreater(data["total_messages"], 0)
+    def test_users_endpoint(self):
+        resp = self.client.get("/api/users?timeframe=all&limit=10")
+        self.assertEqual(resp.status_code, 200)
+        data = resp.get_json()
+        self.assertIn("users", data)
+        self.assertGreater(len(data["users"]), 0)
+        user = data["users"][0]
+        for field in ("user_id", "name", "messages", "percentage"):
+            self.assertIn(field, user)
+    def test_users_include_inactive(self):
+        resp = self.client.get("/api/users?timeframe=all&include_inactive=0")
+        self.assertEqual(resp.status_code, 200)
+        data = resp.get_json()
+        for user in data["users"]:
+            self.assertGreater(user["messages"], 0)
+    def test_search_fts_endpoint(self):
+        resp = self.client.get("/api/search?q=message&mode=fts&limit=5")
+        self.assertEqual(resp.status_code, 200)
+        data = resp.get_json()
+        self.assertIn("results", data)
+    def test_chart_hourly_endpoint(self):
+        resp = self.client.get("/api/chart/hourly?timeframe=all")
+        self.assertEqual(resp.status_code, 200)
+        data = resp.get_json()
+        self.assertIsInstance(data, list)
+        self.assertEqual(len(data), 24)
+    def test_chart_daily_endpoint(self):
+        resp = self.client.get("/api/chart/daily?timeframe=all")
+        self.assertEqual(resp.status_code, 200)
+        data = resp.get_json()
+        self.assertIsInstance(data, list)
+    def test_cache_invalidate_endpoint(self):
+        resp = self.client.get("/api/cache/invalidate")
+        self.assertEqual(resp.status_code, 200)
+        data = resp.get_json()
+        self.assertEqual(data["status"], "invalidated")
+    def test_page_routes_return_200(self):
+        """All page routes should return 200."""
+        for route in ("/", "/users", "/search", "/chat", "/moderation", "/settings"):
+            resp = self.client.get(route)
+            self.assertEqual(resp.status_code, 200, f"Route {route} failed")
+    def test_user_profile_endpoint(self):
+        resp = self.client.get("/api/users?timeframe=all&limit=1")
+        data = resp.get_json()
+        if data["users"]:
+            uid = data["users"][0]["user_id"]
+            resp2 = self.client.get(f"/api/user/{uid}/profile")
+            self.assertEqual(resp2.status_code, 200)
+            profile = resp2.get_json()
+            self.assertIn("total_messages", profile)
+            self.assertIn("hourly_activity", profile)
+    def test_overview_has_expected_keys(self):
+        resp = self.client.get("/api/overview?timeframe=all")
+        data = resp.get_json()
+        for key in ("total_messages", "total_users", "links_count", "media_count"):
+            self.assertIn(key, data, f"Missing key: {key}")
+# ---------------------------------------------------------------------------
+# 5. AI Search Schema Test
+# ---------------------------------------------------------------------------
+class TestAISearchSchema(unittest.TestCase):
+    """Test that AI search schema generation matches actual DB."""
+    def test_dynamic_schema_includes_real_columns(self):
+        tmpdir = tempfile.mkdtemp()
+        db_path = os.path.join(tmpdir, "test.db")
+        # Initialize DB with real schema
+        from indexer import init_database
+        conn = init_database(db_path)
+        conn.close()
+        from ai_search import AISearchEngine
+        # Create instance without connecting to a provider
+        engine = AISearchEngine.__new__(AISearchEngine)
+        engine.db_path = db_path
+        schema = engine._get_db_schema()
+        # Verify real column names are present
+        self.assertIn("text_plain", schema)
+        self.assertIn("date_unixtime", schema)
+        self.assertIn("has_links", schema)
+        self.assertIn("has_media", schema)
+        self.assertIn("from_id", schema)
+        self.assertIn("participants", schema)
+        # Verify old wrong column names are NOT in the dynamic output
+        self.assertNotIn("char_count", schema)
+        # media_type would not appear unless there's a column named that
+        lines_lower = schema.lower()
+        # "media_type" should not be a column name (has_media is the real one)
+        self.assertNotIn("media_type (", lines_lower)
+        import shutil
+        shutil.rmtree(tmpdir, ignore_errors=True)
+# ---------------------------------------------------------------------------
+# Runner
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    unittest.main(verbosity=2)

update_hf.py ADDED Viewed

	@@ -0,0 +1,132 @@

+#!/usr/bin/env python3
+"""
+Automated sync + deploy to Hugging Face Spaces.
+Usage:
+    python update_hf.py              # sync + upload everything
+    python update_hf.py --db-only    # just upload DB (skip sync)
+    python update_hf.py --full       # upload all files + DB (first time or after code changes)
+"""
+import subprocess
+import sys
+import os
+# === CONFIGURATION ===
+REPO_ID = "rottg/telegram-analytics"
+PROJECT_DIR = os.path.dirname(os.path.abspath(__file__))
+DB_PATH = os.path.join(PROJECT_DIR, "telegram.db")
+# Files to upload (code + config)
+CODE_FILES = [
+    "dashboard.py", "ai_search.py", "algorithms.py", "data_structures.py",
+    "indexer.py", "search.py", "semantic_search.py", "schema.sql",
+    "Dockerfile", "requirements.txt", "README.md",
+]
+FOLDERS = ["static", "templates"]
+# Token is read from environment variable or .hf_token file
+HF_TOKEN = os.environ.get("HF_TOKEN")
+if not HF_TOKEN:
+    token_file = os.path.join(PROJECT_DIR, ".hf_token")
+    if os.path.exists(token_file):
+        with open(token_file) as f:
+            HF_TOKEN = f.read().strip()
+    else:
+        print("ERROR: Set HF_TOKEN env var or create .hf_token file with your token")
+        sys.exit(1)
+def run_sync():
+    """Run daily_sync.py"""
+    sync_script = os.path.join(PROJECT_DIR, "daily_sync.py")
+    print("\n=== Step 1: Running daily sync ===")
+    result = subprocess.run([sys.executable, sync_script], cwd=PROJECT_DIR)
+    if result.returncode != 0:
+        print("ERROR: Sync failed!")
+        sys.exit(1)
+    print("Sync complete.")
+def upload_to_hf(full=False):
+    """Upload files to HF Space using the API (no git needed)."""
+    from huggingface_hub import HfApi
+    api = HfApi(token=HF_TOKEN)
+    if full:
+        # Upload all code files + folders + DB
+        print("\n=== Uploading all files to HF ===")
+        # Collect all files to upload
+        upload_files = []
+        for f in CODE_FILES:
+            path = os.path.join(PROJECT_DIR, f)
+            if os.path.exists(path):
+                upload_files.append((path, f))
+        for folder in FOLDERS:
+            folder_path = os.path.join(PROJECT_DIR, folder)
+            if os.path.exists(folder_path):
+                for root, dirs, files in os.walk(folder_path):
+                    for fname in files:
+                        full_path = os.path.join(root, fname)
+                        rel_path = os.path.relpath(full_path, PROJECT_DIR)
+                        upload_files.append((full_path, rel_path.replace("\\", "/")))
+        # Add DB
+        upload_files.append((DB_PATH, "telegram.db"))
+        print(f"Uploading {len(upload_files)} files...")
+        for local_path, repo_path in upload_files:
+            size_mb = os.path.getsize(local_path) / (1024 * 1024)
+            if size_mb > 1:
+                print(f"  {repo_path} ({size_mb:.0f} MB)...")
+            else:
+                print(f"  {repo_path}")
+        api.upload_folder(
+            folder_path=PROJECT_DIR,
+            repo_id=REPO_ID,
+            repo_type="space",
+            allow_patterns=[f for _, f in upload_files],
+        )
+    else:
+        # DB only - delete old, upload new
+        print("\n=== Removing old DB from HF ===")
+        try:
+            api.delete_file("telegram.db", repo_id=REPO_ID, repo_type="space")
+            print("Old DB removed.")
+        except Exception as e:
+            print(f"No old DB to remove ({e})")
+        print("\n=== Uploading new DB to HF ===")
+        db_size_mb = os.path.getsize(DB_PATH) / (1024 * 1024)
+        print(f"Uploading {db_size_mb:.0f} MB...")
+        api.upload_file(
+            path_or_fileobj=DB_PATH,
+            path_in_repo="telegram.db",
+            repo_id=REPO_ID,
+            repo_type="space",
+        )
+    print("Upload complete!")
+    print(f"\nSite will rebuild at: https://rottg-telegram-analytics.hf.space")
+def main():
+    db_only = "--db-only" in sys.argv
+    full = "--full" in sys.argv
+    if not db_only:
+        run_sync()
+    else:
+        print("Skipping sync (--db-only)")
+    upload_to_hf(full=full or not db_only and "--db-only" not in sys.argv)
+    print("\n=== Done! ===")
+if __name__ == "__main__":
+    main()

vector_search.py ADDED Viewed

	@@ -0,0 +1,448 @@

+#!/usr/bin/env python3
+"""
+Vector Search Module for Semantic Similarity
+Optional module that adds semantic search capabilities using:
+- Sentence embeddings (sentence-transformers)
+- FAISS for efficient similarity search
+Dependencies (optional, install with):
+    pip install sentence-transformers faiss-cpu numpy
+If dependencies are not installed, the module gracefully degrades.
+"""
+import sqlite3
+import pickle
+from pathlib import Path
+from typing import Optional
+# Try importing optional dependencies
+VECTOR_SEARCH_AVAILABLE = False
+try:
+    import numpy as np
+    NUMPY_AVAILABLE = True
+except ImportError:
+    NUMPY_AVAILABLE = False
+    np = None
+try:
+    import faiss
+    FAISS_AVAILABLE = True
+except ImportError:
+    FAISS_AVAILABLE = False
+    faiss = None
+try:
+    from sentence_transformers import SentenceTransformer
+    SENTENCE_TRANSFORMERS_AVAILABLE = True
+except ImportError:
+    SENTENCE_TRANSFORMERS_AVAILABLE = False
+    SentenceTransformer = None
+VECTOR_SEARCH_AVAILABLE = all([NUMPY_AVAILABLE, FAISS_AVAILABLE, SENTENCE_TRANSFORMERS_AVAILABLE])
+class VectorSearchUnavailable:
+    """Placeholder when dependencies are not installed."""
+    def __init__(self, *args, **kwargs):
+        pass
+    def __getattr__(self, name):
+        def method(*args, **kwargs):
+            raise RuntimeError(
+                "Vector search requires additional dependencies. Install with:\n"
+                "pip install sentence-transformers faiss-cpu numpy"
+            )
+        return method
+class VectorSearch:
+    """
+    Semantic search using sentence embeddings and FAISS.
+    Features:
+    - Generate embeddings for messages
+    - Build FAISS index for fast similarity search
+    - Find semantically similar messages (not just keyword match)
+    - Supports Hebrew and multilingual text
+    Example:
+        vs = VectorSearch(db_path='telegram.db')
+        vs.build_index()  # One-time, can take a while
+        # Find similar messages
+        results = vs.search("מה קורה היום?", limit=10)
+        for msg_id, score, text in results:
+            print(f"{score:.3f}: {text[:50]}")
+    """
+    # Recommended models for multilingual/Hebrew support
+    MODELS = {
+        'fast': 'paraphrase-multilingual-MiniLM-L12-v2',  # Fast, good multilingual
+        'accurate': 'paraphrase-multilingual-mpnet-base-v2',  # More accurate
+        'small': 'all-MiniLM-L6-v2',  # Smallest, English-focused
+    }
+    def __init__(
+        self,
+        db_path: str = 'telegram.db',
+        model_name: str = 'fast',
+        index_path: Optional[str] = None
+    ):
+        """
+        Initialize vector search.
+        Args:
+            db_path: Path to SQLite database
+            model_name: Model preset ('fast', 'accurate', 'small') or full model name
+            index_path: Path to save/load FAISS index (default: db_path + '.faiss')
+        """
+        if not VECTOR_SEARCH_AVAILABLE:
+            raise RuntimeError(
+                "Vector search requires additional dependencies. Install with:\n"
+                "pip install sentence-transformers faiss-cpu numpy"
+            )
+        self.db_path = db_path
+        self.index_path = index_path or f"{db_path}.faiss"
+        self.id_map_path = f"{self.index_path}.ids"
+        # Load model
+        model_id = self.MODELS.get(model_name, model_name)
+        print(f"Loading embedding model: {model_id}")
+        self.model = SentenceTransformer(model_id)
+        self.dimension = self.model.get_sentence_embedding_dimension()
+        # Initialize FAISS index
+        self.index = None
+        self.id_map: list[int] = []  # Maps FAISS index position to message_id
+        # Try to load existing index
+        if Path(self.index_path).exists():
+            self.load_index()
+    def _get_connection(self) -> sqlite3.Connection:
+        """Get database connection."""
+        conn = sqlite3.connect(self.db_path)
+        conn.row_factory = sqlite3.Row
+        return conn
+    def encode(self, texts: list[str], batch_size: int = 32, show_progress: bool = True) -> 'np.ndarray':
+        """
+        Encode texts to embeddings.
+        Args:
+            texts: List of texts to encode
+            batch_size: Batch size for encoding
+            show_progress: Show progress bar
+        Returns:
+            numpy array of shape (n_texts, dimension)
+        """
+        return self.model.encode(
+            texts,
+            batch_size=batch_size,
+            show_progress_bar=show_progress,
+            convert_to_numpy=True,
+            normalize_embeddings=True  # For cosine similarity
+        )
+    def build_index(
+        self,
+        batch_size: int = 1000,
+        min_text_length: int = 10,
+        use_gpu: bool = False
+    ) -> None:
+        """
+        Build FAISS index from all messages in database.
+        Args:
+            batch_size: Number of messages to process at once
+            min_text_length: Minimum text length to index
+            use_gpu: Use GPU acceleration if available
+        """
+        conn = self._get_connection()
+        # Count messages
+        cursor = conn.execute(
+            'SELECT COUNT(*) FROM messages WHERE length(text_plain) >= ?',
+            (min_text_length,)
+        )
+        total = cursor.fetchone()[0]
+        print(f"Building index for {total} messages...")
+        # Create FAISS index
+        # Using IndexFlatIP (Inner Product) since we normalize embeddings
+        self.index = faiss.IndexFlatIP(self.dimension)
+        if use_gpu and faiss.get_num_gpus() > 0:
+            print("Using GPU acceleration")
+            self.index = faiss.index_cpu_to_gpu(
+                faiss.StandardGpuResources(),
+                0,
+                self.index
+            )
+        self.id_map = []
+        # Process in batches
+        offset = 0
+        while offset < total:
+            cursor = conn.execute(
+                '''
+                SELECT id, text_plain FROM messages
+                WHERE length(text_plain) >= ?
+                ORDER BY id
+                LIMIT ? OFFSET ?
+                ''',
+                (min_text_length, batch_size, offset)
+            )
+            rows = cursor.fetchall()
+            if not rows:
+                break
+            ids = [row['id'] for row in rows]
+            texts = [row['text_plain'] for row in rows]
+            # Encode batch
+            embeddings = self.encode(texts, show_progress=False)
+            # Add to index
+            self.index.add(embeddings)
+            self.id_map.extend(ids)
+            offset += len(rows)
+            print(f"Indexed {offset}/{total} messages ({100*offset/total:.1f}%)")
+        conn.close()
+        # Save index
+        self.save_index()
+        print(f"Index built: {self.index.ntotal} vectors")
+    def save_index(self) -> None:
+        """Save FAISS index and ID map to disk."""
+        if self.index is None:
+            return
+        # Convert GPU index to CPU for saving
+        if hasattr(faiss, 'index_gpu_to_cpu'):
+            try:
+                cpu_index = faiss.index_gpu_to_cpu(self.index)
+            except:
+                cpu_index = self.index
+        else:
+            cpu_index = self.index
+        faiss.write_index(cpu_index, self.index_path)
+        with open(self.id_map_path, 'wb') as f:
+            pickle.dump(self.id_map, f)
+        print(f"Index saved to {self.index_path}")
+    def load_index(self) -> bool:
+        """Load FAISS index from disk."""
+        try:
+            self.index = faiss.read_index(self.index_path)
+            with open(self.id_map_path, 'rb') as f:
+                self.id_map = pickle.load(f)
+            print(f"Loaded index with {self.index.ntotal} vectors")
+            return True
+        except Exception as e:
+            print(f"Could not load index: {e}")
+            return False
+    def search(
+        self,
+        query: str,
+        limit: int = 10,
+        min_score: float = 0.0
+    ) -> list[tuple[int, float, str]]:
+        """
+        Search for semantically similar messages.
+        Args:
+            query: Search query text
+            limit: Maximum results to return
+            min_score: Minimum similarity score (0-1)
+        Returns:
+            List of (message_id, score, text) tuples
+        """
+        if self.index is None or self.index.ntotal == 0:
+            raise RuntimeError("Index not built. Call build_index() first.")
+        # Encode query
+        query_vector = self.encode([query], show_progress=False)
+        # Search FAISS
+        scores, indices = self.index.search(query_vector, limit)
+        # Get message texts from DB
+        conn = self._get_connection()
+        results = []
+        for score, idx in zip(scores[0], indices[0]):
+            if idx == -1 or score < min_score:
+                continue
+            message_id = self.id_map[idx]
+            cursor = conn.execute(
+                'SELECT text_plain FROM messages WHERE id = ?',
+                (message_id,)
+            )
+            row = cursor.fetchone()
+            if row:
+                results.append((message_id, float(score), row['text_plain']))
+        conn.close()
+        return results
+    def find_similar(
+        self,
+        message_id: int,
+        limit: int = 10,
+        exclude_same_user: bool = False
+    ) -> list[tuple[int, float, str]]:
+        """
+        Find messages similar to a specific message.
+        Args:
+            message_id: ID of the reference message
+            limit: Maximum results to return
+            exclude_same_user: Exclude messages from same user
+        Returns:
+            List of (message_id, score, text) tuples
+        """
+        conn = self._get_connection()
+        # Get the reference message
+        cursor = conn.execute(
+            'SELECT text_plain, from_id FROM messages WHERE id = ?',
+            (message_id,)
+        )
+        row = cursor.fetchone()
+        if not row:
+            conn.close()
+            return []
+        reference_text = row['text_plain']
+        reference_user = row['from_id']
+        conn.close()
+        # Search
+        results = self.search(reference_text, limit=limit * 2)
+        # Filter
+        filtered = []
+        for msg_id, score, text in results:
+            if msg_id == message_id:
+                continue
+            if exclude_same_user:
+                conn = self._get_connection()
+                cursor = conn.execute(
+                    'SELECT from_id FROM messages WHERE id = ?',
+                    (msg_id,)
+                )
+                msg_row = cursor.fetchone()
+                conn.close()
+                if msg_row and msg_row['from_id'] == reference_user:
+                    continue
+            filtered.append((msg_id, score, text))
+            if len(filtered) >= limit:
+                break
+        return filtered
+    def cluster_messages(
+        self,
+        n_clusters: int = 10,
+        sample_size: Optional[int] = None
+    ) -> dict[int, list[int]]:
+        """
+        Cluster messages by semantic similarity using K-means.
+        Args:
+            n_clusters: Number of clusters
+            sample_size: Number of messages to sample (None = all)
+        Returns:
+            Dict mapping cluster_id to list of message_ids
+        """
+        if self.index is None or self.index.ntotal == 0:
+            raise RuntimeError("Index not built. Call build_index() first.")
+        # Get vectors
+        n_vectors = self.index.ntotal
+        if sample_size and sample_size < n_vectors:
+            indices = np.random.choice(n_vectors, sample_size, replace=False)
+            vectors = np.array([self.index.reconstruct(int(i)) for i in indices])
+            ids = [self.id_map[i] for i in indices]
+        else:
+            vectors = np.array([self.index.reconstruct(i) for i in range(n_vectors)])
+            ids = self.id_map
+        # K-means clustering
+        kmeans = faiss.Kmeans(self.dimension, n_clusters, niter=20, verbose=True)
+        kmeans.train(vectors)
+        # Assign clusters
+        _, assignments = kmeans.index.search(vectors, 1)
+        # Group by cluster
+        clusters: dict[int, list[int]] = {i: [] for i in range(n_clusters)}
+        for msg_id, cluster_id in zip(ids, assignments.flatten()):
+            clusters[int(cluster_id)].append(msg_id)
+        return clusters
+    @property
+    def stats(self) -> dict:
+        """Get index statistics."""
+        return {
+            'available': VECTOR_SEARCH_AVAILABLE,
+            'model': self.model.get_sentence_embedding_dimension() if self.model else None,
+            'dimension': self.dimension,
+            'index_size': self.index.ntotal if self.index else 0,
+            'index_path': self.index_path
+        }
+# Export appropriate class based on availability
+if VECTOR_SEARCH_AVAILABLE:
+    SemanticSearch = VectorSearch
+else:
+    SemanticSearch = VectorSearchUnavailable
+def check_dependencies() -> dict:
+    """Check which dependencies are available."""
+    return {
+        'numpy': NUMPY_AVAILABLE,
+        'faiss': FAISS_AVAILABLE,
+        'sentence_transformers': SENTENCE_TRANSFORMERS_AVAILABLE,
+        'vector_search_available': VECTOR_SEARCH_AVAILABLE
+    }
+if __name__ == '__main__':
+    print("=== Vector Search Dependencies ===")
+    deps = check_dependencies()
+    for name, available in deps.items():
+        status = "✓" if available else "✗"
+        print(f"  {status} {name}")
+    if VECTOR_SEARCH_AVAILABLE:
+        print("\nVector search is available!")
+        print("Usage:")
+        print("  vs = VectorSearch('telegram.db')")
+        print("  vs.build_index()  # One-time indexing")
+        print("  results = vs.search('מה קורה?')")
+    else:
+        print("\nTo enable vector search, install dependencies:")
+        print("  pip install sentence-transformers faiss-cpu numpy")