#!/usr/bin/env python3
"""
Telegram Chat Search Utilities (Optimized)

Features:
- Full-text search with BM25 ranking
- LRU caching for repeated queries
- Fuzzy search with trigram similarity
- Thread traversal with DFS/BFS
- Autocomplete suggestions

Usage:
    python search.py <query> [options]
    python search.py "שלום" --db telegram.db
    python search.py "link" --user user123 --fuzzy
"""

import sqlite3
import argparse
from datetime import datetime
from typing import Optional
from functools import lru_cache

from data_structures import LRUCache, Trie, TrigramIndex, ReplyGraph, lru_cached


class TelegramSearch:
    """
    High-performance search interface for indexed Telegram messages.

    Features:
    - Full-text search with FTS5 and BM25 ranking
    - Query result caching (LRU)
    - Fuzzy/approximate search with trigrams
    - Thread reconstruction with graph traversal
    - Autocomplete for usernames and common terms
    """

    def __init__(self, db_path: str = 'telegram.db', cache_size: int = 1000):
        self.db_path = db_path
        self.conn = sqlite3.connect(db_path)
        self.conn.row_factory = sqlite3.Row

        # Initialize caches
        self.query_cache = LRUCache(maxsize=cache_size)
        self.user_trie: Optional[Trie] = None
        self.trigram_index: Optional[TrigramIndex] = None
        self.reply_graph: Optional[ReplyGraph] = None

    def close(self):
        self.conn.close()

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()

    # ==========================================
    # FULL-TEXT SEARCH
    # ==========================================

    def search(
        self,
        query: str,
        user_id: Optional[str] = None,
        from_date: Optional[int] = None,
        to_date: Optional[int] = None,
        has_links: Optional[bool] = None,
        has_mentions: Optional[bool] = None,
        has_media: Optional[bool] = None,
        limit: int = 100,
        offset: int = 0,
        use_cache: bool = True
    ) -> list[dict]:
        """
        Full-text search with BM25 ranking and optional filters.

        Args:
            query: FTS5 query (supports AND, OR, NOT, "phrase", prefix*)
            user_id: Filter by user ID
            from_date: Unix timestamp lower bound
            to_date: Unix timestamp upper bound
            has_links/has_mentions/has_media: Boolean filters
            limit: Max results
            offset: Pagination offset
            use_cache: Whether to use LRU cache

        Returns:
            List of message dicts with relevance scores
        """
        # Build cache key
        cache_key = f"search:{query}:{user_id}:{from_date}:{to_date}:{has_links}:{has_mentions}:{has_media}:{limit}:{offset}"

        if use_cache:
            cached = self.query_cache.get(cache_key)
            if cached is not None:
                return cached

        # Build query conditions
        conditions = []
        params = []

        if user_id:
            conditions.append("m.from_id = ?")
            params.append(user_id)

        if from_date:
            conditions.append("m.date_unixtime >= ?")
            params.append(from_date)

        if to_date:
            conditions.append("m.date_unixtime <= ?")
            params.append(to_date)

        if has_links is not None:
            conditions.append("m.has_links = ?")
            params.append(1 if has_links else 0)

        if has_mentions is not None:
            conditions.append("m.has_mentions = ?")
            params.append(1 if has_mentions else 0)

        if has_media is not None:
            conditions.append("m.has_media = ?")
            params.append(1 if has_media else 0)

        where_clause = " AND ".join(conditions) if conditions else "1=1"

        sql = f'''
            SELECT
                m.id,
                m.date,
                m.date_unixtime,
                m.from_name,
                m.from_id,
                m.text_plain,
                m.reply_to_message_id,
                m.forwarded_from,
                m.has_links,
                m.has_mentions,
                m.has_media,
                bm25(messages_fts, 1.0, 0.5) as relevance
            FROM messages_fts
            JOIN messages m ON messages_fts.rowid = m.id
            WHERE messages_fts MATCH ?
            AND {where_clause}
            ORDER BY relevance
            LIMIT ? OFFSET ?
        '''

        params = [query] + params + [limit, offset]

        cursor = self.conn.execute(sql, params)
        results = [dict(row) for row in cursor.fetchall()]

        if use_cache:
            self.query_cache.put(cache_key, results)

        return results

    def search_prefix(self, prefix: str, limit: int = 100) -> list[dict]:
        """
        Search using prefix matching (autocomplete-style).

        Uses FTS5 prefix index for fast prefix queries.
        """
        # FTS5 prefix search syntax
        query = f'{prefix}*'
        return self.search(query, limit=limit, use_cache=True)

    # ==========================================
    # FUZZY SEARCH
    # ==========================================

    def fuzzy_search(
        self,
        query: str,
        threshold: float = 0.3,
        limit: int = 50
    ) -> list[dict]:
        """
        Fuzzy search using trigram similarity.

        Finds messages even with typos or slight variations.

        Args:
            query: Search query
            threshold: Minimum similarity (0-1)
            limit: Max results

        Returns:
            List of (message, similarity) tuples
        """
        # Build trigram index if not exists
        if self.trigram_index is None:
            self._build_trigram_index()

        # Search trigram index
        matches = self.trigram_index.search(query, threshold=threshold, limit=limit)

        # Fetch full messages
        results = []
        for msg_id, similarity in matches:
            cursor = self.conn.execute(
                'SELECT * FROM messages WHERE id = ?',
                (msg_id,)
            )
            row = cursor.fetchone()
            if row:
                msg = dict(row)
                msg['similarity'] = similarity
                results.append(msg)

        return results

    def _build_trigram_index(self) -> None:
        """Build in-memory trigram index from database."""
        print("Building trigram index (first time only)...")
        self.trigram_index = TrigramIndex()

        cursor = self.conn.execute(
            'SELECT id, text_plain FROM messages WHERE text_plain IS NOT NULL'
        )
        for row in cursor.fetchall():
            self.trigram_index.add(row[0], row[1])

        print(f"Trigram index built: {len(self.trigram_index)} documents")

    # ==========================================
    # THREAD TRAVERSAL
    # ==========================================

    def get_thread_dfs(self, message_id: int) -> list[dict]:
        """
        Get full conversation thread using DFS traversal.

        Returns messages in depth-first order (follows reply chains deep).
        """
        if self.reply_graph is None:
            self._build_reply_graph()

        # Find thread root
        root_id = self.reply_graph.get_thread_root(message_id)

        # DFS traversal
        msg_ids = self.reply_graph.dfs_descendants(root_id)

        # Fetch messages in order
        return self._fetch_messages_ordered(msg_ids)

    def get_thread_bfs(self, message_id: int) -> list[dict]:
        """
        Get conversation thread using BFS traversal.

        Returns messages level by level.
        """
        if self.reply_graph is None:
            self._build_reply_graph()

        root_id = self.reply_graph.get_thread_root(message_id)
        msg_ids = self.reply_graph.bfs_descendants(root_id)

        return self._fetch_messages_ordered(msg_ids)

    def get_thread_with_depth(self, message_id: int) -> list[tuple[dict, int]]:
        """
        Get thread with depth information for each message.

        Returns list of (message, depth) tuples.
        """
        if self.reply_graph is None:
            self._build_reply_graph()

        root_id = self.reply_graph.get_thread_root(message_id)
        items = self.reply_graph.bfs_with_depth(root_id)

        results = []
        for msg_id, depth in items:
            cursor = self.conn.execute(
                'SELECT * FROM messages WHERE id = ?',
                (msg_id,)
            )
            row = cursor.fetchone()
            if row:
                results.append((dict(row), depth))

        return results

    def get_replies(self, message_id: int) -> list[dict]:
        """Get all direct replies to a message."""
        if self.reply_graph is None:
            self._build_reply_graph()

        child_ids = self.reply_graph.get_children(message_id)
        return self._fetch_messages_ordered(child_ids)

    def get_conversation_path(self, message_id: int) -> list[dict]:
        """Get the path from thread root to this message."""
        if self.reply_graph is None:
            self._build_reply_graph()

        path_ids = self.reply_graph.get_thread_path(message_id)
        return self._fetch_messages_ordered(path_ids)

    def _build_reply_graph(self) -> None:
        """Build in-memory reply graph from database."""
        print("Building reply graph (first time only)...")
        self.reply_graph = ReplyGraph()

        cursor = self.conn.execute(
            'SELECT id, reply_to_message_id FROM messages'
        )
        for row in cursor.fetchall():
            self.reply_graph.add_message(row[0], row[1])

        print(f"Reply graph built: {self.reply_graph.stats}")

    def _fetch_messages_ordered(self, msg_ids: list[int]) -> list[dict]:
        """Fetch messages preserving the order of IDs."""
        if not msg_ids:
            return []

        placeholders = ','.join('?' * len(msg_ids))
        cursor = self.conn.execute(
            f'SELECT * FROM messages WHERE id IN ({placeholders})',
            msg_ids
        )

        # Create lookup dict
        msg_map = {row['id']: dict(row) for row in cursor.fetchall()}

        # Return in original order
        return [msg_map[mid] for mid in msg_ids if mid in msg_map]

    # ==========================================
    # AUTOCOMPLETE
    # ==========================================

    def autocomplete_user(self, prefix: str, limit: int = 10) -> list[str]:
        """
        Autocomplete username suggestions.

        Uses Trie for O(p + k) lookup where p=prefix length, k=results.
        """
        if self.user_trie is None:
            self._build_user_trie()

        return self.user_trie.autocomplete(prefix, limit=limit)

    def _build_user_trie(self) -> None:
        """Build Trie index for usernames."""
        self.user_trie = Trie()

        cursor = self.conn.execute('SELECT user_id, display_name FROM users')
        for row in cursor.fetchall():
            if row['display_name']:
                self.user_trie.insert(row['display_name'], data=row['user_id'])
            if row['user_id']:
                self.user_trie.insert(row['user_id'], data=row['user_id'])

    # ==========================================
    # CONVENIENCE METHODS
    # ==========================================

    def search_by_user(self, user_id: str, limit: int = 100) -> list[dict]:
        """Get all messages from a specific user."""
        sql = '''
            SELECT * FROM messages
            WHERE from_id = ?
            ORDER BY date_unixtime DESC
            LIMIT ?
        '''
        cursor = self.conn.execute(sql, (user_id, limit))
        return [dict(row) for row in cursor.fetchall()]

    def search_by_date_range(
        self,
        from_date: int,
        to_date: int,
        limit: int = 1000
    ) -> list[dict]:
        """Get messages within a date range."""
        sql = '''
            SELECT * FROM messages
            WHERE date_unixtime BETWEEN ? AND ?
            ORDER BY date_unixtime ASC
            LIMIT ?
        '''
        cursor = self.conn.execute(sql, (from_date, to_date, limit))
        return [dict(row) for row in cursor.fetchall()]

    def get_links(self, limit: int = 100) -> list[dict]:
        """Get all extracted links."""
        sql = '''
            SELECT e.value as url, e.message_id, m.from_name, m.date
            FROM entities e
            JOIN messages m ON e.message_id = m.id
            WHERE e.type = 'link'
            ORDER BY m.date_unixtime DESC
            LIMIT ?
        '''
        cursor = self.conn.execute(sql, (limit,))
        return [dict(row) for row in cursor.fetchall()]

    def get_mentions(self, username: Optional[str] = None, limit: int = 100) -> list[dict]:
        """Get mentions, optionally filtered by username."""
        if username:
            sql = '''
                SELECT e.value as mention, e.message_id, m.from_name, m.text_plain, m.date
                FROM entities e
                JOIN messages m ON e.message_id = m.id
                WHERE e.type = 'mention' AND e.value LIKE ?
                ORDER BY m.date_unixtime DESC
                LIMIT ?
            '''
            cursor = self.conn.execute(sql, (f'%{username}%', limit))
        else:
            sql = '''
                SELECT e.value as mention, e.message_id, m.from_name, m.text_plain, m.date
                FROM entities e
                JOIN messages m ON e.message_id = m.id
                WHERE e.type = 'mention'
                ORDER BY m.date_unixtime DESC
                LIMIT ?
            '''
            cursor = self.conn.execute(sql, (limit,))

        return [dict(row) for row in cursor.fetchall()]

    @property
    def cache_stats(self) -> dict:
        """Get cache statistics."""
        return self.query_cache.stats


def format_result(msg: dict, show_depth: bool = False, depth: int = 0) -> str:
    """Format a message for display."""
    date_str = msg.get('date', 'Unknown date')
    from_name = msg.get('from_name', 'Unknown')
    text = msg.get('text_plain', '')[:200]
    if len(msg.get('text_plain', '')) > 200:
        text += '...'

    flags = []
    if msg.get('has_links'):
        flags.append('[link]')
    if msg.get('has_mentions'):
        flags.append('[mention]')
    if msg.get('has_media'):
        flags.append('[media]')
    if msg.get('similarity'):
        flags.append(f'[sim:{msg["similarity"]:.2f}]')
    if msg.get('relevance'):
        flags.append(f'[rel:{abs(msg["relevance"]):.2f}]')

    flags_str = ' '.join(flags)
    indent = '  ' * depth if show_depth else ''
    return f"{indent}[{date_str}] {from_name}: {text} {flags_str}"


def main():
    parser = argparse.ArgumentParser(description='Search indexed Telegram messages')
    parser.add_argument('query', nargs='?', help='Search query')
    parser.add_argument('--db', default='telegram.db', help='Database path')
    parser.add_argument('--user', help='Filter by user ID')
    parser.add_argument('--from-date', help='From date (YYYY-MM-DD)')
    parser.add_argument('--to-date', help='To date (YYYY-MM-DD)')
    parser.add_argument('--links', action='store_true', help='Show only messages with links')
    parser.add_argument('--mentions', action='store_true', help='Show only messages with mentions')
    parser.add_argument('--media', action='store_true', help='Show only messages with media')
    parser.add_argument('--limit', type=int, default=50, help='Max results')
    parser.add_argument('--fuzzy', action='store_true', help='Use fuzzy search')
    parser.add_argument('--threshold', type=float, default=0.3, help='Fuzzy match threshold')
    parser.add_argument('--thread', type=int, help='Show thread for message ID')
    parser.add_argument('--list-links', action='store_true', help='List all extracted links')
    parser.add_argument('--list-mentions', action='store_true', help='List all mentions')
    parser.add_argument('--autocomplete', help='Autocomplete username')
    parser.add_argument('--cache-stats', action='store_true', help='Show cache statistics')

    args = parser.parse_args()

    with TelegramSearch(args.db) as search:
        # Show thread
        if args.thread:
            print(f"Thread containing message {args.thread}:\n")
            thread = search.get_thread_with_depth(args.thread)
            for msg, depth in thread:
                print(format_result(msg, show_depth=True, depth=depth))
            return

        # Autocomplete
        if args.autocomplete:
            suggestions = search.autocomplete_user(args.autocomplete)
            print(f"Suggestions for '{args.autocomplete}':")
            for s in suggestions:
                print(f"  {s}")
            return

        # List links
        if args.list_links:
            links = search.get_links(args.limit)
            print(f"Found {len(links)} links:\n")
            for link in links:
                print(f"  {link['url']}")
                print(f"    From: {link['from_name']} at {link['date']}")
            return

        # List mentions
        if args.list_mentions:
            mentions = search.get_mentions(limit=args.limit)
            print(f"Found {len(mentions)} mentions:\n")
            for m in mentions:
                print(f"  {m['mention']} by {m['from_name']}")
            return

        # Cache stats
        if args.cache_stats:
            print(f"Cache stats: {search.cache_stats}")
            return

        if not args.query:
            parser.print_help()
            return

        # Parse dates
        from_ts = None
        to_ts = None
        if args.from_date:
            from_ts = int(datetime.strptime(args.from_date, '%Y-%m-%d').timestamp())
        if args.to_date:
            to_ts = int(datetime.strptime(args.to_date, '%Y-%m-%d').timestamp())

        # Fuzzy or regular search
        if args.fuzzy:
            results = search.fuzzy_search(
                query=args.query,
                threshold=args.threshold,
                limit=args.limit
            )
            print(f"Found {len(results)} fuzzy matches for '{args.query}':\n")
        else:
            results = search.search(
                query=args.query,
                user_id=args.user,
                from_date=from_ts,
                to_date=to_ts,
                has_links=True if args.links else None,
                has_mentions=True if args.mentions else None,
                has_media=True if args.media else None,
                limit=args.limit
            )
            print(f"Found {len(results)} results for '{args.query}':\n")

        for msg in results:
            print(format_result(msg))
            print()

        # Show cache stats
        print(f"\nCache: {search.cache_stats}")


if __name__ == '__main__':
    main()