# indexer/store.py

import os
import sqlite3
import numpy as np
import faiss
import yaml


class Store:
    """
    Handles two storage systems:

    1. FAISS — stores dense vectors for fast similarity search
               Uses IndexHNSWFlat instead of IndexFlatL2
               HNSW = Hierarchical Navigable Small World graph
               - IndexFlatL2  : scans every vector (slow at scale)
               - IndexHNSWFlat: graph-based navigation (fast, same accuracy)

    2. SQLite — stores metadata about each chunk
    """

    # HNSW parameter — higher = more accurate but more memory
    # 32 is the standard default, good balance for this use case
    HNSW_M = 32

    def __init__(self, config_path="config.yaml"):
        """
        Load config, set up file paths, initialize FAISS index and SQLite.
        """
        config_path = os.path.abspath(config_path)
        with open(config_path, "r") as f:
            config = yaml.safe_load(f)

        config_dir = os.path.dirname(config_path)
        data_dir = config["data_dir"]
        self.data_dir = data_dir if os.path.isabs(data_dir) else os.path.normpath(os.path.join(config_dir, data_dir))
        os.makedirs(self.data_dir, exist_ok=True)

        self.faiss_path = os.path.join(self.data_dir, "index.faiss")
        self.db_path    = os.path.join(self.data_dir, "metadata.db")

        self._init_db()
        self._load_or_create_index()

    def _init_db(self):
        """
        Create SQLite tables if they don't already exist.
        """
        conn   = sqlite3.connect(self.db_path)
        cursor = conn.cursor()

        cursor.execute('''
            CREATE TABLE IF NOT EXISTS chunks (
                id          INTEGER PRIMARY KEY,
                filepath    TEXT    NOT NULL,
                chunk_text  TEXT    NOT NULL,
                chunk_index INTEGER,
                FOREIGN KEY (filepath) REFERENCES files(filepath)
            )
        ''')

        cursor.execute('''
            CREATE TABLE IF NOT EXISTS files (
                filepath     TEXT PRIMARY KEY,
                file_hash    TEXT NOT NULL,
                total_chunks INTEGER
            )
        ''')

        conn.commit()
        conn.close()

    def _load_or_create_index(self):
        """
        Load an existing FAISS index from disk, or set to None.
        The actual index is created on first add_chunks() call
        so we know the embedding dimension at that point.
        """
        if os.path.exists(self.faiss_path):
            self.index = faiss.read_index(self.faiss_path)
            print(f"[Store] Loaded FAISS index — {self.index.ntotal} vectors")
        else:
            self.index = None
            print("[Store] No existing index found — will create on first insert")

    def _create_hnsw_index(self, dimension: int):
        """
        Create a new HNSW-based FAISS index.

        Why HNSW over FlatL2:
            FlatL2   — exact search, O(n) per query, slow at scale
            HNSWFlat — approximate search, O(log n) per query, same accuracy
                       for top-k retrieval tasks

        IndexIDMap2 wraps HNSW to support custom integer IDs and deletion.

        Args:
            dimension — embedding size (384 for MiniLM and BGE-small)
        """
        hnsw_index      = faiss.IndexHNSWFlat(dimension, self.HNSW_M)
        hnsw_index.hnsw.efSearch     = 64   # search quality — higher = better recall
        hnsw_index.hnsw.efConstruction = 64 # build quality  — higher = better graph
        self.index      = faiss.IndexIDMap2(hnsw_index)
        print(f"[Store] Created HNSW index — dim={dimension}, M={self.HNSW_M}")

    def get_next_id(self):
        """
        Get the next available chunk ID from SQLite.
        """
        conn   = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT MAX(id) FROM chunks")
        result = cursor.fetchone()[0]
        conn.close()
        return 0 if result is None else result + 1

    def add_chunks(self, chunks_with_metadata, embeddings):
        """
        Add new chunks and their embeddings to both FAISS and SQLite.

        Args:
            chunks_with_metadata (list[dict]) — from chunker.chunk_file()
                Each dict has: text, filepath, chunk_index
            embeddings (numpy.ndarray) — shape (num_chunks, embedding_dim)
                From embedder.embed_chunks()
        """
        embeddings = embeddings.astype("float32")

        # create index on first insert — dimension comes from embeddings
        if self.index is None:
            dimension = embeddings.shape[1]
            self._create_hnsw_index(dimension)

        start_id = self.get_next_id()
        ids      = np.array(
            [start_id + i for i in range(len(chunks_with_metadata))],
            dtype=np.int64
        )

        self.index.add_with_ids(embeddings, ids)
        faiss.write_index(self.index, self.faiss_path)

        # save chunk metadata to SQLite
        conn   = sqlite3.connect(self.db_path)
        cursor = conn.cursor()

        for i, chunk in enumerate(chunks_with_metadata):
            vector_id = start_id + i
            cursor.execute(
                "INSERT INTO chunks (id, filepath, chunk_text, chunk_index) "
                "VALUES (?, ?, ?, ?)",
                (vector_id, chunk["filepath"], chunk["text"], chunk["chunk_index"])
            )

        conn.commit()
        conn.close()

    def save_file_info(self, filepath, file_hash, total_chunks):
        """
        Save or update file info in SQLite.

        Args:
            filepath     — file path or fake path e.g. "scifact://12345"
            file_hash    — SHA256 hash or doc_id string
            total_chunks — number of chunks this file was split into
        """
        conn   = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        cursor.execute(
            "INSERT OR REPLACE INTO files (filepath, file_hash, total_chunks) "
            "VALUES (?, ?, ?)",
            (filepath, file_hash, total_chunks)
        )
        conn.commit()
        conn.close()

    def load_hashes(self):
        """
        Load all stored file hashes from SQLite.

        Returns:
            dict — {filepath: hash_string}
        """
        conn   = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT filepath, file_hash FROM files")
        rows   = cursor.fetchall()
        conn.close()
        return {row[0]: row[1] for row in rows}

    def remove_file_chunks(self, filepath):
        """
        Delete all chunks for a file from both SQLite and FAISS.

        Args:
            filepath — the filepath to remove
        """
        conn   = sqlite3.connect(self.db_path)
        cursor = conn.cursor()

        ids = cursor.execute(
            "SELECT id FROM chunks WHERE filepath = ?", (filepath,)
        ).fetchall()

        cursor.execute("DELETE FROM chunks WHERE filepath = ?", (filepath,))
        cursor.execute("DELETE FROM files  WHERE filepath = ?", (filepath,))
        conn.commit()
        conn.close()

        if ids and self.index is not None:
            id_array = np.array([i[0] for i in ids], dtype=np.int64)
            self.index.remove_ids(id_array)
            faiss.write_index(self.index, self.faiss_path)

    def get_total_vectors(self):
        """
        Return how many vectors are in the FAISS index.

        Returns:
            int — number of vectors, or 0 if index is empty
        """
        if self.index is None:
            return 0
        return self.index.ntotal


if __name__ == "__main__":
    store = Store()

    fake_chunks = [
        {"text": "quarterly budget report summary",       "filepath": "/docs/report.pdf",   "chunk_index": 0},
        {"text": "revenue increased by fifteen percent",  "filepath": "/docs/report.pdf",   "chunk_index": 1},
        {"text": "python machine learning tutorial",      "filepath": "/docs/tutorial.txt", "chunk_index": 0},
    ]

    fake_embeddings = np.random.rand(3, 384).astype("float32")

    print(f"Vectors before: {store.get_total_vectors()}")
    store.add_chunks(fake_chunks, fake_embeddings)
    print(f"Vectors after:  {store.get_total_vectors()}")