import os
import json
import faiss
import numpy as np
import argparse
import pickle
from typing import List, Dict
from .embed import get_embedder

def load_processed_data(processed_dir: str) -> List[Dict]:
    chunks = []
    # Always glob for all JSONs to support additive ingestion
    import glob
    json_files = glob.glob(os.path.join(processed_dir, "*.json"))
    
    print(f"Found {len(json_files)} existing documents to index.")
    
    for f_path in json_files:
        if f_path.endswith("manifest.json"): continue
        try:
            with open(f_path, 'r') as f:
                doc_data = json.load(f)
                if 'chunks' in doc_data:
                    chunks.extend(doc_data['chunks'])
        except Exception as e:
            print(f"Error loading {f_path}: {e}")
            
    return chunks

def build_index(processed_dir: str, output_dir: str):
    os.makedirs(output_dir, exist_ok=True)
    
    print("Loading chunks...")
    chunks = load_processed_data(processed_dir)
    print(f"Loaded {len(chunks)} chunks.")
    
    if not chunks:
        print("No chunks found. Exiting.")
        return

    texts = [c['content'] for c in chunks]
    metadatas = [c['metadata'] for c in chunks]
    
    print("Generating embeddings...")
    embedder = get_embedder()
    embeddings = embedder.embed(texts)
    
    dimension = embeddings.shape[1]
    print(f"Embedding dimension: {dimension}")
    
    print("Building FAISS index...")
    # Using simple IndexFlatP, or IVF if dataset large. 
    # For "local lightweight", FlatL2 is safest and exact.
    # Normalize for cosine similarity if using IP (Inner Product).
    # SentenceTransformers are usually cosine-sim optimized.
    
    # Normalize embeddings for Cosine Similarity with IndexFlatIP
    faiss.normalize_L2(embeddings)
    index = faiss.IndexFlatIP(dimension)
    index.add(embeddings)
    
    print(f"Index built with {index.ntotal} vectors.")
    
    # Save index
    faiss.write_index(index, os.path.join(output_dir, "vector.index"))
    
    # Save metadatas (chunks map)
    # We need to map index ID -> Chunk Metadata + Content
    with open(os.path.join(output_dir, "doc_store.pkl"), 'wb') as f:
        pickle.dump(chunks, f)
        
    print(f"Index saved to {output_dir}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--processed", required=True, help="Path to processed data directory")
    parser.add_argument("--out", required=True, help="Output directory for index")
    args = parser.parse_args()
    
    build_index(args.processed, args.out)