import os
from collections import defaultdict
import tiktoken
import pickle
import shutil
import json
import time
import numpy as np
from pathlib import Path

from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F
from langchain_community.vectorstores import Qdrant
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams

from dotenv import load_dotenv

# Load environment variables
load_dotenv()

def tiktoken_len(text):
    """Count tokens using the gpt-4o-mini tokenizer"""
    tokens = tiktoken.encoding_for_model("gpt-4o-mini").encode(text)
    return len(tokens)

def add_page_info_to_splits(splits):
    """Process splits to add page info based on character position"""
    for split in splits:
        # Get the start position of this chunk
        start_pos = split.metadata.get("start_index", 0)
        end_pos = start_pos + len(split.page_content)
        
        # Find which page this chunk belongs to
        if "page_ranges" in split.metadata:
            for page_range in split.metadata["page_ranges"]:
                # If chunk significantly overlaps with this page range
                if (start_pos <= page_range["end"] and 
                    end_pos >= page_range["start"]):
                    # Use this page number
                    split.metadata["page"] = page_range["page"]
                    break
    return splits

def clean_directory(directory_path):
    """Clean a directory by removing all files and subdirectories"""
    path = Path(directory_path)
    if path.exists():
        print(f"Cleaning directory: {directory_path}")
        shutil.rmtree(path)
    
    # Wait a moment to ensure OS releases the directory handles
    time.sleep(1)
    
    path.mkdir(parents=True, exist_ok=True)
    print(f"Created clean directory: {directory_path}")

class ArcticEmbedder:
    def __init__(self, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        
    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def encode(self, texts, batch_size=32):
        all_embeddings = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            
            encoded_input = self.tokenizer(
                batch, 
                padding=True, 
                truncation=True, 
                return_tensors="pt"
            ).to(self.device)
            
            with torch.no_grad():
                model_output = self.model(**encoded_input)
            
            batch_embeddings = self._mean_pooling(model_output, encoded_input['attention_mask'])
            batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)
            
            all_embeddings.append(batch_embeddings.cpu().numpy())
        
        return np.concatenate(all_embeddings)

def process_pdfs():
    """Process PDFs and create vectorstore"""
    print("Processing PDFs...")
    
    # Create processed data directory if it doesn't exist (clean it if it does)
    processed_data_dir = Path("data/processed_data")
    clean_directory(processed_data_dir)
    
    # Load all PDF documents (each page as a separate document)
    pdf_path = "notebook_version_clean/data/"
    print(f"Loading PDFs from: {pdf_path}")
    
    loader = DirectoryLoader(pdf_path, glob="*.pdf", loader_cls=PyPDFLoader)
    all_docs = loader.load()
    
    print(f"Loaded {len(all_docs)} document pages.")
    
    # Create a mapping of merged document chunks back to original pages
    docs_by_source = defaultdict(list)
    
    # Group documents by their source file
    for doc in all_docs:
        source = doc.metadata.get("source", "")
        docs_by_source[source].append(doc)
    
    # Merge pages from the same PDF but track page ranges
    merged_docs = []
    for source, source_docs in docs_by_source.items():
        # Sort by page number if available
        source_docs.sort(key=lambda x: x.metadata.get("page", 0))
        
        # Get just the filename (no path)
        filename = os.path.basename(source)
        
        # Merge the content
        merged_content = ""
        page_ranges = []
        
        for doc in source_docs:
            # Get the page number (1-indexed for human readability)
            page_num = doc.metadata.get("page", 0) + 1
            
            # Add a separator between pages for clarity
            if merged_content:
                merged_content += "\n\n"
            
            # Record where this page's content starts in the merged document
            start_pos = len(merged_content)
            merged_content += doc.page_content
            end_pos = len(merged_content)
            
            # Store the mapping of character ranges to original page numbers
            page_ranges.append({
                "start": start_pos,
                "end": end_pos,
                "page": page_num,
                "source": filename
            })
        
        # Create merged metadata that includes page mapping information
        merged_metadata = {
            "source": filename,
            "title": filename,
            "page_count": len(source_docs),
            "merged": True,
            "page_ranges": page_ranges  # Store the page ranges for later reference
        }
        
        # Create a new document with the merged content
        merged_doc = Document(page_content=merged_content, metadata=merged_metadata)
        merged_docs.append(merged_doc)
    
    print(f"Created {len(merged_docs)} merged documents.")
    
    # Split documents
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=150,
        chunk_overlap=100,
        length_function=tiktoken_len,
        add_start_index=True
    )
    
    # Split and then process to add page information
    raw_splits = text_splitter.split_documents(merged_docs)
    split_chunks = add_page_info_to_splits(raw_splits)
    
    print(f"Created {len(split_chunks)} chunks.")
    
    # Save chunks for later use
    with open(processed_data_dir / "chunks.pkl", "wb") as f:
        pickle.dump(split_chunks, f)
    
    
    
    # Initialize custom embedding model
    try:
        embedding_model = ArcticEmbedder("kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec")
        print("Successfully loaded ArcticEmbedder model")
    
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        raise RuntimeError(f"Error initializing SentenceTransformer model: {str(e)}")
    
    print("Embedding document chunks (this may take a while)...")
    # Create a dictionary to store documents and their embeddings
    embedded_docs = []
    
    # Embed in batches to avoid API rate limits
    batch_size = 50
    for i in range(0, len(split_chunks), batch_size):
        batch = split_chunks[i:i+batch_size]
        
        # Extract text
        texts = [doc.page_content for doc in batch]
        
        # Get embeddings
        embeddings = embedding_model.encode(texts)
        
        # Store with metadata
        for j, doc in enumerate(batch):
            embedded_docs.append({
                "id": i + j,
                "text": doc.page_content,
                "metadata": doc.metadata,
                "embedding": embeddings[j]
            })
        
        # Print progress
        print(f"Embedded {min(i+batch_size, len(split_chunks))}/{len(split_chunks)} chunks")
    
    # Save the embedded docs for later use
    with open(processed_data_dir / "embedded_docs.pkl", "wb") as f:
        pickle.dump(embedded_docs, f)
    
    print("Processing complete. All data saved to data/processed_data/")

if __name__ == "__main__":
    process_pdfs()