"""
embedding.py — SPECTER2 embedding generation with caching.

Uses AutoAdapterModel (from the `adapters` library) with the allenai/specter2
proximity adapter, which is the correct way to load SPECTER2 for document
similarity and clustering tasks.

Reference: https://huggingface.co/allenai/specter2
"""

import os
import pickle
import hashlib
import numpy as np
import pandas as pd
from typing import Optional
from pathlib import Path


CACHE_DIR = Path("cache")
CACHE_DIR.mkdir(exist_ok=True)

MODEL_NAME = "allenai/specter2_base"
ADAPTER_NAME = "allenai/specter2"   # proximity adapter (for similarity / clustering)


def _get_cache_key(texts: list[str]) -> str:
    """Generate a deterministic cache key from input texts."""
    combined = "||".join(texts)
    return hashlib.md5(combined.encode()).hexdigest()


def load_or_generate_embeddings(
    df: pd.DataFrame,
    cache_path: Optional[str] = None,
    batch_size: int = 16,
) -> np.ndarray:
    """
    Generate SPECTER2 embeddings for each paper's combined_text_raw.
    Caches result to disk (pickle). Uses DOI as identity for mapping.

    Returns:
        np.ndarray of shape (n_papers, embedding_dim)
    """
    # Use combined_text_raw (original casing) for embeddings
    texts = df["combined_text_raw"].tolist()
    cache_key = _get_cache_key(texts)

    if cache_path is None:
        cache_path = str(CACHE_DIR / f"embeddings_{cache_key}.pkl")

    if os.path.exists(cache_path):
        print(f"[Embedding] Loading cached embeddings from {cache_path}")
        with open(cache_path, "rb") as f:
            data = pickle.load(f)
        return data["embeddings"]

    print(f"[Embedding] Generating SPECTER2 embeddings for {len(texts)} papers...")
    embeddings = _generate_specter2_embeddings(texts, batch_size=batch_size)

    # Cache with DOI mapping
    with open(cache_path, "wb") as f:
        pickle.dump({"embeddings": embeddings, "dois": df["DOI"].tolist()}, f)
    print(f"[Embedding] Saved embeddings to {cache_path}")

    return embeddings


def _generate_specter2_embeddings(texts: list[str], batch_size: int = 16) -> np.ndarray:
    """
    Generate SPECTER2 embeddings using AutoAdapterModel with the proximity adapter.

    The adapters library allows loading task-specific adapter weights on top of
    the base SPECTER2 model. The 'proximity' adapter is appropriate for
    document similarity and clustering tasks.

    Runs on CPU; GPU is used automatically if available.
    """
    from adapters import AutoAdapterModel
    from transformers import AutoTokenizer
    import torch

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"[Embedding] Using device: {device}")

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # Load base model using adapters' AutoAdapterModel (not transformers AutoModel)
    model = AutoAdapterModel.from_pretrained(MODEL_NAME)

    # Load and activate the proximity adapter from the Hub
    model.load_adapter(ADAPTER_NAME, source="hf", load_as="proximity", set_active=True)

    model.to(device)
    model.eval()

    all_embeddings = []

    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i : i + batch_size]
            inputs = tokenizer(
                batch,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors="pt",
            ).to(device)

            outputs = model(**inputs)
            # Use CLS token embedding (first token of last hidden state)
            batch_emb = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            all_embeddings.append(batch_emb)

            if (i // batch_size) % 5 == 0:
                print(
                    f"[Embedding] Processed "
                    f"{min(i + batch_size, len(texts))}/{len(texts)} papers"
                )

    embeddings = np.vstack(all_embeddings)
    print(f"[Embedding] Done. Embedding shape: {embeddings.shape}")
    return embeddings