from llm2vec import LLM2Vec
from transformers import AutoTokenizer, AutoModel, AutoConfig
from peft import PeftModel
from src.llm2vectrain.config import access_token
import torch
from torchao.quantization import quantize_, Int8WeightOnlyConfig
import os


def load_llm2vec_model():
    # Get cache directory from environment or use default
    cache_dir = os.getenv("TRANSFORMERS_CACHE", "/app/.cache/huggingface")

    model_id = "McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp"

    tokenizer = AutoTokenizer.from_pretrained(
        model_id, padding=True, truncation=True, max_length=512, cache_dir=cache_dir
    )

    config = AutoConfig.from_pretrained(
        model_id, trust_remote_code=True, cache_dir=cache_dir
    )

    if torch.cuda.is_available():
        # GPU path: use bf16 for speed
        model = AutoModel.from_pretrained(
            model_id,
            trust_remote_code=True,
            config=config,
            torch_dtype=torch.bfloat16,
            device_map="cuda",
            token=access_token,
            cache_dir=cache_dir,
        )
    else:
        # CPU path: use float32 first, then quantize
        model = AutoModel.from_pretrained(
            model_id,
            trust_remote_code=True,
            config=config,
            torch_dtype=torch.float32,  # quantization requires fp32
            device_map="cpu",
            token=access_token,
            cache_dir=cache_dir,
        )

    try:
        from torchao.quantization import quantize_

        print("[INFO] Applying torchao quantization for CPU...")
        quant_config = Int8WeightOnlyConfig(group_size=None)
        print("[INFO] Applying torchao quantization with Int8WeightOnlyConfig...")
        quantize_(model, quant_config)
    except ImportError:
        print("[WARNING] torchao not installed. Run: pip install torchao")
        print("[WARNING] Falling back to non-quantized CPU model.")

    l2v = LLM2Vec(model, tokenizer, pooling_mode="mean", max_length=512)
    return l2v