import os
import time
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnableConfig

load_dotenv()

_llm_instance = None

# Ordered by preference: quality + reliability + speed
MODEL_CHAIN = [
    "nvidia/nemotron-3-nano-30b-a3b:free",
    "stepfun/step-3.5-flash:free",
    "arcee-ai/trinity-large-preview:free",
    "google/gemma-3-27b-it:free",
    "meta-llama/llama-3.3-70b-instruct:free",
    "mistralai/mistral-small-3.1-24b-instruct:free",
]


def get_llm() -> ChatOpenAI:
    """Lazy-initialised LLM singleton with automatic model fallback.

    Tries the primary model first.  If it has been marked as failing,
    the fallback chain is tried until one works.
    """
    global _llm_instance
    if _llm_instance is not None:
        return _llm_instance

    api_key = os.getenv("OPENROUTER_API_KEY")

    if not api_key:
        from src.core.logger import get_logger
        logger = get_logger(__name__)
        logger.error("OPENROUTER_API_KEY not found in environment")
        available = [k for k in os.environ if "API" in k or "KEY" in k]
        logger.error("Available key-like env vars: %s", available)
        raise ValueError("OPENROUTER_API_KEY not found. Check your secrets.")

    _llm_instance = ChatOpenAI(
        model=MODEL_CHAIN[0],
        api_key=api_key,
        base_url="https://openrouter.ai/api/v1",
        temperature=0,
    )
    return _llm_instance


def get_structured_llm(max_tokens: int = 65536) -> ChatOpenAI:
    """Return an LLM instance configured for structured output.

    A generous ``max_tokens`` ensures reasoning models have enough
    headroom to think and then produce the full structured JSON.
    """
    api_key = os.getenv("OPENROUTER_API_KEY")
    if not api_key:
        raise ValueError("OPENROUTER_API_KEY not found.")

    return ChatOpenAI(
        model=MODEL_CHAIN[0],
        api_key=api_key,
        base_url="https://openrouter.ai/api/v1",
        temperature=0,
        max_tokens=max_tokens,
        request_timeout=120,
    )


def invoke_with_fallback(prompt: str, max_retries: int = 2, run_name: str = "llm_call") -> str:
    """Invoke the LLM with automatic model fallback on 429 rate limits.

    Tries each model in MODEL_CHAIN until one succeeds.  Returns the
    response content string.

    Each invocation is tagged with the model name so LangSmith can filter
    by ``model:<name>`` and ``error:429`` for the error dashboard.
    """
    from src.core.logger import get_logger
    logger = get_logger(__name__)

    api_key = os.getenv("OPENROUTER_API_KEY")
    if not api_key:
        raise ValueError("OPENROUTER_API_KEY not found.")

    last_error = None

    for model_id in MODEL_CHAIN:
        for attempt in range(max_retries):
            try:
                llm = ChatOpenAI(
                    model=model_id,
                    api_key=api_key,
                    base_url="https://openrouter.ai/api/v1",
                    temperature=0,
                )

                # LangSmith: tag every call with model name + attempt number
                config = RunnableConfig(
                    run_name=run_name,
                    tags=[f"model:{model_id}", f"attempt:{attempt + 1}"],
                    metadata={
                        "model_id": model_id,
                        "attempt": attempt + 1,
                        "fallback_position": MODEL_CHAIN.index(model_id),
                    },
                )

                response = llm.invoke(prompt, config=config)
                logger.info("LLM response from %s (attempt %d)", model_id, attempt + 1)
                return response.content
            except Exception as exc:
                last_error = exc
                err_str = str(exc)
                if "429" in err_str:
                    logger.warning("Rate-limited on %s (attempt %d), trying next...", model_id, attempt + 1)
                    time.sleep(2)
                    break  # move to next model
                elif "404" in err_str:
                    logger.warning("Model %s not available, skipping", model_id)
                    break  # move to next model
                else:
                    logger.error("LLM error on %s: %s", model_id, exc)
                    if attempt < max_retries - 1:
                        time.sleep(1)
                    else:
                        break

    raise RuntimeError(f"All {len(MODEL_CHAIN)} models failed. Last tried: {MODEL_CHAIN[-1]}. Last error: {last_error}")