import os
import random
import time
import requests
from typing import List

API_URL = "https://router.huggingface.co/v1/chat/completions"

# 🔁 Multiple models (order does NOT matter, will be shuffled)
MODELS = [
    "mistralai/Mistral-7B-Instruct-v0.2",
    "meta-llama/Llama-3.1-8B-Instruct",
    "HuggingFaceH4/zephyr-7b-beta",
]

TIMEOUT_SECONDS = 30
MAX_RETRIES_PER_MODEL = 2


def load_tokens() -> List[str]:
    tokens = [
        v for k, v in os.environ.items()
        if k.startswith("HF_TOKEN_") and v
    ]

    if not tokens:
        raise RuntimeError(
            "No HF_TOKEN_* variables found. "
            "Add at least one token in Space settings."
        )

    return tokens


# Load once
HF_TOKENS = load_tokens()


def generate_answer(context: str, query: str) -> str:
    """
    For EACH question:
    - shuffle models
    - shuffle tokens
    - try different model-token pairs
    - backoff on 429
    """

    models = MODELS[:]
    tokens = HF_TOKENS[:]

    random.shuffle(models)
    random.shuffle(tokens)

    # Reduce token pressure (VERY important)
    context = context[:1500]

    for model in models:
        for token in tokens:
            headers = {
                "Authorization": f"Bearer {token}",
                "Content-Type": "application/json",
            }

            payload = {
                "model": model,
                "messages": [
                    {
                        "role": "system",
                        "content": "You are a Harry Potter knowledge assistant."
                    },
                    {
                        "role": "user",
                        "content": (
                            f"Context:\n{context}\n\n"
                            f"Question:\n{query}\n\n"
                            f"Answer:"
                        ),
                    },
                ],
                "temperature": 0.3,
                "max_tokens": 300,
            }

            for attempt in range(MAX_RETRIES_PER_MODEL):
                try:
                    response = requests.post(
                        API_URL,
                        headers=headers,
                        json=payload,
                        timeout=TIMEOUT_SECONDS,
                    )
                except requests.RequestException:
                    break

                # ✅ Success
                if response.status_code == 200:
                    return response.json()["choices"][0]["message"]["content"]

                # ⏳ Rate limited → backoff
                if response.status_code == 429:
                    time.sleep(2 ** attempt)
                    continue

                # ❌ Other error → abandon this model-token pair
                break

    # All combinations exhausted
    return (
        "The library is busy across multiple shelves right now. "
        "Please try again in a moment."
    )