import os import random import time import requests from typing import List API_URL = "https://router.huggingface.co/v1/chat/completions" # 🔁 Multiple models (order does NOT matter, will be shuffled) MODELS = [ "mistralai/Mistral-7B-Instruct-v0.2", "meta-llama/Llama-3.1-8B-Instruct", "HuggingFaceH4/zephyr-7b-beta", ] TIMEOUT_SECONDS = 30 MAX_RETRIES_PER_MODEL = 2 def load_tokens() -> List[str]: tokens = [ v for k, v in os.environ.items() if k.startswith("HF_TOKEN_") and v ] if not tokens: raise RuntimeError( "No HF_TOKEN_* variables found. " "Add at least one token in Space settings." ) return tokens # Load once HF_TOKENS = load_tokens() def generate_answer(context: str, query: str) -> str: """ For EACH question: - shuffle models - shuffle tokens - try different model-token pairs - backoff on 429 """ models = MODELS[:] tokens = HF_TOKENS[:] random.shuffle(models) random.shuffle(tokens) # Reduce token pressure (VERY important) context = context[:1500] for model in models: for token in tokens: headers = { "Authorization": f"Bearer {token}", "Content-Type": "application/json", } payload = { "model": model, "messages": [ { "role": "system", "content": "You are a Harry Potter knowledge assistant." }, { "role": "user", "content": ( f"Context:\n{context}\n\n" f"Question:\n{query}\n\n" f"Answer:" ), }, ], "temperature": 0.3, "max_tokens": 300, } for attempt in range(MAX_RETRIES_PER_MODEL): try: response = requests.post( API_URL, headers=headers, json=payload, timeout=TIMEOUT_SECONDS, ) except requests.RequestException: break # ✅ Success if response.status_code == 200: return response.json()["choices"][0]["message"]["content"] # ⏳ Rate limited → backoff if response.status_code == 429: time.sleep(2 ** attempt) continue # ❌ Other error → abandon this model-token pair break # All combinations exhausted return ( "The library is busy across multiple shelves right now. " "Please try again in a moment." )