Spaces:
Sleeping
Sleeping
| import os | |
| import random | |
| import time | |
| import requests | |
| from typing import List | |
| API_URL = "https://router.huggingface.co/v1/chat/completions" | |
| # π Multiple models (order does NOT matter, will be shuffled) | |
| MODELS = [ | |
| "mistralai/Mistral-7B-Instruct-v0.2", | |
| "meta-llama/Llama-3.1-8B-Instruct", | |
| "HuggingFaceH4/zephyr-7b-beta", | |
| ] | |
| TIMEOUT_SECONDS = 30 | |
| MAX_RETRIES_PER_MODEL = 2 | |
| def load_tokens() -> List[str]: | |
| tokens = [ | |
| v for k, v in os.environ.items() | |
| if k.startswith("HF_TOKEN_") and v | |
| ] | |
| if not tokens: | |
| raise RuntimeError( | |
| "No HF_TOKEN_* variables found. " | |
| "Add at least one token in Space settings." | |
| ) | |
| return tokens | |
| # Load once | |
| HF_TOKENS = load_tokens() | |
| def generate_answer(context: str, query: str) -> str: | |
| """ | |
| For EACH question: | |
| - shuffle models | |
| - shuffle tokens | |
| - try different model-token pairs | |
| - backoff on 429 | |
| """ | |
| models = MODELS[:] | |
| tokens = HF_TOKENS[:] | |
| random.shuffle(models) | |
| random.shuffle(tokens) | |
| # Reduce token pressure (VERY important) | |
| context = context[:1500] | |
| for model in models: | |
| for token in tokens: | |
| headers = { | |
| "Authorization": f"Bearer {token}", | |
| "Content-Type": "application/json", | |
| } | |
| payload = { | |
| "model": model, | |
| "messages": [ | |
| { | |
| "role": "system", | |
| "content": "You are a Harry Potter knowledge assistant." | |
| }, | |
| { | |
| "role": "user", | |
| "content": ( | |
| f"Context:\n{context}\n\n" | |
| f"Question:\n{query}\n\n" | |
| f"Answer:" | |
| ), | |
| }, | |
| ], | |
| "temperature": 0.3, | |
| "max_tokens": 300, | |
| } | |
| for attempt in range(MAX_RETRIES_PER_MODEL): | |
| try: | |
| response = requests.post( | |
| API_URL, | |
| headers=headers, | |
| json=payload, | |
| timeout=TIMEOUT_SECONDS, | |
| ) | |
| except requests.RequestException: | |
| break | |
| # β Success | |
| if response.status_code == 200: | |
| return response.json()["choices"][0]["message"]["content"] | |
| # β³ Rate limited β backoff | |
| if response.status_code == 429: | |
| time.sleep(2 ** attempt) | |
| continue | |
| # β Other error β abandon this model-token pair | |
| break | |
| # All combinations exhausted | |
| return ( | |
| "The library is busy across multiple shelves right now. " | |
| "Please try again in a moment." | |
| ) | |