ishmeet-yo's picture
Update app/llm.py
8116767 verified
import os
import random
import time
import requests
from typing import List
API_URL = "https://router.huggingface.co/v1/chat/completions"
# πŸ” Multiple models (order does NOT matter, will be shuffled)
MODELS = [
"mistralai/Mistral-7B-Instruct-v0.2",
"meta-llama/Llama-3.1-8B-Instruct",
"HuggingFaceH4/zephyr-7b-beta",
]
TIMEOUT_SECONDS = 30
MAX_RETRIES_PER_MODEL = 2
def load_tokens() -> List[str]:
tokens = [
v for k, v in os.environ.items()
if k.startswith("HF_TOKEN_") and v
]
if not tokens:
raise RuntimeError(
"No HF_TOKEN_* variables found. "
"Add at least one token in Space settings."
)
return tokens
# Load once
HF_TOKENS = load_tokens()
def generate_answer(context: str, query: str) -> str:
"""
For EACH question:
- shuffle models
- shuffle tokens
- try different model-token pairs
- backoff on 429
"""
models = MODELS[:]
tokens = HF_TOKENS[:]
random.shuffle(models)
random.shuffle(tokens)
# Reduce token pressure (VERY important)
context = context[:1500]
for model in models:
for token in tokens:
headers = {
"Authorization": f"Bearer {token}",
"Content-Type": "application/json",
}
payload = {
"model": model,
"messages": [
{
"role": "system",
"content": "You are a Harry Potter knowledge assistant."
},
{
"role": "user",
"content": (
f"Context:\n{context}\n\n"
f"Question:\n{query}\n\n"
f"Answer:"
),
},
],
"temperature": 0.3,
"max_tokens": 300,
}
for attempt in range(MAX_RETRIES_PER_MODEL):
try:
response = requests.post(
API_URL,
headers=headers,
json=payload,
timeout=TIMEOUT_SECONDS,
)
except requests.RequestException:
break
# βœ… Success
if response.status_code == 200:
return response.json()["choices"][0]["message"]["content"]
# ⏳ Rate limited β†’ backoff
if response.status_code == 429:
time.sleep(2 ** attempt)
continue
# ❌ Other error β†’ abandon this model-token pair
break
# All combinations exhausted
return (
"The library is busy across multiple shelves right now. "
"Please try again in a moment."
)