Spaces:

ishmeet-yo
/

ISH_harry_potter_rag

Sleeping

App Files Files Community

ISH_harry_potter_rag / app /llm.py

ishmeet-yo

Update app/llm.py

8116767 verified 25 days ago

raw

history blame contribute delete

2.93 kB

	import os
	import random
	import time
	import requests
	from typing import List

	API_URL = "https://router.huggingface.co/v1/chat/completions"

	# 🔁 Multiple models (order does NOT matter, will be shuffled)
	MODELS = [
	"mistralai/Mistral-7B-Instruct-v0.2",
	"meta-llama/Llama-3.1-8B-Instruct",
	"HuggingFaceH4/zephyr-7b-beta",
	]

	TIMEOUT_SECONDS = 30
	MAX_RETRIES_PER_MODEL = 2


	def load_tokens() -> List[str]:
	tokens = [
	v for k, v in os.environ.items()
	if k.startswith("HF_TOKEN_") and v
	]

	if not tokens:
	raise RuntimeError(
	"No HF_TOKEN_* variables found. "
	"Add at least one token in Space settings."
	)

	return tokens


	# Load once
	HF_TOKENS = load_tokens()


	def generate_answer(context: str, query: str) -> str:
	"""
	For EACH question:
	- shuffle models
	- shuffle tokens
	- try different model-token pairs
	- backoff on 429
	"""

	models = MODELS[:]
	tokens = HF_TOKENS[:]

	random.shuffle(models)
	random.shuffle(tokens)

	# Reduce token pressure (VERY important)
	context = context[:1500]

	for model in models:
	for token in tokens:
	headers = {
	"Authorization": f"Bearer {token}",
	"Content-Type": "application/json",
	}

	payload = {
	"model": model,
	"messages": [
	{
	"role": "system",
	"content": "You are a Harry Potter knowledge assistant."
	},
	{
	"role": "user",
	"content": (
	f"Context:\n{context}\n\n"
	f"Question:\n{query}\n\n"
	f"Answer:"
	),
	},
	],
	"temperature": 0.3,
	"max_tokens": 300,
	}

	for attempt in range(MAX_RETRIES_PER_MODEL):
	try:
	response = requests.post(
	API_URL,
	headers=headers,
	json=payload,
	timeout=TIMEOUT_SECONDS,
	)
	except requests.RequestException:
	break

	# ✅ Success
	if response.status_code == 200:
	return response.json()["choices"][0]["message"]["content"]

	# ⏳ Rate limited → backoff
	if response.status_code == 429:
	time.sleep(2 ** attempt)
	continue

	# ❌ Other error → abandon this model-token pair
	break

	# All combinations exhausted
	return (
	"The library is busy across multiple shelves right now. "
	"Please try again in a moment."
	)