Spaces:

ishmeet-yo
/

ISH_harry_potter_rag

Sleeping

App Files Files Community

ishmeet-yo commited on Jan 31

Commit

8116767

verified ·

1 Parent(s): 8182470

Update app/llm.py

Browse files

Files changed (1) hide show

app/llm.py +74 -52

app/llm.py CHANGED Viewed

@@ -6,17 +6,18 @@ from typing import List
 API_URL = "https://router.huggingface.co/v1/chat/completions"
-# MODEL_NAME = "deepseek-ai/DeepSeek-V3.2"
-MODEL_NAME ="mistralai/Mistral-7B-Instruct-v0.2"
 TIMEOUT_SECONDS = 30
-MAX_RETRIES_PER_TOKEN = 3
 def load_tokens() -> List[str]:
-    """
-    Load all Hugging Face tokens that start with HF_TOKEN_
-    from environment variables.
-    """
     tokens = [
         v for k, v in os.environ.items()
         if k.startswith("HF_TOKEN_") and v
@@ -25,65 +26,86 @@ def load_tokens() -> List[str]:
     if not tokens:
         raise RuntimeError(
             "No HF_TOKEN_* variables found. "
-            "Add at least one token in Hugging Face Space settings."
         )
     return tokens
 HF_TOKENS = load_tokens()
 def generate_answer(context: str, query: str) -> str:
     tokens = HF_TOKENS[:]
     random.shuffle(tokens)
-    context = context[:3000]
-    for token in tokens:
-        headers = {
-            "Authorization": f"Bearer {token}",
-            "Content-Type": "application/json",
-        }
-        payload = {
-            "model": MODEL_NAME,
-            "messages": [
-                {
-                    "role": "system",
-                    "content": "You are a Harry Potter knowledge assistant created by Ishmeet Kaur, who is awesome. Answer in short concise sentences, stay grounded and faithful."
-                },
-                {
-                    "role": "user",
-                    "content": f"Context:\n{context}\n\nQuestion:\n{query}\n\nAnswer:"
-                },
-            ],
-            "temperature": 0.3,
-            "max_tokens": 500,
-        }
-        for attempt in range(MAX_RETRIES_PER_TOKEN):
-            try:
-                response = requests.post(
-                    API_URL,
-                    headers=headers,
-                    json=payload,
-                    timeout=TIMEOUT_SECONDS,
-                )
-            except requests.RequestException:
                 break
-            if response.status_code == 200:
-                return response.json()["choices"][0]["message"]["content"]
-            if response.status_code == 429:
-                time.sleep(2 ** attempt)
-                continue
-            break
     return (
-        "The library is a bit crowded right now. "
         "Please try again in a moment."
     )

 API_URL = "https://router.huggingface.co/v1/chat/completions"
+# 🔁 Multiple models (order does NOT matter, will be shuffled)
+MODELS = [
+    "mistralai/Mistral-7B-Instruct-v0.2",
+    "meta-llama/Llama-3.1-8B-Instruct",
+    "HuggingFaceH4/zephyr-7b-beta",
+]
 TIMEOUT_SECONDS = 30
+MAX_RETRIES_PER_MODEL = 2
 def load_tokens() -> List[str]:
     tokens = [
         v for k, v in os.environ.items()
         if k.startswith("HF_TOKEN_") and v
     if not tokens:
         raise RuntimeError(
             "No HF_TOKEN_* variables found. "
+            "Add at least one token in Space settings."
         )
     return tokens
+# Load once
 HF_TOKENS = load_tokens()
 def generate_answer(context: str, query: str) -> str:
+    """
+    For EACH question:
+    - shuffle models
+    - shuffle tokens
+    - try different model-token pairs
+    - backoff on 429
+    """
+    models = MODELS[:]
     tokens = HF_TOKENS[:]
+    random.shuffle(models)
     random.shuffle(tokens)
+    # Reduce token pressure (VERY important)
+    context = context[:1500]
+    for model in models:
+        for token in tokens:
+            headers = {
+                "Authorization": f"Bearer {token}",
+                "Content-Type": "application/json",
+            }
+            payload = {
+                "model": model,
+                "messages": [
+                    {
+                        "role": "system",
+                        "content": "You are a Harry Potter knowledge assistant."
+                    },
+                    {
+                        "role": "user",
+                        "content": (
+                            f"Context:\n{context}\n\n"
+                            f"Question:\n{query}\n\n"
+                            f"Answer:"
+                        ),
+                    },
+                ],
+                "temperature": 0.3,
+                "max_tokens": 300,
+            }
+            for attempt in range(MAX_RETRIES_PER_MODEL):
+                try:
+                    response = requests.post(
+                        API_URL,
+                        headers=headers,
+                        json=payload,
+                        timeout=TIMEOUT_SECONDS,
+                    )
+                except requests.RequestException:
+                    break
+                # ✅ Success
+                if response.status_code == 200:
+                    return response.json()["choices"][0]["message"]["content"]
+                # ⏳ Rate limited → backoff
+                if response.status_code == 429:
+                    time.sleep(2 ** attempt)
+                    continue
+                # ❌ Other error → abandon this model-token pair
                 break
+    # All combinations exhausted
     return (
+        "The library is busy across multiple shelves right now. "
         "Please try again in a moment."
     )