ishmeet-yo commited on
Commit
8116767
·
verified ·
1 Parent(s): 8182470

Update app/llm.py

Browse files
Files changed (1) hide show
  1. app/llm.py +74 -52
app/llm.py CHANGED
@@ -6,17 +6,18 @@ from typing import List
6
 
7
  API_URL = "https://router.huggingface.co/v1/chat/completions"
8
 
9
- # MODEL_NAME = "deepseek-ai/DeepSeek-V3.2"
10
- MODEL_NAME ="mistralai/Mistral-7B-Instruct-v0.2"
 
 
 
 
 
11
  TIMEOUT_SECONDS = 30
12
- MAX_RETRIES_PER_TOKEN = 3
13
 
14
 
15
  def load_tokens() -> List[str]:
16
- """
17
- Load all Hugging Face tokens that start with HF_TOKEN_
18
- from environment variables.
19
- """
20
  tokens = [
21
  v for k, v in os.environ.items()
22
  if k.startswith("HF_TOKEN_") and v
@@ -25,65 +26,86 @@ def load_tokens() -> List[str]:
25
  if not tokens:
26
  raise RuntimeError(
27
  "No HF_TOKEN_* variables found. "
28
- "Add at least one token in Hugging Face Space settings."
29
  )
30
 
31
  return tokens
32
 
33
 
 
34
  HF_TOKENS = load_tokens()
35
 
36
 
37
  def generate_answer(context: str, query: str) -> str:
38
-
 
 
 
 
 
 
 
 
39
  tokens = HF_TOKENS[:]
 
 
40
  random.shuffle(tokens)
41
 
42
- context = context[:3000]
43
-
44
- for token in tokens:
45
- headers = {
46
- "Authorization": f"Bearer {token}",
47
- "Content-Type": "application/json",
48
- }
49
-
50
- payload = {
51
- "model": MODEL_NAME,
52
- "messages": [
53
- {
54
- "role": "system",
55
- "content": "You are a Harry Potter knowledge assistant created by Ishmeet Kaur, who is awesome. Answer in short concise sentences, stay grounded and faithful."
56
- },
57
- {
58
- "role": "user",
59
- "content": f"Context:\n{context}\n\nQuestion:\n{query}\n\nAnswer:"
60
- },
61
- ],
62
- "temperature": 0.3,
63
- "max_tokens": 500,
64
- }
65
-
66
- for attempt in range(MAX_RETRIES_PER_TOKEN):
67
- try:
68
- response = requests.post(
69
- API_URL,
70
- headers=headers,
71
- json=payload,
72
- timeout=TIMEOUT_SECONDS,
73
- )
74
- except requests.RequestException:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  break
76
 
77
- if response.status_code == 200:
78
- return response.json()["choices"][0]["message"]["content"]
79
-
80
- if response.status_code == 429:
81
- time.sleep(2 ** attempt)
82
- continue
83
-
84
- break
85
-
86
  return (
87
- "The library is a bit crowded right now. "
88
  "Please try again in a moment."
89
  )
 
6
 
7
  API_URL = "https://router.huggingface.co/v1/chat/completions"
8
 
9
+ # 🔁 Multiple models (order does NOT matter, will be shuffled)
10
+ MODELS = [
11
+ "mistralai/Mistral-7B-Instruct-v0.2",
12
+ "meta-llama/Llama-3.1-8B-Instruct",
13
+ "HuggingFaceH4/zephyr-7b-beta",
14
+ ]
15
+
16
  TIMEOUT_SECONDS = 30
17
+ MAX_RETRIES_PER_MODEL = 2
18
 
19
 
20
  def load_tokens() -> List[str]:
 
 
 
 
21
  tokens = [
22
  v for k, v in os.environ.items()
23
  if k.startswith("HF_TOKEN_") and v
 
26
  if not tokens:
27
  raise RuntimeError(
28
  "No HF_TOKEN_* variables found. "
29
+ "Add at least one token in Space settings."
30
  )
31
 
32
  return tokens
33
 
34
 
35
+ # Load once
36
  HF_TOKENS = load_tokens()
37
 
38
 
39
  def generate_answer(context: str, query: str) -> str:
40
+ """
41
+ For EACH question:
42
+ - shuffle models
43
+ - shuffle tokens
44
+ - try different model-token pairs
45
+ - backoff on 429
46
+ """
47
+
48
+ models = MODELS[:]
49
  tokens = HF_TOKENS[:]
50
+
51
+ random.shuffle(models)
52
  random.shuffle(tokens)
53
 
54
+ # Reduce token pressure (VERY important)
55
+ context = context[:1500]
56
+
57
+ for model in models:
58
+ for token in tokens:
59
+ headers = {
60
+ "Authorization": f"Bearer {token}",
61
+ "Content-Type": "application/json",
62
+ }
63
+
64
+ payload = {
65
+ "model": model,
66
+ "messages": [
67
+ {
68
+ "role": "system",
69
+ "content": "You are a Harry Potter knowledge assistant."
70
+ },
71
+ {
72
+ "role": "user",
73
+ "content": (
74
+ f"Context:\n{context}\n\n"
75
+ f"Question:\n{query}\n\n"
76
+ f"Answer:"
77
+ ),
78
+ },
79
+ ],
80
+ "temperature": 0.3,
81
+ "max_tokens": 300,
82
+ }
83
+
84
+ for attempt in range(MAX_RETRIES_PER_MODEL):
85
+ try:
86
+ response = requests.post(
87
+ API_URL,
88
+ headers=headers,
89
+ json=payload,
90
+ timeout=TIMEOUT_SECONDS,
91
+ )
92
+ except requests.RequestException:
93
+ break
94
+
95
+ # ✅ Success
96
+ if response.status_code == 200:
97
+ return response.json()["choices"][0]["message"]["content"]
98
+
99
+ # ⏳ Rate limited → backoff
100
+ if response.status_code == 429:
101
+ time.sleep(2 ** attempt)
102
+ continue
103
+
104
+ # ❌ Other error → abandon this model-token pair
105
  break
106
 
107
+ # All combinations exhausted
 
 
 
 
 
 
 
 
108
  return (
109
+ "The library is busy across multiple shelves right now. "
110
  "Please try again in a moment."
111
  )