Spaces:

16bitSega
/

Agentic_RAG

Sleeping

App Files Files Community

Oleksii Obolonskyi commited on Jan 29

Commit

052a978

1 Parent(s): 8faa6a7

Harden HF inference routing

Browse files

Files changed (2) hide show

README.md +5 -7
app.py +89 -14

README.md CHANGED Viewed

@@ -57,11 +57,7 @@ export HF_TOKEN=hf_your_token_here
 export RAG_HF_MODEL=meta-llama/Llama-3.2-1B-Instruct
 ```
-Optional override if you use a dedicated Inference Endpoint:
-```bash
-export RAG_HF_API_URL=https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-1B-Instruct
-```
 ### 3) Prepare sources
@@ -106,7 +102,8 @@ export RAG_ARTICLE_MANIFEST_PATH=data/normalized/manifest_articles.json
 export RAG_EMBED_MODEL=sentence-transformers/all-MiniLM-L6-v2
 export HF_TOKEN=hf_your_token_here
 export RAG_HF_MODEL=meta-llama/Llama-3.2-1B-Instruct
-export RAG_HF_API_URL=https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-1B-Instruct
 export RAG_OUT_DIR=data/normalized
 export RAG_ARTICLE_SOURCES=sources_articles.json
 ```
@@ -115,7 +112,8 @@ export RAG_ARTICLE_SOURCES=sources_articles.json
 1. Create a new Space (Streamlit SDK) and push this repo.
 2. In Space Settings → Secrets, set `HF_TOKEN` (required) and optionally `GITHUB_TOKEN`.
-3. In Space Settings → Variables, set `RAG_HF_MODEL` or `RAG_HF_API_URL` if you want to override defaults.
 ## Common maintenance tasks

 export RAG_HF_MODEL=meta-llama/Llama-3.2-1B-Instruct
 ```
+Optional: set `RAG_HF_FALLBACK_MODEL` to retry if the primary model is gated or unavailable.
 ### 3) Prepare sources
 export RAG_EMBED_MODEL=sentence-transformers/all-MiniLM-L6-v2
 export HF_TOKEN=hf_your_token_here
 export RAG_HF_MODEL=meta-llama/Llama-3.2-1B-Instruct
+export RAG_LLM_BACKEND=hf
+export RAG_HF_FALLBACK_MODEL=HuggingFaceH4/zephyr-7b-beta
 export RAG_OUT_DIR=data/normalized
 export RAG_ARTICLE_SOURCES=sources_articles.json
 ```
 1. Create a new Space (Streamlit SDK) and push this repo.
 2. In Space Settings → Secrets, set `HF_TOKEN` (required) and optionally `GITHUB_TOKEN`.
+3. In Space Settings → Variables, set `RAG_HF_MODEL` (required) and `RAG_LLM_BACKEND=hf`.
+4. Optional: `RAG_HF_FALLBACK_MODEL` to retry if the primary model is gated or unavailable.
 ## Common maintenance tasks

app.py CHANGED Viewed

@@ -15,7 +15,6 @@ import numpy as np
 import faiss
 import requests
 from huggingface_hub import InferenceClient
-from huggingface_hub import InferenceClient
 from sentence_transformers import SentenceTransformer
 load_dotenv(Path(__file__).resolve().parent / ".env", override=True)
@@ -39,6 +38,7 @@ HF_TOKEN = (
     or ""
 ).strip()
 HF_MODEL = os.environ.get("RAG_HF_MODEL", "meta-llama/Llama-3.2-1B-Instruct")
 OLLAMA_BASE_URL = os.environ.get("RAG_OLLAMA_URL", "http://localhost:11434").rstrip("/")
 OLLAMA_MODEL = os.environ.get("RAG_OLLAMA_MODEL", "llama3.2:1b")
@@ -508,12 +508,15 @@ def answer_question(
         return "Model error: Empty response from model", citations, False
     return sanitize_answer(answer), citations, True
-def build_hf_prompt(user_prompt: str, model_id: str) -> str:
-    system_msg = (
         f"You are an assistant for {COMPANY_NAME}. Contact: {COMPANY_EMAIL}, "
         f"{COMPANY_PHONE}. {COMPANY_ABOUT}. Answer only from the provided context. "
         "Keep answers concise. Cite sources using the provided citation tags exactly."
     )
     if "llama-3" in model_id.lower():
         return (
             "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
@@ -523,15 +526,44 @@ def build_hf_prompt(user_prompt: str, model_id: str) -> str:
     return f"System: {system_msg}\nUser: {user_prompt}\nAssistant:"
 @st.cache_resource(show_spinner=False)
-def get_hf_client() -> InferenceClient:
-    return InferenceClient(model=HF_MODEL, token=HF_TOKEN)
 def hf_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Optional[str]]:
     if not HF_TOKEN:
         return "", "Missing HF_TOKEN (or HUGGINGFACEHUB_API_TOKEN)"
     try:
-        client = get_hf_client()
-        inp = build_hf_prompt(prompt, HF_MODEL)
         out = client.text_generation(
             inp,
             max_new_tokens=512,
@@ -541,14 +573,41 @@ def hf_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Opt
         )
         return (out or "").strip(), None
     except Exception as e:
-        return "", str(e)
 def ollama_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Optional[str]]:
     url = f"{OLLAMA_BASE_URL}/api/chat"
     payload = {
         "model": OLLAMA_MODEL,
         "messages": [
-            {"role": "system", "content": f"You are an assistant for {COMPANY_NAME}. Contact: {COMPANY_EMAIL}, {COMPANY_PHONE}. {COMPANY_ABOUT}. Answer only from the provided context. Keep answers concise. Cite sources using the provided citation tags exactly."},
             {"role": "user", "content": prompt},
         ],
         "stream": False,
@@ -575,10 +634,17 @@ def llm_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Op
         return hf_chat(prompt, timeout=timeout)
     if backend == "ollama":
         return ollama_chat(prompt, timeout=timeout)
     if (HF_TOKEN or "").strip():
         return hf_chat(prompt, timeout=timeout)
     return ollama_chat(prompt, timeout=timeout)
 def github_create_issue(title: str, body: str, labels: Optional[List[str]] = None) -> Tuple[Optional[int], Optional[str]]:
     if not GITHUB_TOKEN:
         return None, "Missing GITHUB_TOKEN"
@@ -646,9 +712,16 @@ with st.sidebar:
     st.write("")
     st.subheader("LLM")
     st.markdown(f"- Model: `{HF_MODEL}`")
-    st.markdown(f"- URL: `{HF_API_URL}`")
-    if not HF_TOKEN:
-        st.warning("HF_TOKEN is not set. LLM requests will fail until you add it.")
     st.write("")
     st.subheader("Embedding model (retrieval)")
     st.code(EMBED_MODEL)
@@ -1019,8 +1092,10 @@ if st.session_state.get("active_action"):
             if ok:
                 push_message("assistant", answer, citations=citations, not_found=False)
             else:
-                push_message("assistant", answer, citations=[], not_found=True)
-                st.session_state["ticket_prefill"] = {"question": q_norm, "citations": citations}
             st.session_state["last_question"] = q_norm
             st.session_state["last_citations"] = citations
             st.session_state["last_answer"] = answer

 import faiss
 import requests
 from huggingface_hub import InferenceClient
 from sentence_transformers import SentenceTransformer
 load_dotenv(Path(__file__).resolve().parent / ".env", override=True)
     or ""
 ).strip()
 HF_MODEL = os.environ.get("RAG_HF_MODEL", "meta-llama/Llama-3.2-1B-Instruct")
+HF_FALLBACK_MODEL = os.environ.get("RAG_HF_FALLBACK_MODEL", "").strip()
 OLLAMA_BASE_URL = os.environ.get("RAG_OLLAMA_URL", "http://localhost:11434").rstrip("/")
 OLLAMA_MODEL = os.environ.get("RAG_OLLAMA_MODEL", "llama3.2:1b")
         return "Model error: Empty response from model", citations, False
     return sanitize_answer(answer), citations, True
+def system_message() -> str:
+    return (
         f"You are an assistant for {COMPANY_NAME}. Contact: {COMPANY_EMAIL}, "
         f"{COMPANY_PHONE}. {COMPANY_ABOUT}. Answer only from the provided context. "
         "Keep answers concise. Cite sources using the provided citation tags exactly."
     )
+def build_hf_prompt(user_prompt: str, model_id: str) -> str:
+    system_msg = system_message()
     if "llama-3" in model_id.lower():
         return (
             "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
     return f"System: {system_msg}\nUser: {user_prompt}\nAssistant:"
 @st.cache_resource(show_spinner=False)
+def get_hf_client(model_id: str) -> InferenceClient:
+    return InferenceClient(model=model_id, token=HF_TOKEN)
 def hf_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Optional[str]]:
     if not HF_TOKEN:
         return "", "Missing HF_TOKEN (or HUGGINGFACEHUB_API_TOKEN)"
+    return hf_chat_with_model(prompt, HF_MODEL, timeout=timeout)
+def hf_chat_with_model(
+    prompt: str,
+    model_id: str,
+    timeout: Tuple[int, int] = (10, 600),
+) -> Tuple[str, Optional[str]]:
+    client = get_hf_client(model_id)
+    messages = [
+        {"role": "system", "content": system_message()},
+        {"role": "user", "content": prompt},
+    ]
+    try:
+        chat_api = getattr(getattr(client, "chat", None), "completions", None)
+        create_fn = getattr(chat_api, "create", None)
+        if create_fn:
+            resp = create_fn(
+                model=model_id,
+                messages=messages,
+                max_tokens=512,
+                temperature=0.2,
+            )
+            text = (resp.choices[0].message.content or "").strip()
+            return text, None
+    except Exception as e:
+        fallback = hf_fallback_model_error(str(e), model_id)
+        if fallback:
+            return hf_chat_with_model(prompt, fallback, timeout=timeout)
+        return "", hf_format_error(str(e), model_id)
     try:
+        inp = build_hf_prompt(prompt, model_id)
         out = client.text_generation(
             inp,
             max_new_tokens=512,
         )
         return (out or "").strip(), None
     except Exception as e:
+        fallback = hf_fallback_model_error(str(e), model_id)
+        if fallback:
+            return hf_chat_with_model(prompt, fallback, timeout=timeout)
+        return "", hf_format_error(str(e), model_id)
+def hf_fallback_model_error(err: str, model_id: str) -> Optional[str]:
+    if not HF_FALLBACK_MODEL:
+        return None
+    if model_id == HF_FALLBACK_MODEL:
+        return None
+    err_low = (err or "").lower()
+    if any(k in err_low for k in ["401", "403", "gated", "license", "not authorized", "forbidden"]):
+        return HF_FALLBACK_MODEL
+    if any(k in err_low for k in ["404", "not found", "provider", "unavailable", "service unavailable"]):
+        return HF_FALLBACK_MODEL
+    return None
+def hf_format_error(err: str, model_id: str) -> str:
+    err_low = (err or "").lower()
+    if any(k in err_low for k in ["401", "403", "gated", "license", "not authorized", "forbidden"]):
+        return (
+            f"{err} This model is gated. Ensure HF_TOKEN has accepted the model license and has access."
+        )
+    if any(k in err_low for k in ["404", "not found"]):
+        return f"{err} Model not found. Verify RAG_HF_MODEL or RAG_HF_API_URL."
+    if any(k in err_low for k in ["provider", "unavailable", "service unavailable"]):
+        return f"{err} Provider unavailable. Try again later or set RAG_HF_FALLBACK_MODEL."
+    return err
 def ollama_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Optional[str]]:
     url = f"{OLLAMA_BASE_URL}/api/chat"
     payload = {
         "model": OLLAMA_MODEL,
         "messages": [
+            {"role": "system", "content": system_message()},
             {"role": "user", "content": prompt},
         ],
         "stream": False,
         return hf_chat(prompt, timeout=timeout)
     if backend == "ollama":
         return ollama_chat(prompt, timeout=timeout)
+    if is_running_on_spaces():
+        return hf_chat(prompt, timeout=timeout)
     if (HF_TOKEN or "").strip():
         return hf_chat(prompt, timeout=timeout)
     return ollama_chat(prompt, timeout=timeout)
+def is_running_on_spaces() -> bool:
+    if os.environ.get("HF_SPACE_ID") or os.environ.get("SPACE_ID"):
+        return True
+    return (os.environ.get("SYSTEM") or "").strip().lower() == "spaces"
 def github_create_issue(title: str, body: str, labels: Optional[List[str]] = None) -> Tuple[Optional[int], Optional[str]]:
     if not GITHUB_TOKEN:
         return None, "Missing GITHUB_TOKEN"
     st.write("")
     st.subheader("LLM")
     st.markdown(f"- Model: `{HF_MODEL}`")
+    st.markdown(f"- Backend: `{(os.environ.get('RAG_LLM_BACKEND') or 'auto')}`")
+    st.markdown(f"- HF token set: `{bool(HF_TOKEN)}`")
+    if HF_FALLBACK_MODEL:
+        st.markdown(f"- HF fallback: `{HF_FALLBACK_MODEL}`")
+    if st.button("Test model", key="hf_test", use_container_width=True, disabled=st.session_state["is_thinking"]):
+        test_text, test_err = llm_chat("Say OK.")
+        if test_err:
+            st.error(test_err)
+        else:
+            st.success(test_text or "OK")
     st.write("")
     st.subheader("Embedding model (retrieval)")
     st.code(EMBED_MODEL)
             if ok:
                 push_message("assistant", answer, citations=citations, not_found=False)
             else:
+                is_not_found = answer.strip() == "Not found in dataset."
+                push_message("assistant", answer, citations=[], not_found=is_not_found)
+                if is_not_found:
+                    st.session_state["ticket_prefill"] = {"question": q_norm, "citations": citations}
             st.session_state["last_question"] = q_norm
             st.session_state["last_citations"] = citations
             st.session_state["last_answer"] = answer