Spaces:

16bitSega
/

Agentic_RAG

Sleeping

App Files Files Community

Oleksii Obolonskyi commited on Jan 30

Commit

8aac1c0

1 Parent(s): 45772d2

Use HF Router OpenAI client

Browse files

Files changed (2) hide show

app.py +33 -100
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ import streamlit as st
 import numpy as np
 import faiss
 import requests
-from huggingface_hub import InferenceClient
 from sentence_transformers import SentenceTransformer
 load_dotenv(Path(__file__).resolve().parent / ".env", override=True)
@@ -83,15 +83,7 @@ OVERLAP_FILTER = CONFIG.overlap_filter
 RETRIEVE_TOPK_MULT = CONFIG.retrieve_topk_mult
 HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
-HF_PROVIDER = os.getenv("RAG_HF_PROVIDER", "hf-inference").strip() or "hf-inference"
-HF_MODEL_PRIMARY = os.getenv("RAG_HF_MODEL", os.getenv("RAG_HF_MODEL_PRIMARY", "HuggingFaceTB/SmolLM3-3B")).strip()
-HF_MODEL_FALLBACKS_RAW = os.getenv("RAG_HF_MODEL_FALLBACKS", "").strip()
-HF_MODEL_FALLBACKS = (
-    [m.strip() for m in HF_MODEL_FALLBACKS_RAW.split(",") if m.strip()]
-    if HF_MODEL_FALLBACKS_RAW
-    else ["HuggingFaceTB/SmolLM3-3B", "HuggingFaceTB/SmolLM2-1.7B", "HuggingFaceTB/SmolLM2-360M"]
-)
-HF_MODEL_CANDIDATES = [HF_MODEL_PRIMARY] + [m for m in HF_MODEL_FALLBACKS if m != HF_MODEL_PRIMARY]
 OLLAMA_BASE_URL = os.environ.get("RAG_OLLAMA_URL", "http://localhost:11434").rstrip("/")
 OLLAMA_MODEL = os.environ.get("RAG_OLLAMA_MODEL", "llama3.2:1b")
@@ -623,82 +615,30 @@ def is_running_on_spaces() -> bool:
         return True
     return (os.environ.get("SYSTEM") or "").strip().lower() == "spaces"
-def get_hf_client(model_id: str) -> InferenceClient:
-    return InferenceClient(model=model_id, provider=HF_PROVIDER, token=HF_TOKEN)
-def select_active_hf_model() -> str:
-    if st.session_state.get("hf_active_model"):
-        return st.session_state["hf_active_model"]
-    last_err = ""
-    for model_id in HF_MODEL_CANDIDATES:
-        try:
-            client = get_hf_client(model_id)
-            client.text_generation(
-                "ping",
-                max_new_tokens=2,
-                temperature=0.0,
-                do_sample=False,
-                return_full_text=False,
-            )
-            st.session_state["hf_active_model"] = model_id
-            st.session_state.pop("hf_startup_error", None)
-            return model_id
-        except Exception as exc:
-            last_err = str(exc)
-    st.session_state["hf_active_model"] = HF_MODEL_PRIMARY
-    if last_err:
-        st.session_state["hf_startup_error"] = last_err
-    return HF_MODEL_PRIMARY
-class LLMClient:
-    def __init__(self, backend: str) -> None:
-        self.backend = backend
-    def generate(self, prompt: str) -> Tuple[str, Optional[str]]:
-        if self.backend == "ollama":
-            return ollama_chat(prompt)
-        return self._hf_generate(prompt)
-    def _hf_generate(self, prompt: str) -> Tuple[str, Optional[str]]:
-        model_id = select_active_hf_model()
-        client = get_hf_client(model_id)
-        messages = [
-            {"role": "system", "content": system_message()},
-            {"role": "user", "content": prompt},
-        ]
-        try:
-            chat_api = getattr(getattr(client, "chat", None), "completions", None)
-            create_fn = getattr(chat_api, "create", None)
-            if create_fn:
-                resp = create_fn(
-                    model=model_id,
-                    messages=messages,
-                    max_tokens=MAX_GENERATION_TOKENS,
-                    temperature=0.2,
-                )
-                text = (resp.choices[0].message.content or "").strip()
-                return text, None
-        except Exception as exc:
-            chat_err = str(exc)
-        else:
-            chat_err = ""
-        try:
-            out = client.text_generation(
-                prompt,
-                max_new_tokens=MAX_GENERATION_TOKENS,
-                temperature=0.2,
-                do_sample=True,
-                return_full_text=False,
-            )
-            return (out or "").strip(), None
-        except Exception as exc:
-            err_msg = str(exc) or chat_err
-            hint = f"HF model: {model_id}; provider: {HF_PROVIDER}."
-            err_low = err_msg.lower()
-            if any(k in err_low for k in ["401", "403", "gated", "license", "not authorized", "forbidden"]):
-                hint += " This model is gated. Ensure HF_TOKEN has accepted the license."
-            return "", f"{err_msg} ({hint})"
 def ollama_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Optional[str]]:
     url = f"{OLLAMA_BASE_URL}/api/chat"
@@ -728,15 +668,15 @@ def llm_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Op
     """
     backend = (os.environ.get("RAG_LLM_BACKEND", "") or "").strip().lower()
-    if backend == "hf":
-        return LLMClient("hf").generate(prompt)
     if backend == "ollama":
-        return LLMClient("ollama").generate(prompt)
     if is_running_on_spaces():
-        return LLMClient("hf").generate(prompt)
     if (HF_TOKEN or "").strip():
-        return LLMClient("hf").generate(prompt)
-    return LLMClient("ollama").generate(prompt)
 def github_create_issue(title: str, body: str, labels: Optional[List[str]] = None) -> Tuple[Optional[int], Optional[str]]:
     if not GITHUB_TOKEN:
@@ -804,14 +744,7 @@ with st.sidebar:
         st.session_state["open_ticket_ui"] = True
     st.write("")
     st.subheader("LLM")
-    backend = os.getenv("RAG_LLM_BACKEND", "auto").strip().lower()
-    use_hf = backend == "hf" or (
-        backend == "auto" and (is_running_on_spaces() or (HF_TOKEN or "").strip())
-    )
-    active_model = select_active_hf_model() if use_hf else HF_MODEL_PRIMARY
-    st.markdown(f"- Active model: `{active_model}`")
-    if use_hf and st.session_state.get("hf_startup_error"):
-        st.warning("HF model not available; check token/provider/model list.")
     st.write("")
     st.subheader("Embedding model (retrieval)")
     st.code(EMBED_MODEL)

 import numpy as np
 import faiss
 import requests
+from openai import OpenAI
 from sentence_transformers import SentenceTransformer
 load_dotenv(Path(__file__).resolve().parent / ".env", override=True)
 RETRIEVE_TOPK_MULT = CONFIG.retrieve_topk_mult
 HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
+HF_MODEL = os.getenv("RAG_HF_MODEL", "Qwen/Qwen2.5-7B-Instruct-1M:featherless-ai").strip()
 OLLAMA_BASE_URL = os.environ.get("RAG_OLLAMA_URL", "http://localhost:11434").rstrip("/")
 OLLAMA_MODEL = os.environ.get("RAG_OLLAMA_MODEL", "llama3.2:1b")
         return True
     return (os.environ.get("SYSTEM") or "").strip().lower() == "spaces"
+@st.cache_resource(show_spinner=False)
+def get_hf_router_client() -> OpenAI:
+    return OpenAI(
+        base_url="https://router.huggingface.co/v1",
+        api_key=HF_TOKEN,
+    )
+def hf_chat(prompt: str) -> Tuple[str, Optional[str]]:
+    if not HF_TOKEN:
+        return "", "Missing HF_TOKEN (or HUGGINGFACEHUB_API_TOKEN)"
+    try:
+        client = get_hf_router_client()
+        completion = client.chat.completions.create(
+            model=HF_MODEL,
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": prompt},
+            ],
+            max_tokens=MAX_GENERATION_TOKENS,
+            temperature=0.2,
+        )
+        return (completion.choices[0].message.content or "").strip(), None
+    except Exception as e:
+        return "", str(e)
 def ollama_chat(prompt: str, timeout: Tuple[int, int] = (10, 600)) -> Tuple[str, Optional[str]]:
     url = f"{OLLAMA_BASE_URL}/api/chat"
     """
     backend = (os.environ.get("RAG_LLM_BACKEND", "") or "").strip().lower()
+    if backend == "hf-router":
+        return hf_chat(prompt)
     if backend == "ollama":
+        return ollama_chat(prompt)
     if is_running_on_spaces():
+        return hf_chat(prompt)
     if (HF_TOKEN or "").strip():
+        return hf_chat(prompt)
+    return ollama_chat(prompt)
 def github_create_issue(title: str, body: str, labels: Optional[List[str]] = None) -> Tuple[Optional[int], Optional[str]]:
     if not GITHUB_TOKEN:
         st.session_state["open_ticket_ui"] = True
     st.write("")
     st.subheader("LLM")
+    st.markdown(f"- Active model: `{HF_MODEL}`")
     st.write("")
     st.subheader("Embedding model (retrieval)")
     st.code(EMBED_MODEL)

requirements.txt CHANGED Viewed

@@ -3,7 +3,7 @@
 # -------------------------
 requests>=2.31.0
 python-dotenv>=1.0.0
-huggingface_hub>=0.24.0
 numpy>=1.24.0
 faiss-cpu>=1.8.0
 sentence-transformers>=2.6.0

 # -------------------------
 requests>=2.31.0
 python-dotenv>=1.0.0
+openai>=1.0.0
 numpy>=1.24.0
 faiss-cpu>=1.8.0
 sentence-transformers>=2.6.0