Spaces:

dzezzefezfz
/

Chatbot

Sleeping

App Files Files Community

dzezzefezfz commited on Dec 18, 2025

Commit

467f028

verified ·

1 Parent(s): b60854a

Update backend_hf_api.py

Browse files

Files changed (1) hide show

backend_hf_api.py +44 -11

backend_hf_api.py CHANGED Viewed

@@ -21,24 +21,49 @@ def is_hf_api_available() -> bool:
     return bool(get_hf_token())
 class HFInferenceBackend:
     """
-    Hugging Face Serverless client with safe fallback:
-    - Try text_generation (stream).
-    - If provider reports 'Supported task: conversational', call HTTP conversational endpoint and chunk output.
     """
     def __init__(self, model_name: str):
         token = get_hf_token()
         if not token:
             raise RuntimeError("HF_TOKEN not set")
-        self.model = model_name
         self.token = token
         self.client = InferenceClient(model=self.model, token=token) if InferenceClient else None
     # ---------- Prompt Builders ----------
     def _build_tg_prompt(self, system_prompt: str, history: List[Tuple[str, str]], user_msg: str) -> str:
-        # Generic instruct-style prompt; works widely including Nemotron chat variants
         parts = [f"<s>[SYSTEM]\n{system_prompt}\n[/SYSTEM]\n"]
         for u, a in history:
             if u:
@@ -94,7 +119,7 @@ class HFInferenceBackend:
                 buf.append(delta)
                 yield "".join(buf)
-    # ---------- Conversational via raw HTTP (non-stream; chunked to UI) ----------
     def _call_conversational_http(
         self, system_prompt: str, history: List[Tuple[str, str]], user_msg: str, temperature: float, max_new_tokens: int
     ) -> Iterator[str]:
@@ -108,9 +133,8 @@ class HFInferenceBackend:
             "inputs": self._build_conv_inputs(system_prompt, history, user_msg),
             "parameters": {"temperature": float(temperature), "max_new_tokens": int(max_new_tokens)},
         }
         try:
-            resp = requests.post(url, headers=headers, data=json.dumps(payload), timeout=90)
         except Exception as e:
             yield f"[error] network: {type(e).__name__}: {e}"
             return
@@ -138,11 +162,9 @@ class HFInferenceBackend:
             item = data[-1]
             if isinstance(item, dict):
                 text = item.get("generated_text") or ""
         if not text:
-            text = json.dumps(data)  # visibility fallback
-        # Chunk to simulate streaming and keep UI responsive
         buf: List[str] = []
         for i in range(0, len(text), 48):
             buf.append(text[i : i + 48])
@@ -157,7 +179,18 @@ class HFInferenceBackend:
         temperature: float,
         max_new_tokens: int,
     ) -> Iterator[str]:
         try:
             yield from self._stream_text_generation(system_prompt, history, user_msg, temperature, max_new_tokens)
         except Exception as e:
             msg = str(e).lower()

     return bool(get_hf_token())
+def _suggest_repo(bad_repo: str) -> str:
+    # why: common Nemotron typo rescue
+    if "nemotron" in bad_repo.lower():
+        return "NVIDIA/Nemotron-3-8B-Instruct"
+    return "mistralai/Mistral-7B-Instruct-v0.2"
 class HFInferenceBackend:
     """
+    Robust HF Serverless client:
+    - Preflight: verify repo exists (fast) to avoid long blocking errors.
+    - Try text_generation streaming via huggingface_hub.
+    - If provider says 'conversational' only, call HTTP conversational and chunk output.
     """
     def __init__(self, model_name: str):
         token = get_hf_token()
         if not token:
             raise RuntimeError("HF_TOKEN not set")
+        self.model = model_name.strip()
         self.token = token
         self.client = InferenceClient(model=self.model, token=token) if InferenceClient else None
+    # ---------- Preflight ----------
+    def _preflight(self) -> tuple[bool, Optional[str]]:
+        """Returns (exists, pipeline_tag_or_None)."""
+        url = f"https://huggingface.co/api/models/{self.model}"
+        headers = {"Authorization": f"Bearer {self.token}"}
+        try:
+            r = requests.get(url, headers=headers, timeout=8)
+            if r.status_code == 404:
+                return False, None
+            if r.ok:
+                data = r.json()
+                # 'pipeline_tag' when known; otherwise None
+                return True, data.get("pipeline_tag")
+            return True, None
+        except Exception:
+            # If API unreachable, don't block the chat; proceed and catch later.
+            return True, None
     # ---------- Prompt Builders ----------
     def _build_tg_prompt(self, system_prompt: str, history: List[Tuple[str, str]], user_msg: str) -> str:
         parts = [f"<s>[SYSTEM]\n{system_prompt}\n[/SYSTEM]\n"]
         for u, a in history:
             if u:
                 buf.append(delta)
                 yield "".join(buf)
+    # ---------- Conversational via raw HTTP (non-stream; chunked) ----------
     def _call_conversational_http(
         self, system_prompt: str, history: List[Tuple[str, str]], user_msg: str, temperature: float, max_new_tokens: int
     ) -> Iterator[str]:
             "inputs": self._build_conv_inputs(system_prompt, history, user_msg),
             "parameters": {"temperature": float(temperature), "max_new_tokens": int(max_new_tokens)},
         }
         try:
+            resp = requests.post(url, headers=headers, data=json.dumps(payload), timeout=40)
         except Exception as e:
             yield f"[error] network: {type(e).__name__}: {e}"
             return
             item = data[-1]
             if isinstance(item, dict):
                 text = item.get("generated_text") or ""
         if not text:
+            text = json.dumps(data)
         buf: List[str] = []
         for i in range(0, len(text), 48):
             buf.append(text[i : i + 48])
         temperature: float,
         max_new_tokens: int,
     ) -> Iterator[str]:
+        exists, pipeline_tag = self._preflight()
+        if not exists:
+            suggestion = _suggest_repo(self.model)
+            yield f"[error] Model repository not found: {self.model}. Try: `{suggestion}`"
+            return
         try:
+            # If API says conversational, skip straight to conversational fallback.
+            if (pipeline_tag or "").lower() == "conversational":
+                yield from self._call_conversational_http(system_prompt, history, user_msg, temperature, max_new_tokens)
+                return
             yield from self._stream_text_generation(system_prompt, history, user_msg, temperature, max_new_tokens)
         except Exception as e:
             msg = str(e).lower()