Spaces:

kaizen9
/

server

Sleeping

App Files Files Community

3v324v23 commited on Aug 29, 2025

Commit

af954c4

1 Parent(s): d081499

app.py

Browse files

Files changed (1) hide show

app.py +51 -69

app.py CHANGED Viewed

@@ -11,37 +11,29 @@ def respond(
     max_tokens: int,
     temperature: float,
     top_p: float,
-    hf_token: gr.OAuthToken,
 ):
-    """
-    Streams chat responses from a Hugging Face Inference Endpoint.
-    Notes:
-    - Requires your endpoint to allow inference with your token (permission:
-      `inference.endpoints.infer.write`).
-    - If the endpoint doesn't support OpenAI-style /v1/chat (e.g., plain TGI),
-      we fallback to a single-prompt `.text_generation()` call using a simple
-      prompt format built from the chat history.
-    """
-    # 1) Client that talks directly to your endpoint
     client = InferenceClient(
         base_url=ENDPOINT_URL,
-        token=hf_token.token,  # uses the OAuth token from the LoginButton
     )
-    # 2) Build OpenAI-style messages for chat backends
     messages = []
     if system_message:
         messages.append({"role": "system", "content": system_message})
-    # Gradio gives `history` as a list of {"role": "...", "content": "..."} when type="messages"
-    # Append previous turns, then the new user message
     messages.extend(history or [])
     messages.append({"role": "user", "content": user_msg})
-    # 3) Try OpenAI-style chat first (works if your endpoint exposes /v1/chat/completions)
     try:
-        response_text = ""
         for chunk in client.chat_completion(
             messages=messages,
             max_tokens=max_tokens,
@@ -49,92 +41,82 @@ def respond(
             top_p=top_p,
             stream=True,
         ):
-            # chunk.choices[0].delta.content is the streamed token (if present)
             token = ""
             if getattr(chunk, "choices", None) and getattr(chunk.choices[0], "delta", None):
                 token = chunk.choices[0].delta.content or ""
-            response_text += token
-            yield response_text
-        return  # success via chat api
-    except Exception as e:
-        # If chat endpoint isn't available, fall back to text_generation
-        # (common when the endpoint is plain TGI without OpenAI route enabled)
-        fallback_reason = str(e)
-    # 4) Fallback: Plain text generation with a simple chat-to-prompt adapter
     try:
-        def to_plain_prompt(msgs: List[Dict[str, str]]) -> str:
             lines = []
             for m in msgs:
                 role = m.get("role", "user")
                 content = m.get("content", "")
-                if role == "system":
-                    lines.append(f"[SYSTEM] {content}")
-                elif role == "user":
-                    lines.append(f"[USER] {content}")
-                else:
-                    lines.append(f"[ASSISTANT] {content}")
             lines.append("[ASSISTANT]")  # cue the model to speak
             return "\n".join(lines)
-        prompt = to_plain_prompt(messages)
-        response_text = ""
-        # stream text_generation tokens if the backend supports it
         for tok in client.text_generation(
             prompt,
             max_new_tokens=max_tokens,
             temperature=temperature,
             top_p=top_p,
             stream=True,
-            # Many TGI backends respect these kwargs; safe to include
             return_full_text=False,
         ):
-            # `tok` can be a string or an object depending on server; normalize to str
             piece = getattr(tok, "token", tok)
             if isinstance(piece, dict) and "text" in piece:
                 piece = piece["text"]
-            piece = str(piece)
-            response_text += piece
-            yield response_text
-    except Exception as e2:
-        # Surface a readable error in the chat window
-        err = (
-            "Failed to query the endpoint.\n\n"
-            f"- Chat attempt error: {fallback_reason}\n"
-            f"- Text-generation fallback error: {e2}\n\n"
-            "Check that your endpoint is running, your token has "
-            "`inference.endpoints.infer.write`, and the runtime supports either "
-            "OpenAI chat (/v1/chat/completions) or TGI text-generation."
-        )
-        yield err
-# --- Gradio UI ---
-chatbot = gr.ChatInterface(
     respond,
-    type="messages",  # history comes as [{"role": "...", "content": "..."}]
     additional_inputs=[
         gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.0, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(minimum=0.0, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
     ],
 )
 with gr.Blocks() as demo:
     with gr.Sidebar():
         gr.Markdown("### Hugging Face Login")
-        # This provides `hf_token: gr.OAuthToken` to `respond`
-        gr.LoginButton()
-        gr.Markdown(
-            "Make sure your token has **`inference.endpoints.infer.write`** permission."
-        )
         gr.Markdown(
-            f"**Endpoint**:\n\n`{ENDPOINT_URL}`"
         )
-    chatbot.render()
 if __name__ == "__main__":
     demo.launch()

     max_tokens: int,
     temperature: float,
     top_p: float,
+    hf_token: gr.OAuthToken,  # <-- LoginButton injects this
 ):
+    # 0) Make sure user actually clicked "Login"
+    if hf_token is None or not getattr(hf_token, "token", None):
+        yield "🔒 Please click **Login** (left sidebar) to authorize Hugging Face access."
+        return
+    # 1) Create client against your endpoint (not model=)
     client = InferenceClient(
         base_url=ENDPOINT_URL,
+        token=hf_token.token,   # <-- PAT from Login flow
     )
+    # 2) Build messages for chat APIs
     messages = []
     if system_message:
         messages.append({"role": "system", "content": system_message})
     messages.extend(history or [])
     messages.append({"role": "user", "content": user_msg})
+    # 3) Try OpenAI-style /v1/chat if your endpoint supports it
     try:
+        out = ""
         for chunk in client.chat_completion(
             messages=messages,
             max_tokens=max_tokens,
             top_p=top_p,
             stream=True,
         ):
             token = ""
             if getattr(chunk, "choices", None) and getattr(chunk.choices[0], "delta", None):
                 token = chunk.choices[0].delta.content or ""
+            out += token
+            yield out
+        return
+    except Exception as chat_err:
+        chat_err_msg = str(chat_err)
+    # 4) Fallback to plain text-generation (works on vanilla TGI endpoints)
     try:
+        def to_prompt(msgs: List[Dict[str, str]]) -> str:
             lines = []
             for m in msgs:
                 role = m.get("role", "user")
                 content = m.get("content", "")
+                tag = {"system": "SYSTEM", "user": "USER"}.get(role, "ASSISTANT")
+                lines.append(f"[{tag}] {content}")
             lines.append("[ASSISTANT]")  # cue the model to speak
             return "\n".join(lines)
+        prompt = to_prompt(messages)
+        out = ""
         for tok in client.text_generation(
             prompt,
             max_new_tokens=max_tokens,
             temperature=temperature,
             top_p=top_p,
             stream=True,
             return_full_text=False,
         ):
             piece = getattr(tok, "token", tok)
             if isinstance(piece, dict) and "text" in piece:
                 piece = piece["text"]
+            out += str(piece)
+            yield out
+    except Exception as gen_err:
+        # 5) Clear, helpful errors for auth/permissions/runtime
+        err_text = f"""❗ Failed to query the endpoint.
+• Chat API error: {chat_err_msg}
+• Text-generation fallback error: {gen_err}
+Quick checks:
+1) You clicked **Login** and authorized this app.
+2) Your HF token includes `inference.endpoints.infer.write`.
+3) The endpoint is running and supports either OpenAI chat or TGI generation.
+Endpoint: {ENDPOINT_URL}
+"""
+        yield err_text
+# --- UI ---
+chat = gr.ChatInterface(
     respond,
+    type="messages",
     additional_inputs=[
         gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
+        gr.Slider(1, 4096, value=512, step=1, label="Max new tokens"),
+        gr.Slider(0.0, 4.0, value=0.7, step=0.1, label="Temperature"),
+        gr.Slider(0.0, 1.0, value=0.95, step=0.05, label="Top-p"),
     ],
 )
 with gr.Blocks() as demo:
     with gr.Sidebar():
         gr.Markdown("### Hugging Face Login")
+        gr.LoginButton()  # <-- keep this
         gr.Markdown(
+            "- Make sure your token has **`inference.endpoints.infer.write`**.\n"
+            "- This app will use your HF token only to call the endpoint."
         )
+        gr.Markdown(f"**Endpoint**: `{ENDPOINT_URL}`")
+    chat.render()
 if __name__ == "__main__":
     demo.launch()