Spaces:

kaizen9
/

server

Sleeping

App Files Files Community

kaizen9 commited on Aug 29, 2025

Commit

511336b

1 Parent(s): 6220bad

a

Browse files

Files changed (1) hide show

app.py +51 -108

app.py CHANGED Viewed

@@ -1,120 +1,63 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-from typing import List, Dict, Optional
-# Your endpoint root (no trailing /v1 here; the client adds it for chat)
-ENDPOINT_URL = "https://x6leavj4hgm2fdyx.us-east-2.aws.endpoints.huggingface.cloud/v1/"
-def respond(
-    user_msg: str,
-    history: List[Dict[str, str]],
-    system_message: str,
-    max_tokens: int,
-    temperature: float,
-    top_p: float,
-    hf_token: Optional[gr.OAuthToken],   # from LoginButton (kept)
-    pat_override: str,                   # NEW: user-pasted PAT (password field)
-):
-    """
-    Use PAT override if provided; otherwise fall back to LoginButton token.
-    NOTE: OAuth token from LoginButton usually lacks `inference.endpoints.infer.write`,
-    so for Inference Endpoints you almost always need to paste a PAT here.
-    """
-    # Choose a token: prefer user-supplied PAT with endpoints write scope
-    token = pat_override.strip() or (getattr(hf_token, "token", None) if hf_token else None)
-    if not token:
-        yield "🔒 Please click **Login** OR paste a **Hugging Face PAT** with `inference.endpoints.infer.write`."
-        return
-    client = InferenceClient(base_url=ENDPOINT_URL, token=token)
-    # Build messages (OpenAI-style)
-    messages = []
-    if system_message:
-        messages.append({"role": "system", "content": system_message})
-    messages.extend(history or [])
-    messages.append({"role": "user", "content": user_msg})
-    # Try OpenAI-compatible chat route first: /v1/chat/completions
-    try:
-        out = ""
-        for chunk in client.chat_completion(
-            messages=messages,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p,
-            stream=True,
-        ):
-            tok = ""
-            if getattr(chunk, "choices", None) and getattr(chunk.choices[0], "delta", None):
-                tok = chunk.choices[0].delta.content or ""
-            out += tok
-            yield out
-        return
-    except Exception as e_chat:
-        chat_err = str(e_chat)
-    # Fallback to plain generation (for non-OpenAI runtimes)
-    try:
-        def to_prompt(msgs: List[Dict[str, str]]) -> str:
-            lines = []
-            for m in msgs:
-                role = m.get("role", "user")
-                content = m.get("content", "")
-                tag = {"system": "SYSTEM", "user": "USER"}.get(role, "ASSISTANT")
-                lines.append(f"[{tag}] {content}")
-            lines.append("[ASSISTANT]")
-            return "\n".join(lines)
-        prompt = to_prompt(messages)
-        out = ""
-        for tok in client.text_generation(
-            prompt,
-            max_new_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p,
-            stream=True,
-            return_full_text=False,
-        ):
-            piece = getattr(tok, "token", tok)
-            if isinstance(piece, dict) and "text" in piece:
-                piece = piece["text"]
-            out += str(piece)
-            yield out
-    except Exception as e_gen:
-        yield (
-            "❗ Endpoint call failed.\n\n"
-            f"• Chat API error: {chat_err}\n"
-            f"• Text-generation fallback error: {e_gen}\n\n"
-            "Most likely cause: the token used does NOT have `inference.endpoints.infer.write`.\n"
-            "Paste a PAT with that scope in the sidebar."
-        )
-# --- UI ---
-chat = gr.ChatInterface(
-    respond,
-    type="messages",
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(1, 4096, value=512, step=1, label="Max new tokens"),
-        gr.Slider(0.0, 4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(0.0, 1.0, value=0.95, step=0.05, label="Top-p"),
-        # NEW: secure PAT override
-        gr.Textbox(value="", label="HF PAT (with `inference.endpoints.infer.write`)", type="password"),
-    ],
-)
-with gr.Blocks() as demo:
-    with gr.Sidebar():
-        gr.Markdown("### Hugging Face Login (optional)")
-        gr.LoginButton()
-        gr.Markdown(
-            "**Important:** Inference Endpoints require a PAT with\n"
-            "`inference.endpoints.infer.write`. The Login token usually does **not** have this.\n"
-            "Paste a PAT in the password field if you see 403 errors."
-        )
-        gr.Markdown(f"**Endpoint**: `{ENDPOINT_URL}`")
-    chat.render()
 if __name__ == "__main__":
     demo.launch()

+import os
 import gradio as gr
+from openai import OpenAI
+# Pick up secrets from HF Space
+BASE = os.getenv("HF_ENDPOINT_URL", "").rstrip("/")
+API_KEY = os.getenv("HF_TOKEN")
+MODEL_ID = "kaizen9/qsft_30_6000_v2"
+client = OpenAI(
+    base_url=f"{BASE}/v1",
+    api_key=API_KEY,
+)
+def build_messages(history, user_msg, system_msg):
+    msgs = []
+    if system_msg.strip():
+        msgs.append({"role": "system", "content": system_msg.strip()})
+    for u, a in history:
+        if u: msgs.append({"role": "user", "content": u})
+        if a: msgs.append({"role": "assistant", "content": a})
+    msgs.append({"role": "user", "content": user_msg})
+    return msgs
+def chat_fn(message, history, system_message, temperature, top_p, max_tokens):
+    msgs = build_messages(history, message, system_message)
+    stream = client.chat.completions.create(
+        model=MODEL_ID,
+        messages=msgs,
+        temperature=float(temperature),
+        top_p=float(top_p),
+        max_tokens=int(max_tokens),
+        stream=True,
+    )
+    partial = ""
+    for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta and delta.content:
+            partial += delta.content
+            yield partial
+with gr.Blocks() as demo:
+    gr.Markdown("# QSFT Chat UI")
+    system_box = gr.Textbox(
+        label="System prompt",
+        value="You are a helpful assistant.",
+        lines=2,
+    )
+    temp = gr.Slider(0.0, 2.0, 0.7, step=0.1, label="Temperature")
+    topp = gr.Slider(0.0, 1.0, 0.95, step=0.01, label="Top-p")
+    maxt = gr.Slider(16, 4096, 512, step=16, label="Max tokens")
+    gr.ChatInterface(
+        fn=chat_fn,
+        additional_inputs=[system_box, temp, topp, maxt],
+        retry_btn=True,
+        undo_btn=True,
+    )
 if __name__ == "__main__":
     demo.launch()