Spaces:

Prasanga73
/

TestStream

Sleeping

App Files Files Community

Prasanga73 commited on Jan 25

Commit

e2d2024

verified ·

1 Parent(s): 5765b31

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -12

app.py CHANGED Viewed

@@ -9,16 +9,12 @@ def respond(
     max_tokens,
     temperature,
     top_p,
-    # CHANGE: Replace gr.OAuthToken with a standard string parameter
     hf_token_string,
 ):
-    # Use the token passed from the API,
-    # OR if empty, try to get it from Space Secrets (Settings > Secrets)
     token = hf_token_string if hf_token_string else os.getenv("HF_TOKEN")
-    # If no token is found at all, the client will fail gracefully
     if not token:
-        yield "Error: No Hugging Face Token provided. Please provide one in the API call or Space Secrets."
         return
     client = InferenceClient(token=token, model="meta-llama/Meta-Llama-3-8B-Instruct")
@@ -27,8 +23,8 @@ def respond(
     messages.extend(history)
     messages.append({"role": "user", "content": message})
-    response = ""
     try:
         for chunk in client.chat_completion(
             messages,
             max_tokens=max_tokens,
@@ -39,12 +35,16 @@ def respond(
             if len(chunk.choices) > 0:
                 token_str = chunk.choices[0].delta.content
                 if token_str:
-                    response += token_str
-                    yield response
     except Exception as e:
         yield f"API Error: {str(e)}"
-# Define the interface
 chatbot = gr.ChatInterface(
     respond,
     type="messages",
@@ -53,14 +53,11 @@ chatbot = gr.ChatInterface(
         gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
-        # ADDED: This allows you to pass the token via API
         gr.Textbox(label="Hugging Face Token", type="password"),
     ],
 )
 with gr.Blocks() as demo:
-    # Optional: Keep the login button for web users,
-    # but the API will use the Textbox instead
     with gr.Sidebar():
         gr.LoginButton()
     chatbot.render()

     max_tokens,
     temperature,
     top_p,
     hf_token_string,
 ):
     token = hf_token_string if hf_token_string else os.getenv("HF_TOKEN")
     if not token:
+        yield "Error: No Token provided."
         return
     client = InferenceClient(token=token, model="meta-llama/Meta-Llama-3-8B-Instruct")
     messages.extend(history)
     messages.append({"role": "user", "content": message})
     try:
+        # We don't need a 'response' string variable here for the API
         for chunk in client.chat_completion(
             messages,
             max_tokens=max_tokens,
             if len(chunk.choices) > 0:
                 token_str = chunk.choices[0].delta.content
                 if token_str:
+                    # OPTIMIZATION: Yield ONLY the new token.
+                    # This is what makes the API streaming "instant".
+                    yield token_str
     except Exception as e:
         yield f"API Error: {str(e)}"
+# The ChatInterface will now receive tokens one by one.
+# Note: In the Gradio UI, this might make tokens "replace" each other.
+# If you want the UI to still look normal while keeping the API fast,
+# use the client-side logic below.
 chatbot = gr.ChatInterface(
     respond,
     type="messages",
         gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
         gr.Textbox(label="Hugging Face Token", type="password"),
     ],
 )
 with gr.Blocks() as demo:
     with gr.Sidebar():
         gr.LoginButton()
     chatbot.render()