Prasanga73 commited on
Commit
e2d2024
·
verified ·
1 Parent(s): 5765b31

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -12
app.py CHANGED
@@ -9,16 +9,12 @@ def respond(
9
  max_tokens,
10
  temperature,
11
  top_p,
12
- # CHANGE: Replace gr.OAuthToken with a standard string parameter
13
  hf_token_string,
14
  ):
15
- # Use the token passed from the API,
16
- # OR if empty, try to get it from Space Secrets (Settings > Secrets)
17
  token = hf_token_string if hf_token_string else os.getenv("HF_TOKEN")
18
 
19
- # If no token is found at all, the client will fail gracefully
20
  if not token:
21
- yield "Error: No Hugging Face Token provided. Please provide one in the API call or Space Secrets."
22
  return
23
 
24
  client = InferenceClient(token=token, model="meta-llama/Meta-Llama-3-8B-Instruct")
@@ -27,8 +23,8 @@ def respond(
27
  messages.extend(history)
28
  messages.append({"role": "user", "content": message})
29
 
30
- response = ""
31
  try:
 
32
  for chunk in client.chat_completion(
33
  messages,
34
  max_tokens=max_tokens,
@@ -39,12 +35,16 @@ def respond(
39
  if len(chunk.choices) > 0:
40
  token_str = chunk.choices[0].delta.content
41
  if token_str:
42
- response += token_str
43
- yield response
 
44
  except Exception as e:
45
  yield f"API Error: {str(e)}"
46
 
47
- # Define the interface
 
 
 
48
  chatbot = gr.ChatInterface(
49
  respond,
50
  type="messages",
@@ -53,14 +53,11 @@ chatbot = gr.ChatInterface(
53
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
54
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
55
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
56
- # ADDED: This allows you to pass the token via API
57
  gr.Textbox(label="Hugging Face Token", type="password"),
58
  ],
59
  )
60
 
61
  with gr.Blocks() as demo:
62
- # Optional: Keep the login button for web users,
63
- # but the API will use the Textbox instead
64
  with gr.Sidebar():
65
  gr.LoginButton()
66
  chatbot.render()
 
9
  max_tokens,
10
  temperature,
11
  top_p,
 
12
  hf_token_string,
13
  ):
 
 
14
  token = hf_token_string if hf_token_string else os.getenv("HF_TOKEN")
15
 
 
16
  if not token:
17
+ yield "Error: No Token provided."
18
  return
19
 
20
  client = InferenceClient(token=token, model="meta-llama/Meta-Llama-3-8B-Instruct")
 
23
  messages.extend(history)
24
  messages.append({"role": "user", "content": message})
25
 
 
26
  try:
27
+ # We don't need a 'response' string variable here for the API
28
  for chunk in client.chat_completion(
29
  messages,
30
  max_tokens=max_tokens,
 
35
  if len(chunk.choices) > 0:
36
  token_str = chunk.choices[0].delta.content
37
  if token_str:
38
+ # OPTIMIZATION: Yield ONLY the new token.
39
+ # This is what makes the API streaming "instant".
40
+ yield token_str
41
  except Exception as e:
42
  yield f"API Error: {str(e)}"
43
 
44
+ # The ChatInterface will now receive tokens one by one.
45
+ # Note: In the Gradio UI, this might make tokens "replace" each other.
46
+ # If you want the UI to still look normal while keeping the API fast,
47
+ # use the client-side logic below.
48
  chatbot = gr.ChatInterface(
49
  respond,
50
  type="messages",
 
53
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
54
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
55
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
 
56
  gr.Textbox(label="Hugging Face Token", type="password"),
57
  ],
58
  )
59
 
60
  with gr.Blocks() as demo:
 
 
61
  with gr.Sidebar():
62
  gr.LoginButton()
63
  chatbot.render()