Selinaliu1030 commited on
Commit
55ea8b6
·
1 Parent(s): 66ce9ad

update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -17
app.py CHANGED
@@ -2,6 +2,27 @@ import gradio as gr
2
  from huggingface_hub import InferenceClient
3
 
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  def respond(
6
  message,
7
  history: list[dict[str, str]],
@@ -11,38 +32,39 @@ def respond(
11
  top_p,
12
  hf_token: gr.OAuthToken,
13
  ):
14
- """
15
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
16
- """
17
- client = InferenceClient(token=hf_token.token, model="Selinaliu1030/lora_model")
 
 
18
 
 
19
  messages = [{"role": "system", "content": system_message}]
20
-
21
  messages.extend(history)
22
-
23
  messages.append({"role": "user", "content": message})
24
 
 
 
 
25
  response = ""
26
 
27
- for message in client.text_generation(
28
- messages,
 
29
  max_new_tokens=max_tokens,
30
  stream=True,
31
  temperature=temperature,
32
  top_p=top_p,
33
  ):
34
- choices = message.choices
35
  token = ""
36
- if len(choices) and choices[0].delta.content:
37
- token = choices[0].delta.content
38
 
39
  response += token
40
  yield response
41
 
42
 
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
  chatbot = gr.ChatInterface(
47
  respond,
48
  type="messages",
@@ -65,8 +87,5 @@ with gr.Blocks() as demo:
65
  gr.LoginButton()
66
  chatbot.render()
67
 
68
-
69
  if __name__ == "__main__":
70
  demo.launch()
71
-
72
-
 
2
  from huggingface_hub import InferenceClient
3
 
4
 
5
+ def format_chat_messages(messages):
6
+ """
7
+ Converts Gradio message history into a single text prompt
8
+ in ChatML / Llama-3-like format.
9
+ """
10
+ prompt = ""
11
+ for msg in messages:
12
+ role = msg["role"]
13
+ content = msg["content"]
14
+
15
+ if role == "system":
16
+ prompt += f"<|system|>\n{content}\n"
17
+ elif role == "user":
18
+ prompt += f"<|user|>\n{content}\n"
19
+ else:
20
+ prompt += f"<|assistant|>\n{content}\n"
21
+
22
+ prompt += "<|assistant|>\n" # assistant is about to reply
23
+ return prompt
24
+
25
+
26
  def respond(
27
  message,
28
  history: list[dict[str, str]],
 
32
  top_p,
33
  hf_token: gr.OAuthToken,
34
  ):
35
+ # explicitly specify provider (IMPORTANT for avoiding StopIteration)
36
+ client = InferenceClient(
37
+ model="Selinaliu1030/lora_model",
38
+ token=hf_token.token,
39
+ provider="hf-inference" # <-- ⭐ REQUIRED FIX
40
+ )
41
 
42
+ # Build the message list
43
  messages = [{"role": "system", "content": system_message}]
 
44
  messages.extend(history)
 
45
  messages.append({"role": "user", "content": message})
46
 
47
+ # Convert messages into single prompt
48
+ prompt = format_chat_messages(messages)
49
+
50
  response = ""
51
 
52
+ # Use the correct API for text generation (not chat endpoints)
53
+ for msg in client.text_generation(
54
+ prompt,
55
  max_new_tokens=max_tokens,
56
  stream=True,
57
  temperature=temperature,
58
  top_p=top_p,
59
  ):
 
60
  token = ""
61
+ if msg.token: # new API returns .token
62
+ token = msg.token
63
 
64
  response += token
65
  yield response
66
 
67
 
 
 
 
68
  chatbot = gr.ChatInterface(
69
  respond,
70
  type="messages",
 
87
  gr.LoginButton()
88
  chatbot.render()
89
 
 
90
  if __name__ == "__main__":
91
  demo.launch()