import json import gradio as gr import os import requests hf_token = os.getenv('HF_TOKEN') api_url = "http://region-31.seetacloud.com:46766/v1/chat/completions" headers = { 'Content-Type': 'application/json', } TIME_OUT_SECONDS = 30 title = "Next llama Chatbot" description = """...""" # 保留你原始的描述 css = """.toast-wrap { display: none !important } """ examples = [ ['Hello there! How are you doing?'], # ... 其他示例 ] message_history = [] def process_api_response(response): assistant_response = "" for line in response.iter_lines(): decoded_line = line.decode('utf-8').strip() if decoded_line.startswith("data: "): decoded_line = decoded_line[6:] if decoded_line: try: json_line = json.loads(decoded_line) if "choices" in json_line and "delta" in json_line["choices"][0]: delta = json_line["choices"][0]["delta"] if "content" in delta: assistant_response += delta["content"] # 累加每一步的回复 yield assistant_response # 实时返回累加的回复 except json.JSONDecodeError: print(f"Failed to decode line: {decoded_line}") # 在历史记录中只保存最后一个完整的回复 if assistant_response: message_history.append({ "role": "assistant", "content": assistant_response }) def predict(system_message, message, system_prompt="You are a helpful, respectful and honest assistant.", temperature=0.9, max_new_tokens=2048, top_p=0.6, repetition_penalty=1.0): # 添加用户和助手的消息到历史记录 message_history.append({ "role": "assistant", "content": system_prompt }) message_history.append({ "role": "user", "content": system_message }) data = { "model": "LLaMa-2-13B-chat", "messages": message_history, # 使用完整的消息历史记录 "stream": True, "temperature": temperature, "max_tokens": max_new_tokens, "presence_penalty": repetition_penalty } # 打印发送到后端的API数据 print("Sending the following data to the backend API:") print(json.dumps(data, indent=4)) try: response = requests.post(api_url, headers=headers, data=json.dumps(data), auth=('hf', hf_token), stream=True, timeout=TIME_OUT_SECONDS) if response.status_code == 200: for assistant_reply in process_api_response(response): yield assistant_reply elif response.status_code == 401: yield "Error: Unauthorized" else: yield f"Error with status code: {response.status_code}" except requests.Timeout: yield "Error: Request timed out" except requests.RequestException as e: yield f"Error: {e}" # def vote(data: gr.LikeData): # if data.liked: # print("You upvoted this response: " + data.value) # else: # print("You downvoted this response: " + data.value) additional_inputs = [ gr.Textbox("", label="Optional system prompt"), gr.Slider( label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs", ), gr.Slider( label="Max new tokens", value=256, minimum=0, maximum=4096, step=64, interactive=True, info="The maximum numbers of new tokens", ), gr.Slider( label="Top-p (nucleus sampling)", value=0.6, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens", ), gr.Slider( label="Repetition penalty", value=1.2, minimum=1.0, maximum=2.0, step=0.05, interactive=True, info="Penalize repeated tokens", ) ] # Remove the unrecognized arguments from gr.Chatbot chatbot_stream = gr.Chatbot() # Since gr.ChatInterface doesn't support additional_inputs, we'll need to adjust our design. # For now, I'm removing the additional_inputs argument. You might need to consider a different interface type if you want to use these inputs. chat_interface_stream = gr.ChatInterface(predict, title=title, description=description, chatbot=chatbot_stream, css=css, examples=examples, cache_examples=True) chat_interface_stream.queue().launch(debug=True)