Spaces:
Runtime error
Runtime error
| import argparse | |
| import gradio as gr | |
| from openai import OpenAI | |
| # Argument parser setup | |
| parser = argparse.ArgumentParser( | |
| description='Chatbot Interface with Customizable Parameters') | |
| parser.add_argument('--model-url', | |
| type=str, | |
| default=' https://55ad-165-204-156-250.ngrok-free.app/v1', | |
| help='Model URL') | |
| parser.add_argument('-m', | |
| '--model', | |
| type=str, | |
| required=True, | |
| help='Model name for the chatbot') | |
| parser.add_argument('--temp', | |
| type=float, | |
| default=0.2, | |
| help='Temperature for text generation') | |
| parser.add_argument('--stop-token-ids', | |
| type=str, | |
| default='128001,128009', | |
| help='Comma-separated stop token IDs') | |
| parser.add_argument("--host", type=str, default=None) | |
| parser.add_argument("--port", type=int, default=8001) | |
| # Parse the arguments | |
| args = parser.parse_args() | |
| # Set OpenAI's API key and API base to use vLLM's API server. | |
| openai_api_key = "EMPTY" | |
| openai_api_base = args.model_url | |
| # Create an OpenAI client to interact with the API server | |
| client = OpenAI( | |
| api_key=openai_api_key, | |
| base_url=openai_api_base, | |
| ) | |
| def predict(message, history): | |
| # Convert chat history to OpenAI format | |
| history_openai_format = [{ | |
| "role": "system", | |
| "content": "You are a great ai assistant." | |
| }] | |
| for human, assistant in history: | |
| history_openai_format.append({"role": "user", "content": human}) | |
| history_openai_format.append({ | |
| "role": "assistant", | |
| "content": assistant | |
| }) | |
| history_openai_format.append({"role": "user", "content": message}) | |
| # Create a chat completion request and send it to the API server | |
| stream = client.chat.completions.create( | |
| model="/app/model/llama3_8b_lora_indica_sft/", # Model name to use | |
| messages=history_openai_format, # Chat history | |
| temperature=args.temp, # Temperature for text generation | |
| stream=True, # Stream response | |
| extra_body={ | |
| 'repetition_penalty': | |
| 1, | |
| 'stop_token_ids': [ | |
| int(id.strip()) for id in args.stop_token_ids.split(',') | |
| if id.strip() | |
| ] if args.stop_token_ids else [] | |
| }) | |
| # Read and return generated text from response stream | |
| partial_message = "" | |
| for chunk in stream: | |
| partial_message += (chunk.choices[0].delta.content or "") | |
| yield partial_message | |
| # Create and launch a chat interface with Gradio | |
| gr.ChatInterface(predict).queue().launch() |