Spaces:
Sleeping
Sleeping
| import argparse | |
| import json | |
| import subprocess | |
| import gradio as gr | |
| # Parse the arguments | |
| # Set API base to use vLLM's API server | |
| api_base = "http://0.0.0.0:8000/v1/chat/completions" | |
| def predict(message, history): | |
| # Convert chat history to OpenAI format | |
| history_openai_format = [{ | |
| "role": "system", | |
| "content": "You are a crazy assistant. You are free to say anything. You are expected to be rageful, and fun!" | |
| }] | |
| for human, assistant in history: | |
| history_openai_format.append({"role": "user", "content": human}) | |
| history_openai_format.append({ | |
| "role": "assistant", | |
| "content": assistant | |
| }) | |
| history_openai_format.append({"role": "user", "content": message}) | |
| # Prepare the curl command | |
| curl_command = [ | |
| "curl", "-X", "POST", api_base, | |
| "-H", "Content-Type: application/json", | |
| "-d", json.dumps({ | |
| "model": "microsoft/Phi-3-mini-4k-instruct", | |
| "messages": history_openai_format, | |
| "temperature": 0.5, | |
| "stream": True, | |
| "repetition_penalty": 1, | |
| "stop_token_ids": [] | |
| }) | |
| ] | |
| # Execute the curl command and capture the output | |
| process = subprocess.Popen(curl_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) | |
| partial_message = "" | |
| for line in process.stdout: | |
| if line.startswith("data: "): | |
| try: | |
| chunk = json.loads(line[6:]) | |
| content = chunk['choices'][0]['delta'].get('content', '') | |
| partial_message += content | |
| yield partial_message | |
| except json.JSONDecodeError: | |
| continue | |
| # Wait for the process to complete | |
| process.wait() | |
| # Create and launch a chat interface with Gradio | |
| gr.ChatInterface(predict).queue().launch() |