| import json |
| import traceback |
|
|
| import gradio as gr |
| import uvicorn |
| from fastapi import FastAPI, Request |
| from fastapi.middleware.cors import CORSMiddleware |
| from fastapi.responses import JSONResponse, StreamingResponse |
| from gradio import mount_gradio_app |
| from huggingface_hub import hf_hub_download |
| from llama_cpp import Llama |
|
|
| |
| REPO_ID = "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF" |
| FILENAME = "qwen2.5-coder-7b-instruct-q5_k_m.gguf" |
| CONTEXT_SIZE = 8192 |
| DEFAULT_MAX_TOKENS = 4096 |
|
|
| print(f"Loading model {REPO_ID}...") |
| try: |
| model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME) |
| llm = Llama( |
| model_path=model_path, |
| n_ctx=CONTEXT_SIZE, |
| n_threads=2, |
| n_batch=512, |
| verbose=True, |
| ) |
| except Exception as e: |
| print(f"Critical Error: {e}") |
| llm = None |
|
|
| |
| app = FastAPI() |
| app.add_middleware( |
| CORSMiddleware, |
| allow_origins=["*"], |
| allow_credentials=True, |
| allow_methods=["*"], |
| allow_headers=["*"], |
| ) |
|
|
|
|
| @app.post("/v1/chat/completions") |
| async def chat_completions(request: Request): |
| if not llm: |
| return JSONResponse(content={"error": "Model not loaded"}, status_code=500) |
|
|
| try: |
| data = await request.json() |
| messages = data.get("messages", []) |
| stream = data.get("stream", False) |
| temperature = data.get("temperature", 0.4) |
| max_tokens = data.get("max_tokens", DEFAULT_MAX_TOKENS) |
|
|
| output = llm.create_chat_completion( |
| messages=messages, |
| max_tokens=max_tokens, |
| temperature=temperature, |
| stream=stream, |
| ) |
|
|
| if stream: |
|
|
| def iter_content(): |
| for chunk in output: |
| yield f"data: {json.dumps(chunk)}\n\n" |
| yield "data: [DONE]\n\n" |
|
|
| return StreamingResponse(iter_content(), media_type="text/event-stream") |
|
|
| return JSONResponse(content=output) |
| except Exception as e: |
| return JSONResponse(content={"error": str(e)}, status_code=500) |
|
|
|
|
| |
|
|
|
|
| def user_input(user_message, history): |
| return "", history + [[user_message, None]] |
|
|
|
|
| def bot_response(history, system_prompt, temperature, max_tokens): |
| if not llm: |
| history[-1][1] = "Error: Model failed to load. Check logs." |
| yield history |
| return |
|
|
| |
| messages = [{"role": "system", "content": system_prompt}] |
|
|
| |
| relevant_history = history[-11:-1] if len(history) > 1 else [] |
|
|
| for user_msg, assistant_msg in relevant_history: |
| if user_msg: |
| messages.append({"role": "user", "content": user_msg}) |
| if assistant_msg: |
| messages.append({"role": "assistant", "content": assistant_msg}) |
|
|
| |
| last_user_msg = history[-1][0] |
| messages.append({"role": "user", "content": last_user_msg}) |
|
|
| partial_text = "" |
| try: |
| stream = llm.create_chat_completion( |
| messages=messages, |
| max_tokens=int(max_tokens), |
| temperature=float(temperature), |
| stream=True, |
| ) |
|
|
| for chunk in stream: |
| delta = chunk["choices"][0]["delta"] |
| if "content" in delta: |
| partial_text += delta["content"] |
| |
| history[-1][1] = partial_text |
| yield history |
|
|
| except Exception as e: |
| traceback.print_exc() |
| history[-1][1] = partial_text + f"\n\n❌ **Error:** {str(e)}" |
| yield history |
|
|
|
|
| |
|
|
| custom_css = """ |
| #chatbot { |
| height: 70vh !important; |
| overflow: auto; |
| } |
| """ |
|
|
| theme = gr.themes.Soft(primary_hue="blue", text_size="lg") |
|
|
| with gr.Blocks(theme=theme, css=custom_css, title="Qwen Coder Pro") as demo: |
| gr.Markdown("# 💻 Qwen 2.5 Coder Assistant") |
|
|
| with gr.Row(): |
| |
| with gr.Column(scale=1, min_width=250): |
| gr.Markdown("### ⚙️ Settings") |
| system_prompt = gr.Textbox( |
| label="System Prompt", |
| value="You are an expert coding assistant. Write clean code.", |
| lines=3, |
| ) |
| temperature = gr.Slider(0.0, 1.0, value=0.4, label="Temperature") |
| max_tokens = gr.Slider(512, 8192, value=4096, label="Max Tokens") |
| clear_btn = gr.Button("🗑️ Clear Chat") |
|
|
| |
| with gr.Column(scale=4): |
| |
| chatbot = gr.Chatbot( |
| label="Conversation", |
| elem_id="chatbot", |
| show_copy_button=True, |
| avatar_images=(None, "https://api.iconify.design/noto:robot.svg"), |
| ) |
|
|
| msg = gr.Textbox( |
| show_label=False, placeholder="Type your code question here...", lines=2 |
| ) |
| submit_btn = gr.Button("Run ➤", variant="primary") |
|
|
| |
| msg.submit(user_input, [msg, chatbot], [msg, chatbot], queue=False).then( |
| bot_response, [chatbot, system_prompt, temperature, max_tokens], chatbot |
| ) |
| submit_btn.click(user_input, [msg, chatbot], [msg, chatbot], queue=False).then( |
| bot_response, [chatbot, system_prompt, temperature, max_tokens], chatbot |
| ) |
| clear_btn.click(lambda: None, None, chatbot, queue=False) |
|
|
| app = mount_gradio_app(app, demo, path="/") |
|
|
| if __name__ == "__main__": |
| uvicorn.run(app, host="0.0.0.0", port=7860) |
|
|