import json import traceback import gradio as gr import uvicorn from fastapi import FastAPI, Request from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, StreamingResponse from gradio import mount_gradio_app from huggingface_hub import hf_hub_download from llama_cpp import Llama # --- КОНФИГУРАЦИЯ --- REPO_ID = "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF" FILENAME = "qwen2.5-coder-7b-instruct-q5_k_m.gguf" CONTEXT_SIZE = 8192 DEFAULT_MAX_TOKENS = 4096 print(f"Loading model {REPO_ID}...") try: model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME) llm = Llama( model_path=model_path, n_ctx=CONTEXT_SIZE, n_threads=2, n_batch=512, verbose=True, ) except Exception as e: print(f"Critical Error: {e}") llm = None # --- API (FastAPI) --- app = FastAPI() app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) @app.post("/v1/chat/completions") async def chat_completions(request: Request): if not llm: return JSONResponse(content={"error": "Model not loaded"}, status_code=500) try: data = await request.json() messages = data.get("messages", []) stream = data.get("stream", False) temperature = data.get("temperature", 0.4) max_tokens = data.get("max_tokens", DEFAULT_MAX_TOKENS) output = llm.create_chat_completion( messages=messages, max_tokens=max_tokens, temperature=temperature, stream=stream, ) if stream: def iter_content(): for chunk in output: yield f"data: {json.dumps(chunk)}\n\n" yield "data: [DONE]\n\n" return StreamingResponse(iter_content(), media_type="text/event-stream") return JSONResponse(content=output) except Exception as e: return JSONResponse(content={"error": str(e)}, status_code=500) # --- ЛОГИКА ГЕНЕРАЦИИ ДЛЯ GRADIO --- def user_input(user_message, history): return "", history + [[user_message, None]] def bot_response(history, system_prompt, temperature, max_tokens): if not llm: history[-1][1] = "Error: Model failed to load. Check logs." yield history return # Конвертируем историю Gradio (списки) в формат Llama (словари) messages = [{"role": "system", "content": system_prompt}] # Берем последние 10 диалогов для контекста relevant_history = history[-11:-1] if len(history) > 1 else [] for user_msg, assistant_msg in relevant_history: if user_msg: messages.append({"role": "user", "content": user_msg}) if assistant_msg: messages.append({"role": "assistant", "content": assistant_msg}) # Добавляем последнее сообщение пользователя last_user_msg = history[-1][0] messages.append({"role": "user", "content": last_user_msg}) partial_text = "" try: stream = llm.create_chat_completion( messages=messages, max_tokens=int(max_tokens), temperature=float(temperature), stream=True, ) for chunk in stream: delta = chunk["choices"][0]["delta"] if "content" in delta: partial_text += delta["content"] # Обновляем последнее сообщение ассистента в истории (классический формат) history[-1][1] = partial_text yield history except Exception as e: traceback.print_exc() history[-1][1] = partial_text + f"\n\n❌ **Error:** {str(e)}" yield history # --- ИНТЕРФЕЙС (Gradio Blocks) --- custom_css = """ #chatbot { height: 70vh !important; overflow: auto; } """ theme = gr.themes.Soft(primary_hue="blue", text_size="lg") with gr.Blocks(theme=theme, css=custom_css, title="Qwen Coder Pro") as demo: gr.Markdown("# 💻 Qwen 2.5 Coder Assistant") with gr.Row(): # Настройки with gr.Column(scale=1, min_width=250): gr.Markdown("### ⚙️ Settings") system_prompt = gr.Textbox( label="System Prompt", value="You are an expert coding assistant. Write clean code.", lines=3, ) temperature = gr.Slider(0.0, 1.0, value=0.4, label="Temperature") max_tokens = gr.Slider(512, 8192, value=4096, label="Max Tokens") clear_btn = gr.Button("🗑️ Clear Chat") # Чат with gr.Column(scale=4): # ВАЖНО: Убрали type="messages", используем стандартный формат chatbot = gr.Chatbot( label="Conversation", elem_id="chatbot", show_copy_button=True, # Требует gradio>=3.37 (см. requirements.txt) avatar_images=(None, "https://api.iconify.design/noto:robot.svg"), ) msg = gr.Textbox( show_label=False, placeholder="Type your code question here...", lines=2 ) submit_btn = gr.Button("Run ➤", variant="primary") # Связка событий msg.submit(user_input, [msg, chatbot], [msg, chatbot], queue=False).then( bot_response, [chatbot, system_prompt, temperature, max_tokens], chatbot ) submit_btn.click(user_input, [msg, chatbot], [msg, chatbot], queue=False).then( bot_response, [chatbot, system_prompt, temperature, max_tokens], chatbot ) clear_btn.click(lambda: None, None, chatbot, queue=False) app = mount_gradio_app(app, demo, path="/") if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)