import os import time import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download from fastapi import FastAPI, Request from fastapi.responses import JSONResponse import uvicorn # 1. 初始化 FastAPI app = FastAPI() # 2. 下载并加载模型 (大数据架构师建议:使用 Q4_K_M 以平衡性能与内存) model_path = hf_hub_download( repo_id="Qwen/Qwen2.5-Coder-7B-Instruct-GGUF", filename="qwen2.5-coder-7b-instruct-q4_k_m.gguf" ) llm = Llama( model_path=model_path, n_ctx=4096, n_threads=2, verbose=False ) # 3. 手动实现 OpenAI 兼容接口 (供 Dify 调用) @app.post("/v1/chat/completions") async def chat_completions(request: Request): body = await request.json() messages = body.get("messages", []) # 将 OpenAI 格式转换为 llama-cpp 格式 response = llm.create_chat_completion( messages=messages, temperature=body.get("temperature", 0.3), max_tokens=body.get("max_tokens", 1024), stream=False ) # 模拟 OpenAI 返回结构 return JSONResponse(content=response) # 4. Gradio 交互界面逻辑 (供手动调试) def predict(message, history): system_prompt = "你是一位资深大数据运维专家。请提供专业、安全、高效的脚本和调优建议。" msgs = [{"role": "system", "content": system_prompt}] for h in history: msgs.append({"role": "user", "content": h[0]}) msgs.append({"role": "assistant", "content": h[1]}) msgs.append({"role": "user", "content": message}) output = llm.create_chat_completion(messages=msgs) return output["choices"][0]["message"]["content"] demo = gr.ChatInterface( fn=predict, title="BigData Ops Copilot (Lightweight Mode)", description="免编译轻量化版 - 支持 Dify 接入" ) # 5. 挂载 Gradio 并启动 app = gr.mount_gradio_app(app, demo, path="/") if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)