coco1990's picture
Upload 3 files
112d611 verified
import os
import time
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
import uvicorn
# 1. 初始化 FastAPI
app = FastAPI()
# 2. 下载并加载模型 (大数据架构师建议:使用 Q4_K_M 以平衡性能与内存)
model_path = hf_hub_download(
repo_id="Qwen/Qwen2.5-Coder-7B-Instruct-GGUF",
filename="qwen2.5-coder-7b-instruct-q4_k_m.gguf"
)
llm = Llama(
model_path=model_path,
n_ctx=4096,
n_threads=2,
verbose=False
)
# 3. 手动实现 OpenAI 兼容接口 (供 Dify 调用)
@app.post("/v1/chat/completions")
async def chat_completions(request: Request):
body = await request.json()
messages = body.get("messages", [])
# 将 OpenAI 格式转换为 llama-cpp 格式
response = llm.create_chat_completion(
messages=messages,
temperature=body.get("temperature", 0.3),
max_tokens=body.get("max_tokens", 1024),
stream=False
)
# 模拟 OpenAI 返回结构
return JSONResponse(content=response)
# 4. Gradio 交互界面逻辑 (供手动调试)
def predict(message, history):
system_prompt = "你是一位资深大数据运维专家。请提供专业、安全、高效的脚本和调优建议。"
msgs = [{"role": "system", "content": system_prompt}]
for h in history:
msgs.append({"role": "user", "content": h[0]})
msgs.append({"role": "assistant", "content": h[1]})
msgs.append({"role": "user", "content": message})
output = llm.create_chat_completion(messages=msgs)
return output["choices"][0]["message"]["content"]
demo = gr.ChatInterface(
fn=predict,
title="BigData Ops Copilot (Lightweight Mode)",
description="免编译轻量化版 - 支持 Dify 接入"
)
# 5. 挂载 Gradio 并启动
app = gr.mount_gradio_app(app, demo, path="/")
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860)