ll4b

Paused

App Files Files Community

nagose commited on Mar 15

Commit

48a1762

verified ·

1 Parent(s): b6585a8

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -62

app.py CHANGED Viewed

@@ -4,36 +4,22 @@ import time
 import uuid
 from typing import List, Optional, Dict, Any, Union
-from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
-from llama_cpp import Llama
-# 配置日志
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# ====================== 模型配置 ======================
-# 使用 Hugging Face 上的 GGUF 模型（4B Q4_K_M 版本）
-REPO_ID = "lmstudio-community/Qwen3.5-4B-GGUF"
-FILENAME = "Qwen3.5-4B-Q4_K_M.gguf"
-MODEL_ID = "qwen3.5-4b"  # CoPaw 中配置的模型名称
-# 加载模型（自动从 HF 下载并缓存）
-logger.info(f"正在从 {REPO_ID} 加载模型 {FILENAME}...")
-llm = Llama.from_pretrained(
-    repo_id=REPO_ID,
-    filename=FILENAME,
-    n_ctx=4096,          # 上下文窗口，可根据需求调整
-    n_threads=None,       # 自动使用所有 CPU 线程
-    verbose=False,
-)
-logger.info("模型加载完成！")
-app = FastAPI(title="Qwen3.5-4B GGUF API (CoPaw兼容)")
-# ====================== CORS 中间件 ======================
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -42,7 +28,7 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# ====================== CoPaw 所需端点 ======================
 @app.get("/health")
 async def health():
     return {"status": "healthy"}
@@ -74,7 +60,7 @@ async def list_models():
         ]
     }
-# ====================== 请求/响应数据模型 ======================
 class Message(BaseModel):
     role: str
     content: Optional[Union[str, List[Dict[str, Any]]]] = None
@@ -88,8 +74,8 @@ class ChatRequest(BaseModel):
     tools: Optional[List[Dict[str, Any]]] = None
     tool_choice: Optional[str] = None
-# ====================== 辅助函数 ======================
 def convert_content_to_str(content: Optional[Union[str, List[Dict[str, Any]]]]) -> str:
     if content is None:
         return ""
     if isinstance(content, str):
@@ -105,10 +91,10 @@ def convert_content_to_str(content: Optional[Union[str, List[Dict[str, Any]]]])
 # ====================== 聊天接口 ======================
 @app.post("/v1/chat/completions")
 async def chat_completions(req: ChatRequest):
-    # 转换消息格式
     messages = [{"role": m.role, "content": convert_content_to_str(m.content)} for m in req.messages]
-    # 处理 tools：将工具描述合并到 system 消息中
     if req.tools:
         tools_json = json.dumps(req.tools, ensure_ascii=False)
         tool_prompt = (
@@ -122,47 +108,42 @@ async def chat_completions(req: ChatRequest):
         else:
             messages.insert(0, {"role": "system", "content": tool_prompt})
-    # 流式处理
-    if req.stream:
-        stream = llm.create_chat_completion_openai_v1(
-            messages=messages,
-            temperature=req.temperature,
-            max_tokens=req.max_tokens,
-            stream=True,
-        )
         async def generate():
-            chunk_id = f"chatcmpl-{uuid.uuid4().hex}"
-            for chunk in stream:
-                if chunk.choices:
-                    delta = chunk.choices[0].delta
-                    finish_reason = chunk.choices[0].finish_reason
-                    response_chunk = {
-                        "id": chunk_id,
-                        "object": "chat.completion.chunk",
-                        "created": int(time.time()),
-                        "model": req.model,
-                        "choices": [{
-                            "index": 0,
-                            "delta": delta.model_dump(exclude_none=True),
-                            "finish_reason": finish_reason
-                        }]
-                    }
-                    yield f"data: {json.dumps(response_chunk)}\n\n"
-                    if finish_reason:
-                        yield "data: [DONE]\n\n"
         return StreamingResponse(generate(), media_type="text/event-stream")
-    # 非流式处理
     else:
-        response = llm.create_chat_completion_openai_v1(
-            messages=messages,
-            temperature=req.temperature,
-            max_tokens=req.max_tokens,
-            stream=False,
-        )
-        return response
 @app.get("/")
 async def root():
-    return {"status": "running", "model": f"{REPO_ID}/{FILENAME}"}

 import uuid
 from typing import List, Optional, Dict, Any, Union
+import httpx
+from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# ====================== 配置 ======================
+MODEL_ID = "qwen3.5-4b"                     # CoPaw 中填写的模型名称
+LLAMA_SERVER_URL = "http://127.0.0.1:8080"  # 本地 llama-server 地址
+app = FastAPI(title="Qwen3.5-4B Proxy for CoPaw")
+# CORS 中间件（CoPaw 必须）
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# ====================== CoPaw 所需额外端点 ======================
 @app.get("/health")
 async def health():
     return {"status": "healthy"}
         ]
     }
+# ====================== 请求/响应模型 ======================
 class Message(BaseModel):
     role: str
     content: Optional[Union[str, List[Dict[str, Any]]]] = None
     tools: Optional[List[Dict[str, Any]]] = None
     tool_choice: Optional[str] = None
 def convert_content_to_str(content: Optional[Union[str, List[Dict[str, Any]]]]) -> str:
+    """将 OpenAI 结构化 content 转换为纯文本"""
     if content is None:
         return ""
     if isinstance(content, str):
 # ====================== 聊天接口 ======================
 @app.post("/v1/chat/completions")
 async def chat_completions(req: ChatRequest):
+    # 1. 转换消息格式
     messages = [{"role": m.role, "content": convert_content_to_str(m.content)} for m in req.messages]
+    # 2. 处理 tools（简单提示工程）
     if req.tools:
         tools_json = json.dumps(req.tools, ensure_ascii=False)
         tool_prompt = (
         else:
             messages.insert(0, {"role": "system", "content": tool_prompt})
+    # 3. 构造转发给 llama-server 的请求体
+    payload = {
+        "messages": messages,
+        "temperature": req.temperature,
+        "max_tokens": req.max_tokens,
+        "stream": req.stream,
+        "model": "local"  # llama-server 可能忽略此字段
+    }
+    # 4. 流式处理
+    if req.stream:
         async def generate():
+            async with httpx.AsyncClient(timeout=None) as client:
+                async with client.stream(
+                    "POST",
+                    f"{LLAMA_SERVER_URL}/v1/chat/completions",
+                    json=payload,
+                    headers={"Content-Type": "application/json"}
+                ) as response:
+                    async for line in response.aiter_lines():
+                        if line.startswith("data: "):
+                            yield line + "\n\n"
         return StreamingResponse(generate(), media_type="text/event-stream")
+    # 5. 非流式处理
     else:
+        async with httpx.AsyncClient(timeout=300.0) as client:
+            resp = await client.post(
+                f"{LLAMA_SERVER_URL}/v1/chat/completions",
+                json=payload,
+                headers={"Content-Type": "application/json"}
+            )
+            if resp.status_code != 200:
+                raise HTTPException(status_code=resp.status_code, detail=resp.text)
+            return resp.json()
 @app.get("/")
 async def root():
+    return {"status": "running", "model": "Qwen3.5-4B via llama-server"}