Spaces:

han145
/

llm

Sleeping

App Files Files Community

han145 commited on May 1

Commit

59f0dd7

verified ·

1 Parent(s): 1091096

Update app.py

Browse files

Files changed (1) hide show

app.py +229 -132

app.py CHANGED Viewed

@@ -1,139 +1,236 @@
-import streamlit as st
-from llama_cpp import Llama
-from huggingface_hub import hf_hub_download
 import os
 import time
-import threading
-# ===== 配置区（CPU友好型模型）=====
-MODEL_REPO = "Qwen/Qwen2.5-1.5B-Instruct-GGUF"
-MODEL_FILENAME = "qwen2.5-1_5b-instruct-q4_k_m.gguf"
-MODEL_DIR = "/app/models"  # Spaces持久化目录
-# 全局模型变量（避免重复加载）
-llm_instance = None
-model_loading = False
-model_error = None
-def background_model_load():
-    """后台线程加载模型（避免阻塞Streamlit主线程）"""
-    global llm_instance, model_loading, model_error
-    if model_loading:
-        return
-    model_loading = True
     try:
-        # 创建目录
-        os.makedirs(MODEL_DIR, exist_ok=True)
-        model_path = os.path.join(MODEL_DIR, MODEL_FILENAME)
-        # 检查是否已下载
-        if not os.path.exists(model_path):
-            st.session_state.download_status = "downloading"
-            # 下载模型（自动断点续传）
-            model_path = hf_hub_download(
-                repo_id=MODEL_REPO,
-                filename=MODEL_FILENAME,
-                local_dir=MODEL_DIR,
-                resume_download=True,
-                token=None  # 公开模型无需token
-            )
-            st.session_state.download_status = "downloaded"
-        # 加载模型到内存
-        st.session_state.download_status = "loading"
-        start = time.time()
-        llm_instance = Llama(
-            model_path=model_path,
-            n_ctx=2048,
-            n_threads=4,      # Spaces CPU通常4核
-            n_gpu_layers=0,   # 纯CPU
-            verbose=False,
-            n_batch=512       # 优化批处理
         )
-        st.session_state.download_status = "ready"
-        st.session_state.load_time = time.time() - start
     except Exception as e:
-        model_error = str(e)
-        st.session_state.download_status = "error"
-    finally:
-        model_loading = False
-# ===== Streamlit UI =====
-st.set_page_config(page_title="🦙 CPU LLM Demo", page_icon="🦙", layout="wide")
-# 初始化状态
-if "download_status" not in st.session_state:
-    st.session_state.download_status = "idle"
-    st.session_state.load_time = 0
-    # 启动后台加载线程
-    threading.Thread(target=background_model_load, daemon=True).start()
-# 顶部状态栏
-col1, col2 = st.columns([3, 1])
-with col1:
-    status_map = {
-        "idle": "⏳ 准备加载模型...",
-        "downloading": "⬇️ 正在下载模型 (1.0GB)...",
-        "downloaded": "✅ 模型下载完成，正在加载到内存...",
-        "loading": "🧠 正在加载模型到内存（约60-90秒）...",
-        "ready": f"✅ 模型就绪！加载耗时 {st.session_state.load_time:.1f} 秒",
-        "error": f"❌ 加载失败: {model_error}"
     }
-    st.info(status_map.get(st.session_state.download_status, "❓ 未知状态"))
-with col2:
-    st.caption("💡 首次加载需1-2分钟 | 休眠后需重新下载")
-# 模型未就绪时禁止聊天
-if st.session_state.download_status != "ready":
-    st.stop()
-# 聊天界面
-st.title("🦙 本地CPU大模型 (Qwen2.5-1.5B)")
-st.caption("完全离线运行 · 无外部API调用 · 适合演示用途")
-if "messages" not in st.session_state:
-    st.session_state.messages = []
-# 显示历史消息
-for msg in st.session_state.messages:
-    with st.chat_message(msg["role"]):
-        st.markdown(msg["content"])
-# 用户输入
-if prompt := st.chat_input("问点什么吧..."):
-    # 保存用户消息
-    st.session_state.messages.append({"role": "user", "content": prompt})
-    with st.chat_message("user"):
-        st.markdown(prompt)
-    # 生成回复
-    with st.chat_message("assistant"):
-        message_placeholder = st.empty()
-        full_response = ""
-        # Qwen2.5对话模板
-        messages = [
-            {"role": "system", "content": "You are a helpful assistant."},
-            *[{"role": m["role"], "content": m["content"]} for m in st.session_state.messages]
-        ]
-        # 流式生成（CPU较慢，需耐心）
-        try:
-            for chunk in llm_instance.create_chat_completion(
-                messages=messages,
-                max_tokens=256,   # 限制长度避免超时
-                temperature=0.7,
-                stream=True
-            ):
-                delta = chunk["choices"][0]["delta"]
-                if "content" in delta:
-                    full_response += delta["content"]
-                    message_placeholder.markdown(full_response + "▌")
-            message_placeholder.markdown(full_response)
-            st.session_state.messages.append({"role": "assistant", "content": full_response})
-        except Exception as e:
-            st.error(f"生成失败: {str(e)}")
-            message_placeholder.markdown("❌ 生成超时，请缩短问题长度重试")

 import os
 import time
+import logging
+from fastapi import FastAPI, Request, HTTPException, Depends, status
+from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
+from fastapi.responses import JSONResponse
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+import gc
+# 日志配置
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# 全局变量
+model = None
+tokenizer = None
+# 配置
+MODEL_NAME = "Qwen/Qwen1.5-0.5B-Chat"
+MAX_TOKENS = 512
+DEVICE = "cpu"  # 强制使用 CPU
+# API 密钥配置
+API_KEYS = os.getenv("API_KEYS", "your-secret-key-1,your-secret-key-2").split(",")
+API_AUTH_ENABLED = os.getenv("API_AUTH_ENABLED", "true").lower() == "true"
+# Bearer 认证
+security = HTTPBearer()
+def verify_api_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
+    """验证 API 密钥"""
+    if not API_AUTH_ENABLED:
+        return True
+    if credentials.scheme != "Bearer":
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Invalid authentication scheme. Use 'Bearer' token",
+            headers={"WWW-Authenticate": "Bearer"},
+        )
+    api_key = credentials.credentials
+    if api_key not in API_KEYS:
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Invalid API key",
+            headers={"WWW-Authenticate": "Bearer"},
+        )
+    return True
+def load_model():
+    """加载模型"""
+    global model, tokenizer
     try:
+        logger.info(f"开始加载模型: {MODEL_NAME}")
+        tokenizer = AutoTokenizer.from_pretrained(
+            MODEL_NAME,
+            trust_remote_code=True
         )
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_NAME,
+            torch_dtype=torch.float16,
+            device_map=None,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True
+        )
+        model = model.to(DEVICE)
+        model.eval()
+        logger.info("模型加载成功")
+        return True
+    except Exception as e:
+        logger.error(f"模型加载失败: {e}")
+        return False
+def apply_chat_template(messages):
+    """将 messages 转换为 Qwen 的对话格式"""
+    text = ""
+    for msg in messages:
+        role = msg.get("role", "").lower()
+        content = msg.get("content", "")
+        # 处理 content 可能是 list 的情况（兼容多模态格式）
+        if isinstance(content, list):
+            text_parts = []
+            for item in content:
+                if isinstance(item, dict):
+                    if item.get("type") == "text":
+                        text_parts.append(str(item.get("text", "")))
+                elif isinstance(item, str):
+                    text_parts.append(item)
+            content_str = " ".join([p for p in text_parts if p]).strip()
+        else:
+            content_str = str(content).strip()
+        if not content_str:
+            continue
+        if role == "system":
+            text += f"<|im_start|>system\n{content_str}<|im_end|>\n"
+        elif role == "user":
+            text += f"<|im_start|>user\n{content_str}<|im_end|>\n"
+        elif role == "assistant":
+            text += f"<|im_start|>assistant\n{content_str}<|im_end|>\n"
+    text += "<|im_start|>assistant\n"
+    return text
+def generate_chat_response(messages, max_tokens=512, temperature=0.7):
+    """生成回复"""
+    if model is None or tokenizer is None:
+        return {"error": "模型未加载"}
+    try:
+        prompt = apply_chat_template(messages)
+        logger.info(f"输入文本类型: {type(prompt)}, 长度: {len(prompt)}")
+        inputs = tokenizer(
+            [prompt],
+            return_tensors="pt",
+            truncation=True,
+            max_length=2048,           # 改小，防止上下文过长影响生成
+            padding=True
+        )
+        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=384,    # 暂时写死为 384，确保有足够生成空间
+                do_sample=True,
+                temperature=temperature,
+                top_p=0.85,
+                repetition_penalty=1.05,
+                pad_token_id=tokenizer.eos_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+            )
+        return {"text": response}
     except Exception as e:
+        logger.error(f"生成失败: {str(e)}", exc_info=True)
+        return {"error": str(e)}
+# FastAPI 应用
+app = FastAPI(
+    title="Qwen OpenAI-compatible API",
+    version="1.0",
+    description="仅提供 /v1/chat/completions 端点"
+)
+@app.on_event("startup")
+async def startup_event():
+    if load_model():
+        logger.info("服务启动完成")
+    else:
+        logger.error("模型加载失败，服务可能无法正常工作")
+# 健康检查
+@app.get("/health")
+async def health_check():
+    return {
+        "status": "healthy" if model is not None else "model loading failed",
+        "model_loaded": model is not None,
+        "timestamp": int(time.time())
     }
+# 根路径
+@app.get("/")
+async def root():
+    return {"message": "Qwen API 服务运行中，仅支持 /v1/chat/completions"}
+# 核心端点
+@app.post("/v1/chat/completions")
+async def create_chat_completion(
+    request: Request,
+    auth_valid: bool = Depends(verify_api_key)
+):
+    try:
+        data = await request.json()
+        messages = data.get("messages", [])
+        max_tokens = data.get("max_tokens", MAX_TOKENS)
+        temperature = data.get("temperature", 0.7)
+        logger.info(f"收到请求: messages_count={len(messages)}")
+        if not messages or not isinstance(messages, list):
+            raise ValueError("messages 必须是非空列表")
+        result = generate_chat_response(messages, max_tokens, temperature)
+        if "error" in result:
+            raise RuntimeError(result["error"])
+        response_data = {
+            "id": f"chatcmpl-{int(time.time()*1000)}",
+            "object": "chat.completion",
+            "created": int(time.time()),
+            "model": MODEL_NAME,
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": result["text"]
+                    },
+                    "finish_reason": "stop"
+                }
+            ]
+        }
+        return response_data
+    except Exception as e:
+        logger.error(f"Chat Completions 错误: {str(e)}", exc_info=True)
+        return JSONResponse(
+            status_code=500,
+            content={
+                "error": {
+                    "message": str(e),
+                    "type": "internal_server_error"
+                }
+            }
+        )
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=7860,
+        workers=1,
+        log_level="info"
+    )