Spaces:

han145
/

deepseek

Sleeping

App Files Files Community

han145 commited on Feb 8

Commit

354a3ef

verified ·

1 Parent(s): 9417203

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -457

app.py CHANGED Viewed

@@ -1,433 +1,168 @@
-import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
-import torch
-import torch.nn as nn
 import json
-import asyncio
-from fastapi import FastAPI, Request, HTTPException, Security, Depends
-from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
-from fastapi.responses import JSONResponse
 import logging
-import time
-import os
-import uuid
-from typing import Optional, List, Dict, Any
-import psutil
 import gc
-from contextlib import asynccontextmanager
-# 配置日志
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 # 全局变量
 model = None
 tokenizer = None
-device = "cpu"
-# 性能和安全配置
-TEST_MODE: bool = os.getenv("TEST_MODE", "true").lower() == "true"
-API_KEYS = os.getenv("API_KEYS", "123456,789012").split(",")
-# 性能限制配置 - 使用更保守的值
-MAX_CONCURRENT_REQUESTS = int(os.getenv("MAX_CONCURRENT_REQUESTS", "1"))
-MAX_TOKENS_LIMIT = int(os.getenv("MAX_TOKENS_LIMIT", "128"))
-REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "30"))
-MEMORY_THRESHOLD = int(os.getenv("MEMORY_THRESHOLD", "50"))  # 大幅降低内存阈值
-# 请求管理
-active_requests = 0
-request_semaphore = None
-# 使用OpenAI兼容的Bearer认证
-security = HTTPBearer(auto_error=False)
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    """应用生命周期管理"""
-    global request_semaphore
-    # 初始化信号量限制并发请求
-    request_semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
-    # 异步加载量化模型
-    asyncio.create_task(load_quantized_model_async())
-    yield
-    # 关闭时清理资源
-    cleanup_resources()
-def quantize_model(model):
-    """应用静态量化到模型"""
-    try:
-        logger.info("开始应用静态量化...")
-        # 设置模型为评估模式
-        model.eval()
-        # 准备量化配置
-        model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
-        # 准备模型进行量化
-        model_prepared = torch.quantization.prepare(model, inplace=False)
-        # 由于我们无法进行完整的校准，使用简单的静态量化
-        # 在实际应用中，应该使用校准数据集进行校准
-        logger.info("应用静态量化完成")
-        # 转换模型
-        model_quantized = torch.quantization.convert(model_prepared, inplace=False)
-        logger.info("模型量化完成，内存占用大幅降低")
-        return model_quantized
-    except Exception as e:
-        logger.warning(f"量化失败，使用原模型: {e}")
-        return model
-async def load_quantized_model_async():
-    """异步加载并量化模型"""
-    global model, tokenizer, device
     try:
-        # 使用极小的模型
-        # 选项1: Microsoft的极小型对话模型
-        model_name = "microsoft/DialoGPT-small"  # 仅117M参数
-        # 选项2: 超小型模型
-        # model_name = "sshleifer/tiny-gpt2"  # 仅几十MB
-        logger.info(f"开始加载并量化模型: {model_name}")
-        # 强制使用CPU
-        device = "cpu"
         # 加载tokenizer
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        # 以FP32精度加载模型（量化需要）
         model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            torch_dtype=torch.float32,
-            low_cpu_mem_usage=True
         )
-        # 应用静态量化
-        model = quantize_model(model)
         # 移动到CPU
-        model = model.to(device)
         model.eval()  # 设置为评估模式
-        logger.info(f"量化模型加载成功! 模型大小大幅减少")
-        # 记录内存使用情况
-        log_memory_usage("模型加载后")
-    except Exception as e:
-        logger.error(f"量化模型加载失败: {e}")
-        # 如果量化失败，尝试加载更小的模型
-        await load_tiny_model()
-async def load_tiny_model():
-    """加载超小型模型作为备用"""
-    global model, tokenizer
-    try:
-        # 使用最小的可用模型
-        model_name = "sshleifer/tiny-gpt2"  # 仅33M参数
-        logger.info(f"尝试加载超小型模型: {model_name}")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-        # 确保有pad_token
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
-        model = model.to(device)
-        model.eval()
-        logger.info("超小型模型加载成功!")
-        log_memory_usage("超小型模型加载后")
-    except Exception as e:
-        logger.error(f"超小型模型也加载失败: {e}")
-        logger.info("将使用模拟响应模式")
-def log_memory_usage(stage):
-    """记录内存使用情况"""
-    try:
-        memory = psutil.virtual_memory()
-        logger.info(f"{stage} - 内存使用: {memory.percent}%")
     except Exception as e:
-        logger.error(f"记录内存使用失败: {e}")
-def get_system_health():
-    """获取系统健康状态"""
-    try:
-        memory = psutil.virtual_memory()
-        health = {
-            "memory_used_percent": round(memory.percent, 1),
-            "memory_available_gb": round(memory.available / (1024**3), 2),
-            "active_requests": active_requests,
-            "timestamp": int(time.time()),
-            "model_loaded": model is not None
-        }
-        return health
-    except Exception as e:
-        return {"error": str(e)}
-def check_system_resources():
-    """检查系统资源是否充足"""
-    try:
-        health = get_system_health()
-        # 内存使用超过阈值时拒绝新请求
-        if health.get("memory_used_percent", 0) > MEMORY_THRESHOLD:
-            return False, f"内存使用率过高: {health['memory_used_percent']}%"
-        # 活跃请求数超过限制
-        if active_requests >= MAX_CONCURRENT_REQUESTS:
-            return False, f"并发请求数已达上限: {active_requests}/{MAX_CONCURRENT_REQUESTS}"
-        return True, "资源充足"
-    except Exception as e:
-        return False, f"系统监控异常: {str(e)}"
-async def rate_limit_check():
-    """速率限制和资源检查"""
-    global active_requests
-    # 检查系统资源
-    is_healthy, message = check_system_resources()
-    if not is_healthy:
-        raise HTTPException(
-            status_code=503,
-            detail={
-                "error": {
-                    "message": f"系统资源紧张: {message}",
-                    "type": "service_unavailable",
-                    "code": "resource_unavailable"
-                }
-            }
-        )
-    # 使用信号量控制并发
-    await request_semaphore.acquire()
-    active_requests += 1
-def verify_openai_api_key(credentials: Optional[HTTPAuthorizationCredentials] = Depends(security)):
-    """简化版API密钥验证"""
-    if TEST_MODE:
-        return "test_mode"
-    if not credentials:
-        raise HTTPException(
-            status_code=401,
-            detail={
-                "error": {
-                    "message": "缺少API密钥",
-                    "type": "invalid_request_error",
-                    "code": "missing_api_key"
-                }
-            }
-        )
-    api_key = credentials.credentials
-    # 移除sk-前缀后验证
-    if api_key.startswith("sk-"):
-        key_core = api_key[3:]
-        if key_core in API_KEYS:
-            return api_key
-    raise HTTPException(
-        status_code=401,
-        detail={
-            "error": {
-                "message": "无效的API密钥",
-                "type": "invalid_request_error",
-                "code": "invalid_api_key"
-            }
-        }
-    )
-def generate_quantized_response(messages, max_tokens=64, temperature=0.7):
-    """使用量化模型生成响应"""
     if model is None or tokenizer is None:
-        return "模型未就绪，当前使用模拟响应模式"
     try:
         # 提取用户消息
         user_message = ""
         for msg in messages:
             if msg.get("role") == "user":
-                user_message = msg.get("content", "")[:500]  # 限制输入长度
                 break
         if not user_message:
-            return "未找到有效的用户消息"
-        # 构建提示词
-        prompt = f"User: {user_message}\nAI:"
         # 编码输入
-        inputs = tokenizer(
-            prompt,
-            return_tensors="pt",
-            truncation=True,
-            max_length=256  # 进一步限制输入长度
-        )
         # 生成响应
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
-                max_new_tokens=min(max_tokens, MAX_TOKENS_LIMIT),
-                temperature=min(max(temperature, 0.1), 1.0),
-                top_p=0.9,
                 do_sample=True,
-                pad_token_id=tokenizer.eos_token_id,
-                repetition_penalty=1.1,
-                eos_token_id=tokenizer.eos_token_id
             )
         # 解码响应
-        response = tokenizer.decode(
-            outputs[0][inputs.input_ids.shape[-1]:],
-            skip_special_tokens=True
-        )
-        return response.strip()
-    except Exception as e:
-        logger.error(f"生成响应时出错: {str(e)}")
-        return f"生成响应时出错: {str(e)}"
-def cleanup_resources():
-    """清理资源"""
-    global model, tokenizer
-    try:
-        if model is not None:
-            del model
-            model = None
-        if tokenizer is not None:
-            del tokenizer
-            tokenizer = None
-        gc.collect()
-        logger.info("资源清理完成")
     except Exception as e:
-        logger.error(f"资源清理失败: {e}")
-# 创建FastAPI应用
-app = FastAPI(
-    title="量化大模型API服务",
-    description="使用静态量化技术大幅降低内存占用的API服务",
-    version="1.0.0",
-    lifespan=lifespan
-)
-# 健康检查端点
-@app.get("/")
-async def root():
-    health = get_system_health()
-    return {
-        "message": "量化大模型API服务运行中",
-        "status": "healthy" if model is not None else "loading",
-        "model_loaded": model is not None,
-        "quantized": True,
-        "device": device,
-        "system_health": health,
-        "memory_threshold": f"{MEMORY_THRESHOLD}%"
-    }
 @app.get("/health")
 async def health_check():
-    health = get_system_health()
-    is_healthy, message = check_system_resources()
     return {
-        "status": "healthy" if is_healthy else "degraded",
         "model_loaded": model is not None,
-        "quantized": True,
-        "active_requests": active_requests,
-        "system_health": health,
-        "message": message
-    }
-@app.get("/v1/models")
-async def list_models():
-    """OpenAI兼容的模型列表端点"""
-    return {
-        "object": "list",
-        "data": [
-            {
-                "id": "quantized-dialogpt",
-                "object": "model",
-                "created": int(time.time()),
-                "owned_by": "microsoft",
-                "quantized": True
-            }
-        ]
     }
 @app.post("/v1/chat/completions")
-async def chat_completion(
-    request: Request,
-    api_key: str = Depends(verify_openai_api_key)
-):
-    """OpenAI兼容的聊天完成端点（使用量化模型）"""
-    start_time = time.time()
     try:
-        # 速率限制和资源检查
-        await rate_limit_check()
-        # 解析请求数据
-        try:
-            body = await asyncio.wait_for(request.json(), timeout=5.0)
-        except asyncio.TimeoutError:
-            raise HTTPException(status_code=400, detail="请求体解析超时")
-        messages = body.get("messages", [])
-        max_tokens = min(body.get("max_tokens", 64), MAX_TOKENS_LIMIT)  # 进一步减少
-        temperature = body.get("temperature", 0.7)
-        # 验证消息格式
-        if not messages or not any(msg.get("role") == "user" for msg in messages):
-            raise HTTPException(status_code=400, detail="无效的消息格式")
-        # 生成响应（带超时保护）
-        try:
-            response_text = await asyncio.wait_for(
-                asyncio.get_event_loop().run_in_executor(
-                    None,
-                    generate_quantized_response,
-                    messages, max_tokens, temperature
-                ),
-                timeout=REQUEST_TIMEOUT
             )
-        except asyncio.TimeoutError:
-            raise HTTPException(status_code=504, detail="模型响应超时")
-        # 构建响应
-        response_data = {
-            "id": f"chatcmpl-{uuid.uuid4().hex}",
             "object": "chat.completion",
             "created": int(time.time()),
-            "model": "quantized-dialogpt",
             "choices": [{
                 "index": 0,
                 "message": {
-                    "role": "assistant",
-                    "content": response_text
                 },
                 "finish_reason": "stop"
             }],
@@ -438,111 +173,35 @@ async def chat_completion(
             }
         }
-        return JSONResponse(content=response_data)
-    except HTTPException:
-        raise
     except Exception as e:
-        logger.error(f"处理请求时出错: {str(e)}")
-        raise HTTPException(status_code=500, detail="内部服务器错误")
-    finally:
-        # 释放资源
-        global active_requests
-        active_requests = max(0, active_requests - 1)
-        if request_semaphore:
-            request_semaphore.release()
-# 创建极简Gradio界面
-with gr.Blocks(title="量化大模型API", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""
-    # 量化大模型API服务
-    *使用静态量化技术大幅降低内存占用*
-    ## 技术特性
-    - ✅ **静态量化**: 模型大小减少约75%
-    - ✅ **CPU优化**: 专为低内存环境设计
-    - ✅ **极简架构**: 最小化资源占用
-    ## 当前配置
-    - **模型**: DialoGPT-small (117M参数，量化后约30MB)
-    - **设备**: CPU模式
-    - **并发限制**: 1个请求
-    - **内存阈值**: 50%
-    - **生成长度**: 128 tokens
-    """)
-    # 系统状态
-    with gr.Row():
-        with gr.Column():
-            status_html = gr.HTML("""
-            <div id="status">
-                <p>🔄 加载量化模型中...</p>
-            </div>
-            """)
-            health_btn = gr.Button("刷新系统状态")
-            health_output = gr.JSON(label="系统状态")
-    # 测试界面
-    with gr.Row():
-        with gr.Column():
-            test_input = gr.Textbox(
-                label="测试输入",
-                placeholder="请输入简短的问题...",
-                lines=2
-            )
-            test_btn = gr.Button("测试量化模型", variant="primary")
-            clear_btn = gr.Button("清除")
-            test_output = gr.Textbox(label="测试输出", lines=4)
-    def refresh_status():
-        health = get_system_health()
-        status_text = f"""
-        <div id="status">
-            <p><b>模型状态:</b> {'✅ 已加载(量化)' if model else '❌ 未加载'}</p>
-            <p><b>内存使用:</b> {health.get('memory_used_percent', 0)}% (阈值: {MEMORY_THRESHOLD}%)</p>
-            <p><b>活跃请求:</b> {active_requests}/{MAX_CONCURRENT_REQUESTS}</p>
-            <p><b>量化模式:</b> ✅ 已启用</p>
-        </div>
-        """
-        return status_text, health
-    def test_model(message):
-        if not message.strip():
-            return "请输入消息"
-        if model is None:
-            return "量化模型未加载，请稍后重试"
-        messages = [{"role": "user", "content": message}]
-        return generate_quantized_response(messages)
-    def clear_chat():
-        return ""
-    # 事件绑定
-    health_btn.click(refresh_status, outputs=[status_html, health_output])
-    test_btn.click(test_model, inputs=test_input, outputs=test_output)
-    clear_btn.click(clear_chat, outputs=test_output)
-    # 初始加载状态
-    demo.load(refresh_status, outputs=[status_html, health_output])
-# 挂载Gradio应用到FastAPI
-app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":
     import uvicorn
-    # 优化UVicorn配置
-    config = uvicorn.Config(
-        app,
-        host="0.0.0.0",
         port=7860,
-        workers=1,
-        loop="asyncio",
-        timeout_keep_alive=5,
-        limit_max_requests=100,
-    )
-    server = uvicorn.Server(config)
-    server.run()

+import os
+import time
 import json
 import logging
+from fastapi import FastAPI, Request, HTTPException
+from fastapi.responses import JSONResponse
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
 import gc
+# 极简日志配置
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 # 全局变量
 model = None
 tokenizer = None
+# 配置
+MODEL_NAME = "Qwen/Qwen1.5-0.5B-Chat"
+MAX_TOKENS = 256
+DEVICE = "cpu"  # 强制使用CPU
+def load_model():
+    """极简模型加载"""
+    global model, tokenizer
     try:
+        logger.info(f"开始加载模型: {MODEL_NAME}")
         # 加载tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(
+            MODEL_NAME,
+            trust_remote_code=True
+        )
+        # 确保有pad_token
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
+        # 以最低内存占用加载模型
         model = AutoModelForCausalLM.from_pretrained(
+            MODEL_NAME,
+            torch_dtype=torch.float16,  # 使用半精度减少内存
+            device_map=None,  # 不使用自动设备映射
+            low_cpu_mem_usage=True,  # 优化CPU内存使用
+            trust_remote_code=True
         )
         # 移动到CPU
+        model = model.to(DEVICE)
         model.eval()  # 设置为评估模式
+        logger.info("模型加载成功!")
+        return True
     except Exception as e:
+        logger.error(f"模型加载失败: {e}")
+        return False
+def generate_response(messages):
+    """极简响应生成"""
     if model is None or tokenizer is None:
+        return {"error": "模型未加载"}
     try:
         # 提取用户消息
         user_message = ""
         for msg in messages:
             if msg.get("role") == "user":
+                user_message = msg.get("content", "")
                 break
         if not user_message:
+            return {"error": "未找到用户消息"}
+        # 使用模型内置的聊天模板
+        text = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
         # 编码输入
+        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
+        inputs = inputs.to(DEVICE)
         # 生成响应
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
+                max_new_tokens=MAX_TOKENS,
                 do_sample=True,
+                temperature=0.7,
+                top_p=0.9,
+                pad_token_id=tokenizer.eos_token_id
             )
         # 解码响应
+        response = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
+        # 立即清理内存
+        del inputs, outputs
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        gc.collect()
+        return {"content": response.strip()}
     except Exception as e:
+        logger.error(f"生成响应失败: {e}")
+        return {"error": f"生成失败: {str(e)}"}
+# 创建极简FastAPI应用
+app = FastAPI(title="Qwen1.5-0.5B API", version="1.0")
+# 启动时加载模型
+@app.on_event("startup")
+async def startup_event():
+    load_model()
+# 健康检查端点（OpenClaw可能需要）
 @app.get("/health")
 async def health_check():
     return {
+        "status": "healthy" if model is not None else "loading",
         "model_loaded": model is not None,
+        "timestamp": int(time.time())
     }
+# OpenAI兼容的聊天端点
 @app.post("/v1/chat/completions")
+async def chat_completion(request: Request):
+    """极简版OpenAI兼容端点"""
     try:
+        # 解析请求
+        data = await request.json()
+        messages = data.get("messages", [])
+        model_name = data.get("model", "qwen1.5-0.5b-chat")
+        # 生成响应
+        result = generate_response(messages)
+        if "error" in result:
+            return JSONResponse(
+                status_code=500,
+                content={
+                    "error": {
+                        "message": result["error"],
+                        "type": "internal_error"
+                    }
+                }
             )
+        # 返回OpenAI兼容格式
+        return {
+            "id": f"chatcmpl-{int(time.time())}",
             "object": "chat.completion",
             "created": int(time.time()),
+            "model": model_name,
             "choices": [{
                 "index": 0,
                 "message": {
+                    "role": "assistant",
+                    "content": result["content"]
                 },
                 "finish_reason": "stop"
             }],
             }
         }
     except Exception as e:
+        logger.error(f"API错误: {e}")
+        return JSONResponse(
+            status_code=500,
+            content={
+                "error": {
+                    "message": f"内部服务器错误: {str(e)}",
+                    "type": "internal_error"
+                }
+            }
+        )
+# 根端点
+@app.get("/")
+async def root():
+    return {
+        "message": "Qwen1.5-0.5B-Chat API服务运行中",
+        "model_loaded": model is not None,
+        "endpoint": "/v1/chat/completions"
+    }
 if __name__ == "__main__":
     import uvicorn
+    # 极简UVicorn配置
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
         port=7860,
+        workers=1,  # 单worker减少内存占用
+        log_level="info"
+    )