han145 commited on
Commit
354a3ef
·
verified ·
1 Parent(s): 9417203

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -457
app.py CHANGED
@@ -1,433 +1,168 @@
1
- import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
3
- import torch
4
- import torch.nn as nn
5
  import json
6
- import asyncio
7
- from fastapi import FastAPI, Request, HTTPException, Security, Depends
8
- from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
9
- from fastapi.responses import JSONResponse
10
  import logging
11
- import time
12
- import os
13
- import uuid
14
- from typing import Optional, List, Dict, Any
15
- import psutil
16
  import gc
17
- from contextlib import asynccontextmanager
18
 
19
- # 配置日志
20
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
21
  logger = logging.getLogger(__name__)
22
 
23
  # 全局变量
24
  model = None
25
  tokenizer = None
26
- device = "cpu"
27
-
28
- # 性能和安全配置
29
- TEST_MODE: bool = os.getenv("TEST_MODE", "true").lower() == "true"
30
- API_KEYS = os.getenv("API_KEYS", "123456,789012").split(",")
31
 
32
- # 性能限制配置 - 使用更保守的值
33
- MAX_CONCURRENT_REQUESTS = int(os.getenv("MAX_CONCURRENT_REQUESTS", "1"))
34
- MAX_TOKENS_LIMIT = int(os.getenv("MAX_TOKENS_LIMIT", "128"))
35
- REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "30"))
36
- MEMORY_THRESHOLD = int(os.getenv("MEMORY_THRESHOLD", "50")) # 大幅降低内存阈值
37
 
38
- # 请求管理
39
- active_requests = 0
40
- request_semaphore = None
41
-
42
- # 使用OpenAI兼容的Bearer认证
43
- security = HTTPBearer(auto_error=False)
44
-
45
- @asynccontextmanager
46
- async def lifespan(app: FastAPI):
47
- """应用生命周期管理"""
48
- global request_semaphore
49
-
50
- # 初始化信号量限制并发请求
51
- request_semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
52
-
53
- # 异步加载量化模型
54
- asyncio.create_task(load_quantized_model_async())
55
-
56
- yield
57
-
58
- # 关闭时清理资源
59
- cleanup_resources()
60
-
61
- def quantize_model(model):
62
- """应用静态量化到模型"""
63
- try:
64
- logger.info("开始应用静态量化...")
65
-
66
- # 设置模型为评估模式
67
- model.eval()
68
-
69
- # 准备量化配置
70
- model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
71
-
72
- # 准备模型进行量化
73
- model_prepared = torch.quantization.prepare(model, inplace=False)
74
-
75
- # 由于我们无法进行完整的校准,使用简单的静态量化
76
- # 在实际应用中,应该使用校准数据集进行校准
77
- logger.info("应用静态量化完成")
78
-
79
- # 转换模型
80
- model_quantized = torch.quantization.convert(model_prepared, inplace=False)
81
-
82
- logger.info("模型量化完成,内存占用大幅降低")
83
- return model_quantized
84
-
85
- except Exception as e:
86
- logger.warning(f"量化失败,使用原模型: {e}")
87
- return model
88
-
89
- async def load_quantized_model_async():
90
- """异步加载并量化模型"""
91
- global model, tokenizer, device
92
 
93
  try:
94
- # 使用极小的模型
95
- # 选项1: Microsoft的极小型对话模型
96
- model_name = "microsoft/DialoGPT-small" # 仅117M参数
97
-
98
- # 选项2: 超小型模型
99
- # model_name = "sshleifer/tiny-gpt2" # 仅几十MB
100
-
101
- logger.info(f"开始加载并量化模型: {model_name}")
102
-
103
- # 强制使用CPU
104
- device = "cpu"
105
 
106
  # 加载tokenizer
107
- tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 
 
108
 
 
109
  if tokenizer.pad_token is None:
110
  tokenizer.pad_token = tokenizer.eos_token
111
 
112
- # 以FP32精度加载模型(量化需要)
113
  model = AutoModelForCausalLM.from_pretrained(
114
- model_name,
115
- torch_dtype=torch.float32,
116
- low_cpu_mem_usage=True
 
 
117
  )
118
 
119
- # 应用静态量化
120
- model = quantize_model(model)
121
-
122
  # 移动到CPU
123
- model = model.to(device)
124
  model.eval() # 设置为评估模式
125
 
126
- logger.info(f"量化模型加载成功! 模型大小大幅减少")
 
127
 
128
- # 记录内存使用情况
129
- log_memory_usage("模型加载后")
130
-
131
- except Exception as e:
132
- logger.error(f"量化模型加载失败: {e}")
133
- # 如果量化失败,尝试加载更小的模型
134
- await load_tiny_model()
135
-
136
- async def load_tiny_model():
137
- """加载超小型模型作为备用"""
138
- global model, tokenizer
139
-
140
- try:
141
- # 使用最小的可用模型
142
- model_name = "sshleifer/tiny-gpt2" # 仅33M参数
143
-
144
- logger.info(f"尝试加载超小型模型: {model_name}")
145
-
146
- tokenizer = AutoTokenizer.from_pretrained(model_name)
147
- model = AutoModelForCausalLM.from_pretrained(model_name)
148
-
149
- # 确保有pad_token
150
- if tokenizer.pad_token is None:
151
- tokenizer.pad_token = tokenizer.eos_token
152
-
153
- model = model.to(device)
154
- model.eval()
155
-
156
- logger.info("超小型模型加载成功!")
157
- log_memory_usage("超小型模型加载后")
158
-
159
- except Exception as e:
160
- logger.error(f"超小型模型也加载失败: {e}")
161
- logger.info("将使用模拟响应模式")
162
-
163
- def log_memory_usage(stage):
164
- """记录内存使用情况"""
165
- try:
166
- memory = psutil.virtual_memory()
167
- logger.info(f"{stage} - 内存使用: {memory.percent}%")
168
  except Exception as e:
169
- logger.error(f"记录内存使用失败: {e}")
 
170
 
171
- def get_system_health():
172
- """获取系统健康状态"""
173
- try:
174
- memory = psutil.virtual_memory()
175
- health = {
176
- "memory_used_percent": round(memory.percent, 1),
177
- "memory_available_gb": round(memory.available / (1024**3), 2),
178
- "active_requests": active_requests,
179
- "timestamp": int(time.time()),
180
- "model_loaded": model is not None
181
- }
182
- return health
183
- except Exception as e:
184
- return {"error": str(e)}
185
-
186
- def check_system_resources():
187
- """检查系统资源是否充足"""
188
- try:
189
- health = get_system_health()
190
-
191
- # 内存使用超过阈值时拒绝新请求
192
- if health.get("memory_used_percent", 0) > MEMORY_THRESHOLD:
193
- return False, f"内存使用率过高: {health['memory_used_percent']}%"
194
-
195
- # 活跃请求数超过限制
196
- if active_requests >= MAX_CONCURRENT_REQUESTS:
197
- return False, f"并发请求数已达上限: {active_requests}/{MAX_CONCURRENT_REQUESTS}"
198
-
199
- return True, "资源充足"
200
- except Exception as e:
201
- return False, f"系统监控异常: {str(e)}"
202
-
203
- async def rate_limit_check():
204
- """速率限制和资源检查"""
205
- global active_requests
206
-
207
- # 检查系统资源
208
- is_healthy, message = check_system_resources()
209
- if not is_healthy:
210
- raise HTTPException(
211
- status_code=503,
212
- detail={
213
- "error": {
214
- "message": f"系统资源紧张: {message}",
215
- "type": "service_unavailable",
216
- "code": "resource_unavailable"
217
- }
218
- }
219
- )
220
-
221
- # 使用信号量控制并发
222
- await request_semaphore.acquire()
223
- active_requests += 1
224
-
225
- def verify_openai_api_key(credentials: Optional[HTTPAuthorizationCredentials] = Depends(security)):
226
- """简化版API密钥验证"""
227
- if TEST_MODE:
228
- return "test_mode"
229
-
230
- if not credentials:
231
- raise HTTPException(
232
- status_code=401,
233
- detail={
234
- "error": {
235
- "message": "缺少API密钥",
236
- "type": "invalid_request_error",
237
- "code": "missing_api_key"
238
- }
239
- }
240
- )
241
-
242
- api_key = credentials.credentials
243
-
244
- # 移除sk-前缀后验证
245
- if api_key.startswith("sk-"):
246
- key_core = api_key[3:]
247
- if key_core in API_KEYS:
248
- return api_key
249
-
250
- raise HTTPException(
251
- status_code=401,
252
- detail={
253
- "error": {
254
- "message": "无效的API密钥",
255
- "type": "invalid_request_error",
256
- "code": "invalid_api_key"
257
- }
258
- }
259
- )
260
-
261
- def generate_quantized_response(messages, max_tokens=64, temperature=0.7):
262
- """使用量化模型生成响应"""
263
  if model is None or tokenizer is None:
264
- return "模型未就绪,当前使用模拟响应模式"
265
 
266
  try:
267
  # 提取用户消息
268
  user_message = ""
269
  for msg in messages:
270
  if msg.get("role") == "user":
271
- user_message = msg.get("content", "")[:500] # 限制输入长度
272
  break
273
 
274
  if not user_message:
275
- return "未找到有效的用户消息"
276
 
277
- # 构建提示词
278
- prompt = f"User: {user_message}\nAI:"
 
 
 
 
279
 
280
  # 编码输入
281
- inputs = tokenizer(
282
- prompt,
283
- return_tensors="pt",
284
- truncation=True,
285
- max_length=256 # 进一步限制输入长度
286
- )
287
 
288
  # 生成响应
289
  with torch.no_grad():
290
  outputs = model.generate(
291
  **inputs,
292
- max_new_tokens=min(max_tokens, MAX_TOKENS_LIMIT),
293
- temperature=min(max(temperature, 0.1), 1.0),
294
- top_p=0.9,
295
  do_sample=True,
296
- pad_token_id=tokenizer.eos_token_id,
297
- repetition_penalty=1.1,
298
- eos_token_id=tokenizer.eos_token_id
299
  )
300
 
301
  # 解码响应
302
- response = tokenizer.decode(
303
- outputs[0][inputs.input_ids.shape[-1]:],
304
- skip_special_tokens=True
305
- )
306
 
307
- return response.strip()
 
 
 
 
308
 
309
- except Exception as e:
310
- logger.error(f"生成响应时出错: {str(e)}")
311
- return f"生成响应时出错: {str(e)}"
312
-
313
- def cleanup_resources():
314
- """清理资源"""
315
- global model, tokenizer
316
- try:
317
- if model is not None:
318
- del model
319
- model = None
320
- if tokenizer is not None:
321
- del tokenizer
322
- tokenizer = None
323
 
324
- gc.collect()
325
- logger.info("资源清理完成")
326
  except Exception as e:
327
- logger.error(f"资源清理失败: {e}")
 
328
 
329
- # 创建FastAPI应用
330
- app = FastAPI(
331
- title="量化大模型API服务",
332
- description="使用静态量化技术大幅降低内存占用的API服务",
333
- version="1.0.0",
334
- lifespan=lifespan
335
- )
336
 
337
- # 健康检查端点
338
- @app.get("/")
339
- async def root():
340
- health = get_system_health()
341
- return {
342
- "message": "量化大模型API服务运行中",
343
- "status": "healthy" if model is not None else "loading",
344
- "model_loaded": model is not None,
345
- "quantized": True,
346
- "device": device,
347
- "system_health": health,
348
- "memory_threshold": f"{MEMORY_THRESHOLD}%"
349
- }
350
 
 
351
  @app.get("/health")
352
  async def health_check():
353
- health = get_system_health()
354
- is_healthy, message = check_system_resources()
355
-
356
  return {
357
- "status": "healthy" if is_healthy else "degraded",
358
  "model_loaded": model is not None,
359
- "quantized": True,
360
- "active_requests": active_requests,
361
- "system_health": health,
362
- "message": message
363
- }
364
-
365
- @app.get("/v1/models")
366
- async def list_models():
367
- """OpenAI兼容的模型列表端点"""
368
- return {
369
- "object": "list",
370
- "data": [
371
- {
372
- "id": "quantized-dialogpt",
373
- "object": "model",
374
- "created": int(time.time()),
375
- "owned_by": "microsoft",
376
- "quantized": True
377
- }
378
- ]
379
  }
380
 
 
381
  @app.post("/v1/chat/completions")
382
- async def chat_completion(
383
- request: Request,
384
- api_key: str = Depends(verify_openai_api_key)
385
- ):
386
- """OpenAI兼容的聊天完成端点(使用量化模型)"""
387
- start_time = time.time()
388
-
389
  try:
390
- # 速率限制和资源检查
391
- await rate_limit_check()
392
-
393
- # 解析请求数据
394
- try:
395
- body = await asyncio.wait_for(request.json(), timeout=5.0)
396
- except asyncio.TimeoutError:
397
- raise HTTPException(status_code=400, detail="请求体解析超时")
398
 
399
- messages = body.get("messages", [])
400
- max_tokens = min(body.get("max_tokens", 64), MAX_TOKENS_LIMIT) # 进一步减少
401
- temperature = body.get("temperature", 0.7)
402
-
403
- # 验证消息格式
404
- if not messages or not any(msg.get("role") == "user" for msg in messages):
405
- raise HTTPException(status_code=400, detail="无效的消息格式")
406
-
407
- # 生成响应(带超时保护)
408
- try:
409
- response_text = await asyncio.wait_for(
410
- asyncio.get_event_loop().run_in_executor(
411
- None,
412
- generate_quantized_response,
413
- messages, max_tokens, temperature
414
- ),
415
- timeout=REQUEST_TIMEOUT
416
  )
417
- except asyncio.TimeoutError:
418
- raise HTTPException(status_code=504, detail="模型响应超时")
419
 
420
- # 构建响应
421
- response_data = {
422
- "id": f"chatcmpl-{uuid.uuid4().hex}",
423
  "object": "chat.completion",
424
  "created": int(time.time()),
425
- "model": "quantized-dialogpt",
426
  "choices": [{
427
  "index": 0,
428
  "message": {
429
- "role": "assistant",
430
- "content": response_text
431
  },
432
  "finish_reason": "stop"
433
  }],
@@ -438,111 +173,35 @@ async def chat_completion(
438
  }
439
  }
440
 
441
- return JSONResponse(content=response_data)
442
-
443
- except HTTPException:
444
- raise
445
  except Exception as e:
446
- logger.error(f"处理请求时出错: {str(e)}")
447
- raise HTTPException(status_code=500, detail="内部服务器错误")
448
- finally:
449
- # 释放资源
450
- global active_requests
451
- active_requests = max(0, active_requests - 1)
452
- if request_semaphore:
453
- request_semaphore.release()
454
-
455
- # 创建极简Gradio界面
456
- with gr.Blocks(title="量化大模型API", theme=gr.themes.Soft()) as demo:
457
- gr.Markdown("""
458
- # 量化大模型API服务
459
- *使用静态量化技术大幅降低内存占用*
460
-
461
- ## 技术特性
462
- - ✅ **静态量化**: 模型大小减少约75%
463
- - ✅ **CPU优化**: 专为低内存环境设计
464
- - ✅ **极简架构**: 最小化资源占用
465
-
466
- ## 当前配置
467
- - **模型**: DialoGPT-small (117M参数,量化后约30MB)
468
- - **设备**: CPU模式
469
- - **并发限制**: 1个请求
470
- - **内存阈值**: 50%
471
- - **生成长度**: 128 tokens
472
- """)
473
-
474
- # 系统状态
475
- with gr.Row():
476
- with gr.Column():
477
- status_html = gr.HTML("""
478
- <div id="status">
479
- <p>🔄 加载量化模型中...</p>
480
- </div>
481
- """)
482
- health_btn = gr.Button("刷新系统状态")
483
- health_output = gr.JSON(label="系统状态")
484
-
485
- # 测试界面
486
- with gr.Row():
487
- with gr.Column():
488
- test_input = gr.Textbox(
489
- label="测试输入",
490
- placeholder="请输入简短的问题...",
491
- lines=2
492
- )
493
- test_btn = gr.Button("测试量化模型", variant="primary")
494
- clear_btn = gr.Button("清除")
495
- test_output = gr.Textbox(label="测试输出", lines=4)
496
-
497
- def refresh_status():
498
- health = get_system_health()
499
- status_text = f"""
500
- <div id="status">
501
- <p><b>模型状态:</b> {'✅ 已加载(量化)' if model else '❌ 未加载'}</p>
502
- <p><b>内存使用:</b> {health.get('memory_used_percent', 0)}% (阈值: {MEMORY_THRESHOLD}%)</p>
503
- <p><b>活跃请求:</b> {active_requests}/{MAX_CONCURRENT_REQUESTS}</p>
504
- <p><b>量化模式:</b> ✅ 已启用</p>
505
- </div>
506
- """
507
- return status_text, health
508
-
509
- def test_model(message):
510
- if not message.strip():
511
- return "请输入消息"
512
-
513
- if model is None:
514
- return "量化模型未加载,请稍后重试"
515
-
516
- messages = [{"role": "user", "content": message}]
517
- return generate_quantized_response(messages)
518
-
519
- def clear_chat():
520
- return ""
521
-
522
- # 事件绑定
523
- health_btn.click(refresh_status, outputs=[status_html, health_output])
524
- test_btn.click(test_model, inputs=test_input, outputs=test_output)
525
- clear_btn.click(clear_chat, outputs=test_output)
526
-
527
- # 初始加载状态
528
- demo.load(refresh_status, outputs=[status_html, health_output])
529
 
530
- # 挂载Gradio应用到FastAPI
531
- app = gr.mount_gradio_app(app, demo, path="/")
 
 
 
 
 
 
532
 
533
  if __name__ == "__main__":
534
  import uvicorn
535
 
536
- # 优化UVicorn配置
537
- config = uvicorn.Config(
538
- app,
539
- host="0.0.0.0",
540
  port=7860,
541
- workers=1,
542
- loop="asyncio",
543
- timeout_keep_alive=5,
544
- limit_max_requests=100,
545
- )
546
-
547
- server = uvicorn.Server(config)
548
- server.run()
 
1
+ import os
2
+ import time
 
 
3
  import json
 
 
 
 
4
  import logging
5
+ from fastapi import FastAPI, Request, HTTPException
6
+ from fastapi.responses import JSONResponse
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM
8
+ import torch
 
9
  import gc
 
10
 
11
+ # 极简日志配置
12
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
13
  logger = logging.getLogger(__name__)
14
 
15
  # 全局变量
16
  model = None
17
  tokenizer = None
 
 
 
 
 
18
 
19
+ # 配置
20
+ MODEL_NAME = "Qwen/Qwen1.5-0.5B-Chat"
21
+ MAX_TOKENS = 256
22
+ DEVICE = "cpu" # 强制使用CPU
 
23
 
24
+ def load_model():
25
+ """极简模型加载"""
26
+ global model, tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  try:
29
+ logger.info(f"开始加载模型: {MODEL_NAME}")
 
 
 
 
 
 
 
 
 
 
30
 
31
  # 加载tokenizer
32
+ tokenizer = AutoTokenizer.from_pretrained(
33
+ MODEL_NAME,
34
+ trust_remote_code=True
35
+ )
36
 
37
+ # 确保有pad_token
38
  if tokenizer.pad_token is None:
39
  tokenizer.pad_token = tokenizer.eos_token
40
 
41
+ # 以最低内存占用加载模型
42
  model = AutoModelForCausalLM.from_pretrained(
43
+ MODEL_NAME,
44
+ torch_dtype=torch.float16, # 使用半精度减少内存
45
+ device_map=None, # 不使用自动设备映射
46
+ low_cpu_mem_usage=True, # 优化CPU内存使用
47
+ trust_remote_code=True
48
  )
49
 
 
 
 
50
  # 移动到CPU
51
+ model = model.to(DEVICE)
52
  model.eval() # 设置为评估模式
53
 
54
+ logger.info("模型加载成功!")
55
+ return True
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  except Exception as e:
58
+ logger.error(f"模型加载失败: {e}")
59
+ return False
60
 
61
+ def generate_response(messages):
62
+ """极简响应生成"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  if model is None or tokenizer is None:
64
+ return {"error": "模型未加载"}
65
 
66
  try:
67
  # 提取用户消息
68
  user_message = ""
69
  for msg in messages:
70
  if msg.get("role") == "user":
71
+ user_message = msg.get("content", "")
72
  break
73
 
74
  if not user_message:
75
+ return {"error": "未找到用户消息"}
76
 
77
+ # 使用模型内置的聊天模板
78
+ text = tokenizer.apply_chat_template(
79
+ messages,
80
+ tokenize=False,
81
+ add_generation_prompt=True
82
+ )
83
 
84
  # 编码输入
85
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
86
+ inputs = inputs.to(DEVICE)
 
 
 
 
87
 
88
  # 生成响应
89
  with torch.no_grad():
90
  outputs = model.generate(
91
  **inputs,
92
+ max_new_tokens=MAX_TOKENS,
 
 
93
  do_sample=True,
94
+ temperature=0.7,
95
+ top_p=0.9,
96
+ pad_token_id=tokenizer.eos_token_id
97
  )
98
 
99
  # 解码响应
100
+ response = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
 
 
 
101
 
102
+ # 立即清理内存
103
+ del inputs, outputs
104
+ if torch.cuda.is_available():
105
+ torch.cuda.empty_cache()
106
+ gc.collect()
107
 
108
+ return {"content": response.strip()}
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
 
 
110
  except Exception as e:
111
+ logger.error(f"生成响应失败: {e}")
112
+ return {"error": f"生成失败: {str(e)}"}
113
 
114
+ # 创建极简FastAPI应用
115
+ app = FastAPI(title="Qwen1.5-0.5B API", version="1.0")
 
 
 
 
 
116
 
117
+ # 启动时加载模型
118
+ @app.on_event("startup")
119
+ async def startup_event():
120
+ load_model()
 
 
 
 
 
 
 
 
 
121
 
122
+ # 健康检查端点(OpenClaw可能需要)
123
  @app.get("/health")
124
  async def health_check():
 
 
 
125
  return {
126
+ "status": "healthy" if model is not None else "loading",
127
  "model_loaded": model is not None,
128
+ "timestamp": int(time.time())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  }
130
 
131
+ # OpenAI兼容的聊天端点
132
  @app.post("/v1/chat/completions")
133
+ async def chat_completion(request: Request):
134
+ """极简版OpenAI兼容端点"""
 
 
 
 
 
135
  try:
136
+ # 解析请求
137
+ data = await request.json()
138
+ messages = data.get("messages", [])
139
+ model_name = data.get("model", "qwen1.5-0.5b-chat")
 
 
 
 
140
 
141
+ # 生成响应
142
+ result = generate_response(messages)
143
+
144
+ if "error" in result:
145
+ return JSONResponse(
146
+ status_code=500,
147
+ content={
148
+ "error": {
149
+ "message": result["error"],
150
+ "type": "internal_error"
151
+ }
152
+ }
 
 
 
 
 
153
  )
 
 
154
 
155
+ # 返回OpenAI兼容格式
156
+ return {
157
+ "id": f"chatcmpl-{int(time.time())}",
158
  "object": "chat.completion",
159
  "created": int(time.time()),
160
+ "model": model_name,
161
  "choices": [{
162
  "index": 0,
163
  "message": {
164
+ "role": "assistant",
165
+ "content": result["content"]
166
  },
167
  "finish_reason": "stop"
168
  }],
 
173
  }
174
  }
175
 
 
 
 
 
176
  except Exception as e:
177
+ logger.error(f"API: {e}")
178
+ return JSONResponse(
179
+ status_code=500,
180
+ content={
181
+ "error": {
182
+ "message": f"内部服务器错误: {str(e)}",
183
+ "type": "internal_error"
184
+ }
185
+ }
186
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
+ # 根端点
189
+ @app.get("/")
190
+ async def root():
191
+ return {
192
+ "message": "Qwen1.5-0.5B-Chat API服务运行中",
193
+ "model_loaded": model is not None,
194
+ "endpoint": "/v1/chat/completions"
195
+ }
196
 
197
  if __name__ == "__main__":
198
  import uvicorn
199
 
200
+ # 极简UVicorn配置
201
+ uvicorn.run(
202
+ app,
203
+ host="0.0.0.0",
204
  port=7860,
205
+ workers=1, # 单worker减少内存占用
206
+ log_level="info"
207
+ )