File size: 3,445 Bytes
702fae5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from pydantic import BaseModel
from typing import List, Optional, Dict, Any
import time
import re

# 聊天响应模型
class ChatChoice(BaseModel):
    index: int
    message: Dict[str, str]
    finish_reason: str


class ChatUsage(BaseModel):
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int


class ChatResponse(BaseModel):
    id: str
    object: str
    created: int
    model: str
    choices: List[ChatChoice]
    usage: ChatUsage


def convert_json_format(input_data):
    """转换 pipeline 输出格式"""
    output_generations = []
    for item in input_data:
        generated_text_list = item.get('generated_text', [])
        
        assistant_content = ""
        for message in generated_text_list:
            if message.get('role') == 'assistant':
                assistant_content = message.get('content', '')
                break

        # 移除 </think>...</think> 标签
        clean_content = re.sub(r'\s*', '', assistant_content, flags=re.DOTALL).strip()

        output_generations.append([
            {
                "text": clean_content,
                "generationInfo": {
                    "finish_reason": "stop"
                }
            }
        ])
    
    return {"generations": output_generations}


def create_chat_response(request: Any, pipe=None, tokenizer=None) -> ChatResponse:
    """
    创建聊天响应 - 使用 pipeline 生成实际响应
    """
    if pipe is None:
        # 如果 pipeline 未初始化,返回模拟响应
        response_message = {
            "role": "assistant",
            "content": "模型正在初始化中,请稍后重试..."
        }
        completion_text = response_message["content"]
    else:
        # 使用 pipeline 生成响应
        messages = request.messages
        
        # 从 request 获取 max_new_tokens,如果没有则使用默认值 1000
        # max_new_tokens = request.max_tokens if request.max_tokens is not None else 1000
        max_new_tokens = request.max_tokens if request.max_tokens is not None else None
        
        # 调用 pipeline
        result = pipe(messages, max_new_tokens=max_new_tokens)
        # result = pipe(messages)
        
        # 转换格式
        converted_result = convert_json_format(result)
        
        # 获取生成的文本
        completion_text = converted_result["generations"][0][0]["text"]
        
        response_message = {
            "role": "assistant",
            "content": completion_text
        }
    
    # 计算 token 数量
    if tokenizer:
        prompt_tokens = sum(len(tokenizer.encode(msg.get("content", ""))) for msg in request.messages)
        completion_tokens = len(tokenizer.encode(completion_text))
    else:
        # 简化估算
        prompt_tokens = sum(len(msg.get("content", "")) for msg in request.messages) // 4
        completion_tokens = len(completion_text) // 4
    
    return ChatResponse(
        id=f"chatcmpl-{int(time.time())}",
        object="chat.completion",
        created=int(time.time()),
        model=request.model,
        choices=[
            ChatChoice(
                index=0,
                message=response_message,
                finish_reason="stop"
            )
        ],
        usage=ChatUsage(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            total_tokens=prompt_tokens + completion_tokens
        )
    )