File size: 4,464 Bytes
be22f80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
892a66c
 
 
 
 
 
 
c471409
 
 
 
 
be22f80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import torch
import gradio as gr
from threading import Thread
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from utils import EN_US

ZH2EN = {
    "有算力的可自行克隆至本地或复刻至购买了 GPU 环境的账号测试": "If you have computing power, you can test by cloning to local or forking to an account with purchased GPU environment",
    "⚙️ 参数设置": "⚙️ Parameters",
    "系统提示词": "System prompt",
    "最大 token 数": "Max new tokens",
    "温度参数": "Temperature",
    "Top-K 采样": "Top K sampling",
    "Top-P 采样": "Top P sampling",
    "重复性惩罚": "Repetition penalty",
}


def _L(zh_txt: str):
    return ZH2EN[zh_txt] if EN_US else zh_txt


MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
MODEL_NAME = MODEL_ID.split("/")[-1]
CONTEXT_LENGTH = 16000
DESCRIPTION = (
    f"This is a HuggingFace deployment instance of {MODEL_NAME} model, if you have computing power, you can test by cloning to local or forking to an account with purchased GPU environment"
    if EN_US
    else f"当前仅提供 {MODEL_NAME} 模型的 ModelScope 版部署实例,有算力的可自行克隆至本地或复刻至购买了 GPU 环境的账号测试"
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device == torch.device("cuda"):
    if EN_US:
        import huggingface_hub

        MODEL_DIR = huggingface_hub.snapshot_download(
            MODEL_ID, cache_dir="./__pycache__"
        )

    else:
        import modelscope

        MODEL_DIR = modelscope.snapshot_download(MODEL_ID, cache_dir="./__pycache__")

    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
    model = AutoModelForCausalLM.from_pretrained(MODEL_DIR, device_map="auto")


def predict(msg, history, prompt, temper, max_tokens, top_k, repeat_penalty, top_p):
    # Format history with a given chat template
    stop_tokens = ["<|endoftext|>", "<|im_end|>", "|im_end|"]
    instruction = "<|im_start|>system\n" + prompt + "\n<|im_end|>\n"
    for user, assistant in history:
        instruction += f"<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n"

    instruction += f"<|im_start|>user\n{msg}\n<|im_end|>\n<|im_start|>assistant\n"
    try:
        if device == torch.device("cpu"):
            raise EnvironmentError(
                _L("有算力的可自行克隆至本地或复刻至购买了 GPU 环境的账号测试")
            )

        streamer = TextIteratorStreamer(
            tokenizer,
            skip_prompt=True,
            skip_special_tokens=True,
        )
        enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True)
        input_ids, attention_mask = enc.input_ids, enc.attention_mask
        if input_ids.shape[1] > CONTEXT_LENGTH:
            input_ids = input_ids[:, -CONTEXT_LENGTH:]
            attention_mask = attention_mask[:, -CONTEXT_LENGTH:]

        generate_kwargs = dict(
            input_ids=input_ids.to(device),
            attention_mask=attention_mask.to(device),
            streamer=streamer,
            do_sample=True,
            temperature=temper,
            max_new_tokens=max_tokens,
            top_k=top_k,
            repetition_penalty=repeat_penalty,
            top_p=top_p,
        )
        t = Thread(target=model.generate, kwargs=generate_kwargs)
        t.start()

    except Exception as e:
        streamer = f"{e}"

    outputs = []
    for new_token in streamer:
        outputs.append(new_token)
        if new_token in stop_tokens:
            break

        yield "".join(outputs)


def DeepSeek_R1_Qwen_7B():
    with gr.Accordion(label=_L("⚙️ 参数设置"), open=False) as ds_acc:
        prompt = gr.Textbox(
            "You are a useful assistant. first recognize user request and then reply carfuly and thinking",
            label=_L("系统提示词"),
        )
        temper = gr.Slider(0, 1, 0.6, label=_L("温度参数"))
        maxtoken = gr.Slider(0, 32000, 10000, label=_L("最大 token 数"))
        topk = gr.Slider(1, 80, 40, label=_L("Top-K 采样"))
        repet = gr.Slider(0, 2, 1.1, label=_L("重复性惩罚"))
        topp = gr.Slider(0, 1, 0.95, label=_L("Top-P 采样"))

    return gr.ChatInterface(
        predict,
        description=DESCRIPTION,
        additional_inputs_accordion=ds_acc,
        additional_inputs=[prompt, temper, maxtoken, topk, repet, topp],
    ).queue()