File size: 2,646 Bytes
9cd8054
 
 
e38a095
9cd8054
 
 
 
 
e38a095
 
 
 
 
 
 
 
9cd8054
 
 
e38a095
 
9cd8054
e38a095
9cd8054
e38a095
9cd8054
 
 
 
 
 
 
55494c5
e38a095
9cd8054
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e38a095
9cd8054
 
e38a095
9cd8054
e38a095
9cd8054
7e1b878
e38a095
9cd8054
e38a095
9cd8054
e38a095
9cd8054
 
 
e38a095
9cd8054
e38a095
9cd8054
 
e38a095
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import torch
import gradio as gr
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig

MODEL_ID = os.getenv("MODEL_ID", "elyza/ELYZA-Diffusion-Instruct-1.0-Dream-7B")

DEVICE = "cpu"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,   # 計算はfp16
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

print(f"Starting CPU quant Space: DEVICE={DEVICE}, MODEL_ID={MODEL_ID}")

model = AutoModel.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map={"": DEVICE},
    trust_remote_code=True,
).eval()

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

@torch.no_grad()
def generate(prompt, steps, max_new_tokens, temperature, top_p, alg_temp):
    prompt = (prompt or "").strip()
    if not prompt:
        return "プロンプトを入力してください。"

    steps = int(max(4, min(int(steps), 64)))
    max_new_tokens = int(max(16, min(int(max_new_tokens), 128)))

    messages = [{"role": "user", "content": prompt}]
    inputs = tokenizer.apply_chat_template(
        messages,
        return_tensors="pt",
        return_dict=True,
        add_generation_prompt=True,
    )
    input_ids = inputs.input_ids.to(DEVICE)
    attention_mask = inputs.attention_mask.to(DEVICE)

    out = model.diffusion_generate(
        input_ids,
        attention_mask=attention_mask,
        steps=steps,
        max_new_tokens=max_new_tokens,
        temperature=float(temperature),
        top_p=float(top_p),
        alg="entropy",
        alg_temp=float(alg_temp),
    )

    return tokenizer.decode(out.sequences[0][input_ids.size(1):], skip_special_tokens=True)

with gr.Blocks() as demo:
    gr.Markdown("## ELYZA Diffusion LLM (CPU 4bit quant)")

    prompt = gr.Textbox(label="Prompt", lines=6, value="拡散言語モデルについて教えて")
    with gr.Row():
        steps = gr.Slider(4, 64, value=16, step=1, label="steps")
        max_new_tokens = gr.Slider(16, 128, value=96, step=1, label="max_new_tokens")
    with gr.Row():
        temperature = gr.Slider(0.1, 1.5, value=0.8, step=0.05, label="temperature")
        top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.01, label="top_p")
        alg_temp = gr.Slider(0.1, 1.5, value=0.8, step=0.05, label="alg_temp")

    run = gr.Button("Generate")
    out = gr.Textbox(label="Output", lines=14)
    run.click(generate, [prompt, steps, max_new_tokens, temperature, top_p, alg_temp], out)

demo.queue(max_size=8)

if __name__ == "__main__":
    demo.launch()