File size: 3,939 Bytes
303e3b5
 
 
 
cc74715
303e3b5
 
191ba31
d13590e
303e3b5
cc74715
191ba31
303e3b5
cc74715
303e3b5
 
b9ed3f3
dd6166a
b9ed3f3
303e3b5
 
cc74715
3363a7d
303e3b5
cc74715
b9ed3f3
cc74715
b9ed3f3
cc74715
 
7026a44
cc74715
191ba31
 
 
 
 
d1a4213
191ba31
cc74715
 
 
 
 
 
 
191ba31
303e3b5
cc74715
7026a44
cc74715
b9ed3f3
cc74715
303e3b5
 
cc74715
303e3b5
 
cc74715
 
191ba31
303e3b5
 
cc74715
191ba31
cc74715
 
 
 
 
 
 
5f18925
cc74715
 
 
 
 
 
7026a44
cc74715
dd6166a
 
 
 
cc74715
 
 
5f18925
cc74715
dd6166a
 
 
 
5f18925
7026a44
 
cc74715
 
303e3b5
dd6166a
cc74715
dd6166a
 
c325527
303e3b5
cc74715
dd6166a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc74715
dd6166a
 
303e3b5
 
cc74715
dd6166a
 
cc74715
303e3b5
cc74715
 
dd6166a
 
cc74715
191ba31
 
303e3b5
cc74715
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import gradio as gr
import torch
import time
import psutil

from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread

MODEL_ID = "microsoft/Phi-4-mini-instruct"

print(f"Loading {MODEL_ID}...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="cpu",
    torch_dtype="auto",
    trust_remote_code=True
)

def get_ram():
    return f"{psutil.virtual_memory().available / (1024**3):.2f} GB"

def generate_reply(history, system_prompt, temp, top_p, max_tokens, rep_penalty):
    messages = []

    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})

    for msg in history:
        messages.append(msg)

    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cpu")

    streamer = TextIteratorStreamer(
        tokenizer,
        skip_prompt=True,
        skip_special_tokens=True
    )

    generation_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        max_new_tokens=int(max_tokens),
        do_sample=True if temp > 0 else False,
        temperature=float(temp),
        top_p=float(top_p),
        repetition_penalty=float(rep_penalty),
    )

    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    output = ""
    start = time.time()
    tokens = 0

    for new_text in streamer:
        output += new_text
        tokens += 1
        elapsed = time.time() - start
        tps = tokens / elapsed if elapsed > 0 else 0

        stats = f"⚡ {tps:.2f} tok/s | RAM: {get_ram()}"
        yield output, stats


with gr.Blocks(title="Phi-4 Mini Chat", fill_height=True ) as demo:

    with gr.Sidebar():
        system_prompt = gr.Textbox(
            value="You are a helpful AI assistant.",
            label="System Prompt",
            lines=3
        )

        temp = gr.Slider(0.0, 2.0, 0.7, step=0.1, label="Temperature")
        top_p = gr.Slider(0.0, 1.0, 0.9, step=0.05, label="Top-p")
        rep_penalty = gr.Slider(1.0, 2.0, 1.1, step=0.05, label="Repetition Penalty")
        max_tokens = gr.Slider(64, 1024, 256, step=64, label="Max Tokens")

        stats_box = gr.Markdown("Ready")

    gr.Markdown("# 🤖 Phi-4 Mini")

    # ❌ OLD (breaks)
    # chatbot = gr.Chatbot(type="messages", height=500)

    # ✅ FIXED (Gradio 3.x compatible)
    chatbot = gr.Chatbot(height=350)

    with gr.Row():
        user_input = gr.Textbox(placeholder="Type message...", scale=4)
        send_btn = gr.Button("Send", scale=1)

    # Convert history format (tuple style)
    def user_fn(msg, history):
        history = history or []
        history.append((msg, None))
        return "", history

    def bot_fn(history, system_prompt, t, p, mt, rp):
        user_msg = history[-1][0]

        # Convert to message format for model
        msg_history = []
        for u, b in history[:-1]:
            msg_history.append({"role": "user", "content": u})
            if b:
                msg_history.append({"role": "assistant", "content": b})

        generator = generate_reply(
            msg_history,
            system_prompt,
            t,
            p,
            mt,
            rp
        )

        history[-1] = (user_msg, "")

        for text, stats in generator:
            history[-1] = (user_msg, text)
            yield history, stats

    user_input.submit(user_fn, [user_input, chatbot], [user_input, chatbot]).then(
        bot_fn,
        [chatbot, system_prompt, temp, top_p, max_tokens, rep_penalty],
        [chatbot, stats_box]
    )

    send_btn.click(user_fn, [user_input, chatbot], [user_input, chatbot]).then(
        bot_fn,
        [chatbot, system_prompt, temp, top_p, max_tokens, rep_penalty],
        [chatbot, stats_box]
    )

if __name__ == "__main__":
    demo.launch()