| import gradio as gr |
| import torch |
| import time |
| import psutil |
|
|
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer |
| from threading import Thread |
|
|
| MODEL_ID = "microsoft/Phi-4-mini-instruct" |
|
|
| print(f"Loading {MODEL_ID}...") |
|
|
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) |
|
|
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_ID, |
| device_map="cpu", |
| torch_dtype="auto", |
| trust_remote_code=True |
| ) |
|
|
| def get_ram(): |
| return f"{psutil.virtual_memory().available / (1024**3):.2f} GB" |
|
|
| def generate_reply(history, system_prompt, temp, top_p, max_tokens, rep_penalty): |
| messages = [] |
|
|
| if system_prompt: |
| messages.append({"role": "system", "content": system_prompt}) |
|
|
| for msg in history: |
| messages.append(msg) |
|
|
| input_ids = tokenizer.apply_chat_template( |
| messages, |
| add_generation_prompt=True, |
| return_tensors="pt" |
| ).to("cpu") |
|
|
| streamer = TextIteratorStreamer( |
| tokenizer, |
| skip_prompt=True, |
| skip_special_tokens=True |
| ) |
|
|
| generation_kwargs = dict( |
| input_ids=input_ids, |
| streamer=streamer, |
| max_new_tokens=int(max_tokens), |
| do_sample=True if temp > 0 else False, |
| temperature=float(temp), |
| top_p=float(top_p), |
| repetition_penalty=float(rep_penalty), |
| ) |
|
|
| thread = Thread(target=model.generate, kwargs=generation_kwargs) |
| thread.start() |
|
|
| output = "" |
| start = time.time() |
| tokens = 0 |
|
|
| for new_text in streamer: |
| output += new_text |
| tokens += 1 |
| elapsed = time.time() - start |
| tps = tokens / elapsed if elapsed > 0 else 0 |
|
|
| stats = f"β‘ {tps:.2f} tok/s | RAM: {get_ram()}" |
| yield output, stats |
|
|
|
|
| with gr.Blocks(title="Phi-4 Mini Chat", fill_height=True ) as demo: |
|
|
| with gr.Sidebar(): |
| system_prompt = gr.Textbox( |
| value="You are a helpful AI assistant.", |
| label="System Prompt", |
| lines=3 |
| ) |
|
|
| temp = gr.Slider(0.0, 2.0, 0.7, step=0.1, label="Temperature") |
| top_p = gr.Slider(0.0, 1.0, 0.9, step=0.05, label="Top-p") |
| rep_penalty = gr.Slider(1.0, 2.0, 1.1, step=0.05, label="Repetition Penalty") |
| max_tokens = gr.Slider(64, 1024, 256, step=64, label="Max Tokens") |
|
|
| stats_box = gr.Markdown("Ready") |
|
|
| gr.Markdown("# π€ Phi-4 Mini") |
|
|
| |
| |
|
|
| |
| chatbot = gr.Chatbot(height=350) |
|
|
| with gr.Row(): |
| user_input = gr.Textbox(placeholder="Type message...", scale=4) |
| send_btn = gr.Button("Send", scale=1) |
|
|
| |
| def user_fn(msg, history): |
| history = history or [] |
| history.append((msg, None)) |
| return "", history |
|
|
| def bot_fn(history, system_prompt, t, p, mt, rp): |
| user_msg = history[-1][0] |
|
|
| |
| msg_history = [] |
| for u, b in history[:-1]: |
| msg_history.append({"role": "user", "content": u}) |
| if b: |
| msg_history.append({"role": "assistant", "content": b}) |
|
|
| generator = generate_reply( |
| msg_history, |
| system_prompt, |
| t, |
| p, |
| mt, |
| rp |
| ) |
|
|
| history[-1] = (user_msg, "") |
|
|
| for text, stats in generator: |
| history[-1] = (user_msg, text) |
| yield history, stats |
|
|
| user_input.submit(user_fn, [user_input, chatbot], [user_input, chatbot]).then( |
| bot_fn, |
| [chatbot, system_prompt, temp, top_p, max_tokens, rep_penalty], |
| [chatbot, stats_box] |
| ) |
|
|
| send_btn.click(user_fn, [user_input, chatbot], [user_input, chatbot]).then( |
| bot_fn, |
| [chatbot, system_prompt, temp, top_p, max_tokens, rep_penalty], |
| [chatbot, stats_box] |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |