import gradio as gr import torch import time import psutil from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from threading import Thread MODEL_ID = "microsoft/Phi-4-mini-instruct" print(f"Loading {MODEL_ID}...") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="cpu", torch_dtype="auto", trust_remote_code=True ) def get_ram(): return f"{psutil.virtual_memory().available / (1024**3):.2f} GB" def generate_reply(history, system_prompt, temp, top_p, max_tokens, rep_penalty): messages = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) for msg in history: messages.append(msg) input_ids = tokenizer.apply_chat_template( messages, add_generation_prompt=True, return_tensors="pt" ).to("cpu") streamer = TextIteratorStreamer( tokenizer, skip_prompt=True, skip_special_tokens=True ) generation_kwargs = dict( input_ids=input_ids, streamer=streamer, max_new_tokens=int(max_tokens), do_sample=True if temp > 0 else False, temperature=float(temp), top_p=float(top_p), repetition_penalty=float(rep_penalty), ) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() output = "" start = time.time() tokens = 0 for new_text in streamer: output += new_text tokens += 1 elapsed = time.time() - start tps = tokens / elapsed if elapsed > 0 else 0 stats = f"⚡ {tps:.2f} tok/s | RAM: {get_ram()}" yield output, stats with gr.Blocks(title="Phi-4 Mini Chat", fill_height=True ) as demo: with gr.Sidebar(): system_prompt = gr.Textbox( value="You are a helpful AI assistant.", label="System Prompt", lines=3 ) temp = gr.Slider(0.0, 2.0, 0.7, step=0.1, label="Temperature") top_p = gr.Slider(0.0, 1.0, 0.9, step=0.05, label="Top-p") rep_penalty = gr.Slider(1.0, 2.0, 1.1, step=0.05, label="Repetition Penalty") max_tokens = gr.Slider(64, 1024, 256, step=64, label="Max Tokens") stats_box = gr.Markdown("Ready") gr.Markdown("# 🤖 Phi-4 Mini") # ❌ OLD (breaks) # chatbot = gr.Chatbot(type="messages", height=500) # ✅ FIXED (Gradio 3.x compatible) chatbot = gr.Chatbot(height=350) with gr.Row(): user_input = gr.Textbox(placeholder="Type message...", scale=4) send_btn = gr.Button("Send", scale=1) # Convert history format (tuple style) def user_fn(msg, history): history = history or [] history.append((msg, None)) return "", history def bot_fn(history, system_prompt, t, p, mt, rp): user_msg = history[-1][0] # Convert to message format for model msg_history = [] for u, b in history[:-1]: msg_history.append({"role": "user", "content": u}) if b: msg_history.append({"role": "assistant", "content": b}) generator = generate_reply( msg_history, system_prompt, t, p, mt, rp ) history[-1] = (user_msg, "") for text, stats in generator: history[-1] = (user_msg, text) yield history, stats user_input.submit(user_fn, [user_input, chatbot], [user_input, chatbot]).then( bot_fn, [chatbot, system_prompt, temp, top_p, max_tokens, rep_penalty], [chatbot, stats_box] ) send_btn.click(user_fn, [user_input, chatbot], [user_input, chatbot]).then( bot_fn, [chatbot, system_prompt, temp, top_p, max_tokens, rep_penalty], [chatbot, stats_box] ) if __name__ == "__main__": demo.launch()