import torch from transformers import AutoTokenizer, AutoModelForCausalLM import gradio as gr # Supported models (text-only for now) MODEL_OPTIONS = { "Phi-3.5 Mini Instruct": "microsoft/Phi-3.5-mini-instruct", "Phi-3.5 MoE Instruct": "microsoft/Phi-3.5-MoE-instruct", "Phi-3 Mini 4K Instruct": "microsoft/Phi-3-mini-4k-instruct", "Phi-3 Mini 128K Instruct": "microsoft/Phi-3-mini-128k-instruct" } # Cache for loaded models loaded_models = {} # Load model/tokenizer on demand def load_model(model_id): if model_id not in loaded_models: tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_id, trust_remote_code=True, torch_dtype=torch.float32 ) model.eval() loaded_models[model_id] = (tokenizer, model) return loaded_models[model_id] # Chat function def chat_with_model(user_input, model_choice): model_id = MODEL_OPTIONS[model_choice] tokenizer, model = load_model(model_id) messages = [{"role": "user", "content": user_input}] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt" ).to("cpu") with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=100, do_sample=False, temperature=0.7, top_p=0.9 ) response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True) return response.strip() # Gradio UI with gr.Blocks(title="Phi-3 Instruct Explorer") as demo: gr.Markdown("## 🧠 Phi-3 Instruct Explorer\nSwitch between Phi-3 instruct models and test responses on CPU.") with gr.Row(): model_choice = gr.Dropdown(label="Choose a model", choices=list(MODEL_OPTIONS.keys()), value="Phi-3.5 Mini Instruct") with gr.Row(): user_input = gr.Textbox(label="Your message", placeholder="Ask me anything...") with gr.Row(): output = gr.Textbox(label="Model response") with gr.Row(): submit = gr.Button("Generate") submit.click(fn=chat_with_model, inputs=[user_input, model_choice], outputs=output) demo.launch()