File size: 2,319 Bytes
e179439
e82baba
7291080
e179439
e82baba
 
 
 
 
 
 
e179439
e82baba
 
e179439
e82baba
 
 
 
 
 
 
 
 
 
 
 
e179439
e82baba
 
 
 
 
 
 
 
 
 
 
 
 
e179439
e82baba
 
 
 
 
 
 
 
e179439
e82baba
 
e179439
e82baba
 
 
 
 
 
 
 
 
 
 
e179439
e82baba
7291080
e82baba
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr

# Supported models (text-only for now)
MODEL_OPTIONS = {
    "Phi-3.5 Mini Instruct": "microsoft/Phi-3.5-mini-instruct",
    "Phi-3.5 MoE Instruct": "microsoft/Phi-3.5-MoE-instruct",
    "Phi-3 Mini 4K Instruct": "microsoft/Phi-3-mini-4k-instruct",
    "Phi-3 Mini 128K Instruct": "microsoft/Phi-3-mini-128k-instruct"
}

# Cache for loaded models
loaded_models = {}

# Load model/tokenizer on demand
def load_model(model_id):
    if model_id not in loaded_models:
        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            trust_remote_code=True,
            torch_dtype=torch.float32
        )
        model.eval()
        loaded_models[model_id] = (tokenizer, model)
    return loaded_models[model_id]

# Chat function
def chat_with_model(user_input, model_choice):
    model_id = MODEL_OPTIONS[model_choice]
    tokenizer, model = load_model(model_id)

    messages = [{"role": "user", "content": user_input}]
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt"
    ).to("cpu")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=False,
            temperature=0.7,
            top_p=0.9
        )

    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
    return response.strip()

# Gradio UI
with gr.Blocks(title="Phi-3 Instruct Explorer") as demo:
    gr.Markdown("## 🧠 Phi-3 Instruct Explorer\nSwitch between Phi-3 instruct models and test responses on CPU.")
    with gr.Row():
        model_choice = gr.Dropdown(label="Choose a model", choices=list(MODEL_OPTIONS.keys()), value="Phi-3.5 Mini Instruct")
    with gr.Row():
        user_input = gr.Textbox(label="Your message", placeholder="Ask me anything...")
    with gr.Row():
        output = gr.Textbox(label="Model response")
    with gr.Row():
        submit = gr.Button("Generate")

    submit.click(fn=chat_with_model, inputs=[user_input, model_choice], outputs=output)

demo.launch()