File size: 1,226 Bytes
26f7ee1
ff3a5c3
e9572f2
ff3a5c3
26f7ee1
 
 
 
 
 
 
 
e9572f2
 
 
26f7ee1
e9572f2
26f7ee1
 
755a360
436ad70
e9572f2
26f7ee1
e9572f2
 
436ad70
1022de7
436ad70
 
e9572f2
 
1022de7
436ad70
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import gradio as gr
import torch


# Set quantization config (4-bit for max speed)
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit precision
    bnb_4bit_quant_type="nf4",  # NF4 for better accuracy
    bnb_4bit_compute_dtype=torch.float16,  # Use float16 for computation
    device_map="auto"
)
# Load Phi-2 (smaller model with high-quality responses)
model_name = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(model_name)
# Speed up inference with torch.compile
model = torch.compile(model)  # Compile the model for faster inference

def respond(message, history):
    inputs = tokenizer(message, return_tensors="pt")
    outputs = model.generate(inputs.input_ids, max_new_tokens=50, temperature=0.7, top_p=0.9)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Gradio Chat Interface
gr.ChatInterface(
    respond,
    title="🤖 Phi-2 Chatbot",
    description="Ask me anything! Powered by Phi-2.",
    examples=["What's your favorite book?", "Tell me a fun fact about space!"],
    theme="soft"
).launch()