File size: 1,857 Bytes
dc0bb4c
 
2ba0f71
 
dc0bb4c
9a972c0
 
0e4c2bd
9a972c0
 
 
 
 
 
 
 
 
 
 
 
2ba0f71
0e4c2bd
9a972c0
2ba0f71
 
9a972c0
 
2ba0f71
dc0bb4c
 
9a972c0
2ba0f71
 
0e4c2bd
9a972c0
dc0bb4c
9a972c0
 
2ba0f71
9a972c0
2ba0f71
 
 
 
 
9a972c0
2ba0f71
9a972c0
dc0bb4c
 
 
9a972c0
dc0bb4c
9a972c0
2ba0f71
 
dc0bb4c
9a972c0
dc0bb4c
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# Your adapter (LoRA fine-tuned model on Hugging Face)
ADAPTER_ID = "Anabury/My_Finetuned_Phi-4"

# Detect device
USE_GPU = torch.cuda.is_available()

# Pick base model depending on device
if USE_GPU:
    BASE_MODEL = "unsloth/phi-4-unsloth-bnb-4bit"   # fast + quantized
else:
    BASE_MODEL = "unsloth/phi-4"                   # full precision for CPU

print(f"Loading base model: {BASE_MODEL} on {'GPU' if USE_GPU else 'CPU'}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)

# Load base model
base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto" if USE_GPU else None,
    torch_dtype=torch.float16 if USE_GPU else torch.float32,
    trust_remote_code=True
)

# Attach your LoRA adapter
model = PeftModel.from_pretrained(base, ADAPTER_ID)
model.eval()

# Chat function
def chat(message, history):
    # simple prompt, you can swap in chat template later
    inputs = tokenizer(message, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
        )
    reply = tokenizer.decode(outputs[0], skip_special_tokens=True)
    history.append((message, reply))
    return history, history

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# 🧠 Phi-4 Chatbot (Fine-tuned)")
    chatbot = gr.Chatbot(height=420)
    msg = gr.Textbox(placeholder="Ask me anything…")
    clear = gr.Button("Clear")

    msg.submit(chat, [msg, chatbot], [chatbot, chatbot])
    clear.click(lambda: [], None, chatbot, queue=False)

demo.launch()