anabury commited on
Commit
2ba0f71
·
verified ·
1 Parent(s): 0e4c2bd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -24
app.py CHANGED
@@ -1,42 +1,48 @@
1
  import gradio as gr
2
- from transformers import AutoTokenizer, AutoConfig, PhiForCausalLM
3
  import torch
 
 
4
 
5
- model_id = "Anabury/My_Finetuned_Phi-4"
 
6
 
7
- # Load config to confirm model type
8
- config = AutoConfig.from_pretrained(model_id)
9
 
10
- # Load tokenizer
11
- tokenizer = AutoTokenizer.from_pretrained(model_id)
12
-
13
- # Use PhiForCausalLM for Phi-4 architecture
14
- model = PhiForCausalLM.from_pretrained(
15
- model_id,
16
  device_map="auto",
17
- torch_dtype=torch.float16,
18
- trust_remote_code=True # if needed for custom implementations
19
  )
20
 
21
- model.config.use_cache = True # enables faster inference
 
 
22
 
23
- # Define the chat interface
24
  def chat(message, history):
25
- inputs = tokenizer(message, return_tensors="pt").to(model.device)
26
- outputs = model.generate(
27
- **inputs,
28
- max_new_tokens=200,
29
- pad_token_id=tokenizer.eos_token_id
30
- )
31
- reply = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
 
32
  history.append((message, reply))
33
  return history, history
34
 
35
  with gr.Blocks() as demo:
36
- chatbot = gr.Chatbot()
37
- msg = gr.Textbox(placeholder="Type your message here...")
 
38
  clear = gr.Button("Clear")
39
-
40
  msg.submit(chat, [msg, chatbot], [chatbot, chatbot])
41
  clear.click(lambda: [], None, chatbot, queue=False)
42
 
 
1
  import gradio as gr
 
2
  import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
+ from peft import PeftModel
5
 
6
+ BASE_MODEL = "unsloth/phi-4-unsloth-bnb-4bit" # base that you finetuned from
7
+ ADAPTER_ID = "Anabury/My_Finetuned_Phi-4" # your adapter repo
8
 
9
+ # tokenizer (either base or adapter works; use base)
10
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
11
 
12
+ # load base model (4-bit quant is fine on Spaces GPU/CPU)
13
+ base = AutoModelForCausalLM.from_pretrained(
14
+ BASE_MODEL,
 
 
 
15
  device_map="auto",
16
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
17
+ trust_remote_code=True
18
  )
19
 
20
+ # attach your LoRA adapter
21
+ model = PeftModel.from_pretrained(base, ADAPTER_ID)
22
+ model.eval()
23
 
 
24
  def chat(message, history):
25
+ # build a simple prompt; adapt if you have a chat template in your repo
26
+ prompt = message
27
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
28
+ with torch.no_grad():
29
+ output = model.generate(
30
+ **inputs,
31
+ max_new_tokens=256,
32
+ do_sample=True,
33
+ temperature=0.7,
34
+ top_p=0.9,
35
+ pad_token_id=tokenizer.eos_token_id
36
+ )
37
+ reply = tokenizer.decode(output[0], skip_special_tokens=True)
38
  history.append((message, reply))
39
  return history, history
40
 
41
  with gr.Blocks() as demo:
42
+ gr.Markdown("# Phi-4 Chat (LoRA)")
43
+ chatbot = gr.Chatbot(height=420)
44
+ msg = gr.Textbox(placeholder="Ask me anything…")
45
  clear = gr.Button("Clear")
 
46
  msg.submit(chat, [msg, chatbot], [chatbot, chatbot])
47
  clear.click(lambda: [], None, chatbot, queue=False)
48