richardprobe commited on
Commit
6a6269f
·
verified ·
1 Parent(s): 49295f2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -79
app.py CHANGED
@@ -1,84 +1,89 @@
 
1
  import os
2
  import torch
3
  import gradio as gr
4
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
5
- from peft import PeftModel
6
 
7
- # --- CONFIG ---
8
- BASE_MODEL = "microsoft/Phi-4-mini-instruct"
9
- ADAPTER_REPO = "richardprobe/phi4-mini-chris-assistant-richard-adapter"
10
  SYSTEM_PROMPT = "You are Richard. Be concise and casual."
11
- LOAD_4BIT = True
12
 
13
- def load_model():
 
 
 
 
 
 
 
 
 
 
 
14
  print("Loading tokenizer...")
15
- tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
16
- print("Loading base model...")
17
- kwargs = dict(device_map="auto")
18
- if LOAD_4BIT:
19
- kwargs["quantization_config"] = BitsAndBytesConfig(
20
- load_in_4bit=True,
21
- bnb_4bit_compute_dtype=torch.bfloat16,
22
- bnb_4bit_use_double_quant=True,
23
- bnb_4bit_quant_type="nf4",
24
- )
25
- kwargs["torch_dtype"] = torch.bfloat16
26
- else:
27
- kwargs["torch_dtype"] = torch.bfloat16 if torch.cuda.is_available() else torch.float32
28
-
29
- base = AutoModelForCausalLM.from_pretrained(BASE_MODEL, **kwargs)
30
-
31
- print("Loading adapter...")
32
- # HF Hub auth if needed
33
- model = PeftModel.from_pretrained(base, ADAPTER_REPO, use_auth_token=os.getenv("HF_TOKEN"))
34
- model.eval()
35
-
36
- # make sure pad token exists
37
- if tok.pad_token_id is None:
38
- tok.pad_token = tok.eos_token
39
 
40
- return tok, model
 
 
 
41
 
42
- tok, model = load_model()
 
 
 
 
 
43
 
44
- def _normalize_history(history):
45
- """Accepts either tuples [(u,a), ...] or messages-style [{'role','content'}, ...]."""
46
- msgs = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  if SYSTEM_PROMPT:
48
- msgs.append({"role": "system", "content": SYSTEM_PROMPT})
49
-
50
- if not history:
51
- return msgs
52
-
53
- # messages-style
54
- if isinstance(history[0], dict):
55
- for m in history:
56
- role = m.get("role")
57
- content = m.get("content", "")
58
- if isinstance(content, list): # v5 can send [{"type":"text","text":"..."}]
59
- content = "".join(
60
- c.get("text", "") if isinstance(c, dict) else str(c) for c in content
61
- )
62
- if role in {"user", "assistant", "system"}:
63
- msgs.append({"role": role, "content": content})
64
- else:
65
- # tuples-style
66
- for u, a in history:
67
- if u:
68
- msgs.append({"role": "user", "content": u})
69
- if a:
70
- msgs.append({"role": "assistant", "content": a})
71
- return msgs
72
 
 
73
  def chat_generate(message, history, temperature=0.7, top_p=0.95, max_new_tokens=256, repetition_penalty=1.1):
74
- # Build messages
75
- messages = _normalize_history(history)
76
- if message:
77
- messages.append({"role": "user", "content": message})
78
 
79
- inputs = tok.apply_chat_template(
80
- messages, add_generation_prompt=True, return_tensors="pt"
81
- ).to(model.device)
82
 
83
  gen_kwargs = dict(
84
  max_new_tokens=int(max_new_tokens),
@@ -86,38 +91,40 @@ def chat_generate(message, history, temperature=0.7, top_p=0.95, max_new_tokens=
86
  top_p=float(top_p),
87
  do_sample=float(temperature) > 0,
88
  repetition_penalty=float(repetition_penalty),
89
- eos_token_id=tok.eos_token_id,
90
- pad_token_id=tok.pad_token_id,
91
  )
92
 
93
  with torch.inference_mode():
94
- with torch.cuda.amp.autocast(enabled=torch.cuda.is_available(), dtype=torch.bfloat16):
95
- out = model.generate(inputs, **gen_kwargs)
96
 
97
- gen_tokens = out[0][inputs.shape[-1]:]
98
- text = tok.decode(gen_tokens, skip_special_tokens=True, errors="ignore")
99
- return text.strip()
 
 
 
100
 
 
101
  demo = gr.ChatInterface(
102
  fn=chat_generate,
103
- title="Phi-4 Mini + LoRA Adapter (Chris style)",
104
- description="Base: microsoft/Phi-4-mini-instruct + your LoRA adapter. Style-tuned chat.",
105
  additional_inputs=[
106
  gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="Temperature"),
107
  gr.Slider(0.5, 1.0, value=0.95, step=0.01, label="Top-p"),
108
  gr.Slider(16, 512, value=256, step=16, label="Max new tokens"),
109
  gr.Slider(1.0, 1.5, value=1.1, step=0.05, label="Repetition penalty"),
110
  ],
111
- # Each example is: [message, *additional_inputs]
112
  examples=[
113
  ["What are you up to?", 0.7, 0.95, 256, 1.1],
114
  ["You coming?", 0.7, 0.95, 256, 1.1],
115
  ["I'm on the can", 0.7, 0.95, 256, 1.1],
116
  ],
117
- cache_examples=False, # turn off while debugging; turn on later if you want
118
  )
119
 
120
  if __name__ == "__main__":
 
121
  demo.queue(max_size=8)
122
- # Hide API docs to avoid the schema crash toast
123
  demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False, show_error=True)
 
1
+ # app.py
2
  import os
3
  import torch
4
  import gradio as gr
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM
6
+ from peft import PeftModel, PeftConfig
7
 
8
+ # ---- CONFIG ----
9
+ ADAPTER_REPO = "richardprobe/opt-350-chris-adapter" # your LoRA repo
10
+ ADAPTER_NAME = "finetune_adapter" # how you saved it
11
  SYSTEM_PROMPT = "You are Richard. Be concise and casual."
 
12
 
13
+ # If the adapter is private on the Hub, set HF_TOKEN in the Space secrets
14
+ HF_TOKEN = os.getenv("HF_TOKEN", None)
15
+
16
+ # ------------- Loading -------------
17
+ def load_model_and_tokenizer():
18
+ # Inspect adapter to get its base
19
+ print("Reading adapter config...")
20
+ peft_cfg = PeftConfig.from_pretrained(ADAPTER_REPO, token=HF_TOKEN)
21
+ base_id = peft_cfg.base_model_name_or_path
22
+ print(f"Base model detected: {base_id}")
23
+
24
+ # Tokenizer from base (adapter may also carry added tokens)
25
  print("Loading tokenizer...")
26
+ tok = AutoTokenizer.from_pretrained(base_id, use_fast=True, token=HF_TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ # Safety: many decoder-only models don't define a pad token
29
+ if tok.pad_token is None and tok.eos_token is not None:
30
+ tok.pad_token = tok.eos_token
31
+ tok.padding_side = "right"
32
 
33
+ # Non-quantized load so we can merge
34
+ print("Loading base model...")
35
+ dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
36
+ base = AutoModelForCausalLM.from_pretrained(
37
+ base_id, torch_dtype=dtype, device_map="auto", token=HF_TOKEN
38
+ )
39
 
40
+ print("Loading adapter and merging...")
41
+ peft = PeftModel.from_pretrained(
42
+ base, ADAPTER_REPO, adapter_name=ADAPTER_NAME, token=HF_TOKEN
43
+ )
44
+ # This bakes LoRA weights into the base weights and returns a plain model
45
+ merged = peft.merge_and_unload() # equivalent to merge_adapter + unload
46
+ merged.eval()
47
+
48
+ # We’ll use <|end|> as EOS if it exists
49
+ try:
50
+ end_id = tok.convert_tokens_to_ids("<|end|>")
51
+ if end_id is not None and end_id != tok.unk_token_id:
52
+ merged.config.eos_token_id = end_id
53
+ except Exception:
54
+ pass
55
+
56
+ return tok, merged
57
+
58
+ tokenizer, model = load_model_and_tokenizer()
59
+
60
+ # ------------- Prompt building -------------
61
+ def build_prompt(history, user_msg):
62
+ """
63
+ Render your chat format using the added tokens that were used during training.
64
+ History is a list of (user, assistant) tuples from ChatInterface.
65
+ """
66
+ segments = []
67
  if SYSTEM_PROMPT:
68
+ # If you trained with a system token, add it here. Otherwise keep as plain text.
69
+ segments.append(f"<|system|>{SYSTEM_PROMPT}<|end|>")
70
+
71
+ for u, a in history or []:
72
+ if u:
73
+ segments.append(f"<|user|>{u}<|end|>")
74
+ if a:
75
+ segments.append(f"<|assistant|>{a}<|end|>")
76
+
77
+ segments.append(f"<|user|>{user_msg}<|end|>")
78
+ segments.append("<|assistant|>")
79
+ return "\n".join(segments)
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
+ # ------------- Inference -------------
82
  def chat_generate(message, history, temperature=0.7, top_p=0.95, max_new_tokens=256, repetition_penalty=1.1):
83
+ prompt = build_prompt(history, message)
 
 
 
84
 
85
+ inputs = tokenizer(prompt, add_special_tokens=False, return_tensors="pt")
86
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
 
87
 
88
  gen_kwargs = dict(
89
  max_new_tokens=int(max_new_tokens),
 
91
  top_p=float(top_p),
92
  do_sample=float(temperature) > 0,
93
  repetition_penalty=float(repetition_penalty),
94
+ eos_token_id=getattr(model.config, "eos_token_id", tokenizer.eos_token_id),
95
+ pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
96
  )
97
 
98
  with torch.inference_mode():
99
+ out = model.generate(**inputs, **gen_kwargs)
 
100
 
101
+ # Return only the assistant part
102
+ gen_tokens = out[0][inputs["input_ids"].shape[-1]:]
103
+ text = tokenizer.decode(gen_tokens, skip_special_tokens=True, errors="ignore")
104
+ # If your <|end|> isn’t marked as special, strip it manually
105
+ text = text.replace("<|end|>", "").strip()
106
+ return text
107
 
108
+ # ------------- UI -------------
109
  demo = gr.ChatInterface(
110
  fn=chat_generate,
111
+ title="OPT-350M + LoRA (Chris style)",
112
+ description="Loads the base model from the adapter's config, merges LoRA, and chats using your training tokens.",
113
  additional_inputs=[
114
  gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="Temperature"),
115
  gr.Slider(0.5, 1.0, value=0.95, step=0.01, label="Top-p"),
116
  gr.Slider(16, 512, value=256, step=16, label="Max new tokens"),
117
  gr.Slider(1.0, 1.5, value=1.1, step=0.05, label="Repetition penalty"),
118
  ],
 
119
  examples=[
120
  ["What are you up to?", 0.7, 0.95, 256, 1.1],
121
  ["You coming?", 0.7, 0.95, 256, 1.1],
122
  ["I'm on the can", 0.7, 0.95, 256, 1.1],
123
  ],
124
+ cache_examples=False,
125
  )
126
 
127
  if __name__ == "__main__":
128
+ # queue helps avoid device contention; hide API to avoid schema issues
129
  demo.queue(max_size=8)
 
130
  demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False, show_error=True)