Spaces:

richardprobe
/

phi4-mini-friend-chatting-assistant

Runtime error

App Files Files Community

richardprobe commited on Sep 1

Commit

6a6269f

verified ·

1 Parent(s): 49295f2

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -79

app.py CHANGED Viewed

@@ -1,84 +1,89 @@
 import os
 import torch
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
-from peft import PeftModel
-# --- CONFIG ---
-BASE_MODEL = "microsoft/Phi-4-mini-instruct"
-ADAPTER_REPO = "richardprobe/phi4-mini-chris-assistant-richard-adapter"
 SYSTEM_PROMPT = "You are Richard. Be concise and casual."
-LOAD_4BIT = True
-def load_model():
     print("Loading tokenizer...")
-    tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
-    print("Loading base model...")
-    kwargs = dict(device_map="auto")
-    if LOAD_4BIT:
-        kwargs["quantization_config"] = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_compute_dtype=torch.bfloat16,
-            bnb_4bit_use_double_quant=True,
-            bnb_4bit_quant_type="nf4",
-        )
-        kwargs["torch_dtype"] = torch.bfloat16
-    else:
-        kwargs["torch_dtype"] = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-    base = AutoModelForCausalLM.from_pretrained(BASE_MODEL, **kwargs)
-    print("Loading adapter...")
-    # HF Hub auth if needed
-    model = PeftModel.from_pretrained(base, ADAPTER_REPO, use_auth_token=os.getenv("HF_TOKEN"))
-    model.eval()
-    # make sure pad token exists
-    if tok.pad_token_id is None:
-        tok.pad_token = tok.eos_token
-    return tok, model
-tok, model = load_model()
-def _normalize_history(history):
-    """Accepts either tuples [(u,a), ...] or messages-style [{'role','content'}, ...]."""
-    msgs = []
     if SYSTEM_PROMPT:
-        msgs.append({"role": "system", "content": SYSTEM_PROMPT})
-    if not history:
-        return msgs
-    # messages-style
-    if isinstance(history[0], dict):
-        for m in history:
-            role = m.get("role")
-            content = m.get("content", "")
-            if isinstance(content, list):  # v5 can send [{"type":"text","text":"..."}]
-                content = "".join(
-                    c.get("text", "") if isinstance(c, dict) else str(c) for c in content
-                )
-            if role in {"user", "assistant", "system"}:
-                msgs.append({"role": role, "content": content})
-    else:
-        # tuples-style
-        for u, a in history:
-            if u:
-                msgs.append({"role": "user", "content": u})
-            if a:
-                msgs.append({"role": "assistant", "content": a})
-    return msgs
 def chat_generate(message, history, temperature=0.7, top_p=0.95, max_new_tokens=256, repetition_penalty=1.1):
-    # Build messages
-    messages = _normalize_history(history)
-    if message:
-        messages.append({"role": "user", "content": message})
-    inputs = tok.apply_chat_template(
-        messages, add_generation_prompt=True, return_tensors="pt"
-    ).to(model.device)
     gen_kwargs = dict(
         max_new_tokens=int(max_new_tokens),
@@ -86,38 +91,40 @@ def chat_generate(message, history, temperature=0.7, top_p=0.95, max_new_tokens=
         top_p=float(top_p),
         do_sample=float(temperature) > 0,
         repetition_penalty=float(repetition_penalty),
-        eos_token_id=tok.eos_token_id,
-        pad_token_id=tok.pad_token_id,
     )
     with torch.inference_mode():
-        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available(), dtype=torch.bfloat16):
-            out = model.generate(inputs, **gen_kwargs)
-    gen_tokens = out[0][inputs.shape[-1]:]
-    text = tok.decode(gen_tokens, skip_special_tokens=True, errors="ignore")
-    return text.strip()
 demo = gr.ChatInterface(
     fn=chat_generate,
-    title="Phi-4 Mini + LoRA Adapter (Chris style)",
-    description="Base: microsoft/Phi-4-mini-instruct + your LoRA adapter. Style-tuned chat.",
     additional_inputs=[
         gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(0.5, 1.0, value=0.95, step=0.01, label="Top-p"),
         gr.Slider(16, 512, value=256, step=16, label="Max new tokens"),
         gr.Slider(1.0, 1.5, value=1.1, step=0.05, label="Repetition penalty"),
     ],
-    # Each example is: [message, *additional_inputs]
     examples=[
         ["What are you up to?", 0.7, 0.95, 256, 1.1],
         ["You coming?",         0.7, 0.95, 256, 1.1],
         ["I'm on the can",      0.7, 0.95, 256, 1.1],
     ],
-    cache_examples=False,   # turn off while debugging; turn on later if you want
 )
 if __name__ == "__main__":
     demo.queue(max_size=8)
-    # Hide API docs to avoid the schema crash toast
     demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False, show_error=True)

+# app.py
 import os
 import torch
 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from peft import PeftModel, PeftConfig
+# ---- CONFIG ----
+ADAPTER_REPO = "richardprobe/opt-350-chris-adapter"  # your LoRA repo
+ADAPTER_NAME = "finetune_adapter"                    # how you saved it
 SYSTEM_PROMPT = "You are Richard. Be concise and casual."
+# If the adapter is private on the Hub, set HF_TOKEN in the Space secrets
+HF_TOKEN = os.getenv("HF_TOKEN", None)
+# ------------- Loading -------------
+def load_model_and_tokenizer():
+    # Inspect adapter to get its base
+    print("Reading adapter config...")
+    peft_cfg = PeftConfig.from_pretrained(ADAPTER_REPO, token=HF_TOKEN)
+    base_id = peft_cfg.base_model_name_or_path
+    print(f"Base model detected: {base_id}")
+    # Tokenizer from base (adapter may also carry added tokens)
     print("Loading tokenizer...")
+    tok = AutoTokenizer.from_pretrained(base_id, use_fast=True, token=HF_TOKEN)
+    # Safety: many decoder-only models don't define a pad token
+    if tok.pad_token is None and tok.eos_token is not None:
+        tok.pad_token = tok.eos_token
+    tok.padding_side = "right"
+    # Non-quantized load so we can merge
+    print("Loading base model...")
+    dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+    base = AutoModelForCausalLM.from_pretrained(
+        base_id, torch_dtype=dtype, device_map="auto", token=HF_TOKEN
+    )
+    print("Loading adapter and merging...")
+    peft = PeftModel.from_pretrained(
+        base, ADAPTER_REPO, adapter_name=ADAPTER_NAME, token=HF_TOKEN
+    )
+    # This bakes LoRA weights into the base weights and returns a plain model
+    merged = peft.merge_and_unload()  # equivalent to merge_adapter + unload
+    merged.eval()
+    # We’ll use <|end|> as EOS if it exists
+    try:
+        end_id = tok.convert_tokens_to_ids("<|end|>")
+        if end_id is not None and end_id != tok.unk_token_id:
+            merged.config.eos_token_id = end_id
+    except Exception:
+        pass
+    return tok, merged
+tokenizer, model = load_model_and_tokenizer()
+# ------------- Prompt building -------------
+def build_prompt(history, user_msg):
+    """
+    Render your chat format using the added tokens that were used during training.
+    History is a list of (user, assistant) tuples from ChatInterface.
+    """
+    segments = []
     if SYSTEM_PROMPT:
+        # If you trained with a system token, add it here. Otherwise keep as plain text.
+        segments.append(f"<|system|>{SYSTEM_PROMPT}<|end|>")
+    for u, a in history or []:
+        if u:
+            segments.append(f"<|user|>{u}<|end|>")
+        if a:
+            segments.append(f"<|assistant|>{a}<|end|>")
+    segments.append(f"<|user|>{user_msg}<|end|>")
+    segments.append("<|assistant|>")
+    return "\n".join(segments)
+# ------------- Inference -------------
 def chat_generate(message, history, temperature=0.7, top_p=0.95, max_new_tokens=256, repetition_penalty=1.1):
+    prompt = build_prompt(history, message)
+    inputs = tokenizer(prompt, add_special_tokens=False, return_tensors="pt")
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
     gen_kwargs = dict(
         max_new_tokens=int(max_new_tokens),
         top_p=float(top_p),
         do_sample=float(temperature) > 0,
         repetition_penalty=float(repetition_penalty),
+        eos_token_id=getattr(model.config, "eos_token_id", tokenizer.eos_token_id),
+        pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
     )
     with torch.inference_mode():
+        out = model.generate(**inputs, **gen_kwargs)
+    # Return only the assistant part
+    gen_tokens = out[0][inputs["input_ids"].shape[-1]:]
+    text = tokenizer.decode(gen_tokens, skip_special_tokens=True, errors="ignore")
+    # If your <|end|> isn’t marked as special, strip it manually
+    text = text.replace("<|end|>", "").strip()
+    return text
+# ------------- UI -------------
 demo = gr.ChatInterface(
     fn=chat_generate,
+    title="OPT-350M + LoRA (Chris style)",
+    description="Loads the base model from the adapter's config, merges LoRA, and chats using your training tokens.",
     additional_inputs=[
         gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(0.5, 1.0, value=0.95, step=0.01, label="Top-p"),
         gr.Slider(16, 512, value=256, step=16, label="Max new tokens"),
         gr.Slider(1.0, 1.5, value=1.1, step=0.05, label="Repetition penalty"),
     ],
     examples=[
         ["What are you up to?", 0.7, 0.95, 256, 1.1],
         ["You coming?",         0.7, 0.95, 256, 1.1],
         ["I'm on the can",      0.7, 0.95, 256, 1.1],
     ],
+    cache_examples=False,
 )
 if __name__ == "__main__":
+    # queue helps avoid device contention; hide API to avoid schema issues
     demo.queue(max_size=8)
     demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False, show_error=True)