UserLM

Sleeping

App Files Files Community

pszemraj commited on Oct 12

Commit

9dd80f6

verified ·

1 Parent(s): ee4117d

role fix attempt

Browse files

Correct left/right roles by switching to Chatbot(type="messages") and mapping UserLM turns to role='assistant' (left) and your replies to role='user' (right) per Gradio’s message schema
Gradio

Guardrail 1 actually applied via a custom LogitsProcessor that forbids the 6 first tokens on the first generated token only; the other three guardrails are enforced as in Appendix C.1 (length 3–25, block <|endconversation|>, verbatim filtering) .

Defaults align with the model card/paper: temperature=1.0, top_p=0.8, stop on <|eot_id|>, block <|endconversation|> (you can still tune via sliders)

Files changed (1) hide show

app.py +191 -213

app.py CHANGED Viewed

@@ -1,16 +1,17 @@
 from __future__ import annotations
 import os
-from typing import Any, Dict, List, Tuple
 import gradio as gr
 import spaces
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
-# ----------------------
 # Config
-# ----------------------
 MODEL_ID = os.getenv("MODEL_ID", "microsoft/UserLM-8b")
 DEFAULT_SYSTEM_PROMPT = (
     "You are a user who wants to implement a special type of sequence. "
@@ -18,139 +19,140 @@ DEFAULT_SYSTEM_PROMPT = (
     "The first two numbers in the sequence are 1 and 1."
 )
 def load_model(model_id: str = MODEL_ID):
-    """Load tokenizer and model, with a reasonable dtype and device fallback."""
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(
         model_id,
         trust_remote_code=True,
         torch_dtype="auto",
         device_map="auto",
     )
-    # Special tokens for stopping / filtering
-    end_token = "<|eot_id|>"
-    end_conv_token = "<|endconversation|>"
-    end_token_ids = tokenizer.encode(end_token, add_special_tokens=False)
-    end_conv_token_ids = tokenizer.encode(end_conv_token, add_special_tokens=False)
-    # Guardrail 1: Problematic first tokens that cause repetition (from Appendix C.1)
-    problematic_tokens = ["I", "You", "Here", "i", "you", "here"]
     first_token_filter_ids = []
-    for token in problematic_tokens:
-        token_ids = tokenizer.encode(token, add_special_tokens=False)
-        if len(token_ids) > 0:
-            first_token_filter_ids.append(token_ids[0])
-    eos_token_id = (
-        end_token_ids[0] if len(end_token_ids) > 0 else tokenizer.eos_token_id
-    )
-    bad_words_ids = (
-        [[tid] for tid in end_conv_token_ids] if len(end_conv_token_ids) > 0 else None
-    )
-    return tokenizer, model, eos_token_id, bad_words_ids, first_token_filter_ids
 tokenizer, model, EOS_TOKEN_ID, BAD_WORDS_IDS, FIRST_TOKEN_FILTER_IDS = load_model()
 model.eval()
-# ----------------------
-# Generation helper
-# ----------------------
-def build_messages(
-    system_prompt: str, history: List[Tuple[str, str]]
-) -> List[Dict[str, str]]:
-    """Transform Gradio history into chat template messages.
-    History is stored as (model_user, human_assistant) tuples.
-    """
-    messages: List[Dict[str, str]] = []
-    if system_prompt.strip():
-        messages.append({"role": "system", "content": system_prompt.strip()})
-    # Each tuple is (model_user, human_assistant)
-    for model_user, human_assistant in history:
-        if model_user:
-            messages.append({"role": "user", "content": model_user})
-        if human_assistant:
-            messages.append({"role": "assistant", "content": human_assistant})
-    return messages
-def apply_first_token_filter(
-    logits: torch.Tensor, filter_ids: List[int]
-) -> torch.Tensor:
-    """Apply logit filter for problematic first tokens (Guardrail 1)."""
-    logits_filtered = logits.clone()
-    for token_id in filter_ids:
-        logits_filtered[0, -1, token_id] = float("-inf")
-    return logits_filtered
-def is_valid_length(text: str, min_words: int = 3, max_words: int = 50) -> bool:
-    """Check if generated text meets length requirements (Guardrail 3).
-    Paper used max_words=25 for their simulation experiments, but we use 50
-    for interactive demo to allow slightly longer responses while still preventing
-    the model from revealing the entire intent at once.
     """
-    word_count = len(text.split())
-    return min_words <= word_count <= max_words
-def is_verbatim_repetition(
-    new_text: str, history: List[Tuple[str, str]], system_prompt: str
-) -> bool:
-    """Check if text is exact repetition of prior user turn or system prompt (Guardrail 4)."""
-    new_text_normalized = new_text.strip().lower()
-    # Check against system prompt
-    if new_text_normalized == system_prompt.strip().lower():
-        return True
-    # Check against previous model user messages (first element in tuple)
-    for model_user, _ in history:
-        if model_user and new_text_normalized == model_user.strip().lower():
-            return True
-    return False
 @spaces.GPU
 def generate_reply(
-    messages: List[Dict[str, str]],
-    history: List[Tuple[str, str]],
     system_prompt: str,
-    max_new_tokens: int = 256,
     temperature: float = 1.0,
     top_p: float = 0.8,
     max_retries: int = 5,
 ) -> str:
-    """Run generation with guardrails from Appendix C.1.
-    Implements all 4 guardrails from the paper:
-    1. Filter problematic first tokens
-    2. Optionally avoid dialogue termination (disabled by default for demo)
-    3. Enforce length thresholds with retry
-    4. Filter verbatim repetitions with retry
-    """
-    for attempt in range(max_retries):
-        # Prepare input ids using the model's chat template
-        inputs = tokenizer.apply_chat_template(
-            messages,
-            return_tensors="pt",
-            add_generation_prompt=True,
-        ).to(model.device)
         with torch.no_grad():
-            outputs = model.generate(
                 input_ids=inputs,
                 do_sample=True,
                 top_p=top_p,
@@ -158,139 +160,109 @@ def generate_reply(
                 max_new_tokens=max_new_tokens,
                 eos_token_id=EOS_TOKEN_ID,
                 pad_token_id=tokenizer.eos_token_id,
-                bad_words_ids=BAD_WORDS_IDS,  # Prevents <|endconversation|>
             )
-        # Slice off the prompt tokens to get only the new text
-        generated = outputs[0][inputs.shape[1] :]
-        text = tokenizer.decode(generated, skip_special_tokens=True).strip()
-        # Apply guardrails - retry if checks fail
-        if not is_valid_length(text):
             continue
-        if is_verbatim_repetition(text, history, system_prompt):
             continue
-        # Success - return the valid text
         return text
-    # If all retries failed, raise an error
-    raise RuntimeError(
-        f"Failed to generate valid response after {max_retries} attempts"
-    )
-# ----------------------
-# Gradio UI callbacks
-# ----------------------
 def respond(
-    assistant_message: str,
-    chat_history: List[Tuple[str, str]],
     system_prompt: str,
     max_new_tokens: int,
     temperature: float,
     top_p: float,
 ):
-    """Generate next user turn.
-    Flow:
-    - If history empty: Generate first user message (ignores assistant_message input)
-    - If history exists: Add assistant response and generate next user turn
-    History format: (model_user, human_assistant)
-    """
-    # First message generation - ignore any text in the assistant box
-    if len(chat_history) == 0:
-        # Generate initial user message from system prompt alone
-        messages = build_messages(system_prompt, [])
-        user_reply = generate_reply(
-            messages,
-            chat_history,
             system_prompt,
             max_new_tokens=max_new_tokens,
             temperature=temperature,
             top_p=top_p,
         )
-        # Start conversation with first user message (empty assistant slot)
-        chat_history = [(user_reply, None)]
-        return chat_history, chat_history
-    # Subsequent messages - require assistant response
-    if not assistant_message.strip():
-        # User clicked generate without providing assistant response
-        gr.Info(
-            "Please type your assistant response before generating the next user message."
-        )
-        return chat_history, chat_history
-    # Update the last tuple with the assistant response
-    last_model_user, _ = chat_history[-1]
-    chat_history[-1] = (last_model_user, assistant_message.strip())
-    # Build messages for next user turn generation
-    messages = build_messages(system_prompt, chat_history)
-    user_reply = generate_reply(
-        messages,
-        chat_history,
         system_prompt,
         max_new_tokens=max_new_tokens,
         temperature=temperature,
         top_p=top_p,
     )
-    # Add new model user message (with empty assistant slot)
-    chat_history.append((user_reply, None))
-    return chat_history, chat_history
-def clear_state():
-    return [], DEFAULT_SYSTEM_PROMPT
-# ----------------------
-# Build the Gradio App
-# ----------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         f"""
-    # UserLM-8b: User Language Model Demo
-    **Model:** `{MODEL_ID}`
-    The AI plays the user, you play the assistant.
-    """
     )
-    with gr.Row():
-        system_box = gr.Textbox(
-            label="User Intent",
-            value=DEFAULT_SYSTEM_PROMPT,
-            lines=3,
-            placeholder="Enter the user's goal or intent",
-        )
     chatbot = gr.Chatbot(
-        height=420,
         label="Conversation",
     )
-    with gr.Row():
-        msg = gr.Textbox(
-            label="Assistant Response",
-            placeholder="Leave empty for first generation, then type your responses",
-            lines=2,
-        )
     with gr.Accordion("Generation Settings", open=False):
-        max_new_tokens = gr.Slider(16, 512, value=256, step=16, label="max_new_tokens")
         temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="temperature")
         top_p = gr.Slider(0.0, 1.0, value=0.8, step=0.01, label="top_p")
@@ -298,46 +270,52 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         submit_btn = gr.Button("Generate", variant="primary")
         clear_btn = gr.Button("Clear")
-    state = gr.State([])  # chat history: List[Tuple[model_user, human_assistant]]
     with gr.Accordion("Implementation Details", open=False):
         gr.Markdown(
             """
-            Based on Appendix C.1 of the UserLM paper:
-            - Sampling: temp=1.0, top_p=0.8
-            - First token filtering for problematic tokens
-            - Length constraints: 3-50 words
-            - Repetition filtering
             """
         )
-    def _submit(asst_text, history, system_prompt, mnt, temp, tp):
-        new_history, visible = respond(asst_text, history, system_prompt, mnt, temp, tp)
-        # Clear input box after submission
-        return "", visible
     submit_btn.click(
         fn=_submit,
-        inputs=[msg, state, system_box, max_new_tokens, temperature, top_p],
-        outputs=[msg, chatbot],
     )
     msg.submit(
         fn=_submit,
-        inputs=[msg, state, system_box, max_new_tokens, temperature, top_p],
-        outputs=[msg, chatbot],
     )
-    # Keep state in sync with the visible Chatbot
-    def _sync_state(chat):
-        return chat
-    chatbot.change(_sync_state, inputs=[chatbot], outputs=[state])
-    def _clear():
-        history, sys = clear_state()
-        return history, sys, history, ""
-    clear_btn.click(_clear, outputs=[state, system_box, chatbot, msg])
 if __name__ == "__main__":
-    demo.queue().launch()

 from __future__ import annotations
 import os
+from typing import Any, Dict, List, Optional, Tuple
 import gradio as gr
 import spaces
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation.logits_process import LogitsProcessor, LogitsProcessorList
+# ======================
 # Config
+# ======================
 MODEL_ID = os.getenv("MODEL_ID", "microsoft/UserLM-8b")
 DEFAULT_SYSTEM_PROMPT = (
     "You are a user who wants to implement a special type of sequence. "
     "The first two numbers in the sequence are 1 and 1."
 )
+# ======================
+# Load model
+# ======================
 def load_model(model_id: str = MODEL_ID):
+    tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    mdl = AutoModelForCausalLM.from_pretrained(
         model_id,
         trust_remote_code=True,
         torch_dtype="auto",
         device_map="auto",
     )
+    # Special tokens
+    eot = "<|eot_id|>"
+    end_conv = "<|endconversation|>"
+    eot_ids = tok.encode(eot, add_special_tokens=False)
+    end_conv_ids = tok.encode(end_conv, add_special_tokens=False)
+    eos_token_id = eot_ids[0] if len(eot_ids) > 0 else tok.eos_token_id
+    bad_words_ids = [[tid] for tid in end_conv_ids] if len(end_conv_ids) > 0 else None
+    # Guardrail 1: problematic first tokens (Appendix C.1)
+    prob_first_tokens = ["I", "You", "Here", "i", "you", "here"]
     first_token_filter_ids = []
+    for w in prob_first_tokens:
+        ids = tok.encode(w, add_special_tokens=False)
+        if ids:
+            first_token_filter_ids.append(ids[0])
+    return tok, mdl, eos_token_id, bad_words_ids, first_token_filter_ids
 tokenizer, model, EOS_TOKEN_ID, BAD_WORDS_IDS, FIRST_TOKEN_FILTER_IDS = load_model()
 model.eval()
+# ======================
+# Guardrail helpers
+# ======================
+def is_valid_length(text: str, min_words: int = 3, max_words: int = 25) -> bool:
+    wc = len(text.split())
+    return min_words <= wc <= max_words
+def is_verbatim_repetition(
+    new_text: str, history_pairs: List[Tuple[str, Optional[str]]], system_prompt: str
+) -> bool:
+    t = new_text.strip().lower()
+    if t == system_prompt.strip().lower():
+        return True
+    for model_user, _ in history_pairs:
+        if model_user and t == model_user.strip().lower():
+            return True
+    return False
+class ForbidFirstToken(LogitsProcessor):
+    """Set -inf on a token list for the *first* generated token only."""
+    def __init__(self, forbid_ids: List[int], prompt_len: int):
+        self.forbid = list(set(int(x) for x in forbid_ids))
+        self.prompt_len = int(prompt_len)
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        # Apply only when generating the very first token (seq len == prompt_len)
+        if input_ids.shape[1] == self.prompt_len and self.forbid:
+            scores[:, self.forbid] = float("-inf")
+        return scores
+# ======================
+# Message utilities
+# ======================
+def build_hf_messages(
+    system_prompt: str, history_pairs: List[Tuple[str, Optional[str]]]
+) -> List[Dict[str, str]]:
     """
+    Construct messages for tokenizer.apply_chat_template.
+    history_pairs = list of (model_user, human_assistant)
+    """
+    msgs: List[Dict[str, str]] = []
+    if system_prompt.strip():
+        msgs.append({"role": "system", "content": system_prompt.strip()})
+    for model_user, human_assistant in history_pairs:
+        if model_user:
+            msgs.append({"role": "user", "content": model_user})
+        if human_assistant:
+            msgs.append({"role": "assistant", "content": human_assistant})
+    return msgs
+def pairs_to_ui_messages(
+    history_pairs: List[Tuple[str, Optional[str]]]
+) -> List[Dict[str, str]]:
+    """
+    Convert (model_user, human_assistant) pairs to Gradio Chatbot(type='messages') UI messages.
+    Visual convention:
+      - LEFT  (role='assistant'): UserLM's utterances (the simulator)
+      - RIGHT (role='user'):      Your replies (you play the assistant)
+    """
+    ui: List[Dict[str, str]] = []
+    for model_user, human_assistant in history_pairs:
+        if model_user:
+            ui.append({"role": "assistant", "content": model_user})
+        if human_assistant:
+            ui.append({"role": "user", "content": human_assistant})
+    return ui
+# ======================
+# Generation
+# ======================
 @spaces.GPU
 def generate_reply(
     system_prompt: str,
+    history_pairs: List[Tuple[str, Optional[str]]],
+    max_new_tokens: int = 128,
     temperature: float = 1.0,
     top_p: float = 0.8,
     max_retries: int = 5,
 ) -> str:
+    """Implements the 4 guardrails from Appendix C.1."""
+    messages = build_hf_messages(system_prompt, history_pairs)
+    inputs = tokenizer.apply_chat_template(
+        messages, return_tensors="pt", add_generation_prompt=True
+    ).to(model.device)
+    for _ in range(max_retries):
+        lp = LogitsProcessorList(
+            [ForbidFirstToken(FIRST_TOKEN_FILTER_IDS, prompt_len=inputs.shape[1])]
+        )
         with torch.no_grad():
+            out = model.generate(
                 input_ids=inputs,
                 do_sample=True,
                 top_p=top_p,
                 max_new_tokens=max_new_tokens,
                 eos_token_id=EOS_TOKEN_ID,
                 pad_token_id=tokenizer.eos_token_id,
+                bad_words_ids=BAD_WORDS_IDS,  # Guardrail 2: block <|endconversation|>
+                logits_processor=lp,  # Guardrail 1
             )
+        gen = out[0][inputs.shape[1] :]
+        text = tokenizer.decode(gen, skip_special_tokens=True).strip()
+        # Guardrails 3 & 4
+        if not is_valid_length(text, min_words=3, max_words=25):
             continue
+        if is_verbatim_repetition(text, history_pairs, system_prompt):
             continue
         return text
+    raise RuntimeError("Failed to generate a valid user utterance after retries.")
+# ======================
+# Gradio UI
+# ======================
 def respond(
+    your_reply: str,
+    history_pairs: List[Tuple[str, Optional[str]]],
     system_prompt: str,
     max_new_tokens: int,
     temperature: float,
     top_p: float,
 ):
+    # First turn: ignore your_reply and generate the initial UserLM utterance
+    if not history_pairs:
+        userlm = generate_reply(
             system_prompt,
+            [],
             max_new_tokens=max_new_tokens,
             temperature=temperature,
             top_p=top_p,
         )
+        history_pairs = [(userlm, None)]
+        return pairs_to_ui_messages(history_pairs), history_pairs, ""
+    # Subsequent turns require your reply
+    if not your_reply.strip():
+        gr.Info("Type your (assistant) reply on the right, then click Generate.")
+        return pairs_to_ui_messages(history_pairs), history_pairs, ""
+    # Close the last pair with your reply
+    last_userlm, _ = history_pairs[-1]
+    history_pairs[-1] = (last_userlm, your_reply.strip())
+    # Generate the next UserLM utterance
+    userlm = generate_reply(
         system_prompt,
+        history_pairs,
         max_new_tokens=max_new_tokens,
         temperature=temperature,
         top_p=top_p,
     )
+    history_pairs.append((userlm, None))
+    return pairs_to_ui_messages(history_pairs), history_pairs, ""
+def _clear():
+    return [], [], DEFAULT_SYSTEM_PROMPT, ""
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         f"""
+# UserLM-8b: User Language Model Demo
+**Model:** `{MODEL_ID}`
+The AI plays the **user**, you play the **assistant**. Your messages appear on the **right**.
+"""
     )
+    system_box = gr.Textbox(
+        label="User Intent",
+        value=DEFAULT_SYSTEM_PROMPT,
+        lines=3,
+        placeholder="Enter the user's goal or intent",
+    )
+    # Use messages format so we can control left/right explicitly
     chatbot = gr.Chatbot(
         label="Conversation",
+        height=420,
+        type="messages",  # modern format; tuples are deprecated
+        render_markdown=True,
+        autoscroll=True,
+        show_copy_button=True,
+        # You can set avatar images like: avatar_images=("assets/you.png", "assets/userlm.png")
     )
+    # Your reply box (you play the assistant)
+    msg = gr.Textbox(
+        label="Your Reply (assistant)",
+        placeholder="Type your assistant response here…",
+        lines=2,
+    )
     with gr.Accordion("Generation Settings", open=False):
+        max_new_tokens = gr.Slider(16, 512, value=128, step=16, label="max_new_tokens")
         temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="temperature")
         top_p = gr.Slider(0.0, 1.0, value=0.8, step=0.01, label="top_p")
         submit_btn = gr.Button("Generate", variant="primary")
         clear_btn = gr.Button("Clear")
+    # Internal state keeps the compact (userLM, you) pairs used for decoding
+    history_pairs_state = gr.State([])  # List[Tuple[str, Optional[str]]]
     with gr.Accordion("Implementation Details", open=False):
         gr.Markdown(
             """
+- Decoding defaults from the model card: `temperature=1.0`, `top_p=0.8`, stop on `<|eot_id|>`, and block `<|endconversation|>`.
+- Guardrails from Appendix C.1: (1) first-token logit filter, (2) block endconversation, (3) 3–25 word length, (4) verbatim repetition filter.
             """
         )
+    def _submit(your_text, pairs, sys_prompt, mnt, temp, tp):
+        ui_msgs, new_pairs, cleared_text = respond(
+            your_text, pairs, sys_prompt, mnt, temp, tp
+        )
+        return ui_msgs, new_pairs, cleared_text
     submit_btn.click(
         fn=_submit,
+        inputs=[
+            msg,
+            history_pairs_state,
+            system_box,
+            max_new_tokens,
+            temperature,
+            top_p,
+        ],
+        outputs=[chatbot, history_pairs_state, msg],
     )
     msg.submit(
         fn=_submit,
+        inputs=[
+            msg,
+            history_pairs_state,
+            system_box,
+            max_new_tokens,
+            temperature,
+            top_p,
+        ],
+        outputs=[chatbot, history_pairs_state, msg],
     )
+    clear_btn.click(
+        fn=_clear,
+        outputs=[chatbot, history_pairs_state, system_box, msg],
+    )
 if __name__ == "__main__":
+    demo.queue().launch()