UserLM

Sleeping

App Files Files Community

pszemraj commited on Oct 13

Commit

57679c3

verified ·

1 Parent(s): a34fe64

explicit attention mask in generate_reply

Browse files

Files changed (1) hide show

app.py +14 -4

app.py CHANGED Viewed

@@ -142,12 +142,20 @@ def generate_reply(
     top_p: float = 0.8,
     max_retries: int = 10,
 ) -> str:
-    """Implements the 4 guardrails from Appendix C.1."""
     messages = build_hf_messages(system_prompt, history_pairs)
     inputs = tokenizer.apply_chat_template(
         messages, return_tensors="pt", add_generation_prompt=True
     ).to(model.device)
     for _ in range(max_retries):
         lp = LogitsProcessorList(
             [ForbidFirstToken(FIRST_TOKEN_FILTER_IDS, prompt_len=inputs.shape[1])]
@@ -156,17 +164,18 @@ def generate_reply(
         with torch.no_grad():
             out = model.generate(
                 input_ids=inputs,
                 do_sample=True,
                 top_p=top_p,
                 temperature=temperature,
                 max_new_tokens=max_new_tokens,
                 eos_token_id=EOS_TOKEN_ID,
                 pad_token_id=tokenizer.eos_token_id,
-                bad_words_ids=BAD_WORDS_IDS,  # Guardrail 2: block <|endconversation|>
-                logits_processor=lp,  # Guardrail 1
             )
-        gen = out[0][inputs.shape[1] :]
         text = tokenizer.decode(gen, skip_special_tokens=True).strip()
         # Guardrails 3 & 4
@@ -179,6 +188,7 @@ def generate_reply(
     raise RuntimeError("Failed to generate a valid user utterance after retries.")
 # ======================
 # Gradio UI
 # ======================

     top_p: float = 0.8,
     max_retries: int = 10,
 ) -> str:
+    """Implements the 4 guardrails from Appendix C.1 and passes an explicit attention_mask."""
     messages = build_hf_messages(system_prompt, history_pairs)
     inputs = tokenizer.apply_chat_template(
         messages, return_tensors="pt", add_generation_prompt=True
     ).to(model.device)
+    # Robust attention mask even when pad_token_id == eos_token_id.
+    # If no padding is present (usual single-sequence case), use all-ones.
+    pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
+    if pad_id is not None and (inputs == pad_id).any():
+        attention_mask = (inputs != pad_id).long()
+    else:
+        attention_mask = torch.ones_like(inputs, dtype=torch.long)
     for _ in range(max_retries):
         lp = LogitsProcessorList(
             [ForbidFirstToken(FIRST_TOKEN_FILTER_IDS, prompt_len=inputs.shape[1])]
         with torch.no_grad():
             out = model.generate(
                 input_ids=inputs,
+                attention_mask=attention_mask,   # <-- explicit mask to silence warning & be robust
                 do_sample=True,
                 top_p=top_p,
                 temperature=temperature,
                 max_new_tokens=max_new_tokens,
                 eos_token_id=EOS_TOKEN_ID,
                 pad_token_id=tokenizer.eos_token_id,
+                bad_words_ids=BAD_WORDS_IDS,     # Guardrail 2: block <|endconversation|>
+                logits_processor=lp,             # Guardrail 1: first-token filter
             )
+        gen = out[0][inputs.shape[1]:]
         text = tokenizer.decode(gen, skip_special_tokens=True).strip()
         # Guardrails 3 & 4
     raise RuntimeError("Failed to generate a valid user utterance after retries.")
 # ======================
 # Gradio UI
 # ======================