teapotai
/

tinyteapot

@@ -1,14 +1,7 @@
 # handler.py
-#
-# Hugging Face Inference Endpoints custom handler for teapotai/tinyteapot (T5/Flan-T5 style seq2seq).
-# - Uses the mounted model directory (`path`, typically "/repository") exactly like your notebook loads from Hub.
-# - Forces the *slow* SentencePiece tokenizer (use_fast=False) to avoid tokenizer.json / fast-tokenizer mismatch issues.
-#   => Requires `spiece.model` to be present in the repo root.
-# - Left-truncates inputs to keep only the most recent 512 tokens (matches your request).
-# - Deterministic generation (do_sample=False).
 from __future__ import annotations
 from typing import Any, Dict, Union
 import torch
@@ -27,75 +20,89 @@ DEFAULT_SYSTEM_PROMPT = (
 )
-class EndpointHandler:
-    """
-    HF Inference Endpoints will instantiate this class once, then call it per-request.
-    """
     def __init__(self, path: str = ""):
-        # Force slow tokenizer to guarantee consistency with SentencePiece vocab (spiece.model).
-        # This avoids fast-tokenizer init paths that can diverge across environments.
         self.tokenizer = AutoTokenizer.from_pretrained(
             path,
             use_fast=False,
             model_max_length=MAX_INPUT_TOKENS,
         )
         self.model = AutoModelForSeq2SeqLM.from_pretrained(path)
-        # CPU by default on small models; endpoints sets device to CPU in your logs.
         self.device = torch.device("cpu")
         self.model.to(self.device)
         self.model.eval()
         self.system_prompt = DEFAULT_SYSTEM_PROMPT
     @torch.inference_mode()
     def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
-        """
-        Accepts either:
-          - {"inputs": "<full prompt string>"}  (raw mode)
-          - {"inputs": {"context": "...", "question": "...", "system_prompt": "..."}}
-        Optional generation knobs:
-          - {"parameters": {"max_new_tokens": 128}}
-        """
-        if not isinstance(data, dict):
-            raise ValueError("Request payload must be a JSON object.")
-        if "inputs" not in data:
-            raise ValueError("Missing required field: 'inputs'.")
         inputs: Union[str, Dict[str, Any]] = data["inputs"]
-        # Optional: generation parameters
         params = data.get("parameters") or {}
-        try:
-            max_new_tokens = int(params.get("max_new_tokens", DEFAULT_MAX_NEW_TOKENS))
-        except Exception:
-            max_new_tokens = DEFAULT_MAX_NEW_TOKENS
-        # Build prompt exactly like your notebook logic:
-        # prompt = f"{context}\n{system_prompt}\n{question}\n"
         if isinstance(inputs, str):
             prompt = inputs
         elif isinstance(inputs, dict):
             context = inputs.get("context", "")
             question = inputs.get("question", "")
             system_prompt = inputs.get("system_prompt", self.system_prompt)
-            if not isinstance(context, str) or not isinstance(question, str) or not isinstance(system_prompt, str):
-                raise ValueError("'context', 'question', and 'system_prompt' must be strings.")
             prompt = f"{context}\n{system_prompt}\n{question}\n"
         else:
             raise ValueError("'inputs' must be a string or an object with {context, question}.")
-        # Tokenize
         enc = self.tokenizer(prompt, return_tensors="pt")
         input_ids = enc["input_ids"]
-        attention_mask = enc.get("attention_mask", None)
-        # Left-truncate to keep only most recent tokens (last 512)
         if input_ids.shape[1] > MAX_INPUT_TOKENS:
             input_ids = input_ids[:, -MAX_INPUT_TOKENS:]
             if attention_mask is not None:
@@ -105,14 +112,15 @@ class EndpointHandler:
         if attention_mask is not None:
             attention_mask = attention_mask.to(self.device)
-        # Generate deterministically
         out = self.model.generate(
             input_ids=input_ids,
             attention_mask=attention_mask,
             do_sample=False,
             num_beams=1,
             max_new_tokens=max_new_tokens,
-            use_cache=True,
         )
         text = self.tokenizer.decode(out[0], skip_special_tokens=True)

 # handler.py
 from __future__ import annotations
+import os
 from typing import Any, Dict, Union
 import torch
 )
+def _path_exists(p: str) -> bool:
+    try:
+        return os.path.exists(p)
+    except Exception:
+        return False
+class EndpointHandler:
     def __init__(self, path: str = ""):
+        # Sanity: ensure key files exist in the mounted repo
+        spiece_path = os.path.join(path, "spiece.model")
+        tokjson_path = os.path.join(path, "tokenizer.json")
+        cfg_path = os.path.join(path, "config.json")
+        print(f"[teapot] model_dir={path}")
+        print(f"[teapot] exists config.json={_path_exists(cfg_path)} tokenizer.json={_path_exists(tokjson_path)} spiece.model={_path_exists(spiece_path)}")
+        # Force SentencePiece tokenizer (slow)
         self.tokenizer = AutoTokenizer.from_pretrained(
             path,
             use_fast=False,
             model_max_length=MAX_INPUT_TOKENS,
         )
         self.model = AutoModelForSeq2SeqLM.from_pretrained(path)
         self.device = torch.device("cpu")
         self.model.to(self.device)
         self.model.eval()
+        # ----------------------------
+        # CRITICAL CONSISTENCY CHECKS
+        # ----------------------------
+        tok_len = len(self.tokenizer)                       # includes added tokens
+        tok_vocab_size = getattr(self.tokenizer, "vocab_size", None)  # base vocab (T5 SP)
+        cfg_vocab = getattr(self.model.config, "vocab_size", None)
+        emb_rows = int(self.model.get_input_embeddings().weight.shape[0])
+        print(f"[teapot] tokenizer_class={type(self.tokenizer).__name__} use_fast={getattr(self.tokenizer, 'is_fast', None)}")
+        print(f"[teapot] len(tokenizer)={tok_len} tokenizer.vocab_size={tok_vocab_size} model.config.vocab_size={cfg_vocab} embedding_rows={emb_rows}")
+        print(f"[teapot] special_tokens: pad={self.tokenizer.pad_token} eos={self.tokenizer.eos_token} unk={self.tokenizer.unk_token}")
+        # If you ever resized embeddings, these MUST match:
+        # - embedding rows must equal len(tokenizer)
+        # - config vocab_size should match embedding rows
+        if emb_rows != tok_len:
+            raise RuntimeError(
+                f"[teapot] FATAL: embedding_rows ({emb_rows}) != len(tokenizer) ({tok_len}). "
+                "This means your model weights and tokenizer files are out of sync in the repo. "
+                "Fix by re-saving model+tokenizer together after resize_token_embeddings."
+            )
+        if cfg_vocab is not None and cfg_vocab != emb_rows:
+            raise RuntimeError(
+                f"[teapot] FATAL: model.config.vocab_size ({cfg_vocab}) != embedding_rows ({emb_rows}). "
+                "Your config.json is inconsistent with the weights. Re-save model to update config."
+            )
         self.system_prompt = DEFAULT_SYSTEM_PROMPT
     @torch.inference_mode()
     def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
+        if not isinstance(data, dict) or "inputs" not in data:
+            raise ValueError("Request must be JSON with an 'inputs' field.")
         inputs: Union[str, Dict[str, Any]] = data["inputs"]
         params = data.get("parameters") or {}
+        max_new_tokens = int(params.get("max_new_tokens", DEFAULT_MAX_NEW_TOKENS))
         if isinstance(inputs, str):
             prompt = inputs
         elif isinstance(inputs, dict):
             context = inputs.get("context", "")
             question = inputs.get("question", "")
             system_prompt = inputs.get("system_prompt", self.system_prompt)
             prompt = f"{context}\n{system_prompt}\n{question}\n"
         else:
             raise ValueError("'inputs' must be a string or an object with {context, question}.")
         enc = self.tokenizer(prompt, return_tensors="pt")
         input_ids = enc["input_ids"]
+        attention_mask = enc.get("attention_mask")
+        # Keep most recent tokens (left truncate)
         if input_ids.shape[1] > MAX_INPUT_TOKENS:
             input_ids = input_ids[:, -MAX_INPUT_TOKENS:]
             if attention_mask is not None:
         if attention_mask is not None:
             attention_mask = attention_mask.to(self.device)
         out = self.model.generate(
             input_ids=input_ids,
             attention_mask=attention_mask,
             do_sample=False,
             num_beams=1,
             max_new_tokens=max_new_tokens,
+            # Band-aid to prevent pathological repeats, but not a real fix:
+            repetition_penalty=1.05,
+            no_repeat_ngram_size=3,
         )
         text = self.tokenizer.decode(out[0], skip_special_tokens=True)