teapotai
/

tinyteapot

@@ -1,59 +1,119 @@
-# handler.py (repo root)
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
 MAX_INPUT_TOKENS = 512
 class EndpointHandler:
     def __init__(self, path: str = ""):
-        # Load exactly from the mounted model dir ("/repository")
-        self.tokenizer = AutoTokenizer.from_pretrained(path)
         self.model = AutoModelForSeq2SeqLM.from_pretrained(path)
-        self.model.eval()
         self.device = torch.device("cpu")
         self.model.to(self.device)
-        self.system_prompt = (
-            "You are Teapot, an open-source AI assistant optimized for low-end devices, "
-            "providing short, accurate responses without hallucinating while excelling at "
-            "information extraction and text summarization. "
-            "If the context does not answer the question, reply exactly: "
-            "'I am sorry but I don't have any information on that'."
-        )
     @torch.inference_mode()
-    def __call__(self, data):
-        inputs = data.get("inputs")
-        if inputs is None:
-            raise ValueError("Missing required field 'inputs'.")
         if isinstance(inputs, str):
             prompt = inputs
         elif isinstance(inputs, dict):
             context = inputs.get("context", "")
             question = inputs.get("question", "")
-            prompt = f"{context}\n{self.system_prompt}\n{question}\n"
         else:
-            raise ValueError("inputs must be a string or dict.")
         enc = self.tokenizer(prompt, return_tensors="pt")
         input_ids = enc["input_ids"]
-        attention_mask = enc["attention_mask"]
-        # keep most recent 512 tokens
         if input_ids.shape[1] > MAX_INPUT_TOKENS:
             input_ids = input_ids[:, -MAX_INPUT_TOKENS:]
-            attention_mask = attention_mask[:, -MAX_INPUT_TOKENS:]
         input_ids = input_ids.to(self.device)
-        attention_mask = attention_mask.to(self.device)
-        outputs = self.model.generate(
             input_ids=input_ids,
             attention_mask=attention_mask,
             do_sample=False,
         )
-        answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-        return {"generated_text": answer}

+# handler.py
+#
+# Hugging Face Inference Endpoints custom handler for teapotai/tinyteapot (T5/Flan-T5 style seq2seq).
+# - Uses the mounted model directory (`path`, typically "/repository") exactly like your notebook loads from Hub.
+# - Forces the *slow* SentencePiece tokenizer (use_fast=False) to avoid tokenizer.json / fast-tokenizer mismatch issues.
+#   => Requires `spiece.model` to be present in the repo root.
+# - Left-truncates inputs to keep only the most recent 512 tokens (matches your request).
+# - Deterministic generation (do_sample=False).
+from __future__ import annotations
+from typing import Any, Dict, Union
 import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 MAX_INPUT_TOKENS = 512
+DEFAULT_MAX_NEW_TOKENS = 128
+DEFAULT_SYSTEM_PROMPT = (
+    "You are Teapot, an open-source AI assistant optimized for low-end devices, "
+    "providing short, accurate responses without hallucinating while excelling at "
+    "information extraction and text summarization. "
+    "If the context does not answer the question, reply exactly: "
+    "'I am sorry but I don't have any information on that'."
+)
 class EndpointHandler:
+    """
+    HF Inference Endpoints will instantiate this class once, then call it per-request.
+    """
     def __init__(self, path: str = ""):
+        # Force slow tokenizer to guarantee consistency with SentencePiece vocab (spiece.model).
+        # This avoids fast-tokenizer init paths that can diverge across environments.
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            path,
+            use_fast=False,
+            model_max_length=MAX_INPUT_TOKENS,
+        )
         self.model = AutoModelForSeq2SeqLM.from_pretrained(path)
+        # CPU by default on small models; endpoints sets device to CPU in your logs.
         self.device = torch.device("cpu")
         self.model.to(self.device)
+        self.model.eval()
+        self.system_prompt = DEFAULT_SYSTEM_PROMPT
     @torch.inference_mode()
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
+        """
+        Accepts either:
+          - {"inputs": "<full prompt string>"}  (raw mode)
+          - {"inputs": {"context": "...", "question": "...", "system_prompt": "..."}}
+        Optional generation knobs:
+          - {"parameters": {"max_new_tokens": 128}}
+        """
+        if not isinstance(data, dict):
+            raise ValueError("Request payload must be a JSON object.")
+        if "inputs" not in data:
+            raise ValueError("Missing required field: 'inputs'.")
+        inputs: Union[str, Dict[str, Any]] = data["inputs"]
+        # Optional: generation parameters
+        params = data.get("parameters") or {}
+        try:
+            max_new_tokens = int(params.get("max_new_tokens", DEFAULT_MAX_NEW_TOKENS))
+        except Exception:
+            max_new_tokens = DEFAULT_MAX_NEW_TOKENS
+        # Build prompt exactly like your notebook logic:
+        # prompt = f"{context}\n{system_prompt}\n{question}\n"
         if isinstance(inputs, str):
             prompt = inputs
         elif isinstance(inputs, dict):
             context = inputs.get("context", "")
             question = inputs.get("question", "")
+            system_prompt = inputs.get("system_prompt", self.system_prompt)
+            if not isinstance(context, str) or not isinstance(question, str) or not isinstance(system_prompt, str):
+                raise ValueError("'context', 'question', and 'system_prompt' must be strings.")
+            prompt = f"{context}\n{system_prompt}\n{question}\n"
         else:
+            raise ValueError("'inputs' must be a string or an object with {context, question}.")
+        # Tokenize
         enc = self.tokenizer(prompt, return_tensors="pt")
         input_ids = enc["input_ids"]
+        attention_mask = enc.get("attention_mask", None)
+        # Left-truncate to keep only most recent tokens (last 512)
         if input_ids.shape[1] > MAX_INPUT_TOKENS:
             input_ids = input_ids[:, -MAX_INPUT_TOKENS:]
+            if attention_mask is not None:
+                attention_mask = attention_mask[:, -MAX_INPUT_TOKENS:]
         input_ids = input_ids.to(self.device)
+        if attention_mask is not None:
+            attention_mask = attention_mask.to(self.device)
+        # Generate deterministically
+        out = self.model.generate(
             input_ids=input_ids,
             attention_mask=attention_mask,
             do_sample=False,
+            num_beams=1,
+            max_new_tokens=max_new_tokens,
+            use_cache=True,
         )
+        text = self.tokenizer.decode(out[0], skip_special_tokens=True)
+        return {"generated_text": text}