teapotai
/

tinyteapot

@@ -1,19 +1,14 @@
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
-MODEL_NAME = "teapotai/tinyteapot"
 MAX_INPUT_TOKENS = 512
 class EndpointHandler:
     def __init__(self, path: str = ""):
-        # EXACT same as your snippet BUT force slow tokenizer
-        # This prevents the fast tokenizer crash from extra_special_tokens list
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            MODEL_NAME,
-            use_fast=False
-        )
-        self.model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
         self.model.eval()
         self.device = torch.device("cpu")
@@ -30,11 +25,9 @@ class EndpointHandler:
     @torch.inference_mode()
     def __call__(self, data):
         inputs = data.get("inputs")
         if inputs is None:
-            raise ValueError("Missing 'inputs' field")
-        # Match your ask() behavior
         if isinstance(inputs, str):
             prompt = inputs
         elif isinstance(inputs, dict):
@@ -42,15 +35,13 @@ class EndpointHandler:
             question = inputs.get("question", "")
             prompt = f"{context}\n{self.system_prompt}\n{question}\n"
         else:
-            raise ValueError("inputs must be a string or dict")
-        # EXACT tokenizer call like your code
         enc = self.tokenizer(prompt, return_tensors="pt")
         input_ids = enc["input_ids"]
         attention_mask = enc["attention_mask"]
-        # NEW requirement: keep most recent 512 tokens
         if input_ids.shape[1] > MAX_INPUT_TOKENS:
             input_ids = input_ids[:, -MAX_INPUT_TOKENS:]
             attention_mask = attention_mask[:, -MAX_INPUT_TOKENS:]
@@ -58,15 +49,11 @@ class EndpointHandler:
         input_ids = input_ids.to(self.device)
         attention_mask = attention_mask.to(self.device)
-        # EXACT generation settings from your snippet
         outputs = self.model.generate(
             input_ids=input_ids,
             attention_mask=attention_mask,
-            do_sample=False
         )
         answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-        return {
-            "generated_text": answer
-        }

+# handler.py (repo root)
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
 MAX_INPUT_TOKENS = 512
 class EndpointHandler:
     def __init__(self, path: str = ""):
+        # Load exactly from the mounted model dir ("/repository")
+        self.tokenizer = AutoTokenizer.from_pretrained(path)
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(path)
         self.model.eval()
         self.device = torch.device("cpu")
     @torch.inference_mode()
     def __call__(self, data):
         inputs = data.get("inputs")
         if inputs is None:
+            raise ValueError("Missing required field 'inputs'.")
         if isinstance(inputs, str):
             prompt = inputs
         elif isinstance(inputs, dict):
             question = inputs.get("question", "")
             prompt = f"{context}\n{self.system_prompt}\n{question}\n"
         else:
+            raise ValueError("inputs must be a string or dict.")
         enc = self.tokenizer(prompt, return_tensors="pt")
         input_ids = enc["input_ids"]
         attention_mask = enc["attention_mask"]
+        # keep most recent 512 tokens
         if input_ids.shape[1] > MAX_INPUT_TOKENS:
             input_ids = input_ids[:, -MAX_INPUT_TOKENS:]
             attention_mask = attention_mask[:, -MAX_INPUT_TOKENS:]
         input_ids = input_ids.to(self.device)
         attention_mask = attention_mask.to(self.device)
         outputs = self.model.generate(
             input_ids=input_ids,
             attention_mask=attention_mask,
+            do_sample=False,
         )
         answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return {"generated_text": answer}