teapotai
/

tinyteapot

@@ -1,71 +1,70 @@
 # handler.py
-from typing import Any, Dict, List, Union
-import torch
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-MAX_INPUT_TOKENS = 512
 class EndpointHandler:
-    """
-    HF Inference Endpoints custom handler that reproduces the exact style of
-    your shared Colab code:
-      - slow tokenizer (use_fast=False)
-      - Seq2Seq model
-      - deterministic generation by default (do_sample=False)
-      - decode skip_special_tokens=True
-      - if input > 512 tokens, keep only the MOST RECENT tokens (left-truncate)
-    """
     def __init__(self, path: str = ""):
-        # Match your working code path and avoid fast tokenizer init issues on HF endpoints.
-        self.tokenizer = AutoTokenizer.from_pretrained(path, use_fast=False)
-        self.model = AutoModelForSeq2SeqLM.from_pretrained(path)
         self.model.eval()
         self.device = torch.device("cpu")
         self.model.to(self.device)
     @torch.inference_mode()
-    def __call__(self, data: Dict[str, Any]) -> Union[Dict[str, str], List[Dict[str, str]]]:
         """
-        Request schema:
-          {
-            "inputs": "<full prompt string>" OR ["<prompt1>", "<prompt2>", ...],
-            "parameters": { ... optional generate kwargs ... }
-          }
-        Response schema (kept simple):
-          - single input  -> {"generated_text": "..."}
-          - list inputs   -> [{"generated_text": "..."}, ...]
         """
-        if "inputs" not in data:
-            raise ValueError("Missing required field 'inputs'.")
-        inputs = data["inputs"]
-        params = data.get("parameters") or {}
-        # Normalize to a batch of prompts
         if isinstance(inputs, str):
-            prompts = [inputs]
-            single = True
         else:
-            prompts = list(inputs)
-            single = False
-        # --- Tokenize WITHOUT truncation first so we can left-truncate manually ---
-        enc = self.tokenizer(
-            prompts,
-            return_tensors="pt",
-            padding=True,
-            truncation=False,
-        )
         input_ids = enc["input_ids"]
         attention_mask = enc["attention_mask"]
-        # Left-truncate to keep the most recent tokens (right side)
         if input_ids.shape[1] > MAX_INPUT_TOKENS:
             input_ids = input_ids[:, -MAX_INPUT_TOKENS:]
             attention_mask = attention_mask[:, -MAX_INPUT_TOKENS:]
@@ -73,34 +72,16 @@ class EndpointHandler:
         input_ids = input_ids.to(self.device)
         attention_mask = attention_mask.to(self.device)
-        # Defaults that match your code: model.generate(**inputs, do_sample=False)
-        # Keep them overrideable via "parameters".
-        gen_kwargs = {
-            "do_sample": params.pop("do_sample", False),
-        }
-        # Optional knobs (only applied if provided)
-        if "max_new_tokens" in params:
-            gen_kwargs["max_new_tokens"] = params.pop("max_new_tokens")
-        if "num_beams" in params:
-            gen_kwargs["num_beams"] = params.pop("num_beams")
-        if "temperature" in params:
-            gen_kwargs["temperature"] = params.pop("temperature")
-        if "top_p" in params:
-            gen_kwargs["top_p"] = params.pop("top_p")
-        if "top_k" in params:
-            gen_kwargs["top_k"] = params.pop("top_k")
-        # Allow any remaining generate() kwargs through, in case you pass them
-        gen_kwargs.update(params)
         outputs = self.model.generate(
             input_ids=input_ids,
             attention_mask=attention_mask,
-            **gen_kwargs,
         )
-        texts = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        result = [{"generated_text": t} for t in texts]
-        return result[0] if single else result

 # handler.py
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import torch
+MODEL_NAME = "."  # HF mounts the repo at /repository, so "." loads local files
+MAX_INPUT_TOKENS = 512
 class EndpointHandler:
     def __init__(self, path: str = ""):
+        # EXACTLY your loading logic (no use_fast, no overrides)
+        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
         self.model.eval()
         self.device = torch.device("cpu")
         self.model.to(self.device)
+        # Your exact system prompt
+        self.system_prompt = (
+            "You are Teapot, an open-source AI assistant optimized for low-end devices, "
+            "providing short, accurate responses without hallucinating while excelling at "
+            "information extraction and text summarization. "
+            "If the context does not answer the question, reply exactly: "
+            "'I am sorry but I don't have any information on that'."
+        )
     @torch.inference_mode()
+    def __call__(self, data):
         """
+        Expected input format:
+        {
+            "inputs": {
+                "context": "...",
+                "question": "..."
+            }
+        }
+        OR
+        {
+            "inputs": "full prebuilt prompt string"
+        }
         """
+        inputs = data.get("inputs")
+        if inputs is None:
+            raise ValueError("Missing 'inputs' field")
+        # Support BOTH:
+        # 1) Full prompt string (closest to your ask() function)
+        # 2) Structured {context, question}
         if isinstance(inputs, str):
+            prompt = inputs
+        elif isinstance(inputs, dict):
+            context = inputs.get("context", "")
+            question = inputs.get("question", "")
+            prompt = f"{context}\n{self.system_prompt}\n{question}\n"
         else:
+            raise ValueError("inputs must be a string or dict with context/question")
+        # EXACT tokenizer call like your code
+        enc = self.tokenizer(prompt, return_tensors="pt")
         input_ids = enc["input_ids"]
         attention_mask = enc["attention_mask"]
+        # NEW requirement: truncate to MOST RECENT 512 tokens
         if input_ids.shape[1] > MAX_INPUT_TOKENS:
             input_ids = input_ids[:, -MAX_INPUT_TOKENS:]
             attention_mask = attention_mask[:, -MAX_INPUT_TOKENS:]
         input_ids = input_ids.to(self.device)
         attention_mask = attention_mask.to(self.device)
+        # EXACT generation call from your snippet
         outputs = self.model.generate(
             input_ids=input_ids,
             attention_mask=attention_mask,
+            do_sample=False
         )
+        # EXACT decode logic
+        answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return {
+            "generated_text": answer
+        }