NoesisLab
/

Asterisk

@@ -1,129 +1,87 @@
 # handler.py
-from __future__ import annotations
-from typing import Any, Dict, List, Union
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
 Json = Dict[str, Any]
-Messages = List[Dict[str, str]]  # [{"role":"user|assistant|system", "content":"..."}]
-def _is_messages(x: Any) -> bool:
-    return (
-        isinstance(x, list)
-        and len(x) > 0
-        and all(isinstance(m, dict) and "role" in m and "content" in m for m in x)
-    )
 class EndpointHandler:
     """
-    Hugging Face Inference Endpoints custom handler.
-    Expects:
-      - request body is a dict
-      - always contains `inputs`
-      - may contain `parameters` for generation
     """
     def __init__(self, model_dir: str):
-        self.model_dir = model_dir
-        # Pick dtype/device
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        if self.device == "cuda":
-            # bfloat16 is usually safe on A100/H100; if your instance doesn't support bf16, change to float16
-            self.dtype = torch.bfloat16
-        else:
-            self.dtype = torch.float32
-        # IMPORTANT: trust_remote_code=True because repo contains AsteriskForCausalLM.py + auto_map
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_dir,
-            trust_remote_code=True,
             use_fast=True,
         )
-        # Make sure pad token exists (your config uses pad_token_id=2 which equals eos_token_id in many llama-like models)
-        if self.tokenizer.pad_token_id is None and self.tokenizer.eos_token_id is not None:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
         self.model = AutoModelForCausalLM.from_pretrained(
             model_dir,
             trust_remote_code=True,
-            torch_dtype=self.dtype,
-            device_map="auto" if self.device == "cuda" else None,
         )
-        if self.device != "cuda":
-            self.model.to(self.device)
         self.model.eval()
     @torch.inference_mode()
-    def __call__(self, data: Json) -> Union[Json, List[Json]]:
         inputs = data.get("inputs", "")
         params = data.get("parameters", {}) or {}
-        # Generation defaults (can be overridden via `parameters`)
-        max_new_tokens = int(params.get("max_new_tokens", 256))
-        temperature = float(params.get("temperature", 0.7))
-        top_p = float(params.get("top_p", 0.95))
-        top_k = int(params.get("top_k", 0))
-        repetition_penalty = float(params.get("repetition_penalty", 1.0))
-        do_sample = bool(params.get("do_sample", temperature > 0))
-        num_beams = int(params.get("num_beams", 1))
-        def _one(item: Any) -> Json:
-            # Accept:
-            # 1) string prompt
-            # 2) messages list: [{"role":"user","content":"..."}]
-            # 3) dict {"messages":[...]} (common chat style)
-            if isinstance(item, dict) and "messages" in item:
-                item = item["messages"]
-            if _is_messages(item):
-                rendered = self.tokenizer.apply_chat_template(
-                    item,
-                    tokenize=False,
-                    add_generation_prompt=True,
-                )
-                enc = self.tokenizer(rendered, return_tensors="pt")
-                input_ids = enc["input_ids"]
-                attention_mask = enc.get("attention_mask", None)
-            else:
-                enc = self.tokenizer(str(item), return_tensors="pt")
-                input_ids = enc["input_ids"]
-                attention_mask = enc.get("attention_mask", None)
-            input_ids = input_ids.to(self.model.device)
-            if attention_mask is not None:
-                attention_mask = attention_mask.to(self.model.device)
-            gen_ids = self.model.generate(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_new_tokens=max_new_tokens,
-                do_sample=do_sample,
-                temperature=temperature if do_sample else None,
-                top_p=top_p if do_sample else None,
-                top_k=top_k if do_sample and top_k > 0 else None,
-                num_beams=num_beams,
-                repetition_penalty=repetition_penalty,
-                pad_token_id=self.tokenizer.pad_token_id,
-                eos_token_id=self.tokenizer.eos_token_id,
-            )
-            # Only return newly generated tokens
-            new_tokens = gen_ids
-            text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
-            return {"generated_text": text}
-        # Batch support
-        if isinstance(inputs, list) and not _is_messages(inputs):
-            return [_one(x) for x in inputs]
-        else:
-            return _one(inputs)

 # handler.py
+from typing import Any, Dict, List
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
 Json = Dict[str, Any]
 class EndpointHandler:
     """
+    Minimal custom handler for Hugging Face Inference Endpoints.
+    Implements __init__() to load the model/tokenizer,
+    and __call__() to handle inference requests.
     """
     def __init__(self, model_dir: str):
+        """
+        Called once on endpoint startup.
+        Args:
+            model_dir (str): Local path where the model repo was downloaded.
+        """
+        # Load tokenizer and model
+        # Set trust_remote_code=True if the model repo has custom code
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_dir,
+            trust_remote_code=True,   # allow custom code in repo
             use_fast=True,
         )
         self.model = AutoModelForCausalLM.from_pretrained(
             model_dir,
             trust_remote_code=True,
         )
+        # Put model in eval mode
         self.model.eval()
     @torch.inference_mode()
+    def __call__(self, data: Json) -> List[Json]:
+        """
+        Called for each inference request.
+        Args:
+            data (dict): {"inputs": str or list[str], "parameters": {...}}
+        Returns:
+            List[dict]: list of output dicts (each must be serializable).
+        """
+        # Parse incoming prompt(s)
         inputs = data.get("inputs", "")
         params = data.get("parameters", {}) or {}
+        # Tokenize
+        enc = self.tokenizer(
+            inputs,
+            return_tensors="pt",
+            padding=True,
+        )
+        input_ids = enc["input_ids"]
+        attention_mask = enc["attention_mask"]
+        # Move tensors to model device
+        device = next(self.model.parameters()).device
+        input_ids = input_ids.to(device)
+        attention_mask = attention_mask.to(device)
+        # Generation parameters (optional overrides)
+        max_new_tokens = int(params.get("max_new_tokens", 128))
+        temperature = float(params.get("temperature", 1.0))
+        # Run generation
+        output_ids = self.model.generate(
+            input_ids,
+            attention_mask=attention_mask,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+        )
+        # Decode to text
+        outputs = []
+        for seq in output_ids:
+            text = self.tokenizer.decode(seq, skip_special_tokens=True)
+            outputs.append({"generated_text": text})
+        return outputs