ongilLabs
/

IB-Math-Ontology-7B

+"""
+Custom handler for Hugging Face Inference Endpoints
+Model: ongilLabs/IB-Math-Instruct-7B
+"""
+from typing import Dict, List, Any
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+class EndpointHandler:
+    def __init__(self, path: str = ""):
+        """Initialize the model and tokenizer."""
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            path,
+            trust_remote_code=True
+        )
+        self.model = AutoModelForCausalLM.from_pretrained(
+            path,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            trust_remote_code=True
+        )
+        self.model.eval()
+        # Default system prompt
+        self.default_system = """You are an expert IB Mathematics tutor. When solving problems:
+1. Show your work step by step
+2. Explain your reasoning clearly
+3. Use proper mathematical notation
+4. Provide the final answer clearly marked"""
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Handle inference request.
+        Args:
+            data: Dictionary with 'inputs' (str or list) and optional 'parameters'
+        Returns:
+            Dictionary with 'generated_text'
+        """
+        inputs = data.get("inputs", "")
+        parameters = data.get("parameters", {})
+        # Extract parameters with defaults
+        max_new_tokens = parameters.get("max_new_tokens", 1024)
+        temperature = parameters.get("temperature", 0.7)
+        top_p = parameters.get("top_p", 0.9)
+        system_prompt = parameters.get("system_prompt", self.default_system)
+        # Handle both string and message list inputs
+        if isinstance(inputs, str):
+            messages = [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": inputs}
+            ]
+        elif isinstance(inputs, list):
+            # Assume it's already a list of messages
+            messages = inputs
+            # Prepend system if not present
+            if messages and messages[0].get("role") != "system":
+                messages = [{"role": "system", "content": system_prompt}] + messages
+        else:
+            return {"error": "Invalid input format. Expected string or list of messages."}
+        # Apply chat template
+        prompt = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        input_ids = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **input_ids,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature if temperature > 0 else None,
+                top_p=top_p,
+                do_sample=temperature > 0,
+                pad_token_id=self.tokenizer.eos_token_id,
+            )
+        # Decode only new tokens (exclude prompt)
+        response = self.tokenizer.decode(
+            outputs[0][input_ids["input_ids"].shape[1]:],
+            skip_special_tokens=True
+        )
+        return {"generated_text": response}