prashanthbsp
/

reasoning-cpg-entity-v1

@@ -1,36 +1,25 @@
 from typing import Dict, List, Any
-from unsloth import FastLanguageModel
-class EndpointHandler():
-    def __init__(self, path="prashanthbsp/DeepSeek-R1-Distill-Llama-8B-unsloth-bnb-4bit-reasoning-cpg-entity-v1"):
-        # Preload all the elements you are going to need at inference.
-        # pseudo:
-        # self.model= load_model(path)
-        max_seq_length = 2048
-        dtype = None
-        load_in_4bit = True
-        model, tokenizer = FastLanguageModel.from_pretrained(
-            model_name = path,
-            max_seq_length = max_seq_length,
-            dtype = dtype,
-            load_in_4bit = load_in_4bit,
-        )
-        self.model = model
-        self.tokenizer = tokenizer
-    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
-       data args:
-            inputs (:obj: `str` | `PIL.Image` | `np.array`)
-            kwargs
-      Return:
-            A :obj:`list` | `dict`: will be serialized and returned
         """
-        # pseudo
-        # self.model(input)
         inputs = data.pop("inputs", data)
         context = inputs.pop("context", inputs)
-        prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
             Write a response that appropriately completes the request.
             Before answering, think carefully about the task to ensure a logical and accurate response.
@@ -65,16 +54,16 @@ class EndpointHandler():
             }}
             ### Social Media Post:
-            {0}
             ### Response:
-            <think>{1}"""
-        FastLanguageModel.for_inference(model)
-        inputs = tokenizer([prompt_style.format(context, "")], return_tensors="pt").to("cuda")
-        outputs = model.generate(
-            input_ids=inputs.input_ids,
-            attention_mask=inputs.attention_mask,
-            max_new_tokens=1200,
-            use_cache=True,
-        )
-        response = tokenizer.batch_decode(outputs)
-        return response[0].split("### Response:")[1]

 from typing import Dict, List, Any
+from transformers import AutoModelForCausalLM, AutoTokenizer
+class EndpointHandler:
+    def __init__(self, path="prashanthbsp/reasoning-cpg-entity-v1"):
+        # Standard HF model loading - compatible with TGI
+        self.tokenizer = AutoTokenizer.from_pretrained(path)
+        # Model is loaded by the TGI server, not by the handler
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
+        data args:
+            inputs: text or dict containing text
+        Return:
+            A dict with the model's response
         """
+        # Extract inputs
         inputs = data.pop("inputs", data)
         context = inputs.pop("context", inputs)
+        # Format prompt according to your requirements
+        prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context.
             Write a response that appropriately completes the request.
             Before answering, think carefully about the task to ensure a logical and accurate response.
             }}
             ### Social Media Post:
+            {context}
             ### Response:
+            <think>"""
+        # For TGI, we return a dict with the prompt and generation params
+        return {
+            "inputs": prompt,
+            "parameters": {
+                "max_new_tokens": 1200,
+                "do_sample": False,
+                "return_full_text": False  # Only return the generated text, not the prompt
+            }
+        }