taylorj94
/

Llama-3.2-1B

@@ -1,33 +1,26 @@
 import os
 import torch
 from llama_cpp import Llama  # Library for GGUF model handling
-from typing import Any, List, Dict, Union
-from transformers import LogitsProcessorList
-class FixedVocabLogitsProcessor(torch.nn.Module):
     """
     A custom logits processor for GGUF-compatible models.
     """
     def __init__(self, allowed_ids: set[int], fill_value=float('-inf')):
-        super().__init__()
         self.allowed_ids = allowed_ids
         self.fill_value = fill_value
-    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
         """
         Modify logits to restrict to allowed token IDs.
-        Args:
-            input_ids (torch.Tensor): Input IDs.
-            scores (torch.Tensor): Logits scores.
-        Returns:
-            torch.Tensor: Modified logits.
         """
-        for token_id in range(scores.size(1)):
             if token_id not in self.allowed_ids:
-                scores[:, token_id] = self.fill_value
-        return scores
 class EndpointHandler:
@@ -38,8 +31,8 @@ class EndpointHandler:
             path (str): Path to the GGUF file.
         """
         self.model = Llama.from_pretrained(
-            repo_id="taylorj94/Llama-3.2-1B",
-            filename="model.gguf",
         )
         self.tokenizer = self.model.tokenizer  # GGUF-specific tokenizer, if available
@@ -54,8 +47,11 @@ class EndpointHandler:
         # Extract inputs and parameters
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", {})
-        vocab_list = data.pop("vocab_list", None)
         if not vocab_list:
             raise ValueError("You must provide a 'vocab_list' to define allowed tokens.")
@@ -68,19 +64,15 @@ class EndpointHandler:
         # Tokenize input
         input_ids = self.model.tokenize(inputs)
-        # Prepare logits processor
-        logits_processor = LogitsProcessorList([
-            FixedVocabLogitsProcessor(allowed_ids)
-        ])
         # Perform inference
         output_ids = self.model.generate(
-            input_ids=input_ids,
             max_tokens=parameters.get("max_length", 30),
-            logits_processor=logits_processor
         )
         # Decode the output
         generated_text = self.model.detokenize(output_ids)
-        return [{"generated_text": generated_text}]

 import os
 import torch
 from llama_cpp import Llama  # Library for GGUF model handling
+from typing import Any, List, Dict
+class FixedVocabLogitsProcessor:
     """
     A custom logits processor for GGUF-compatible models.
     """
     def __init__(self, allowed_ids: set[int], fill_value=float('-inf')):
         self.allowed_ids = allowed_ids
         self.fill_value = fill_value
+    def apply(self, logits: torch.FloatTensor):
         """
         Modify logits to restrict to allowed token IDs.
         """
+        for token_id in range(len(logits)):
             if token_id not in self.allowed_ids:
+                logits[token_id] = self.fill_value
+        return logits
 class EndpointHandler:
             path (str): Path to the GGUF file.
         """
         self.model = Llama.from_pretrained(
+        	repo_id="taylorj94/Llama-3.2-1B",
+        	filename="model.gguf",
         )
         self.tokenizer = self.model.tokenizer  # GGUF-specific tokenizer, if available
         # Extract inputs and parameters
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", {})
+        print('Debug 1')
+        vocab_list = data.pop("vocab_list", [])
+        print('Debug 2')
         if not vocab_list:
             raise ValueError("You must provide a 'vocab_list' to define allowed tokens.")
         # Tokenize input
         input_ids = self.model.tokenize(inputs)
+        print('Debug 3')
         # Perform inference
         output_ids = self.model.generate(
+            input_ids,
             max_tokens=parameters.get("max_length", 30),
+            logits_processor=lambda logits: FixedVocabLogitsProcessor(allowed_ids).apply(logits)
         )
         # Decode the output
         generated_text = self.model.detokenize(output_ids)
+        return [{"generated_text": generated_text}]