parallelstudios
/

mpt-7b-instruct-parallel-colony-memory-importance-ft

@@ -1,17 +1,51 @@
 import torch
-from typing import Any, Dict
 from transformers import AutoModelForCausalLM, AutoTokenizer
-class EndpointHandler:
-    def __init__(self, path=''):
-        # load model and tokenizer from path
-        self.tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
         self.model = AutoModelForCausalLM.from_pretrained(
-            path, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True
         )
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
     def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
         # process input
@@ -19,15 +53,13 @@ class EndpointHandler:
         parameters = data.pop("parameters", None)
         # preprocess
-        inputs = self.tokenizer(inputs, return_tensors="pt").to(self.device)
-        # pass inputs with all kwargs in data
-        if parameters is not None:
-            outputs = self.model.generate(**inputs, **parameters)
-        else:
-            outputs = self.model.generate(**inputs)
-        # postprocess the prediction
-        prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-        return [{"generated_text": prediction}]

+import warnings
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from typing import Any, Dict
+class InstructionTextGenerationPipeline:
+    def __init__(
+        self,
+        path,
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True,
+    ) -> None:
         self.model = AutoModelForCausalLM.from_pretrained(
+            path,
+            torch_dtype=torch_dtype,
+            trust_remote_code=trust_remote_code
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            "mosaicml/mpt-7b-instruct",
+            trust_remote_code=trust_remote_code
         )
+        if tokenizer.pad_token_id is None:
+            warnings.warn(
+                "pad_token_id is not set for the tokenizer. Using eos_token_id as pad_token_id."
+            )
+            tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.padding_side = "right" # "left"
+        self.tokenizer = tokenizer
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model.eval()
+        self.model.to(device=self.device, dtype=torch_dtype)
+        self.generate_kwargs = {
+            "temperature": 0.01,
+            "top_p": 0.92,
+            "top_k": 0,
+            "max_new_tokens": 512,
+            "use_cache": True,
+            "do_sample": True,
+            "eos_token_id": self.tokenizer.eos_token_id,
+            "pad_token_id": self.tokenizer.pad_token_id,
+            "repetition_penalty": 1.0
+        }
+    def format_instruction(self, instruction):
+        return PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction)
     def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
         # process input
         parameters = data.pop("parameters", None)
         # preprocess
+        s = PROMPT_FOR_GENERATION_FORMAT.format(instruction=inputs)
+        input_ids = self.tokenizer(s, return_tensors="pt").input_ids.to(self.device)
+        gkw = {**self.generate_kwargs, **parameters}
+        # pass inputs with all kwargs in data
+        with torch.no_grad():
+            output_ids = self.model.generate(input_ids, **gkw)
+        # Slice the output_ids tensor to get only new tokens
+        new_tokens = output_ids[0, len(input_ids[0]) :]
+        output_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
+        return [{"generated_text": output_text}]