Srijith Rajamohan commited on
Commit
1dcfe80
·
1 Parent(s): 64c0ea0

Added model inference code

Browse files
Files changed (1) hide show
  1. handler.py +29 -3
handler.py CHANGED
@@ -21,6 +21,10 @@ class EndpointHandler():
21
  quantization_config=None,
22
  torch_dtype=torch.float, # data type is float
23
  device_map="auto")
 
 
 
 
24
 
25
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
26
  """
@@ -31,8 +35,30 @@ class EndpointHandler():
31
  A :obj:`list` | `dict`: will be serialized and returned
32
  """
33
 
34
- # pseudo
35
  inputs = data.pop("inputs", data)
36
- #self.model(input)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- return [{"outputs": inputs}]
 
21
  quantization_config=None,
22
  torch_dtype=torch.float, # data type is float
23
  device_map="auto")
24
+ self.tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
25
+ self.tokenizer.padding_side = "left"
26
+ self.tokenizer.pad_token = self.tokenizer.eos_token
27
+ self.tokenizer.add_eos_token = True
28
 
29
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
30
  """
 
35
  A :obj:`list` | `dict`: will be serialized and returned
36
  """
37
 
 
38
  inputs = data.pop("inputs", data)
39
+ messages = [
40
+ {
41
+ "role": "user",
42
+ "content": ""
43
+ + inputs,
44
+ },
45
+ ]
46
+ encodeds = self.tokenizer.apply_chat_template(messages, return_tensors="pt")
47
+ encoded_length = len(encodeds[0])
48
+ model_inputs = encodeds.to('cuda')
49
+ result = self.model.generate(model_inputs,
50
+ do_sample=False,
51
+ output_scores=True,
52
+ return_dict_in_generate=True,
53
+ output_attentions=True,
54
+ output_hidden_states=True,
55
+ #num_beams=3,
56
+ #no_repeat_ngram_size=1,
57
+ early_stopping = True,
58
+ #top_k=0,
59
+ max_new_tokens=400)
60
+ x, logits_gen = result.sequences, result.scores
61
+ x = x[:,encoded_length:]
62
+ decoded = self.tokenizer.batch_decode(x)
63
 
64
+ return [{"outputs": decoded[0]}]