busybisi commited on
Commit
0aee091
·
verified ·
1 Parent(s): 7faf46b

Upload handler.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. handler.py +9 -15
handler.py CHANGED
@@ -5,7 +5,8 @@ import torch
5
 
6
  class EndpointHandler:
7
  """
8
- Custom handler for DoloresAI model on HuggingFace Inference Endpoints.
 
9
  """
10
 
11
  def __init__(self, path=""):
@@ -30,18 +31,14 @@ class EndpointHandler:
30
 
31
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, str]]:
32
  """
33
- Process inference requests.
34
 
35
  Args:
36
  data (Dict): Input data with format:
37
  {
38
  "inputs": str, # The prompt text
39
  "parameters": { # Optional generation parameters
40
- "max_new_tokens": int,
41
- "temperature": float,
42
- "top_p": float,
43
- "do_sample": bool,
44
- "repetition_penalty": float
45
  }
46
  }
47
 
@@ -52,13 +49,9 @@ class EndpointHandler:
52
  inputs = data.pop("inputs", data)
53
  parameters = data.pop("parameters", {})
54
 
55
- # Default generation parameters - use greedy decoding to avoid sampling issues
56
  max_new_tokens = parameters.get("max_new_tokens", 512)
57
 
58
- # Use greedy decoding (do_sample=False) to avoid probability tensor issues
59
- # This is more stable for models with potential embedding issues
60
- do_sample = False # Force greedy decoding
61
-
62
  # Tokenize input
63
  input_ids = self.tokenizer(
64
  inputs,
@@ -67,13 +60,14 @@ class EndpointHandler:
67
  max_length=self.model.config.max_position_embeddings - max_new_tokens
68
  ).input_ids.to(self.model.device)
69
 
70
- # Generate response with greedy decoding (no sampling)
 
71
  with torch.no_grad():
72
  outputs = self.model.generate(
73
  input_ids,
74
  max_new_tokens=max_new_tokens,
75
- do_sample=False, # Greedy decoding - most stable
76
- num_beams=1, # No beam search for speed
77
  pad_token_id=self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id,
78
  eos_token_id=self.tokenizer.eos_token_id,
79
  )
 
5
 
6
  class EndpointHandler:
7
  """
8
+ Custom handler for DoloresAI model - GREEDY DECODING ONLY
9
+ This avoids sampling issues with resized embeddings.
10
  """
11
 
12
  def __init__(self, path=""):
 
31
 
32
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, str]]:
33
  """
34
+ Process inference requests using GREEDY DECODING ONLY.
35
 
36
  Args:
37
  data (Dict): Input data with format:
38
  {
39
  "inputs": str, # The prompt text
40
  "parameters": { # Optional generation parameters
41
+ "max_new_tokens": int
 
 
 
 
42
  }
43
  }
44
 
 
49
  inputs = data.pop("inputs", data)
50
  parameters = data.pop("parameters", {})
51
 
52
+ # Get max tokens (only parameter we use)
53
  max_new_tokens = parameters.get("max_new_tokens", 512)
54
 
 
 
 
 
55
  # Tokenize input
56
  input_ids = self.tokenizer(
57
  inputs,
 
60
  max_length=self.model.config.max_position_embeddings - max_new_tokens
61
  ).input_ids.to(self.model.device)
62
 
63
+ # Generate response with GREEDY DECODING ONLY
64
+ # This is stable and avoids NaN/inf errors from sampling
65
  with torch.no_grad():
66
  outputs = self.model.generate(
67
  input_ids,
68
  max_new_tokens=max_new_tokens,
69
+ do_sample=False, # GREEDY - no sampling
70
+ num_beams=1, # No beam search
71
  pad_token_id=self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id,
72
  eos_token_id=self.tokenizer.eos_token_id,
73
  )