Fix: Add pad_token and attention_mask for proper generation"

#1
Files changed (1) hide show
  1. handler.py +10 -2
handler.py CHANGED
@@ -15,6 +15,10 @@ class EndpointHandler:
15
 
16
  print(f"Loading tokenizer from {path}...")
17
  self.tokenizer = AutoTokenizer.from_pretrained(path)
 
 
 
 
18
 
19
  print(f"Loading model from {path} on device: {self.device}...")
20
  self.model = AutoModelForCausalLM.from_pretrained(
@@ -62,11 +66,15 @@ class EndpointHandler:
62
  print(f"Generating with parameters: {gen_kwargs}")
63
 
64
  # Tokenize input
65
- enc = self.tokenizer(inputs, return_tensors="pt").to(self.device)
 
 
66
 
67
  # Generate with no_grad for efficiency
68
  with torch.no_grad():
69
- out = self.model.generate(**enc, **gen_kwargs)
 
 
70
 
71
  # Decode output
72
  generated_text = self.tokenizer.decode(out[0], skip_special_tokens=True)
 
15
 
16
  print(f"Loading tokenizer from {path}...")
17
  self.tokenizer = AutoTokenizer.from_pretrained(path)
18
+
19
+ # ✅ ADD THIS: Set pad token to prevent corruption
20
+ if self.tokenizer.pad_token is None:
21
+ self.tokenizer.pad_token = self.tokenizer.eos_token
22
 
23
  print(f"Loading model from {path} on device: {self.device}...")
24
  self.model = AutoModelForCausalLM.from_pretrained(
 
66
  print(f"Generating with parameters: {gen_kwargs}")
67
 
68
  # Tokenize input
69
+ enc = self.tokenizer(inputs, return_tensors="pt",padding=True, # Enable padding
70
+ truncation=True, # Truncate if needed
71
+ max_length=2048).to(self.device)
72
 
73
  # Generate with no_grad for efficiency
74
  with torch.no_grad():
75
+ out = self.model.generate(**enc,
76
+ **gen_kwargs,
77
+ pad_token_id=self.tokenizer.pad_token_id) # Tell model which token is padding
78
 
79
  # Decode output
80
  generated_text = self.tokenizer.decode(out[0], skip_special_tokens=True)