vivekchakraverty commited on
Commit
6246295
·
1 Parent(s): 2709f63

Restore max_new_tokens to 512 (4-bit gen is fast: ~25 tok/s on GPU)

Browse files
Files changed (1) hide show
  1. generate.py +1 -1
generate.py CHANGED
@@ -71,7 +71,7 @@ def _render(messages, tok) -> str:
71
 
72
 
73
  @GPU(duration=180)
74
- def generate(messages: list[dict], max_new_tokens: int = 256,
75
  temperature: float = 0.2) -> str:
76
  """Generate an assistant reply for chat-format ``messages``."""
77
  if STUB:
 
71
 
72
 
73
  @GPU(duration=180)
74
+ def generate(messages: list[dict], max_new_tokens: int = 512,
75
  temperature: float = 0.2) -> str:
76
  """Generate an assistant reply for chat-format ``messages``."""
77
  if STUB: