Spaces:
Paused
Paused
fix(remove): use_cached_output is not an option
Browse files
main.py
CHANGED
|
@@ -64,7 +64,6 @@ engine_llama_3_2: LLM = LLM(
|
|
| 64 |
# Your Tesla T4 GPU has compute capability 7.5.
|
| 65 |
# You can use float16 instead by explicitly setting the`dtype` flag in CLI, for example: --dtype=half.
|
| 66 |
dtype='half', # Use 'half' for T4
|
| 67 |
-
use_cached_outputs=True, # Enable caching
|
| 68 |
)
|
| 69 |
|
| 70 |
# ValueError: max_num_batched_tokens (512) is smaller than max_model_len (32768).
|
|
@@ -80,7 +79,6 @@ engine_sailor_chat: LLM = LLM(
|
|
| 80 |
max_model_len=32768,
|
| 81 |
enforce_eager=True, # Disable CUDA graph
|
| 82 |
dtype='half', # Use 'half' for T4
|
| 83 |
-
use_cached_outputs=True, # Enable caching
|
| 84 |
)
|
| 85 |
|
| 86 |
|
|
|
|
| 64 |
# Your Tesla T4 GPU has compute capability 7.5.
|
| 65 |
# You can use float16 instead by explicitly setting the`dtype` flag in CLI, for example: --dtype=half.
|
| 66 |
dtype='half', # Use 'half' for T4
|
|
|
|
| 67 |
)
|
| 68 |
|
| 69 |
# ValueError: max_num_batched_tokens (512) is smaller than max_model_len (32768).
|
|
|
|
| 79 |
max_model_len=32768,
|
| 80 |
enforce_eager=True, # Disable CUDA graph
|
| 81 |
dtype='half', # Use 'half' for T4
|
|
|
|
| 82 |
)
|
| 83 |
|
| 84 |
|