Update README.md
Browse files
README.md
CHANGED
|
@@ -57,14 +57,12 @@ tokenizer = AutoTokenizer.from_pretrained(
|
|
| 57 |
trust_remote_code=True
|
| 58 |
)
|
| 59 |
|
| 60 |
-
# IMPORTANT: Load with flash-attention for correct behavior
|
| 61 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
| 62 |
model_name=model_id,
|
| 63 |
token=hf_token,
|
| 64 |
max_seq_length=2048, # Adjust based on your memory constraints
|
| 65 |
dtype=None, # Auto-detect best dtype
|
| 66 |
load_in_4bit=True, # Use 4-bit quantization for efficiency
|
| 67 |
-
use_flash_attention=True # REQUIRED for correct results
|
| 68 |
)
|
| 69 |
|
| 70 |
# Enable fast inference mode
|
|
|
|
| 57 |
trust_remote_code=True
|
| 58 |
)
|
| 59 |
|
|
|
|
| 60 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
| 61 |
model_name=model_id,
|
| 62 |
token=hf_token,
|
| 63 |
max_seq_length=2048, # Adjust based on your memory constraints
|
| 64 |
dtype=None, # Auto-detect best dtype
|
| 65 |
load_in_4bit=True, # Use 4-bit quantization for efficiency
|
|
|
|
| 66 |
)
|
| 67 |
|
| 68 |
# Enable fast inference mode
|