Update README.md
Browse files
README.md
CHANGED
|
@@ -243,7 +243,7 @@ from llama_cpp import Llama
|
|
| 243 |
|
| 244 |
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
|
| 245 |
llm = Llama(
|
| 246 |
-
model_path="./
|
| 247 |
n_ctx=2048, # The max sequence length to use - note that longer sequence lengths require much more resources
|
| 248 |
n_threads=8, # The number of CPU threads to use, tailor to your system and the resulting performance
|
| 249 |
n_gpu_layers=35 # The number of layers to offload to GPU, if you have GPU acceleration available
|
|
@@ -251,7 +251,7 @@ llm = Llama(
|
|
| 251 |
|
| 252 |
# Simple inference example
|
| 253 |
output = llm(
|
| 254 |
-
"
|
| 255 |
max_tokens=512, # Generate up to 512 tokens
|
| 256 |
stop=["</s>"], # Example stop token - not necessarily correct for this specific model! Please check before using.
|
| 257 |
echo=True # Whether to echo the prompt
|
|
@@ -259,7 +259,7 @@ output = llm(
|
|
| 259 |
|
| 260 |
# Chat Completion API
|
| 261 |
|
| 262 |
-
llm = Llama(model_path="./
|
| 263 |
llm.create_chat_completion(
|
| 264 |
messages = [
|
| 265 |
{"role": "system", "content": "You are a story writing assistant."},
|
|
|
|
| 243 |
|
| 244 |
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
|
| 245 |
llm = Llama(
|
| 246 |
+
model_path="./NT-Java-1.1B_Q4_K_M.gguf", # Download the model file first
|
| 247 |
n_ctx=2048, # The max sequence length to use - note that longer sequence lengths require much more resources
|
| 248 |
n_threads=8, # The number of CPU threads to use, tailor to your system and the resulting performance
|
| 249 |
n_gpu_layers=35 # The number of layers to offload to GPU, if you have GPU acceleration available
|
|
|
|
| 251 |
|
| 252 |
# Simple inference example
|
| 253 |
output = llm(
|
| 254 |
+
"{prompt}", # Prompt
|
| 255 |
max_tokens=512, # Generate up to 512 tokens
|
| 256 |
stop=["</s>"], # Example stop token - not necessarily correct for this specific model! Please check before using.
|
| 257 |
echo=True # Whether to echo the prompt
|
|
|
|
| 259 |
|
| 260 |
# Chat Completion API
|
| 261 |
|
| 262 |
+
llm = Llama(model_path="./NT-Java-1.1B_Q4_K_M.gguf", chat_format="llama-2") # Set chat_format according to the model you are using
|
| 263 |
llm.create_chat_completion(
|
| 264 |
messages = [
|
| 265 |
{"role": "system", "content": "You are a story writing assistant."},
|