Update app.py
Browse files
app.py
CHANGED
|
@@ -49,7 +49,16 @@ def load_model():
|
|
| 49 |
logging.info("uploading model from hf pub")
|
| 50 |
#model_path = '/content/llama.cpp/models/llama-2-7b-chat.ggmlv3.q4_K_M.bin'
|
| 51 |
llm = LlamaCpp(model_path=model_path, n_ctx=4096)
|
| 52 |
-
llm_chain = LLMChain(llm=llm, prompt=prompt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
#llm_chain = ConversationChain(llm=llm, prompt=promptmemory=ConversationBufferMemory())
|
| 54 |
logging.info("uploading model done")
|
| 55 |
return llm_chain
|
|
|
|
| 49 |
logging.info("uploading model from hf pub")
|
| 50 |
#model_path = '/content/llama.cpp/models/llama-2-7b-chat.ggmlv3.q4_K_M.bin'
|
| 51 |
llm = LlamaCpp(model_path=model_path, n_ctx=4096)
|
| 52 |
+
#llm_chain = LLMChain(llm=llm, prompt=prompt)
|
| 53 |
+
n_gpu_layers = 1 # Change this value based on your model and your GPU VRAM pool.
|
| 54 |
+
n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
|
| 55 |
+
llm = LlamaCpp(model_path=model_path, n_ctx=2048,
|
| 56 |
+
input={"temperature": 0.75, "max_length": 2000, "top_p": 1},
|
| 57 |
+
callback_manager=callback_manager,
|
| 58 |
+
n_gpu_layers=n_gpu_layers,
|
| 59 |
+
n_batch=n_batch,
|
| 60 |
+
verbose=True,)
|
| 61 |
+
|
| 62 |
#llm_chain = ConversationChain(llm=llm, prompt=promptmemory=ConversationBufferMemory())
|
| 63 |
logging.info("uploading model done")
|
| 64 |
return llm_chain
|