| | import os |
| | import gradio as gr |
| | from llama_cpp import Llama |
| |
|
| | |
| | MODEL_PATH = "./models/mistral.gguf" |
| |
|
| | llm = Llama( |
| | model_path=MODEL_PATH, |
| | n_ctx=2048, |
| | n_threads=9, |
| | n_batch=128, |
| | use_mlock=True, |
| | use_mmap=True, |
| | verbose=False |
| | ) |
| |
|
| | |
| | def generate_response(prompt): |
| | stream = llm( |
| | prompt=f"[INST] {prompt.strip()} [/INST]", |
| | max_tokens=512, |
| | stop=["</s>"], |
| | stream=True |
| | ) |
| | partial = "" |
| | for chunk in stream: |
| | partial += chunk["choices"][0]["text"] |
| | yield partial |
| |
|
| | |
| | gr.ChatInterface( |
| | fn=generate_response, |
| | title="Leo9 AI Tutor", |
| | description="An ai chatbots who answer any question.", |
| | ).launch() |
| |
|