hermes3-llama-cpp

Running

Jodaro commited on 1 day ago

Commit

689f1fc

verified ·

1 Parent(s): 1e86ec8

Switch to llama_cpp

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,19 +1,21 @@
 import gradio as gr
-from ctransformers import AutoModelForCausalLM
 MODEL_REPO = "bartowski/Hermes-3-Llama-3.1-8B-GGUF"
 MODEL_FILE = "Hermes-3-Llama-3.1-8B-Q4_K_M.gguf"
 print("Loading model...")
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_REPO,
-    model_file=MODEL_FILE,
-    model_type="llama",
-    gpu_layers=0,
-    context_length=4096,
-)
-def respond(message, history):
     prompt = ""
     for user_msg, bot_msg in history:
         prompt += f"<|im_start|>user\n{user_msg}\n<|im_end|>\n"
@@ -21,10 +23,22 @@ def respond(message, history):
     prompt += f"<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n"
-    output = model(prompt, max_new_tokens=512, temperature=0.7, top_p=0.9)
-    return output
 iface = gr.ChatInterface(respond)
 if __name__ == "__main__":
-    iface.launch()

 import gradio as gr
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
 MODEL_REPO = "bartowski/Hermes-3-Llama-3.1-8B-GGUF"
 MODEL_FILE = "Hermes-3-Llama-3.1-8B-Q4_K_M.gguf"
+print("Downloading model...")
+model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
 print("Loading model...")
+llm = Llama(model_path=model_path, n_ctx=4096, n_threads=2)
+STOP_TOKENS = ["<|im_end|>"]
+def respond(message: str, history: list[list[str]]) -> str:
+    # conversation history into prompt
     prompt = ""
     for user_msg, bot_msg in history:
         prompt += f"<|im_start|>user\n{user_msg}\n<|im_end|>\n"
     prompt += f"<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n"
+    out = llm(
+        prompt,
+        max_tokens=512,
+        temperature=0.7,
+        top_p=0.9,
+        stop=STOP_TOKENS,
+    )
+    text = out["choices"][0]["text"]
+    # remove any stop token
+    for s in STOP_TOKENS:
+        text = text.split(s)[0]
+    return text
 iface = gr.ChatInterface(respond)
 if __name__ == "__main__":
+    iface.launch()