iris_before_code

Sleeping

App Files Files Community

astegaras commited on Nov 30, 2025

Commit

e497580

verified ·

1 Parent(s): 66b667b

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -36

app.py CHANGED Viewed

@@ -1,57 +1,47 @@
 import gradio as gr
-from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 # ----------------------------------------------------
-# 1. Download GGUF model from HuggingFace
 # ----------------------------------------------------
-REPO_ID = "astegaras/merged_kaggle"   # your GGUF repo
-FILENAME = "llama-3.2-3b-instruct.Q4_K_M.gguf"        # your GGUF file
-print("Downloading GGUF model...")
-model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
-print("Model downloaded:", model_path)
-# ----------------------------------------------------
-# 2. Load llama.cpp model
-# ----------------------------------------------------
-llm = Llama(
-    model_path=model_path,
-    n_ctx=4096,           # context size
-    n_threads=8,          # use HF Space CPU
 )
 # ----------------------------------------------------
-# 3. Chat / inference function
 # ----------------------------------------------------
 def respond(message, history):
-    prompt = ""
-    # Build prompt manually
-    for user_msg, bot_msg in history:
-        prompt += f"User: {user_msg}\nAssistant: {bot_msg}\n"
-    prompt += f"User: {message}\nAssistant:"
-    # Generate response
-    output = llm(
-        prompt,
-        max_tokens=256,
-        temperature=0.7,
-        top_p=0.9,
-        stop=["User:", "Assistant:"]
-    )
-    assistant_reply = output["choices"][0]["text"].strip()
-    return assistant_reply
 # ----------------------------------------------------
-# 4. Launch Gradio Chat Interface
 # ----------------------------------------------------
 gr.ChatInterface(
-    fn=respond,
-    title="My Llama.cpp GGUF Model",
-    description="Chat with your fine-tuned GGUF model!",
 ).launch()

 import gradio as gr
 from llama_cpp import Llama
 # ----------------------------------------------------
+# Load GGUF model
 # ----------------------------------------------------
+MODEL_PATH = "astegaras/merged_kaggle"   # HF repo containing your .gguf
+# llama_cpp automatically downloads from HF Hub if you provide the repo
+llm = Llama.from_pretrained(
+    repo_id=MODEL_PATH,
+    filename="llama-3.2-3b-instruct.Q4_K_M.gguf",
+    n_threads=8,
+    n_ctx=4096,
+    verbose=False,
 )
 # ----------------------------------------------------
+# Chat function
 # ----------------------------------------------------
 def respond(message, history):
+    messages = []
+    for user, assistant in history:
+        messages.append({"role": "user", "content": user})
+        messages.append({"role": "assistant", "content": assistant})
+    messages.append({"role": "user", "content": message})
+    output = llm.create_chat_completion(messages=messages)
+    reply = output["choices"][0]["message"]["content"]
+    return reply
 # ----------------------------------------------------
+# Launch Gradio app
 # ----------------------------------------------------
 gr.ChatInterface(
+    respond,
+    title="My Llama 3.2 GGUF Chatbot",
+    description="Running GGUF with llama.cpp inside a HuggingFace Space",
 ).launch()