astegaras commited on
Commit
e497580
·
verified ·
1 Parent(s): 66b667b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -36
app.py CHANGED
@@ -1,57 +1,47 @@
1
  import gradio as gr
2
- from huggingface_hub import hf_hub_download
3
  from llama_cpp import Llama
4
 
5
  # ----------------------------------------------------
6
- # 1. Download GGUF model from HuggingFace
7
  # ----------------------------------------------------
8
 
9
- REPO_ID = "astegaras/merged_kaggle" # your GGUF repo
10
- FILENAME = "llama-3.2-3b-instruct.Q4_K_M.gguf" # your GGUF file
11
 
12
- print("Downloading GGUF model...")
13
- model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
14
- print("Model downloaded:", model_path)
15
-
16
- # ----------------------------------------------------
17
- # 2. Load llama.cpp model
18
- # ----------------------------------------------------
19
-
20
- llm = Llama(
21
- model_path=model_path,
22
- n_ctx=4096, # context size
23
- n_threads=8, # use HF Space CPU
24
  )
25
 
26
  # ----------------------------------------------------
27
- # 3. Chat / inference function
28
  # ----------------------------------------------------
 
29
  def respond(message, history):
30
- prompt = ""
 
 
 
 
31
 
32
- # Build prompt manually
33
- for user_msg, bot_msg in history:
34
- prompt += f"User: {user_msg}\nAssistant: {bot_msg}\n"
35
- prompt += f"User: {message}\nAssistant:"
36
 
37
- # Generate response
38
- output = llm(
39
- prompt,
40
- max_tokens=256,
41
- temperature=0.7,
42
- top_p=0.9,
43
- stop=["User:", "Assistant:"]
44
- )
45
 
46
- assistant_reply = output["choices"][0]["text"].strip()
47
- return assistant_reply
48
 
49
  # ----------------------------------------------------
50
- # 4. Launch Gradio Chat Interface
51
  # ----------------------------------------------------
 
52
  gr.ChatInterface(
53
- fn=respond,
54
- title="My Llama.cpp GGUF Model",
55
- description="Chat with your fine-tuned GGUF model!",
56
  ).launch()
57
 
 
 
1
  import gradio as gr
 
2
  from llama_cpp import Llama
3
 
4
  # ----------------------------------------------------
5
+ # Load GGUF model
6
  # ----------------------------------------------------
7
 
8
+ MODEL_PATH = "astegaras/merged_kaggle" # HF repo containing your .gguf
 
9
 
10
+ # llama_cpp automatically downloads from HF Hub if you provide the repo
11
+ llm = Llama.from_pretrained(
12
+ repo_id=MODEL_PATH,
13
+ filename="llama-3.2-3b-instruct.Q4_K_M.gguf",
14
+ n_threads=8,
15
+ n_ctx=4096,
16
+ verbose=False,
 
 
 
 
 
17
  )
18
 
19
  # ----------------------------------------------------
20
+ # Chat function
21
  # ----------------------------------------------------
22
+
23
  def respond(message, history):
24
+ messages = []
25
+
26
+ for user, assistant in history:
27
+ messages.append({"role": "user", "content": user})
28
+ messages.append({"role": "assistant", "content": assistant})
29
 
30
+ messages.append({"role": "user", "content": message})
 
 
 
31
 
32
+ output = llm.create_chat_completion(messages=messages)
33
+ reply = output["choices"][0]["message"]["content"]
 
 
 
 
 
 
34
 
35
+ return reply
 
36
 
37
  # ----------------------------------------------------
38
+ # Launch Gradio app
39
  # ----------------------------------------------------
40
+
41
  gr.ChatInterface(
42
+ respond,
43
+ title="My Llama 3.2 GGUF Chatbot",
44
+ description="Running GGUF with llama.cpp inside a HuggingFace Space",
45
  ).launch()
46
 
47
+