Jodaro commited on
Commit
e9ddae9
·
verified ·
1 Parent(s): b39253d

Use ctransformers for Qwen

Browse files
Files changed (1) hide show
  1. app.py +18 -20
app.py CHANGED
@@ -1,34 +1,32 @@
1
  import gradio as gr
2
- from llama_cpp import Llama
3
 
4
  MODEL_REPO = "Qwen/Qwen3-4B-GGUF"
5
  MODEL_FILE = "Qwen3-4B-Q4_K_M.gguf"
6
 
7
  print("Loading model...")
8
- llm = Llama.from_pretrained(
9
- repo_id=MODEL_REPO,
10
- filename=MODEL_FILE,
11
- n_ctx=4096,
12
- n_threads=2,
 
13
  )
14
 
15
-
16
  def respond(message: str, history: list[list[str]]) -> str:
17
- messages = []
18
  for user_msg, bot_msg in history:
19
- messages.append({"role": "user", "content": user_msg})
20
- messages.append({"role": "assistant", "content": bot_msg})
21
-
22
- messages.append({"role": "user", "content": message})
23
-
24
- out = llm.create_chat_completion(
25
- messages=messages,
26
- max_tokens=512,
27
  temperature=0.7,
28
  top_p=0.9,
 
29
  )
 
30
 
31
- return out["choices"][0]["message"]["content"]
32
-
33
-
34
- gr.ChatInterface(respond).launch()
 
1
  import gradio as gr
2
+ from ctransformers import AutoModelForCausalLM
3
 
4
  MODEL_REPO = "Qwen/Qwen3-4B-GGUF"
5
  MODEL_FILE = "Qwen3-4B-Q4_K_M.gguf"
6
 
7
  print("Loading model...")
8
+ llm = AutoModelForCausalLM.from_pretrained(
9
+ MODEL_REPO,
10
+ model_file=MODEL_FILE,
11
+ model_type="llama",
12
+ gpu_layers=0,
13
+ context_length=4096,
14
  )
15
 
 
16
  def respond(message: str, history: list[list[str]]) -> str:
17
+ prompt = ""
18
  for user_msg, bot_msg in history:
19
+ prompt += f"<|im_start|>user\n{user_msg}\n<|im_end|>\n"
20
+ prompt += f"<|im_start|>assistant\n{bot_msg}\n<|im_end|>\n"
21
+ prompt += f"<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n"
22
+ out = llm(
23
+ prompt,
24
+ max_new_tokens=512,
 
 
25
  temperature=0.7,
26
  top_p=0.9,
27
+ stop=["<|im_end|>"],
28
  )
29
+ return out
30
 
31
+ if __name__ == "__main__":
32
+ gr.ChatInterface(respond).launch()