Neon-tech commited on
Commit
ac3ae82
·
verified ·
1 Parent(s): 9cf4e50

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -11
app.py CHANGED
@@ -1,12 +1,15 @@
1
- from transformers import AutoModelForCausalLM, AutoTokenizer
2
- import gradio as gr
3
  import os
 
 
4
 
5
- os.environ["HF_HOME"] = "/app/hf_cache"
6
- model_name = "Qwen/Qwen3.5-35B-A3B-FP8"
7
 
8
- tokenizer = AutoTokenizer.from_pretrained(model_name)
9
- model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", offload_folder="/tmp/offload")
 
 
 
 
10
 
11
  def chat(message, history):
12
  messages = []
@@ -15,10 +18,7 @@ def chat(message, history):
15
  messages.append({"role": "assistant", "content": assistant})
16
  messages.append({"role": "user", "content": message})
17
 
18
- text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
19
- inputs = tokenizer([text], return_tensors="pt").to(model.device)
20
- outputs = model.generate(**inputs, max_new_tokens=512)
21
- output = tokenizer.decode(outputs[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True)
22
- return output
23
 
24
  gr.ChatInterface(chat).launch(server_name="0.0.0.0", server_port=7860)
 
 
 
1
  import os
2
+ import gradio as gr
3
+ from llama_cpp import Llama
4
 
5
+ os.environ["HF_HOME"] = "/tmp/hf_cache"
 
6
 
7
+ model = Llama.from_pretrained(
8
+ repo_id="bartowski/Qwen3.5-35B-A3B-GGUF",
9
+ filename="Qwen3.5-35B-A3B-Q3_K_M.gguf",
10
+ n_ctx=2048,
11
+ n_threads=2,
12
+ )
13
 
14
  def chat(message, history):
15
  messages = []
 
18
  messages.append({"role": "assistant", "content": assistant})
19
  messages.append({"role": "user", "content": message})
20
 
21
+ response = model.create_chat_completion(messages=messages, max_tokens=512)
22
+ return response["choices"][0]["message"]["content"]
 
 
 
23
 
24
  gr.ChatInterface(chat).launch(server_name="0.0.0.0", server_port=7860)