Neon-tech commited on
Commit
75503bc
·
verified ·
1 Parent(s): a8525a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -21
app.py CHANGED
@@ -1,39 +1,33 @@
1
  import os
2
  import gradio as gr
3
- from llama_cpp import Llama
4
  import psutil
 
 
 
 
 
 
5
 
6
  def get_stats():
7
  process = psutil.Process(os.getpid())
8
  ram = process.memory_info().rss / 1024 ** 3
9
- disk = psutil.disk_usage('/').used / 1024 ** 3
 
10
  cpu = psutil.cpu_percent(interval=1)
11
- return f"RAM: {ram:.2f} GB | Disk: {disk:.2f} GB | CPU: {cpu:.1f}%"
12
- os.environ["HF_HOME"] = "/tmp/hf_cache"
13
-
14
- model = Llama.from_pretrained(
15
- repo_id="unsloth/Qwen3.5-35B-A3B-GGUF",
16
- filename="Qwen3.5-35B-A3B-Q3_K_M.gguf",
17
- n_ctx=2048,
18
- n_threads=2,
19
- )
20
 
21
  def chat(message, history):
22
- messages = [{"role": "system", "content": "Reply directly without any reasoning or thinking process."}]
23
  for user, assistant in history:
24
  messages.append({"role": "user", "content": user})
25
  messages.append({"role": "assistant", "content": assistant})
26
  messages.append({"role": "user", "content": message})
27
 
28
- output = ""
29
- for chunk in model.create_chat_completion(
30
- messages=messages,
31
- max_tokens=2048,
32
- stream=True
33
- ):
34
- delta = chunk["choices"][0]["delta"].get("content", "")
35
- output += delta
36
- yield output
37
 
38
  with gr.Blocks() as demo:
39
  stats = gr.Textbox(label="System Stats", value=get_stats, every=5)
 
1
  import os
2
  import gradio as gr
 
3
  import psutil
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
+
6
+ os.environ["HF_HOME"] = "/tmp/hf_cache"
7
+
8
+ tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-120b")
9
+ model = AutoModelForCausalLM.from_pretrained("openai/gpt-oss-120b", device_map="auto", offload_folder="/tmp/offload")
10
 
11
  def get_stats():
12
  process = psutil.Process(os.getpid())
13
  ram = process.memory_info().rss / 1024 ** 3
14
+ disk_tmp = psutil.disk_usage('/tmp').used / 1024 ** 3
15
+ disk_app = psutil.disk_usage('/').used / 1024 ** 3
16
  cpu = psutil.cpu_percent(interval=1)
17
+ return f"RAM: {ram:.2f} GB | /tmp: {disk_tmp:.2f} GB | Disk: {disk_app:.2f} GB | CPU: {cpu:.1f}%"
 
 
 
 
 
 
 
 
18
 
19
  def chat(message, history):
20
+ messages = []
21
  for user, assistant in history:
22
  messages.append({"role": "user", "content": user})
23
  messages.append({"role": "assistant", "content": assistant})
24
  messages.append({"role": "user", "content": message})
25
 
26
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
27
+ inputs = tokenizer([text], return_tensors="pt").to(model.device)
28
+ outputs = model.generate(**inputs, max_new_tokens=512)
29
+ output = tokenizer.decode(outputs[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True)
30
+ return output
 
 
 
 
31
 
32
  with gr.Blocks() as demo:
33
  stats = gr.Textbox(label="System Stats", value=get_stats, every=5)