everydaytok commited on
Commit
ff4ac49
Β·
verified Β·
1 Parent(s): 8fbb150

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -92
app.py CHANGED
@@ -1,69 +1,89 @@
1
  import gradio as gr
2
- from llama_cpp import Llama
 
 
 
 
 
3
  from threading import Thread
4
  import time
5
  import psutil
6
  import os
 
7
 
8
  from fastapi import FastAPI, HTTPException
9
  from pydantic import BaseModel
10
  import uvicorn
11
- from huggingface_hub import hf_hub_download
12
 
13
  # ─────────────────────────────────────────────────────────────
14
  # CONFIGURATION
 
 
15
  # ─────────────────────────────────────────────────────────────
16
- MODEL_REPO = "bartowski/Qwen2.5-0.5B-Instruct-GGUF"
17
- GGUF_FILE = "Qwen2.5-0.5B-Instruct-Q4_K_M.gguf"
18
 
19
  model = None
 
20
  load_status = "πŸ”„ Initializing..."
21
  load_start = time.time()
22
 
23
 
24
  # ─────────────────────────────────────────────────────────────
25
- # RAM β€” process-only, not container/host
26
  # ─────────────────────────────────────────────────────────────
27
  def get_process_ram_mb() -> float:
28
  return psutil.Process(os.getpid()).memory_info().rss / 1024**2
29
 
30
- def get_stats_md() -> str:
31
  mb = get_process_ram_mb()
32
- filled = min(int(mb / 100), 10) # 1 block per 100MB, max 10
33
  bar = "β–ˆ" * filled + "β–‘" * (10 - filled)
34
- return (
35
- f"**Status:** {load_status} \n"
36
- f"**Process RAM:** `[{bar}]` **{mb:.0f} MB**"
37
- )
 
 
 
 
 
38
 
39
 
40
  # ─────────────────────────────────────────────────────────────
41
- # MODEL LOADING
 
42
  # ─────────────────────────────────────────────────────────────
43
  def load_model():
44
- global model, load_status
45
  try:
46
- load_status = "πŸ”„ Downloading GGUF (~350 MB)..."
47
  print(load_status)
48
 
49
- model_path = hf_hub_download(
50
- repo_id=MODEL_REPO,
51
- filename=GGUF_FILE
52
  )
53
 
54
- load_status = "πŸ”„ Loading into llama.cpp..."
55
  print(load_status)
56
 
57
- model = Llama(
58
- model_path=model_path,
59
- n_ctx=2048,
60
- n_threads=os.cpu_count() or 4,
61
- n_gpu_layers=0,
62
- verbose=False
 
 
63
  )
 
64
 
65
  elapsed = time.time() - load_start
66
- load_status = f"βœ… Ready β€” {get_process_ram_mb():.0f} MB Β· {elapsed:.0f}s load time"
 
 
 
 
67
  print(load_status)
68
 
69
  except Exception as e:
@@ -74,7 +94,7 @@ Thread(target=load_model, daemon=True).start()
74
 
75
 
76
  # ─────────────────────────────────────────────────────────────
77
- # PROMPT β€” Qwen2.5 ChatML format
78
  # ─────────────────────────────────────────────────────────────
79
  def build_prompt(system: str, history: list, user: str) -> str:
80
  parts = []
@@ -94,49 +114,51 @@ def build_prompt(system: str, history: list, user: str) -> str:
94
  # STREAMING GENERATOR
95
  # ─────────────────────────────────────────────────────────────
96
  def chat(message: str, history: list, system_prompt: str):
97
- if model is None:
98
  yield "⏳ Model still loading β€” please wait.", get_stats_md()
99
  return
100
 
101
- prompt = build_prompt(system_prompt, history, message)
102
- t0 = time.time()
103
- output = ""
104
- count = 0
105
 
106
- stream = model(
107
- prompt,
108
- max_tokens=512,
 
 
 
 
 
 
 
 
109
  temperature=0.7,
110
  top_p=0.9,
111
- repeat_penalty=1.1,
112
- stop=["<|im_end|>", "<|im_start|>"],
113
- stream=True
114
  )
115
 
116
- for chunk in stream:
117
- token = chunk["choices"][0]["text"]
118
- output += token
 
 
 
 
 
 
119
  count += 1
120
  elapsed = time.time() - t0
121
  tps = count / elapsed if elapsed > 0 else 0
122
- stats = (
123
- f"**Status:** {load_status} \n"
124
- f"**Process RAM:** {get_process_ram_mb():.0f} MB \n"
125
- f"**Speed:** {tps:.1f} t/s Β· "
126
- f"**Tokens:** {count} Β· "
127
- f"**Elapsed:** {elapsed:.1f}s"
128
- )
129
- yield output, stats
130
 
131
 
132
  # ─────────────────────────────────────────────────────────────
133
  # GRADIO UI
134
  # ─────────────────────────────────────────────────────────────
135
  CSS = """
136
- /* hide empty chatbot SVG placeholder */
137
- .empty.svelte-byatnx { display: none !important; }
138
- .wrap.svelte-byatnx { min-height: 20px !important; }
139
-
140
  #stats {
141
  background: #0f172a;
142
  color: #94a3b8;
@@ -146,31 +168,18 @@ CSS = """
146
  line-height: 1.7;
147
  margin-bottom: 8px;
148
  }
149
-
150
- #chatbot .message {
151
- font-size: 0.95rem;
152
- line-height: 1.5;
153
- }
154
-
155
- /* full-width send on mobile */
156
- @media (max-width: 600px) {
157
- #send-btn { width: 100% !important; margin-top: 6px; }
158
- }
159
-
160
  footer { display: none !important; }
161
  """
162
 
163
  with gr.Blocks(theme=gr.themes.Default(), css=CSS, title="Qwen 0.5B") as demo:
164
 
165
- gr.Markdown("## 🧠 Qwen2.5-0.5B · Q4_K_M · CPU")
166
 
167
- # ── always-visible status bar ────────────────────────────
168
  stats_md = gr.Markdown(
169
  value=get_stats_md(),
170
  elem_id="stats"
171
  )
172
 
173
- # ── optional system prompt ───────────────────────────────
174
  with gr.Accordion("βš™οΈ System Prompt", open=False):
175
  system_box = gr.Textbox(
176
  value="You are a helpful assistant.",
@@ -178,7 +187,6 @@ with gr.Blocks(theme=gr.themes.Default(), css=CSS, title="Qwen 0.5B") as demo:
178
  show_label=False
179
  )
180
 
181
- # ── conversation ─────────────────────────────────────────
182
  chatbot = gr.Chatbot(
183
  value=[],
184
  show_label=False,
@@ -187,27 +195,23 @@ with gr.Blocks(theme=gr.themes.Default(), css=CSS, title="Qwen 0.5B") as demo:
187
  bubble_full_width=False
188
  )
189
 
190
- # ── input row ────────────────────────────────────────────
191
  with gr.Row(equal_height=True):
192
  msg = gr.Textbox(
193
  placeholder="Type a message…",
194
  show_label=False,
195
  scale=9,
196
  lines=1,
197
- max_lines=5,
198
- elem_id="msg"
199
  )
200
  send_btn = gr.Button(
201
  "➀",
202
  variant="primary",
203
  scale=1,
204
- min_width=48,
205
- elem_id="send-btn"
206
  )
207
 
208
  clear = gr.Button("πŸ—‘οΈ Clear", size="sm")
209
 
210
- # ── wiring ───────────────────────────────────────────────
211
  def user_turn(message, history):
212
  return "", history + [[message, ""]]
213
 
@@ -261,33 +265,33 @@ def api_chat(req: ChatRequest):
261
  raise HTTPException(status_code=503, detail=load_status)
262
 
263
  prompt = build_prompt(req.system, [], req.message)
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
- result = model(
266
- prompt,
267
- max_tokens=req.max_tokens,
268
- temperature=req.temperature,
269
- top_p=0.9,
270
- repeat_penalty=1.1,
271
- stop=["<|im_end|>", "<|im_start|>"]
272
  )
273
 
274
- text = result["choices"][0]["text"].strip()
275
-
276
  return {
277
- "response": text,
278
- "tokens": result["usage"]["completion_tokens"],
279
  "process_ram_mb": round(get_process_ram_mb(), 1)
280
  }
281
 
282
-
283
- # ─────────────────────────────────────────────────────────────
284
- # MOUNT + RUN
285
- # ─────────────────────────────────────────────────────────────
286
  app = gr.mount_gradio_app(app, demo, path="/")
287
 
288
  if __name__ == "__main__":
289
- print("\n🌐 Starting on http://0.0.0.0:7860")
290
- print(" UI β†’ http://0.0.0.0:7860/")
291
- print(" API β†’ POST http://0.0.0.0:7860/chat")
292
- print(" Health β†’ GET http://0.0.0.0:7860/health\n")
293
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
  import gradio as gr
2
+ from transformers import (
3
+ AutoTokenizer,
4
+ AutoModelForCausalLM,
5
+ TextIteratorStreamer,
6
+ BitsAndBytesConfig
7
+ )
8
  from threading import Thread
9
  import time
10
  import psutil
11
  import os
12
+ import torch
13
 
14
  from fastapi import FastAPI, HTTPException
15
  from pydantic import BaseModel
16
  import uvicorn
 
17
 
18
  # ─────────────────────────────────────────────────────────────
19
  # CONFIGURATION
20
+ # Use a model that is ALREADY quantized on HF β€” no GGUF needed
21
+ # Qwen2.5-0.5B in int8 via bitsandbytes = ~500MB, no compilation
22
  # ─────────────────────────────────────────────────────────────
23
+ MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
 
24
 
25
  model = None
26
+ tokenizer = None
27
  load_status = "πŸ”„ Initializing..."
28
  load_start = time.time()
29
 
30
 
31
  # ─────────────────────────────────────────────────────────────
32
+ # RAM β€” process only
33
  # ─────────────────────────────────────────────────────────────
34
  def get_process_ram_mb() -> float:
35
  return psutil.Process(os.getpid()).memory_info().rss / 1024**2
36
 
37
+ def get_stats_md(tps=None, tokens=None, elapsed=None) -> str:
38
  mb = get_process_ram_mb()
39
+ filled = min(int(mb / 150), 10) # 1 block per 150MB
40
  bar = "β–ˆ" * filled + "β–‘" * (10 - filled)
41
+ line1 = f"**Status:** {load_status} \n"
42
+ line2 = f"**RAM:** `[{bar}]` **{mb:.0f} MB**"
43
+ if tps is not None:
44
+ line2 += (
45
+ f" \n**Speed:** {tps:.1f} t/s Β· "
46
+ f"**Tokens:** {tokens} Β· "
47
+ f"**Elapsed:** {elapsed:.1f}s"
48
+ )
49
+ return line1 + line2
50
 
51
 
52
  # ─────────────────────────────────────────────────────────────
53
+ # MODEL LOADING β€” int8 quantization via bitsandbytes
54
+ # No compilation, installs in seconds, stays ~450-500MB RAM
55
  # ─────────────────────────────────────────────────────────────
56
  def load_model():
57
+ global model, tokenizer, load_status
58
  try:
59
+ load_status = "πŸ”„ Loading tokenizer..."
60
  print(load_status)
61
 
62
+ tokenizer = AutoTokenizer.from_pretrained(
63
+ MODEL_ID,
64
+ trust_remote_code=True
65
  )
66
 
67
+ load_status = "πŸ”„ Loading model (int8 quantized)..."
68
  print(load_status)
69
 
70
+ quant_config = BitsAndBytesConfig(load_in_8bit=True)
71
+
72
+ model = AutoModelForCausalLM.from_pretrained(
73
+ MODEL_ID,
74
+ quantization_config=quant_config,
75
+ device_map="cpu",
76
+ trust_remote_code=True,
77
+ low_cpu_mem_usage=True
78
  )
79
+ model.eval()
80
 
81
  elapsed = time.time() - load_start
82
+ load_status = (
83
+ f"βœ… Ready β€” "
84
+ f"{get_process_ram_mb():.0f} MB Β· "
85
+ f"{elapsed:.0f}s"
86
+ )
87
  print(load_status)
88
 
89
  except Exception as e:
 
94
 
95
 
96
  # ─────────────────────────────────────────────────────────────
97
+ # PROMPT β€” Qwen2.5 ChatML
98
  # ─────────────────────────────────────────────────────────────
99
  def build_prompt(system: str, history: list, user: str) -> str:
100
  parts = []
 
114
  # STREAMING GENERATOR
115
  # ─────────────────────────────────────────────────────────────
116
  def chat(message: str, history: list, system_prompt: str):
117
+ if model is None or tokenizer is None:
118
  yield "⏳ Model still loading β€” please wait.", get_stats_md()
119
  return
120
 
121
+ prompt = build_prompt(system_prompt, history, message)
122
+ inputs = tokenizer(prompt, return_tensors="pt")
 
 
123
 
124
+ streamer = TextIteratorStreamer(
125
+ tokenizer,
126
+ skip_prompt=True,
127
+ skip_special_tokens=True
128
+ )
129
+
130
+ gen_kwargs = dict(
131
+ **inputs,
132
+ streamer=streamer,
133
+ max_new_tokens=512,
134
+ do_sample=True,
135
  temperature=0.7,
136
  top_p=0.9,
137
+ repetition_penalty=1.1,
138
+ pad_token_id=tokenizer.eos_token_id
 
139
  )
140
 
141
+ thread = Thread(target=model.generate, kwargs=gen_kwargs)
142
+ thread.start()
143
+
144
+ t0 = time.time()
145
+ output = ""
146
+ count = 0
147
+
148
+ for chunk in streamer:
149
+ output += chunk
150
  count += 1
151
  elapsed = time.time() - t0
152
  tps = count / elapsed if elapsed > 0 else 0
153
+ yield output, get_stats_md(tps=tps, tokens=count, elapsed=elapsed)
154
+
155
+ thread.join()
 
 
 
 
 
156
 
157
 
158
  # ─────────────────────────────────────────────────────────────
159
  # GRADIO UI
160
  # ─────────────────────────────────────────────────────────────
161
  CSS = """
 
 
 
 
162
  #stats {
163
  background: #0f172a;
164
  color: #94a3b8;
 
168
  line-height: 1.7;
169
  margin-bottom: 8px;
170
  }
 
 
 
 
 
 
 
 
 
 
 
171
  footer { display: none !important; }
172
  """
173
 
174
  with gr.Blocks(theme=gr.themes.Default(), css=CSS, title="Qwen 0.5B") as demo:
175
 
176
+ gr.Markdown("## 🧠 Qwen2.5-0.5B · int8 · CPU")
177
 
 
178
  stats_md = gr.Markdown(
179
  value=get_stats_md(),
180
  elem_id="stats"
181
  )
182
 
 
183
  with gr.Accordion("βš™οΈ System Prompt", open=False):
184
  system_box = gr.Textbox(
185
  value="You are a helpful assistant.",
 
187
  show_label=False
188
  )
189
 
 
190
  chatbot = gr.Chatbot(
191
  value=[],
192
  show_label=False,
 
195
  bubble_full_width=False
196
  )
197
 
 
198
  with gr.Row(equal_height=True):
199
  msg = gr.Textbox(
200
  placeholder="Type a message…",
201
  show_label=False,
202
  scale=9,
203
  lines=1,
204
+ max_lines=5
 
205
  )
206
  send_btn = gr.Button(
207
  "➀",
208
  variant="primary",
209
  scale=1,
210
+ min_width=48
 
211
  )
212
 
213
  clear = gr.Button("πŸ—‘οΈ Clear", size="sm")
214
 
 
215
  def user_turn(message, history):
216
  return "", history + [[message, ""]]
217
 
 
265
  raise HTTPException(status_code=503, detail=load_status)
266
 
267
  prompt = build_prompt(req.system, [], req.message)
268
+ inputs = tokenizer(prompt, return_tensors="pt")
269
+
270
+ with torch.no_grad():
271
+ outputs = model.generate(
272
+ **inputs,
273
+ max_new_tokens=req.max_tokens,
274
+ do_sample=req.temperature > 0,
275
+ temperature=max(req.temperature, 1e-4),
276
+ top_p=0.9,
277
+ repetition_penalty=1.1,
278
+ pad_token_id=tokenizer.eos_token_id
279
+ )
280
 
281
+ input_length = inputs.input_ids.shape[1]
282
+ response_text = tokenizer.decode(
283
+ outputs[0][input_length:],
284
+ skip_special_tokens=True
 
 
 
285
  )
286
 
 
 
287
  return {
288
+ "response": response_text,
289
+ "tokens": len(outputs[0]) - input_length,
290
  "process_ram_mb": round(get_process_ram_mb(), 1)
291
  }
292
 
 
 
 
 
293
  app = gr.mount_gradio_app(app, demo, path="/")
294
 
295
  if __name__ == "__main__":
296
+ print("\n🌐 http://0.0.0.0:7860")
 
 
 
297
  uvicorn.run(app, host="0.0.0.0", port=7860)