everydaytok commited on
Commit
c44a1f7
Β·
verified Β·
1 Parent(s): b88168f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -58
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import gradio as gr
2
  from llama_cpp import Llama
3
  from threading import Thread
4
- from queue import Queue, Empty
5
  import time
6
  import psutil
7
  import os
@@ -17,47 +16,39 @@ from huggingface_hub import hf_hub_download
17
  MODEL_REPO = "bartowski/Qwen2.5-0.5B-Instruct-GGUF"
18
  GGUF_FILE = "Qwen2.5-0.5B-Instruct-Q4_K_M.gguf"
19
 
20
- # This is the actual RAM budget β€” Q4_K_M 0.5B should be ~350MB
21
- EXPECTED_MB = 350
22
-
23
  model = None
24
  load_status = "πŸ”„ Initializing..."
25
  load_start = time.time()
26
 
 
27
  # ─────────────────────────────────────────────────────────────
28
- # ACCURATE RAM READING β€” process RSS only, not whole container
29
  # ─────────────────────────────────────────────────────────────
30
  def get_process_ram_mb() -> float:
31
- """Returns only THIS process's RAM in MB."""
32
- proc = psutil.Process(os.getpid())
33
- return proc.memory_info().rss / 1024**2
34
 
35
  def get_stats_md() -> str:
36
- used_mb = get_process_ram_mb()
37
- used_gb = used_mb / 1024
38
- pct = min((used_mb / (EXPECTED_MB * 4)) * 100, 100) # scale bar to 4x expected
39
- filled = int(pct / 10)
40
- bar = "β–ˆ" * filled + "β–‘" * (10 - filled)
41
  return (
42
  f"**Status:** {load_status} \n"
43
- f"**Process RAM:** `[{bar}]` "
44
- f"**{used_mb:.0f} MB** ({used_gb:.2f} GB)"
45
  )
46
 
47
 
48
  # ─────────────────────────────────────────────────────────────
49
- # MODEL LOADING β€” llama-cpp-python runs GGUF natively
50
- # stays at ~350MB instead of dequantizing to float32
51
  # ─────────────────────────────────────────────────────────────
52
  def load_model():
53
  global model, load_status
54
  try:
55
- load_status = "πŸ”„ Downloading GGUF (~350MB)..."
56
  print(load_status)
57
 
58
  model_path = hf_hub_download(
59
  repo_id=MODEL_REPO,
60
- filename=GGUF_FILE,
61
  )
62
 
63
  load_status = "πŸ”„ Loading into llama.cpp..."
@@ -65,15 +56,14 @@ def load_model():
65
 
66
  model = Llama(
67
  model_path=model_path,
68
- n_ctx=2048, # context window
69
- n_threads=4, # CPU threads
70
- n_gpu_layers=0, # CPU only
71
  verbose=False
72
  )
73
 
74
  elapsed = time.time() - load_start
75
- ram_mb = get_process_ram_mb()
76
- load_status = f"βœ… Ready in {elapsed:.0f}s Β· {ram_mb:.0f} MB used"
77
  print(load_status)
78
 
79
  except Exception as e:
@@ -84,7 +74,7 @@ Thread(target=load_model, daemon=True).start()
84
 
85
 
86
  # ─────────────────────────────────────────────────────────────
87
- # PROMPT FORMAT β€” Qwen2.5 ChatML
88
  # ─────────────────────────────────────────────────────────────
89
  def build_prompt(system: str, history: list, user: str) -> str:
90
  parts = []
@@ -105,15 +95,14 @@ def build_prompt(system: str, history: list, user: str) -> str:
105
  # ─────────────────────────────────────────────────────────────
106
  def chat(message: str, history: list, system_prompt: str):
107
  if model is None:
108
- yield "⏳ Model still loading...", get_stats_md()
109
  return
110
 
111
- prompt = build_prompt(system_prompt, history, message)
112
- t0 = time.time()
113
- output = ""
114
- count = 0
115
 
116
- # llama-cpp-python native streaming
117
  stream = model(
118
  prompt,
119
  max_tokens=512,
@@ -125,7 +114,7 @@ def chat(message: str, history: list, system_prompt: str):
125
  )
126
 
127
  for chunk in stream:
128
- token = chunk["choices"][0]["text"]
129
  output += token
130
  count += 1
131
  elapsed = time.time() - t0
@@ -141,46 +130,47 @@ def chat(message: str, history: list, system_prompt: str):
141
 
142
 
143
  # ─────────────────────────────────────────────────────────────
144
- # GRADIO UI β€” mobile-first, minimal, no broken SVG
145
  # ─────────────────────────────────────────────────────────────
146
  CSS = """
147
- /* ── reset Gradio's giant empty-chatbot SVG placeholder ── */
148
  .empty.svelte-byatnx { display: none !important; }
149
  .wrap.svelte-byatnx { min-height: 20px !important; }
150
 
151
- /* ── stats panel ── */
152
  #stats {
153
  background: #0f172a;
154
  color: #94a3b8;
155
  border-radius: 8px;
156
  padding: 10px 14px;
157
  font-size: 0.82rem;
158
- line-height: 1.6;
159
- margin-bottom: 6px;
160
  }
161
 
162
- /* ── make textbox taller on mobile ── */
163
- #msg textarea { font-size: 1rem; }
 
 
164
 
165
- /* ── send button full width on small screens ── */
166
  @media (max-width: 600px) {
167
- #send-btn { width: 100% !important; }
168
  }
169
 
170
  footer { display: none !important; }
171
  """
172
 
173
- with gr.Blocks(theme=gr.themes.Default(), css=CSS) as demo:
174
 
175
- gr.Markdown("## 🧠 Qwen2.5-0.5B Q4_K_M")
176
 
177
- # Live stats β€” always at top
178
  stats_md = gr.Markdown(
179
  value=get_stats_md(),
180
  elem_id="stats"
181
  )
182
 
183
- # System prompt β€” hidden by default
184
  with gr.Accordion("βš™οΈ System Prompt", open=False):
185
  system_box = gr.Textbox(
186
  value="You are a helpful assistant.",
@@ -188,37 +178,36 @@ with gr.Blocks(theme=gr.themes.Default(), css=CSS) as demo:
188
  show_label=False
189
  )
190
 
191
- # Chat
192
  chatbot = gr.Chatbot(
193
  value=[],
194
- label="",
195
  show_label=False,
196
- height=380,
197
- # No placeholder icon
198
- placeholder=None
199
  )
200
 
201
- # Input row
202
  with gr.Row(equal_height=True):
203
  msg = gr.Textbox(
204
- placeholder="Message…",
205
  show_label=False,
206
- scale=8,
207
  lines=1,
208
- max_lines=4,
209
  elem_id="msg"
210
  )
211
  send_btn = gr.Button(
212
  "➀",
213
  variant="primary",
214
  scale=1,
215
- elem_id="send-btn",
216
- min_width=48
217
  )
218
 
219
  clear = gr.Button("πŸ—‘οΈ Clear", size="sm")
220
 
221
- # ── event wiring ────────────────────────────────────────
222
  def user_turn(message, history):
223
  return "", history + [[message, ""]]
224
 
@@ -250,7 +239,7 @@ with gr.Blocks(theme=gr.themes.Default(), css=CSS) as demo:
250
  # ─────────────────────────────────────────────────────────────
251
  # FASTAPI
252
  # ─────────────────────────────────────────────────────────────
253
- app = FastAPI(title="Qwen2.5-0.5B")
254
 
255
  class ChatRequest(BaseModel):
256
  message: str
@@ -283,13 +272,22 @@ def api_chat(req: ChatRequest):
283
  )
284
 
285
  text = result["choices"][0]["text"].strip()
 
286
  return {
287
  "response": text,
288
  "tokens": result["usage"]["completion_tokens"],
289
  "process_ram_mb": round(get_process_ram_mb(), 1)
290
  }
291
 
 
 
 
 
292
  app = gr.mount_gradio_app(app, demo, path="/")
293
 
294
  if __name__ == "__main__":
 
 
 
 
295
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
  import gradio as gr
2
  from llama_cpp import Llama
3
  from threading import Thread
 
4
  import time
5
  import psutil
6
  import os
 
16
  MODEL_REPO = "bartowski/Qwen2.5-0.5B-Instruct-GGUF"
17
  GGUF_FILE = "Qwen2.5-0.5B-Instruct-Q4_K_M.gguf"
18
 
 
 
 
19
  model = None
20
  load_status = "πŸ”„ Initializing..."
21
  load_start = time.time()
22
 
23
+
24
  # ─────────────────────────────────────────────────────────────
25
+ # RAM β€” process-only, not container/host
26
  # ─────────────────────────────────────────────────────────────
27
  def get_process_ram_mb() -> float:
28
+ return psutil.Process(os.getpid()).memory_info().rss / 1024**2
 
 
29
 
30
  def get_stats_md() -> str:
31
+ mb = get_process_ram_mb()
32
+ filled = min(int(mb / 100), 10) # 1 block per 100MB, max 10
33
+ bar = "β–ˆ" * filled + "β–‘" * (10 - filled)
 
 
34
  return (
35
  f"**Status:** {load_status} \n"
36
+ f"**Process RAM:** `[{bar}]` **{mb:.0f} MB**"
 
37
  )
38
 
39
 
40
  # ─────────────────────────────────────────────────────────────
41
+ # MODEL LOADING
 
42
  # ─────────────────────────────────────────────────────────────
43
  def load_model():
44
  global model, load_status
45
  try:
46
+ load_status = "πŸ”„ Downloading GGUF (~350 MB)..."
47
  print(load_status)
48
 
49
  model_path = hf_hub_download(
50
  repo_id=MODEL_REPO,
51
+ filename=GGUF_FILE
52
  )
53
 
54
  load_status = "πŸ”„ Loading into llama.cpp..."
 
56
 
57
  model = Llama(
58
  model_path=model_path,
59
+ n_ctx=2048,
60
+ n_threads=os.cpu_count() or 4,
61
+ n_gpu_layers=0,
62
  verbose=False
63
  )
64
 
65
  elapsed = time.time() - load_start
66
+ load_status = f"βœ… Ready β€” {get_process_ram_mb():.0f} MB Β· {elapsed:.0f}s load time"
 
67
  print(load_status)
68
 
69
  except Exception as e:
 
74
 
75
 
76
  # ─────────────────────────────────────────────────────────────
77
+ # PROMPT β€” Qwen2.5 ChatML format
78
  # ─────────────────────────────────────────────────────────────
79
  def build_prompt(system: str, history: list, user: str) -> str:
80
  parts = []
 
95
  # ─────────────────────────────────────────────────────────────
96
  def chat(message: str, history: list, system_prompt: str):
97
  if model is None:
98
+ yield "⏳ Model still loading β€” please wait.", get_stats_md()
99
  return
100
 
101
+ prompt = build_prompt(system_prompt, history, message)
102
+ t0 = time.time()
103
+ output = ""
104
+ count = 0
105
 
 
106
  stream = model(
107
  prompt,
108
  max_tokens=512,
 
114
  )
115
 
116
  for chunk in stream:
117
+ token = chunk["choices"][0]["text"]
118
  output += token
119
  count += 1
120
  elapsed = time.time() - t0
 
130
 
131
 
132
  # ─────────────────────────────────────────────────────────────
133
+ # GRADIO UI
134
  # ─────────────────────────────────────────────────────────────
135
  CSS = """
136
+ /* hide empty chatbot SVG placeholder */
137
  .empty.svelte-byatnx { display: none !important; }
138
  .wrap.svelte-byatnx { min-height: 20px !important; }
139
 
 
140
  #stats {
141
  background: #0f172a;
142
  color: #94a3b8;
143
  border-radius: 8px;
144
  padding: 10px 14px;
145
  font-size: 0.82rem;
146
+ line-height: 1.7;
147
+ margin-bottom: 8px;
148
  }
149
 
150
+ #chatbot .message {
151
+ font-size: 0.95rem;
152
+ line-height: 1.5;
153
+ }
154
 
155
+ /* full-width send on mobile */
156
  @media (max-width: 600px) {
157
+ #send-btn { width: 100% !important; margin-top: 6px; }
158
  }
159
 
160
  footer { display: none !important; }
161
  """
162
 
163
+ with gr.Blocks(theme=gr.themes.Default(), css=CSS, title="Qwen 0.5B") as demo:
164
 
165
+ gr.Markdown("## 🧠 Qwen2.5-0.5B · Q4_K_M · CPU")
166
 
167
+ # ── always-visible status bar ────────────────────────────
168
  stats_md = gr.Markdown(
169
  value=get_stats_md(),
170
  elem_id="stats"
171
  )
172
 
173
+ # ── optional system prompt ───────────────────────────────
174
  with gr.Accordion("βš™οΈ System Prompt", open=False):
175
  system_box = gr.Textbox(
176
  value="You are a helpful assistant.",
 
178
  show_label=False
179
  )
180
 
181
+ # ── conversation ─────────────────────────────────────────
182
  chatbot = gr.Chatbot(
183
  value=[],
 
184
  show_label=False,
185
+ height=400,
186
+ placeholder=None,
187
+ bubble_full_width=False
188
  )
189
 
190
+ # ── input row ────────────────────────────────────────────
191
  with gr.Row(equal_height=True):
192
  msg = gr.Textbox(
193
+ placeholder="Type a message…",
194
  show_label=False,
195
+ scale=9,
196
  lines=1,
197
+ max_lines=5,
198
  elem_id="msg"
199
  )
200
  send_btn = gr.Button(
201
  "➀",
202
  variant="primary",
203
  scale=1,
204
+ min_width=48,
205
+ elem_id="send-btn"
206
  )
207
 
208
  clear = gr.Button("πŸ—‘οΈ Clear", size="sm")
209
 
210
+ # ── wiring ───────────────────────────────────────────────
211
  def user_turn(message, history):
212
  return "", history + [[message, ""]]
213
 
 
239
  # ─────────────────────────────────────────────────────────────
240
  # FASTAPI
241
  # ─────────────────────────────────────────────────────────────
242
+ app = FastAPI(title="Qwen2.5-0.5B API")
243
 
244
  class ChatRequest(BaseModel):
245
  message: str
 
272
  )
273
 
274
  text = result["choices"][0]["text"].strip()
275
+
276
  return {
277
  "response": text,
278
  "tokens": result["usage"]["completion_tokens"],
279
  "process_ram_mb": round(get_process_ram_mb(), 1)
280
  }
281
 
282
+
283
+ # ────────────────────────────────���────────────────────────────
284
+ # MOUNT + RUN
285
+ # ─────────────────────────────────────────────────────────────
286
  app = gr.mount_gradio_app(app, demo, path="/")
287
 
288
  if __name__ == "__main__":
289
+ print("\n🌐 Starting on http://0.0.0.0:7860")
290
+ print(" UI β†’ http://0.0.0.0:7860/")
291
+ print(" API β†’ POST http://0.0.0.0:7860/chat")
292
+ print(" Health β†’ GET http://0.0.0.0:7860/health\n")
293
  uvicorn.run(app, host="0.0.0.0", port=7860)