Fu01978 commited on
Commit
9ed7335
·
verified ·
1 Parent(s): 50d2614

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -121
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import shutil
3
  import time
@@ -7,7 +8,7 @@ from huggingface_hub import hf_hub_download, hf_hub_url
7
  from llama_cpp import Llama
8
  import gradio as gr
9
 
10
- # ------------------ CONFIG ------------------
11
  REPO_ID = "mradermacher/EuroLLM-1.7B-Instruct-GGUF"
12
  FILENAME = "EuroLLM-1.7B-Instruct.Q8_0.gguf"
13
  SYSTEM_PROMPT = "You are a helpful assistant. Answer concisely and helpfully."
@@ -15,24 +16,18 @@ MODEL_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), "models")
15
  os.makedirs(MODEL_DIR, exist_ok=True)
16
  DEST_PATH = os.path.join(MODEL_DIR, FILENAME)
17
 
18
- # Llama runtime params
19
  N_CTX = 2048
20
  MAX_TOKENS = 512
21
  TEMPERATURE = 0.2
22
  TOP_P = 0.95
23
- # Threads: be conservative; Spaces gives limited CPU
24
  N_THREADS = min(4, max(1, (os.cpu_count() or 1) // 2))
25
- # --------------------------------------------
 
 
 
 
26
 
27
  def robust_download(repo_id: str, filename: str, dest: str, max_attempts: int = 2) -> str:
28
- """
29
- Download model robustly:
30
- 1) Try hf_hub_download (may return a cached path)
31
- 2) If cached path differs, copy to dest, fsync, chmod
32
- 3) If hf_hub_download fails repeat attempts then fallback to direct URL via requests
33
- Returns the final dest path (guaranteed to exist and be readable or raises)
34
- """
35
- # quick exit
36
  if os.path.exists(dest) and os.path.getsize(dest) > 0:
37
  print(f"[robust_download] Already present: {dest} ({os.path.getsize(dest)} bytes)")
38
  return dest
@@ -40,45 +35,32 @@ def robust_download(repo_id: str, filename: str, dest: str, max_attempts: int =
40
  last_err = None
41
  for attempt in range(1, max_attempts + 1):
42
  try:
43
- print(f"[robust_download] Attempt {attempt}: hf_hub_download(repo_id={repo_id}, filename={filename})")
44
  cached_path = hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=MODEL_DIR)
45
  print("[robust_download] hf_hub_download returned:", cached_path)
46
-
47
- # If hf_hub_download already saved at dest, good.
48
  if os.path.abspath(cached_path) != os.path.abspath(dest):
49
- print(f"[robust_download] Copying cached file -> {dest}")
50
  shutil.copy2(cached_path, dest)
51
- else:
52
- print("[robust_download] Cached path equals dest; no copy needed.")
53
-
54
- # ensure it's synced to disk
55
  with open(dest, "rb") as f:
56
  try:
57
  f.flush()
58
  os.fsync(f.fileno())
59
  except Exception:
60
- # some hosted filesystems don't support fsync; ignore but we tried
61
  pass
62
-
63
- # set readable permissions
64
- os.chmod(dest, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH) # 0o644
65
-
66
  size = os.path.getsize(dest)
67
  if size == 0:
68
  raise RuntimeError("Downloaded file has size 0 after copy")
69
  print(f"[robust_download] Success: {dest} ({size} bytes)")
70
  return dest
71
-
72
  except Exception as e:
73
  print(f"[robust_download] hf_hub_download attempt {attempt} failed: {e}")
74
  last_err = e
75
  time.sleep(1)
76
 
77
- # fallback: direct URL using hf_hub_url + requests
78
  try:
79
  print("[robust_download] Falling back to direct download via requests...")
80
  url = hf_hub_url(repo_id=repo_id, filename=filename)
81
- print("[robust_download] Downloading from URL:", url)
82
  tmp_path = dest + ".part"
83
  with requests.get(url, stream=True, timeout=120) as r:
84
  r.raise_for_status()
@@ -88,8 +70,6 @@ def robust_download(repo_id: str, filename: str, dest: str, max_attempts: int =
88
  f.write(chunk)
89
  f.flush()
90
  shutil.move(tmp_path, dest)
91
-
92
- # fsync + chmod
93
  with open(dest, "rb") as f:
94
  try:
95
  os.fsync(f.fileno())
@@ -102,11 +82,9 @@ def robust_download(repo_id: str, filename: str, dest: str, max_attempts: int =
102
  print("[robust_download] Direct download failed:", e2)
103
  raise RuntimeError(f"All download attempts failed. last_err={last_err}, fallback_err={e2}")
104
 
105
- # ------------------ Ensure model exists ------------------
106
  print("Ensuring model present at:", DEST_PATH)
107
  model_path = robust_download(REPO_ID, FILENAME, DEST_PATH)
108
-
109
- # Debug listing of model dir (useful in logs)
110
  print("DEBUG: listing model dir:", MODEL_DIR)
111
  for fn in sorted(os.listdir(MODEL_DIR)):
112
  p = os.path.join(MODEL_DIR, fn)
@@ -115,38 +93,42 @@ for fn in sorted(os.listdir(MODEL_DIR)):
115
  print(f" - {fn}: exists, size={st.st_size}, mode={oct(st.st_mode)}")
116
  except FileNotFoundError:
117
  print(f" - {fn}: NOT FOUND after copy")
118
-
119
- # tiny delay for some hosted FS race conditions
120
  time.sleep(0.2)
121
 
122
- # ------------------ Initialize Llama ------------------
123
  try:
124
  print("Initializing Llama with model_path:", model_path)
 
125
  llm = Llama(
126
  model_path=model_path,
127
  n_ctx=N_CTX,
128
  n_threads=N_THREADS,
129
  n_gpu_layers=0,
 
130
  )
131
- print("Llama initialized successfully.")
132
- except ValueError as ve:
133
- print("Llama init ValueError:", ve)
134
- print("Model dir listing at failure:")
135
- for fn in sorted(os.listdir(MODEL_DIR)):
136
- p = os.path.join(MODEL_DIR, fn)
137
- try:
138
- st = os.stat(p)
139
- print(f" * {p}: size={st.st_size}, mode={oct(st.st_mode)}")
140
- except Exception as ex:
141
- print(" * stat failed for", p, ex)
142
  raise
143
 
144
- # ------------------ Helpers ------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  def build_messages(history, user_message, system_prompt=SYSTEM_PROMPT):
146
- """
147
- Convert Gradio history (list of [user, assistant]) into chat messages:
148
- [{role: system}, {role: user}, {role: assistant}, ... , {role: user}]
149
- """
150
  messages = []
151
  if system_prompt:
152
  messages.append({"role": "system", "content": system_prompt})
@@ -158,10 +140,6 @@ def build_messages(history, user_message, system_prompt=SYSTEM_PROMPT):
158
  return messages
159
 
160
  def parse_final_response(resp):
161
- """
162
- Normalize single-shot responses to a string.
163
- Handles common llama-cpp-python shapes.
164
- """
165
  try:
166
  if resp is None:
167
  return ""
@@ -171,28 +149,20 @@ def parse_final_response(resp):
171
  choices = resp.get("choices", [])
172
  if len(choices) > 0:
173
  c = choices[0]
174
- # message.content (chat format)
175
  if isinstance(c.get("message"), dict):
176
  return c["message"].get("content", "") or ""
177
- # text fallback
178
  if "text" in c and c["text"]:
179
  return c["text"]
180
- # delta fallback
181
  if "delta" in c and isinstance(c["delta"], dict):
182
  return c["delta"].get("content", "") or ""
183
  return str(resp)
184
  except Exception:
185
  return str(resp)
186
 
187
- # ------------------ Robust streaming chat function ------------------
188
  def chat_fn(user_message, history):
189
- """
190
- Generator for Gradio ChatInterface streaming.
191
- Yields progressive partial strings. Falls back to non-streaming if needed.
192
- Ensures Gradio never sees an empty generator (avoids StopAsyncIteration crash).
193
- """
194
  messages = build_messages(history or [], user_message)
195
- # Attempt streaming call
196
  try:
197
  stream = llm.create_chat_completion(
198
  messages=messages,
@@ -202,75 +172,69 @@ def chat_fn(user_message, history):
202
  stream=True
203
  )
204
  except Exception as e:
205
- # immediate failure: try non-stream fallback
206
  try:
207
- final = llm.create_chat_completion(
208
- messages=messages,
209
- max_tokens=MAX_TOKENS,
210
- temperature=TEMPERATURE,
211
- top_p=TOP_P,
212
- stream=False
213
- )
214
- text = parse_final_response(final)
215
- yield text
216
  return
217
  except Exception as e2:
218
  yield f"[error] create_chat_completion failed: {e} | fallback error: {e2}"
219
  return
220
 
221
- # If the returned object isn't iterable, treat as final
222
  if not hasattr(stream, "__iter__"):
223
- final_text = parse_final_response(stream)
224
- yield final_text
225
  return
226
 
227
  partial = ""
228
  yielded_any = False
 
229
 
230
  try:
231
  for chunk in stream:
232
- # Debug: uncomment to log raw chunks
233
- # print("STREAM CHUNK:", repr(chunk))
234
 
235
  if not chunk:
236
  continue
237
- # expected format: {"choices":[{"delta":{"content":"..."}}], ...}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  try:
239
- choices = chunk.get("choices", [])
240
- except Exception:
241
- # some weird shapes might be simple strings
242
  chunk_str = str(chunk)
243
  if chunk_str:
244
  partial += chunk_str
245
  yielded_any = True
246
  yield partial
247
- continue
248
-
249
- if len(choices) == 0:
250
- continue
251
-
252
- c0 = choices[0]
253
- # delta content (usual)
254
- delta = c0.get("delta", {})
255
- if isinstance(delta, dict) and "content" in delta:
256
- partial += delta["content"]
257
- yielded_any = True
258
- yield partial
259
- continue
260
-
261
- # final message object (some runners)
262
- msg = c0.get("message") or c0.get("text") or {}
263
- if isinstance(msg, dict):
264
- content = msg.get("content") or msg.get("content_text") or ""
265
- if content:
266
- partial = content
267
- yielded_any = True
268
- yield partial
269
- continue
270
- elif isinstance(msg, str) and msg:
271
- partial += msg
272
- yielded_any = True
273
- yield partial
274
  continue
275
 
276
  except StopIteration:
@@ -283,11 +247,7 @@ def chat_fn(user_message, history):
283
  if not yielded_any:
284
  try:
285
  final = llm.create_chat_completion(
286
- messages=messages,
287
- max_tokens=MAX_TOKENS,
288
- temperature=TEMPERATURE,
289
- top_p=TOP_P,
290
- stream=False
291
  )
292
  final_text = parse_final_response(final)
293
  yield final_text if final_text is not None else ""
@@ -296,13 +256,13 @@ def chat_fn(user_message, history):
296
  yield f"[error] fallback non-stream failed: {e}"
297
  return
298
 
299
- # ------------------ Launch Gradio ------------------
300
  demo = gr.ChatInterface(
301
  fn=chat_fn,
302
- title="EuroLLM 1.7B (GGUF) Robust streaming chat",
303
- description="System prompt enabled. Streaming ON.",
304
  )
305
 
306
  if __name__ == "__main__":
307
- # If you want a public link during testing, pass share=True in launch()
308
  demo.launch()
 
 
1
+ # app.py (robust streaming + chat_format + debug)
2
  import os
3
  import shutil
4
  import time
 
8
  from llama_cpp import Llama
9
  import gradio as gr
10
 
11
+ # ------------- CONFIG -------------
12
  REPO_ID = "mradermacher/EuroLLM-1.7B-Instruct-GGUF"
13
  FILENAME = "EuroLLM-1.7B-Instruct.Q8_0.gguf"
14
  SYSTEM_PROMPT = "You are a helpful assistant. Answer concisely and helpfully."
 
16
  os.makedirs(MODEL_DIR, exist_ok=True)
17
  DEST_PATH = os.path.join(MODEL_DIR, FILENAME)
18
 
 
19
  N_CTX = 2048
20
  MAX_TOKENS = 512
21
  TEMPERATURE = 0.2
22
  TOP_P = 0.95
 
23
  N_THREADS = min(4, max(1, (os.cpu_count() or 1) // 2))
24
+
25
+ # Debug controls
26
+ DEBUG_CHUNKS = True # prints every raw stream chunk to logs
27
+ DEBUG_SINGLESHOT_AT_START = True # run a non-stream single-shot test at startup and log result
28
+ # -----------------------------------
29
 
30
  def robust_download(repo_id: str, filename: str, dest: str, max_attempts: int = 2) -> str:
 
 
 
 
 
 
 
 
31
  if os.path.exists(dest) and os.path.getsize(dest) > 0:
32
  print(f"[robust_download] Already present: {dest} ({os.path.getsize(dest)} bytes)")
33
  return dest
 
35
  last_err = None
36
  for attempt in range(1, max_attempts + 1):
37
  try:
38
+ print(f"[robust_download] Attempt {attempt}: hf_hub_download...")
39
  cached_path = hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=MODEL_DIR)
40
  print("[robust_download] hf_hub_download returned:", cached_path)
 
 
41
  if os.path.abspath(cached_path) != os.path.abspath(dest):
 
42
  shutil.copy2(cached_path, dest)
 
 
 
 
43
  with open(dest, "rb") as f:
44
  try:
45
  f.flush()
46
  os.fsync(f.fileno())
47
  except Exception:
 
48
  pass
49
+ os.chmod(dest, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH)
 
 
 
50
  size = os.path.getsize(dest)
51
  if size == 0:
52
  raise RuntimeError("Downloaded file has size 0 after copy")
53
  print(f"[robust_download] Success: {dest} ({size} bytes)")
54
  return dest
 
55
  except Exception as e:
56
  print(f"[robust_download] hf_hub_download attempt {attempt} failed: {e}")
57
  last_err = e
58
  time.sleep(1)
59
 
60
+ # fallback: direct url
61
  try:
62
  print("[robust_download] Falling back to direct download via requests...")
63
  url = hf_hub_url(repo_id=repo_id, filename=filename)
 
64
  tmp_path = dest + ".part"
65
  with requests.get(url, stream=True, timeout=120) as r:
66
  r.raise_for_status()
 
70
  f.write(chunk)
71
  f.flush()
72
  shutil.move(tmp_path, dest)
 
 
73
  with open(dest, "rb") as f:
74
  try:
75
  os.fsync(f.fileno())
 
82
  print("[robust_download] Direct download failed:", e2)
83
  raise RuntimeError(f"All download attempts failed. last_err={last_err}, fallback_err={e2}")
84
 
85
+ # Ensure model
86
  print("Ensuring model present at:", DEST_PATH)
87
  model_path = robust_download(REPO_ID, FILENAME, DEST_PATH)
 
 
88
  print("DEBUG: listing model dir:", MODEL_DIR)
89
  for fn in sorted(os.listdir(MODEL_DIR)):
90
  p = os.path.join(MODEL_DIR, fn)
 
93
  print(f" - {fn}: exists, size={st.st_size}, mode={oct(st.st_mode)}")
94
  except FileNotFoundError:
95
  print(f" - {fn}: NOT FOUND after copy")
 
 
96
  time.sleep(0.2)
97
 
98
+ # ----------------- Llama init -----------------
99
  try:
100
  print("Initializing Llama with model_path:", model_path)
101
+ # *** IMPORTANT: set chat_format so the bindings format messages correctly ***
102
  llm = Llama(
103
  model_path=model_path,
104
  n_ctx=N_CTX,
105
  n_threads=N_THREADS,
106
  n_gpu_layers=0,
107
+ chat_format="chatml", # <- often fixes blank replies for Llama-family GGUFs. See docs.
108
  )
109
+ print("Llama initialized.")
110
+ except Exception as e:
111
+ print("Llama init failed:", e)
 
 
 
 
 
 
 
 
112
  raise
113
 
114
+ # optional single-shot debug test at startup (prints final structure)
115
+ def run_startup_test():
116
+ try:
117
+ test_messages = [
118
+ {"role": "system", "content": SYSTEM_PROMPT},
119
+ {"role": "user", "content": "Say hello in one short sentence."}
120
+ ]
121
+ print("[startup_test] Running single-shot create_chat_completion (stream=False)...")
122
+ out = llm.create_chat_completion(messages=test_messages, max_tokens=64, stream=False)
123
+ print("[startup_test] Single-shot response (raw):", out)
124
+ except Exception as e:
125
+ print("[startup_test] Error during single-shot test:", e)
126
+
127
+ if DEBUG_SINGLESHOT_AT_START:
128
+ run_startup_test()
129
+
130
+ # ----------------- helpers -----------------
131
  def build_messages(history, user_message, system_prompt=SYSTEM_PROMPT):
 
 
 
 
132
  messages = []
133
  if system_prompt:
134
  messages.append({"role": "system", "content": system_prompt})
 
140
  return messages
141
 
142
  def parse_final_response(resp):
 
 
 
 
143
  try:
144
  if resp is None:
145
  return ""
 
149
  choices = resp.get("choices", [])
150
  if len(choices) > 0:
151
  c = choices[0]
 
152
  if isinstance(c.get("message"), dict):
153
  return c["message"].get("content", "") or ""
 
154
  if "text" in c and c["text"]:
155
  return c["text"]
 
156
  if "delta" in c and isinstance(c["delta"], dict):
157
  return c["delta"].get("content", "") or ""
158
  return str(resp)
159
  except Exception:
160
  return str(resp)
161
 
162
+ # ----------------- robust streaming chat -----------------
163
  def chat_fn(user_message, history):
 
 
 
 
 
164
  messages = build_messages(history or [], user_message)
165
+
166
  try:
167
  stream = llm.create_chat_completion(
168
  messages=messages,
 
172
  stream=True
173
  )
174
  except Exception as e:
175
+ # immediate failure -> non-stream fallback
176
  try:
177
+ final = llm.create_chat_completion(messages=messages, max_tokens=MAX_TOKENS, stream=False)
178
+ yield parse_final_response(final)
 
 
 
 
 
 
 
179
  return
180
  except Exception as e2:
181
  yield f"[error] create_chat_completion failed: {e} | fallback error: {e2}"
182
  return
183
 
184
+ # If not iterable, treat as final
185
  if not hasattr(stream, "__iter__"):
186
+ yield parse_final_response(stream)
 
187
  return
188
 
189
  partial = ""
190
  yielded_any = False
191
+ buffer_for_unicode = "" # helper to accumulate partial bytes/characters
192
 
193
  try:
194
  for chunk in stream:
195
+ if DEBUG_CHUNKS:
196
+ print("STREAM CHUNK:", repr(chunk))
197
 
198
  if not chunk:
199
  continue
200
+
201
+ # try normal shape
202
+ choices = chunk.get("choices", []) if isinstance(chunk, dict) else []
203
+ if choices and len(choices) > 0:
204
+ c0 = choices[0]
205
+ delta = c0.get("delta", {})
206
+ if isinstance(delta, dict) and "content" in delta:
207
+ # accumulate and yield only when new non-empty content appears
208
+ new = delta["content"]
209
+ if new:
210
+ partial += new
211
+ yielded_any = True
212
+ yield partial
213
+ continue
214
+
215
+ # some runners provide 'message' as full object
216
+ msg = c0.get("message") or c0.get("text")
217
+ if isinstance(msg, dict):
218
+ content = msg.get("content") or msg.get("content_text") or ""
219
+ if content:
220
+ partial = content
221
+ yielded_any = True
222
+ yield partial
223
+ continue
224
+ elif isinstance(msg, str) and msg:
225
+ partial += msg
226
+ yielded_any = True
227
+ yield partial
228
+ continue
229
+
230
+ # fallback: if chunk is a plain string or other shape, append its string form
231
  try:
 
 
 
232
  chunk_str = str(chunk)
233
  if chunk_str:
234
  partial += chunk_str
235
  yielded_any = True
236
  yield partial
237
+ except Exception:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  continue
239
 
240
  except StopIteration:
 
247
  if not yielded_any:
248
  try:
249
  final = llm.create_chat_completion(
250
+ messages=messages, max_tokens=MAX_TOKENS, temperature=TEMPERATURE, top_p=TOP_P, stream=False
 
 
 
 
251
  )
252
  final_text = parse_final_response(final)
253
  yield final_text if final_text is not None else ""
 
256
  yield f"[error] fallback non-stream failed: {e}"
257
  return
258
 
259
+ # --------------- Launch Gradio ----------------
260
  demo = gr.ChatInterface(
261
  fn=chat_fn,
262
+ title="EuroLLM 1.7B (with robust streaming & chat_format)",
263
+ description="Streaming ON. Check logs for STREAM CHUNK lines if behaviour is blank.",
264
  )
265
 
266
  if __name__ == "__main__":
 
267
  demo.launch()
268
+