Fu01978 commited on
Commit
dc8348c
·
verified ·
1 Parent(s): 9ed7335

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -43
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py (robust streaming + chat_format + debug)
2
  import os
3
  import shutil
4
  import time
@@ -23,7 +23,7 @@ TOP_P = 0.95
23
  N_THREADS = min(4, max(1, (os.cpu_count() or 1) // 2))
24
 
25
  # Debug controls
26
- DEBUG_CHUNKS = True # prints every raw stream chunk to logs
27
  DEBUG_SINGLESHOT_AT_START = True # run a non-stream single-shot test at startup and log result
28
  # -----------------------------------
29
 
@@ -98,13 +98,12 @@ time.sleep(0.2)
98
  # ----------------- Llama init -----------------
99
  try:
100
  print("Initializing Llama with model_path:", model_path)
101
- # *** IMPORTANT: set chat_format so the bindings format messages correctly ***
102
  llm = Llama(
103
  model_path=model_path,
104
  n_ctx=N_CTX,
105
  n_threads=N_THREADS,
106
  n_gpu_layers=0,
107
- chat_format="chatml", # <- often fixes blank replies for Llama-family GGUFs. See docs.
108
  )
109
  print("Llama initialized.")
110
  except Exception as e:
@@ -163,6 +162,7 @@ def parse_final_response(resp):
163
  def chat_fn(user_message, history):
164
  messages = build_messages(history or [], user_message)
165
 
 
166
  try:
167
  stream = llm.create_chat_completion(
168
  messages=messages,
@@ -181,61 +181,84 @@ def chat_fn(user_message, history):
181
  yield f"[error] create_chat_completion failed: {e} | fallback error: {e2}"
182
  return
183
 
184
- # If not iterable, treat as final
185
  if not hasattr(stream, "__iter__"):
186
  yield parse_final_response(stream)
187
  return
188
 
189
  partial = ""
190
  yielded_any = False
191
- buffer_for_unicode = "" # helper to accumulate partial bytes/characters
192
 
193
  try:
194
  for chunk in stream:
195
  if DEBUG_CHUNKS:
196
  print("STREAM CHUNK:", repr(chunk))
197
 
198
- if not chunk:
199
- continue
 
 
 
200
 
201
- # try normal shape
202
- choices = chunk.get("choices", []) if isinstance(chunk, dict) else []
203
- if choices and len(choices) > 0:
204
- c0 = choices[0]
205
- delta = c0.get("delta", {})
206
- if isinstance(delta, dict) and "content" in delta:
207
- # accumulate and yield only when new non-empty content appears
208
- new = delta["content"]
209
- if new:
210
- partial += new
211
  yielded_any = True
212
  yield partial
213
  continue
214
 
215
- # some runners provide 'message' as full object
216
- msg = c0.get("message") or c0.get("text")
217
- if isinstance(msg, dict):
218
- content = msg.get("content") or msg.get("content_text") or ""
219
- if content:
220
- partial = content
 
 
 
 
 
 
 
 
 
 
221
  yielded_any = True
222
  yield partial
223
  continue
224
- elif isinstance(msg, str) and msg:
225
- partial += msg
226
- yielded_any = True
227
- yield partial
228
- continue
229
 
230
- # fallback: if chunk is a plain string or other shape, append its string form
231
- try:
232
- chunk_str = str(chunk)
233
- if chunk_str:
234
- partial += chunk_str
235
- yielded_any = True
236
- yield partial
237
- except Exception:
238
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
  except StopIteration:
241
  pass
@@ -243,12 +266,10 @@ def chat_fn(user_message, history):
243
  yield f"[error] stream iteration error: {e}"
244
  return
245
 
246
- # If streaming produced nothing, fallback to non-stream
247
  if not yielded_any:
248
  try:
249
- final = llm.create_chat_completion(
250
- messages=messages, max_tokens=MAX_TOKENS, temperature=TEMPERATURE, top_p=TOP_P, stream=False
251
- )
252
  final_text = parse_final_response(final)
253
  yield final_text if final_text is not None else ""
254
  return
@@ -265,4 +286,3 @@ demo = gr.ChatInterface(
265
 
266
  if __name__ == "__main__":
267
  demo.launch()
268
-
 
1
+ # app.py robust downloader + chat_format + streaming parser that handles role-only and plain-string chunks
2
  import os
3
  import shutil
4
  import time
 
23
  N_THREADS = min(4, max(1, (os.cpu_count() or 1) // 2))
24
 
25
  # Debug controls
26
+ DEBUG_CHUNKS = True # prints every raw stream chunk to logs (turn off if noisy)
27
  DEBUG_SINGLESHOT_AT_START = True # run a non-stream single-shot test at startup and log result
28
  # -----------------------------------
29
 
 
98
  # ----------------- Llama init -----------------
99
  try:
100
  print("Initializing Llama with model_path:", model_path)
 
101
  llm = Llama(
102
  model_path=model_path,
103
  n_ctx=N_CTX,
104
  n_threads=N_THREADS,
105
  n_gpu_layers=0,
106
+ chat_format="chatml", # important so the binding formats messages correctly
107
  )
108
  print("Llama initialized.")
109
  except Exception as e:
 
162
  def chat_fn(user_message, history):
163
  messages = build_messages(history or [], user_message)
164
 
165
+ # Try streaming
166
  try:
167
  stream = llm.create_chat_completion(
168
  messages=messages,
 
181
  yield f"[error] create_chat_completion failed: {e} | fallback error: {e2}"
182
  return
183
 
184
+ # Non-iterable stream -> final
185
  if not hasattr(stream, "__iter__"):
186
  yield parse_final_response(stream)
187
  return
188
 
189
  partial = ""
190
  yielded_any = False
 
191
 
192
  try:
193
  for chunk in stream:
194
  if DEBUG_CHUNKS:
195
  print("STREAM CHUNK:", repr(chunk))
196
 
197
+ # Case A: chunk is a dict with "choices" (normal)
198
+ if isinstance(chunk, dict):
199
+ choices = chunk.get("choices", []) or []
200
+ if len(choices) > 0:
201
+ c0 = choices[0]
202
 
203
+ # 1) delta with content
204
+ delta = c0.get("delta", {})
205
+ if isinstance(delta, dict) and "content" in delta and delta["content"]:
206
+ partial += delta["content"]
 
 
 
 
 
 
207
  yielded_any = True
208
  yield partial
209
  continue
210
 
211
+ # 2) delta with role only (e.g. {"role":"assistant"}) -> ignore for content
212
+ if isinstance(delta, dict) and "role" in delta and not delta.get("content"):
213
+ # role announcement, not content
214
+ continue
215
+
216
+ # 3) sometimes a 'message' object appears with content
217
+ msg = c0.get("message") or c0.get("text")
218
+ if isinstance(msg, dict):
219
+ content = msg.get("content") or msg.get("content_text") or ""
220
+ if content:
221
+ partial = content
222
+ yielded_any = True
223
+ yield partial
224
+ continue
225
+ elif isinstance(msg, str) and msg:
226
+ partial += msg
227
  yielded_any = True
228
  yield partial
229
  continue
 
 
 
 
 
230
 
231
+ # 4) finish reason with empty delta -> if we have accumulated text, yield it; else fallback
232
+ finish_reason = c0.get("finish_reason")
233
+ if finish_reason:
234
+ if partial:
235
+ # we already have content; ensure UI gets it
236
+ if not yielded_any:
237
+ yield partial
238
+ return
239
+ else:
240
+ # no content accumulated — do a non-stream final fetch
241
+ try:
242
+ final = llm.create_chat_completion(messages=messages, max_tokens=MAX_TOKENS, stream=False)
243
+ final_text = parse_final_response(final)
244
+ yield final_text
245
+ return
246
+ except Exception as e:
247
+ yield f"[error] fallback non-stream at finish failed: {e}"
248
+ return
249
+
250
+ # Case B: chunk is not a dict (plain string or other)
251
+ else:
252
+ try:
253
+ chunk_str = str(chunk)
254
+ if chunk_str and chunk_str.strip():
255
+ partial += chunk_str
256
+ yielded_any = True
257
+ yield partial
258
+ continue
259
+ except Exception:
260
+ # ignore weird chunk -> continue
261
+ continue
262
 
263
  except StopIteration:
264
  pass
 
266
  yield f"[error] stream iteration error: {e}"
267
  return
268
 
269
+ # If streaming produced nothing, final non-stream fallback
270
  if not yielded_any:
271
  try:
272
+ final = llm.create_chat_completion(messages=messages, max_tokens=MAX_TOKENS, temperature=TEMPERATURE, top_p=TOP_P, stream=False)
 
 
273
  final_text = parse_final_response(final)
274
  yield final_text if final_text is not None else ""
275
  return
 
286
 
287
  if __name__ == "__main__":
288
  demo.launch()