Spaces:

Fu01978
/

VoxAI

Running

App Files Files Community

Fu01978 commited on 17 days ago

Commit

dc8348c

verified ·

1 Parent(s): 9ed7335

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -43

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py (robust streaming + chat_format + debug)
 import os
 import shutil
 import time
@@ -23,7 +23,7 @@ TOP_P = 0.95
 N_THREADS = min(4, max(1, (os.cpu_count() or 1) // 2))
 # Debug controls
-DEBUG_CHUNKS = True            # prints every raw stream chunk to logs
 DEBUG_SINGLESHOT_AT_START = True  # run a non-stream single-shot test at startup and log result
 # -----------------------------------
@@ -98,13 +98,12 @@ time.sleep(0.2)
 # ----------------- Llama init -----------------
 try:
     print("Initializing Llama with model_path:", model_path)
-    # *** IMPORTANT: set chat_format so the bindings format messages correctly ***
     llm = Llama(
         model_path=model_path,
         n_ctx=N_CTX,
         n_threads=N_THREADS,
         n_gpu_layers=0,
-        chat_format="chatml",   # <- often fixes blank replies for Llama-family GGUFs. See docs.
     )
     print("Llama initialized.")
 except Exception as e:
@@ -163,6 +162,7 @@ def parse_final_response(resp):
 def chat_fn(user_message, history):
     messages = build_messages(history or [], user_message)
     try:
         stream = llm.create_chat_completion(
             messages=messages,
@@ -181,61 +181,84 @@ def chat_fn(user_message, history):
             yield f"[error] create_chat_completion failed: {e} | fallback error: {e2}"
             return
-    # If not iterable, treat as final
     if not hasattr(stream, "__iter__"):
         yield parse_final_response(stream)
         return
     partial = ""
     yielded_any = False
-    buffer_for_unicode = ""  # helper to accumulate partial bytes/characters
     try:
         for chunk in stream:
             if DEBUG_CHUNKS:
                 print("STREAM CHUNK:", repr(chunk))
-            if not chunk:
-                continue
-            # try normal shape
-            choices = chunk.get("choices", []) if isinstance(chunk, dict) else []
-            if choices and len(choices) > 0:
-                c0 = choices[0]
-                delta = c0.get("delta", {})
-                if isinstance(delta, dict) and "content" in delta:
-                    # accumulate and yield only when new non-empty content appears
-                    new = delta["content"]
-                    if new:
-                        partial += new
                         yielded_any = True
                         yield partial
                         continue
-                # some runners provide 'message' as full object
-                msg = c0.get("message") or c0.get("text")
-                if isinstance(msg, dict):
-                    content = msg.get("content") or msg.get("content_text") or ""
-                    if content:
-                        partial = content
                         yielded_any = True
                         yield partial
                         continue
-                elif isinstance(msg, str) and msg:
-                    partial += msg
-                    yielded_any = True
-                    yield partial
-                    continue
-            # fallback: if chunk is a plain string or other shape, append its string form
-            try:
-                chunk_str = str(chunk)
-                if chunk_str:
-                    partial += chunk_str
-                    yielded_any = True
-                    yield partial
-            except Exception:
-                continue
     except StopIteration:
         pass
@@ -243,12 +266,10 @@ def chat_fn(user_message, history):
         yield f"[error] stream iteration error: {e}"
         return
-    # If streaming produced nothing, fallback to non-stream
     if not yielded_any:
         try:
-            final = llm.create_chat_completion(
-                messages=messages, max_tokens=MAX_TOKENS, temperature=TEMPERATURE, top_p=TOP_P, stream=False
-            )
             final_text = parse_final_response(final)
             yield final_text if final_text is not None else ""
             return
@@ -265,4 +286,3 @@ demo = gr.ChatInterface(
 if __name__ == "__main__":
     demo.launch()

+# app.py — robust downloader + chat_format + streaming parser that handles role-only and plain-string chunks
 import os
 import shutil
 import time
 N_THREADS = min(4, max(1, (os.cpu_count() or 1) // 2))
 # Debug controls
+DEBUG_CHUNKS = True            # prints every raw stream chunk to logs (turn off if noisy)
 DEBUG_SINGLESHOT_AT_START = True  # run a non-stream single-shot test at startup and log result
 # -----------------------------------
 # ----------------- Llama init -----------------
 try:
     print("Initializing Llama with model_path:", model_path)
     llm = Llama(
         model_path=model_path,
         n_ctx=N_CTX,
         n_threads=N_THREADS,
         n_gpu_layers=0,
+        chat_format="chatml",   # important so the binding formats messages correctly
     )
     print("Llama initialized.")
 except Exception as e:
 def chat_fn(user_message, history):
     messages = build_messages(history or [], user_message)
+    # Try streaming
     try:
         stream = llm.create_chat_completion(
             messages=messages,
             yield f"[error] create_chat_completion failed: {e} | fallback error: {e2}"
             return
+    # Non-iterable stream -> final
     if not hasattr(stream, "__iter__"):
         yield parse_final_response(stream)
         return
     partial = ""
     yielded_any = False
     try:
         for chunk in stream:
             if DEBUG_CHUNKS:
                 print("STREAM CHUNK:", repr(chunk))
+            # Case A: chunk is a dict with "choices" (normal)
+            if isinstance(chunk, dict):
+                choices = chunk.get("choices", []) or []
+                if len(choices) > 0:
+                    c0 = choices[0]
+                    # 1) delta with content
+                    delta = c0.get("delta", {})
+                    if isinstance(delta, dict) and "content" in delta and delta["content"]:
+                        partial += delta["content"]
                         yielded_any = True
                         yield partial
                         continue
+                    # 2) delta with role only (e.g. {"role":"assistant"}) -> ignore for content
+                    if isinstance(delta, dict) and "role" in delta and not delta.get("content"):
+                        # role announcement, not content
+                        continue
+                    # 3) sometimes a 'message' object appears with content
+                    msg = c0.get("message") or c0.get("text")
+                    if isinstance(msg, dict):
+                        content = msg.get("content") or msg.get("content_text") or ""
+                        if content:
+                            partial = content
+                            yielded_any = True
+                            yield partial
+                            continue
+                    elif isinstance(msg, str) and msg:
+                        partial += msg
                         yielded_any = True
                         yield partial
                         continue
+                    # 4) finish reason with empty delta -> if we have accumulated text, yield it; else fallback
+                    finish_reason = c0.get("finish_reason")
+                    if finish_reason:
+                        if partial:
+                            # we already have content; ensure UI gets it
+                            if not yielded_any:
+                                yield partial
+                            return
+                        else:
+                            # no content accumulated — do a non-stream final fetch
+                            try:
+                                final = llm.create_chat_completion(messages=messages, max_tokens=MAX_TOKENS, stream=False)
+                                final_text = parse_final_response(final)
+                                yield final_text
+                                return
+                            except Exception as e:
+                                yield f"[error] fallback non-stream at finish failed: {e}"
+                                return
+            # Case B: chunk is not a dict (plain string or other)
+            else:
+                try:
+                    chunk_str = str(chunk)
+                    if chunk_str and chunk_str.strip():
+                        partial += chunk_str
+                        yielded_any = True
+                        yield partial
+                        continue
+                except Exception:
+                    # ignore weird chunk -> continue
+                    continue
     except StopIteration:
         pass
         yield f"[error] stream iteration error: {e}"
         return
+    # If streaming produced nothing, final non-stream fallback
     if not yielded_any:
         try:
+            final = llm.create_chat_completion(messages=messages, max_tokens=MAX_TOKENS, temperature=TEMPERATURE, top_p=TOP_P, stream=False)
             final_text = parse_final_response(final)
             yield final_text if final_text is not None else ""
             return
 if __name__ == "__main__":
     demo.launch()