Spaces:

Fu01978
/

VoxAI

Running

App Files Files Community

Fu01978 commited on 29 days ago

Commit

9ed7335

verified ·

1 Parent(s): 50d2614

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -121

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import shutil
 import time
@@ -7,7 +8,7 @@ from huggingface_hub import hf_hub_download, hf_hub_url
 from llama_cpp import Llama
 import gradio as gr
-# ------------------ CONFIG ------------------
 REPO_ID = "mradermacher/EuroLLM-1.7B-Instruct-GGUF"
 FILENAME = "EuroLLM-1.7B-Instruct.Q8_0.gguf"
 SYSTEM_PROMPT = "You are a helpful assistant. Answer concisely and helpfully."
@@ -15,24 +16,18 @@ MODEL_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), "models")
 os.makedirs(MODEL_DIR, exist_ok=True)
 DEST_PATH = os.path.join(MODEL_DIR, FILENAME)
-# Llama runtime params
 N_CTX = 2048
 MAX_TOKENS = 512
 TEMPERATURE = 0.2
 TOP_P = 0.95
-# Threads: be conservative; Spaces gives limited CPU
 N_THREADS = min(4, max(1, (os.cpu_count() or 1) // 2))
-# --------------------------------------------
 def robust_download(repo_id: str, filename: str, dest: str, max_attempts: int = 2) -> str:
-    """
-    Download model robustly:
-      1) Try hf_hub_download (may return a cached path)
-      2) If cached path differs, copy to dest, fsync, chmod
-      3) If hf_hub_download fails repeat attempts then fallback to direct URL via requests
-    Returns the final dest path (guaranteed to exist and be readable or raises)
-    """
-    # quick exit
     if os.path.exists(dest) and os.path.getsize(dest) > 0:
         print(f"[robust_download] Already present: {dest} ({os.path.getsize(dest)} bytes)")
         return dest
@@ -40,45 +35,32 @@ def robust_download(repo_id: str, filename: str, dest: str, max_attempts: int =
     last_err = None
     for attempt in range(1, max_attempts + 1):
         try:
-            print(f"[robust_download] Attempt {attempt}: hf_hub_download(repo_id={repo_id}, filename={filename})")
             cached_path = hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=MODEL_DIR)
             print("[robust_download] hf_hub_download returned:", cached_path)
-            # If hf_hub_download already saved at dest, good.
             if os.path.abspath(cached_path) != os.path.abspath(dest):
-                print(f"[robust_download] Copying cached file -> {dest}")
                 shutil.copy2(cached_path, dest)
-            else:
-                print("[robust_download] Cached path equals dest; no copy needed.")
-            # ensure it's synced to disk
             with open(dest, "rb") as f:
                 try:
                     f.flush()
                     os.fsync(f.fileno())
                 except Exception:
-                    # some hosted filesystems don't support fsync; ignore but we tried
                     pass
-            # set readable permissions
-            os.chmod(dest, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH)  # 0o644
             size = os.path.getsize(dest)
             if size == 0:
                 raise RuntimeError("Downloaded file has size 0 after copy")
             print(f"[robust_download] Success: {dest} ({size} bytes)")
             return dest
         except Exception as e:
             print(f"[robust_download] hf_hub_download attempt {attempt} failed: {e}")
             last_err = e
             time.sleep(1)
-    # fallback: direct URL using hf_hub_url + requests
     try:
         print("[robust_download] Falling back to direct download via requests...")
         url = hf_hub_url(repo_id=repo_id, filename=filename)
-        print("[robust_download] Downloading from URL:", url)
         tmp_path = dest + ".part"
         with requests.get(url, stream=True, timeout=120) as r:
             r.raise_for_status()
@@ -88,8 +70,6 @@ def robust_download(repo_id: str, filename: str, dest: str, max_attempts: int =
                         f.write(chunk)
                         f.flush()
         shutil.move(tmp_path, dest)
-        # fsync + chmod
         with open(dest, "rb") as f:
             try:
                 os.fsync(f.fileno())
@@ -102,11 +82,9 @@ def robust_download(repo_id: str, filename: str, dest: str, max_attempts: int =
         print("[robust_download] Direct download failed:", e2)
         raise RuntimeError(f"All download attempts failed. last_err={last_err}, fallback_err={e2}")
-# ------------------ Ensure model exists ------------------
 print("Ensuring model present at:", DEST_PATH)
 model_path = robust_download(REPO_ID, FILENAME, DEST_PATH)
-# Debug listing of model dir (useful in logs)
 print("DEBUG: listing model dir:", MODEL_DIR)
 for fn in sorted(os.listdir(MODEL_DIR)):
     p = os.path.join(MODEL_DIR, fn)
@@ -115,38 +93,42 @@ for fn in sorted(os.listdir(MODEL_DIR)):
         print(f" - {fn}: exists, size={st.st_size}, mode={oct(st.st_mode)}")
     except FileNotFoundError:
         print(f" - {fn}: NOT FOUND after copy")
-# tiny delay for some hosted FS race conditions
 time.sleep(0.2)
-# ------------------ Initialize Llama ------------------
 try:
     print("Initializing Llama with model_path:", model_path)
     llm = Llama(
         model_path=model_path,
         n_ctx=N_CTX,
         n_threads=N_THREADS,
         n_gpu_layers=0,
     )
-    print("Llama initialized successfully.")
-except ValueError as ve:
-    print("Llama init ValueError:", ve)
-    print("Model dir listing at failure:")
-    for fn in sorted(os.listdir(MODEL_DIR)):
-        p = os.path.join(MODEL_DIR, fn)
-        try:
-            st = os.stat(p)
-            print(f" * {p}: size={st.st_size}, mode={oct(st.st_mode)}")
-        except Exception as ex:
-            print(" * stat failed for", p, ex)
     raise
-# ------------------ Helpers ------------------
 def build_messages(history, user_message, system_prompt=SYSTEM_PROMPT):
-    """
-    Convert Gradio history (list of [user, assistant]) into chat messages:
-    [{role: system}, {role: user}, {role: assistant}, ... , {role: user}]
-    """
     messages = []
     if system_prompt:
         messages.append({"role": "system", "content": system_prompt})
@@ -158,10 +140,6 @@ def build_messages(history, user_message, system_prompt=SYSTEM_PROMPT):
     return messages
 def parse_final_response(resp):
-    """
-    Normalize single-shot responses to a string.
-    Handles common llama-cpp-python shapes.
-    """
     try:
         if resp is None:
             return ""
@@ -171,28 +149,20 @@ def parse_final_response(resp):
             choices = resp.get("choices", [])
             if len(choices) > 0:
                 c = choices[0]
-                # message.content (chat format)
                 if isinstance(c.get("message"), dict):
                     return c["message"].get("content", "") or ""
-                # text fallback
                 if "text" in c and c["text"]:
                     return c["text"]
-                # delta fallback
                 if "delta" in c and isinstance(c["delta"], dict):
                     return c["delta"].get("content", "") or ""
         return str(resp)
     except Exception:
         return str(resp)
-# ------------------ Robust streaming chat function ------------------
 def chat_fn(user_message, history):
-    """
-    Generator for Gradio ChatInterface streaming.
-    Yields progressive partial strings. Falls back to non-streaming if needed.
-    Ensures Gradio never sees an empty generator (avoids StopAsyncIteration crash).
-    """
     messages = build_messages(history or [], user_message)
-    # Attempt streaming call
     try:
         stream = llm.create_chat_completion(
             messages=messages,
@@ -202,75 +172,69 @@ def chat_fn(user_message, history):
             stream=True
         )
     except Exception as e:
-        # immediate failure: try non-stream fallback
         try:
-            final = llm.create_chat_completion(
-                messages=messages,
-                max_tokens=MAX_TOKENS,
-                temperature=TEMPERATURE,
-                top_p=TOP_P,
-                stream=False
-            )
-            text = parse_final_response(final)
-            yield text
             return
         except Exception as e2:
             yield f"[error] create_chat_completion failed: {e} | fallback error: {e2}"
             return
-    # If the returned object isn't iterable, treat as final
     if not hasattr(stream, "__iter__"):
-        final_text = parse_final_response(stream)
-        yield final_text
         return
     partial = ""
     yielded_any = False
     try:
         for chunk in stream:
-            # Debug: uncomment to log raw chunks
-            # print("STREAM CHUNK:", repr(chunk))
             if not chunk:
                 continue
-            # expected format: {"choices":[{"delta":{"content":"..."}}], ...}
             try:
-                choices = chunk.get("choices", [])
-            except Exception:
-                # some weird shapes might be simple strings
                 chunk_str = str(chunk)
                 if chunk_str:
                     partial += chunk_str
                     yielded_any = True
                     yield partial
-                continue
-            if len(choices) == 0:
-                continue
-            c0 = choices[0]
-            # delta content (usual)
-            delta = c0.get("delta", {})
-            if isinstance(delta, dict) and "content" in delta:
-                partial += delta["content"]
-                yielded_any = True
-                yield partial
-                continue
-            # final message object (some runners)
-            msg = c0.get("message") or c0.get("text") or {}
-            if isinstance(msg, dict):
-                content = msg.get("content") or msg.get("content_text") or ""
-                if content:
-                    partial = content
-                    yielded_any = True
-                    yield partial
-                    continue
-            elif isinstance(msg, str) and msg:
-                partial += msg
-                yielded_any = True
-                yield partial
                 continue
     except StopIteration:
@@ -283,11 +247,7 @@ def chat_fn(user_message, history):
     if not yielded_any:
         try:
             final = llm.create_chat_completion(
-                messages=messages,
-                max_tokens=MAX_TOKENS,
-                temperature=TEMPERATURE,
-                top_p=TOP_P,
-                stream=False
             )
             final_text = parse_final_response(final)
             yield final_text if final_text is not None else ""
@@ -296,13 +256,13 @@ def chat_fn(user_message, history):
             yield f"[error] fallback non-stream failed: {e}"
             return
-# ------------------ Launch Gradio ------------------
 demo = gr.ChatInterface(
     fn=chat_fn,
-    title="EuroLLM 1.7B (GGUF) — Robust streaming chat",
-    description="System prompt enabled. Streaming ON.",
 )
 if __name__ == "__main__":
-    # If you want a public link during testing, pass share=True in launch()
     demo.launch()

+# app.py (robust streaming + chat_format + debug)
 import os
 import shutil
 import time
 from llama_cpp import Llama
 import gradio as gr
+# ------------- CONFIG -------------
 REPO_ID = "mradermacher/EuroLLM-1.7B-Instruct-GGUF"
 FILENAME = "EuroLLM-1.7B-Instruct.Q8_0.gguf"
 SYSTEM_PROMPT = "You are a helpful assistant. Answer concisely and helpfully."
 os.makedirs(MODEL_DIR, exist_ok=True)
 DEST_PATH = os.path.join(MODEL_DIR, FILENAME)
 N_CTX = 2048
 MAX_TOKENS = 512
 TEMPERATURE = 0.2
 TOP_P = 0.95
 N_THREADS = min(4, max(1, (os.cpu_count() or 1) // 2))
+# Debug controls
+DEBUG_CHUNKS = True            # prints every raw stream chunk to logs
+DEBUG_SINGLESHOT_AT_START = True  # run a non-stream single-shot test at startup and log result
+# -----------------------------------
 def robust_download(repo_id: str, filename: str, dest: str, max_attempts: int = 2) -> str:
     if os.path.exists(dest) and os.path.getsize(dest) > 0:
         print(f"[robust_download] Already present: {dest} ({os.path.getsize(dest)} bytes)")
         return dest
     last_err = None
     for attempt in range(1, max_attempts + 1):
         try:
+            print(f"[robust_download] Attempt {attempt}: hf_hub_download...")
             cached_path = hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=MODEL_DIR)
             print("[robust_download] hf_hub_download returned:", cached_path)
             if os.path.abspath(cached_path) != os.path.abspath(dest):
                 shutil.copy2(cached_path, dest)
             with open(dest, "rb") as f:
                 try:
                     f.flush()
                     os.fsync(f.fileno())
                 except Exception:
                     pass
+            os.chmod(dest, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH)
             size = os.path.getsize(dest)
             if size == 0:
                 raise RuntimeError("Downloaded file has size 0 after copy")
             print(f"[robust_download] Success: {dest} ({size} bytes)")
             return dest
         except Exception as e:
             print(f"[robust_download] hf_hub_download attempt {attempt} failed: {e}")
             last_err = e
             time.sleep(1)
+    # fallback: direct url
     try:
         print("[robust_download] Falling back to direct download via requests...")
         url = hf_hub_url(repo_id=repo_id, filename=filename)
         tmp_path = dest + ".part"
         with requests.get(url, stream=True, timeout=120) as r:
             r.raise_for_status()
                         f.write(chunk)
                         f.flush()
         shutil.move(tmp_path, dest)
         with open(dest, "rb") as f:
             try:
                 os.fsync(f.fileno())
         print("[robust_download] Direct download failed:", e2)
         raise RuntimeError(f"All download attempts failed. last_err={last_err}, fallback_err={e2}")
+# Ensure model
 print("Ensuring model present at:", DEST_PATH)
 model_path = robust_download(REPO_ID, FILENAME, DEST_PATH)
 print("DEBUG: listing model dir:", MODEL_DIR)
 for fn in sorted(os.listdir(MODEL_DIR)):
     p = os.path.join(MODEL_DIR, fn)
         print(f" - {fn}: exists, size={st.st_size}, mode={oct(st.st_mode)}")
     except FileNotFoundError:
         print(f" - {fn}: NOT FOUND after copy")
 time.sleep(0.2)
+# ----------------- Llama init -----------------
 try:
     print("Initializing Llama with model_path:", model_path)
+    # *** IMPORTANT: set chat_format so the bindings format messages correctly ***
     llm = Llama(
         model_path=model_path,
         n_ctx=N_CTX,
         n_threads=N_THREADS,
         n_gpu_layers=0,
+        chat_format="chatml",   # <- often fixes blank replies for Llama-family GGUFs. See docs.
     )
+    print("Llama initialized.")
+except Exception as e:
+    print("Llama init failed:", e)
     raise
+# optional single-shot debug test at startup (prints final structure)
+def run_startup_test():
+    try:
+        test_messages = [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": "Say hello in one short sentence."}
+        ]
+        print("[startup_test] Running single-shot create_chat_completion (stream=False)...")
+        out = llm.create_chat_completion(messages=test_messages, max_tokens=64, stream=False)
+        print("[startup_test] Single-shot response (raw):", out)
+    except Exception as e:
+        print("[startup_test] Error during single-shot test:", e)
+if DEBUG_SINGLESHOT_AT_START:
+    run_startup_test()
+# ----------------- helpers -----------------
 def build_messages(history, user_message, system_prompt=SYSTEM_PROMPT):
     messages = []
     if system_prompt:
         messages.append({"role": "system", "content": system_prompt})
     return messages
 def parse_final_response(resp):
     try:
         if resp is None:
             return ""
             choices = resp.get("choices", [])
             if len(choices) > 0:
                 c = choices[0]
                 if isinstance(c.get("message"), dict):
                     return c["message"].get("content", "") or ""
                 if "text" in c and c["text"]:
                     return c["text"]
                 if "delta" in c and isinstance(c["delta"], dict):
                     return c["delta"].get("content", "") or ""
         return str(resp)
     except Exception:
         return str(resp)
+# ----------------- robust streaming chat -----------------
 def chat_fn(user_message, history):
     messages = build_messages(history or [], user_message)
     try:
         stream = llm.create_chat_completion(
             messages=messages,
             stream=True
         )
     except Exception as e:
+        # immediate failure -> non-stream fallback
         try:
+            final = llm.create_chat_completion(messages=messages, max_tokens=MAX_TOKENS, stream=False)
+            yield parse_final_response(final)
             return
         except Exception as e2:
             yield f"[error] create_chat_completion failed: {e} | fallback error: {e2}"
             return
+    # If not iterable, treat as final
     if not hasattr(stream, "__iter__"):
+        yield parse_final_response(stream)
         return
     partial = ""
     yielded_any = False
+    buffer_for_unicode = ""  # helper to accumulate partial bytes/characters
     try:
         for chunk in stream:
+            if DEBUG_CHUNKS:
+                print("STREAM CHUNK:", repr(chunk))
             if not chunk:
                 continue
+            # try normal shape
+            choices = chunk.get("choices", []) if isinstance(chunk, dict) else []
+            if choices and len(choices) > 0:
+                c0 = choices[0]
+                delta = c0.get("delta", {})
+                if isinstance(delta, dict) and "content" in delta:
+                    # accumulate and yield only when new non-empty content appears
+                    new = delta["content"]
+                    if new:
+                        partial += new
+                        yielded_any = True
+                        yield partial
+                        continue
+                # some runners provide 'message' as full object
+                msg = c0.get("message") or c0.get("text")
+                if isinstance(msg, dict):
+                    content = msg.get("content") or msg.get("content_text") or ""
+                    if content:
+                        partial = content
+                        yielded_any = True
+                        yield partial
+                        continue
+                elif isinstance(msg, str) and msg:
+                    partial += msg
+                    yielded_any = True
+                    yield partial
+                    continue
+            # fallback: if chunk is a plain string or other shape, append its string form
             try:
                 chunk_str = str(chunk)
                 if chunk_str:
                     partial += chunk_str
                     yielded_any = True
                     yield partial
+            except Exception:
                 continue
     except StopIteration:
     if not yielded_any:
         try:
             final = llm.create_chat_completion(
+                messages=messages, max_tokens=MAX_TOKENS, temperature=TEMPERATURE, top_p=TOP_P, stream=False
             )
             final_text = parse_final_response(final)
             yield final_text if final_text is not None else ""
             yield f"[error] fallback non-stream failed: {e}"
             return
+# --------------- Launch Gradio ----------------
 demo = gr.ChatInterface(
     fn=chat_fn,
+    title="EuroLLM 1.7B (with robust streaming & chat_format)",
+    description="Streaming ON. Check logs for STREAM CHUNK lines if behaviour is blank.",
 )
 if __name__ == "__main__":
     demo.launch()