Spaces:

Fu01978
/

VoxAI

Running

App Files Files Community

Fu01978 commited on Dec 12, 2025

Commit

50d2614

verified ·

1 Parent(s): b53b3ec

Update app.py

Browse files

Files changed (1) hide show

app.py +190 -53

app.py CHANGED Viewed

@@ -7,22 +7,32 @@ from huggingface_hub import hf_hub_download, hf_hub_url
 from llama_cpp import Llama
 import gradio as gr
-# --------- config ----------
 REPO_ID = "mradermacher/EuroLLM-1.7B-Instruct-GGUF"
 FILENAME = "EuroLLM-1.7B-Instruct.Q8_0.gguf"
 SYSTEM_PROMPT = "You are a helpful assistant. Answer concisely and helpfully."
 MODEL_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), "models")
 os.makedirs(MODEL_DIR, exist_ok=True)
 DEST_PATH = os.path.join(MODEL_DIR, FILENAME)
-# ---------------------------
 def robust_download(repo_id: str, filename: str, dest: str, max_attempts: int = 2) -> str:
     """
-    Try hf_hub_download first. If it returns a cached path, copy it into `dest`.
-    If hf_hub_download fails, fallback to huggingface direct URL via requests.
-    After placing the file at dest, fsync and chmod to ensure llama-cpp-python can read it.
     """
-    # quick exit if already present with some reasonable size
     if os.path.exists(dest) and os.path.getsize(dest) > 0:
         print(f"[robust_download] Already present: {dest} ({os.path.getsize(dest)} bytes)")
         return dest
@@ -30,31 +40,29 @@ def robust_download(repo_id: str, filename: str, dest: str, max_attempts: int =
     last_err = None
     for attempt in range(1, max_attempts + 1):
         try:
-            print(f"[robust_download] Attempt {attempt}: hf_hub_download...")
             cached_path = hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=MODEL_DIR)
             print("[robust_download] hf_hub_download returned:", cached_path)
-            # If hf_hub_download already saved directly at dest, great
-            if os.path.abspath(cached_path) == os.path.abspath(dest):
-                print("[robust_download] File is already at dest.")
-            else:
-                # copy to dest (safer than os.replace in some mount setups)
                 print(f"[robust_download] Copying cached file -> {dest}")
                 shutil.copy2(cached_path, dest)
-            # ensure it's synced to disk and readable
             with open(dest, "rb") as f:
-                f.flush()
                 try:
                     os.fsync(f.fileno())
-                except OSError:
-                    # some environments / filesystems may not support fsync; ignore if so
                     pass
-            # set sane permissions
             os.chmod(dest, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH)  # 0o644
-            # verify
             size = os.path.getsize(dest)
             if size == 0:
                 raise RuntimeError("Downloaded file has size 0 after copy")
@@ -62,44 +70,43 @@ def robust_download(repo_id: str, filename: str, dest: str, max_attempts: int =
             return dest
         except Exception as e:
-            print(f"[robust_download] hf_hub_download attempt failed: {e}")
             last_err = e
-            # fallback to direct requests download next loop or after attempts exhausted
             time.sleep(1)
-    # fallback: direct URL using hf_hub_url and requests
     try:
         print("[robust_download] Falling back to direct download via requests...")
         url = hf_hub_url(repo_id=repo_id, filename=filename)
         print("[robust_download] Downloading from URL:", url)
         with requests.get(url, stream=True, timeout=120) as r:
             r.raise_for_status()
-            tmp_path = dest + ".part"
             with open(tmp_path, "wb") as f:
                 for chunk in r.iter_content(chunk_size=8192):
                     if chunk:
                         f.write(chunk)
                         f.flush()
-            # move into place
-            shutil.move(tmp_path, dest)
         # fsync + chmod
         with open(dest, "rb") as f:
             try:
                 os.fsync(f.fileno())
-            except OSError:
                 pass
         os.chmod(dest, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH)
         print("[robust_download] Direct download success:", dest)
         return dest
     except Exception as e2:
         print("[robust_download] Direct download failed:", e2)
-        raise RuntimeError(f"All download attempts failed. last_err={last_err}, last_fallback_err={e2}")
-# ---------- ensure model present ----------
-print("Trying to ensure model is present at DEST_PATH:", DEST_PATH)
 model_path = robust_download(REPO_ID, FILENAME, DEST_PATH)
-# debug check: list the models folder and stat
 print("DEBUG: listing model dir:", MODEL_DIR)
 for fn in sorted(os.listdir(MODEL_DIR)):
     p = os.path.join(MODEL_DIR, fn)
@@ -107,22 +114,22 @@ for fn in sorted(os.listdir(MODEL_DIR)):
         st = os.stat(p)
         print(f" - {fn}: exists, size={st.st_size}, mode={oct(st.st_mode)}")
     except FileNotFoundError:
-        print(f" - {fn}: NOT FOUND after copy (weird)")
-# small safety wait for FS to settle (rarely needed, but prevents race in some hosted FS)
 time.sleep(0.2)
-# ---------- initialize llama -----------
 try:
     print("Initializing Llama with model_path:", model_path)
     llm = Llama(
         model_path=model_path,
-        n_ctx=2048,
-        n_threads=4,
         n_gpu_layers=0,
     )
 except ValueError as ve:
-    # dump extra debug info for logs and re-raise with context
     print("Llama init ValueError:", ve)
     print("Model dir listing at failure:")
     for fn in sorted(os.listdir(MODEL_DIR)):
@@ -134,38 +141,168 @@ except ValueError as ve:
             print(" * stat failed for", p, ex)
     raise
-# ---------- chat utilities ----------
 def build_messages(history, user_message, system_prompt=SYSTEM_PROMPT):
     messages = []
     if system_prompt:
         messages.append({"role": "system", "content": system_prompt})
     for user_msg, assistant_msg in history:
         messages.append({"role": "user", "content": user_msg})
-        if assistant_msg:
             messages.append({"role": "assistant", "content": assistant_msg})
     messages.append({"role": "user", "content": user_message})
     return messages
 def chat_fn(user_message, history):
     messages = build_messages(history or [], user_message)
-    stream = llm.create_chat_completion(
-        messages=messages,
-        max_tokens=512,
-        temperature=0.2,
-        top_p=0.95,
-        stream=True
-    )
-    partial = ""
-    for chunk in stream:
         try:
-            if "choices" in chunk and len(chunk["choices"]) > 0:
-                delta = chunk["choices"][0].get("delta", {})
-                if "content" in delta:
-                    partial += delta["content"]
                     yield partial
-        except Exception:
-            continue
-demo = gr.ChatInterface(fn=chat_fn, title="EuroLLM 1.7B (robust loader + streaming)")
 if __name__ == "__main__":
     demo.launch()

 from llama_cpp import Llama
 import gradio as gr
+# ------------------ CONFIG ------------------
 REPO_ID = "mradermacher/EuroLLM-1.7B-Instruct-GGUF"
 FILENAME = "EuroLLM-1.7B-Instruct.Q8_0.gguf"
 SYSTEM_PROMPT = "You are a helpful assistant. Answer concisely and helpfully."
 MODEL_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), "models")
 os.makedirs(MODEL_DIR, exist_ok=True)
 DEST_PATH = os.path.join(MODEL_DIR, FILENAME)
+# Llama runtime params
+N_CTX = 2048
+MAX_TOKENS = 512
+TEMPERATURE = 0.2
+TOP_P = 0.95
+# Threads: be conservative; Spaces gives limited CPU
+N_THREADS = min(4, max(1, (os.cpu_count() or 1) // 2))
+# --------------------------------------------
 def robust_download(repo_id: str, filename: str, dest: str, max_attempts: int = 2) -> str:
     """
+    Download model robustly:
+      1) Try hf_hub_download (may return a cached path)
+      2) If cached path differs, copy to dest, fsync, chmod
+      3) If hf_hub_download fails repeat attempts then fallback to direct URL via requests
+    Returns the final dest path (guaranteed to exist and be readable or raises)
     """
+    # quick exit
     if os.path.exists(dest) and os.path.getsize(dest) > 0:
         print(f"[robust_download] Already present: {dest} ({os.path.getsize(dest)} bytes)")
         return dest
     last_err = None
     for attempt in range(1, max_attempts + 1):
         try:
+            print(f"[robust_download] Attempt {attempt}: hf_hub_download(repo_id={repo_id}, filename={filename})")
             cached_path = hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=MODEL_DIR)
             print("[robust_download] hf_hub_download returned:", cached_path)
+            # If hf_hub_download already saved at dest, good.
+            if os.path.abspath(cached_path) != os.path.abspath(dest):
                 print(f"[robust_download] Copying cached file -> {dest}")
                 shutil.copy2(cached_path, dest)
+            else:
+                print("[robust_download] Cached path equals dest; no copy needed.")
+            # ensure it's synced to disk
             with open(dest, "rb") as f:
                 try:
+                    f.flush()
                     os.fsync(f.fileno())
+                except Exception:
+                    # some hosted filesystems don't support fsync; ignore but we tried
                     pass
+            # set readable permissions
             os.chmod(dest, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH)  # 0o644
             size = os.path.getsize(dest)
             if size == 0:
                 raise RuntimeError("Downloaded file has size 0 after copy")
             return dest
         except Exception as e:
+            print(f"[robust_download] hf_hub_download attempt {attempt} failed: {e}")
             last_err = e
             time.sleep(1)
+    # fallback: direct URL using hf_hub_url + requests
     try:
         print("[robust_download] Falling back to direct download via requests...")
         url = hf_hub_url(repo_id=repo_id, filename=filename)
         print("[robust_download] Downloading from URL:", url)
+        tmp_path = dest + ".part"
         with requests.get(url, stream=True, timeout=120) as r:
             r.raise_for_status()
             with open(tmp_path, "wb") as f:
                 for chunk in r.iter_content(chunk_size=8192):
                     if chunk:
                         f.write(chunk)
                         f.flush()
+        shutil.move(tmp_path, dest)
         # fsync + chmod
         with open(dest, "rb") as f:
             try:
                 os.fsync(f.fileno())
+            except Exception:
                 pass
         os.chmod(dest, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH)
         print("[robust_download] Direct download success:", dest)
         return dest
     except Exception as e2:
         print("[robust_download] Direct download failed:", e2)
+        raise RuntimeError(f"All download attempts failed. last_err={last_err}, fallback_err={e2}")
+# ------------------ Ensure model exists ------------------
+print("Ensuring model present at:", DEST_PATH)
 model_path = robust_download(REPO_ID, FILENAME, DEST_PATH)
+# Debug listing of model dir (useful in logs)
 print("DEBUG: listing model dir:", MODEL_DIR)
 for fn in sorted(os.listdir(MODEL_DIR)):
     p = os.path.join(MODEL_DIR, fn)
         st = os.stat(p)
         print(f" - {fn}: exists, size={st.st_size}, mode={oct(st.st_mode)}")
     except FileNotFoundError:
+        print(f" - {fn}: NOT FOUND after copy")
+# tiny delay for some hosted FS race conditions
 time.sleep(0.2)
+# ------------------ Initialize Llama ------------------
 try:
     print("Initializing Llama with model_path:", model_path)
     llm = Llama(
         model_path=model_path,
+        n_ctx=N_CTX,
+        n_threads=N_THREADS,
         n_gpu_layers=0,
     )
+    print("Llama initialized successfully.")
 except ValueError as ve:
     print("Llama init ValueError:", ve)
     print("Model dir listing at failure:")
     for fn in sorted(os.listdir(MODEL_DIR)):
             print(" * stat failed for", p, ex)
     raise
+# ------------------ Helpers ------------------
 def build_messages(history, user_message, system_prompt=SYSTEM_PROMPT):
+    """
+    Convert Gradio history (list of [user, assistant]) into chat messages:
+    [{role: system}, {role: user}, {role: assistant}, ... , {role: user}]
+    """
     messages = []
     if system_prompt:
         messages.append({"role": "system", "content": system_prompt})
     for user_msg, assistant_msg in history:
         messages.append({"role": "user", "content": user_msg})
+        if assistant_msg is not None and assistant_msg != "":
             messages.append({"role": "assistant", "content": assistant_msg})
     messages.append({"role": "user", "content": user_message})
     return messages
+def parse_final_response(resp):
+    """
+    Normalize single-shot responses to a string.
+    Handles common llama-cpp-python shapes.
+    """
+    try:
+        if resp is None:
+            return ""
+        if isinstance(resp, str):
+            return resp
+        if isinstance(resp, dict):
+            choices = resp.get("choices", [])
+            if len(choices) > 0:
+                c = choices[0]
+                # message.content (chat format)
+                if isinstance(c.get("message"), dict):
+                    return c["message"].get("content", "") or ""
+                # text fallback
+                if "text" in c and c["text"]:
+                    return c["text"]
+                # delta fallback
+                if "delta" in c and isinstance(c["delta"], dict):
+                    return c["delta"].get("content", "") or ""
+        return str(resp)
+    except Exception:
+        return str(resp)
+# ------------------ Robust streaming chat function ------------------
 def chat_fn(user_message, history):
+    """
+    Generator for Gradio ChatInterface streaming.
+    Yields progressive partial strings. Falls back to non-streaming if needed.
+    Ensures Gradio never sees an empty generator (avoids StopAsyncIteration crash).
+    """
     messages = build_messages(history or [], user_message)
+    # Attempt streaming call
+    try:
+        stream = llm.create_chat_completion(
+            messages=messages,
+            max_tokens=MAX_TOKENS,
+            temperature=TEMPERATURE,
+            top_p=TOP_P,
+            stream=True
+        )
+    except Exception as e:
+        # immediate failure: try non-stream fallback
         try:
+            final = llm.create_chat_completion(
+                messages=messages,
+                max_tokens=MAX_TOKENS,
+                temperature=TEMPERATURE,
+                top_p=TOP_P,
+                stream=False
+            )
+            text = parse_final_response(final)
+            yield text
+            return
+        except Exception as e2:
+            yield f"[error] create_chat_completion failed: {e} | fallback error: {e2}"
+            return
+    # If the returned object isn't iterable, treat as final
+    if not hasattr(stream, "__iter__"):
+        final_text = parse_final_response(stream)
+        yield final_text
+        return
+    partial = ""
+    yielded_any = False
+    try:
+        for chunk in stream:
+            # Debug: uncomment to log raw chunks
+            # print("STREAM CHUNK:", repr(chunk))
+            if not chunk:
+                continue
+            # expected format: {"choices":[{"delta":{"content":"..."}}], ...}
+            try:
+                choices = chunk.get("choices", [])
+            except Exception:
+                # some weird shapes might be simple strings
+                chunk_str = str(chunk)
+                if chunk_str:
+                    partial += chunk_str
+                    yielded_any = True
                     yield partial
+                continue
+            if len(choices) == 0:
+                continue
+            c0 = choices[0]
+            # delta content (usual)
+            delta = c0.get("delta", {})
+            if isinstance(delta, dict) and "content" in delta:
+                partial += delta["content"]
+                yielded_any = True
+                yield partial
+                continue
+            # final message object (some runners)
+            msg = c0.get("message") or c0.get("text") or {}
+            if isinstance(msg, dict):
+                content = msg.get("content") or msg.get("content_text") or ""
+                if content:
+                    partial = content
+                    yielded_any = True
+                    yield partial
+                    continue
+            elif isinstance(msg, str) and msg:
+                partial += msg
+                yielded_any = True
+                yield partial
+                continue
+    except StopIteration:
+        pass
+    except Exception as e:
+        yield f"[error] stream iteration error: {e}"
+        return
+    # If streaming produced nothing, fallback to non-stream
+    if not yielded_any:
+        try:
+            final = llm.create_chat_completion(
+                messages=messages,
+                max_tokens=MAX_TOKENS,
+                temperature=TEMPERATURE,
+                top_p=TOP_P,
+                stream=False
+            )
+            final_text = parse_final_response(final)
+            yield final_text if final_text is not None else ""
+            return
+        except Exception as e:
+            yield f"[error] fallback non-stream failed: {e}"
+            return
+# ------------------ Launch Gradio ------------------
+demo = gr.ChatInterface(
+    fn=chat_fn,
+    title="EuroLLM 1.7B (GGUF) — Robust streaming chat",
+    description="System prompt enabled. Streaming ON.",
+)
 if __name__ == "__main__":
+    # If you want a public link during testing, pass share=True in launch()
     demo.launch()