Ana-2

Runtime error

App Files Files Community

OrbitMC commited on Apr 1

Commit

3cd46db

verified ·

1 Parent(s): 6baa494

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -51

app.py CHANGED Viewed

@@ -5,16 +5,14 @@ import base64
 import threading
 import traceback
 import asyncio
 from pathlib import Path
 from flask import Flask, request, jsonify, send_from_directory, Response
-# ══════════════════════════════════════════════════════════════════
-# LLAMA.CPP BACKEND IMPORTS (Universal GGUF Support)
-# ══════════════════════════════════════════════════════════════════
-# Install via: pip install llama-cpp-python huggingface-hub
-from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import edge_tts
 # ══════════════════════════════════════════════════════════════════
@@ -27,10 +25,12 @@ TTS_RATE       = int(os.environ.get("TTS_RATE",  "-4"))
 TTS_PITCH      = int(os.environ.get("TTS_PITCH", "7"))
 IMG_DIR        = Path(__file__).parent / "img"
-# You can swap this with ANY GGUF model supported by llama.cpp
 GGUF_REPO      = os.environ.get("GGUF_REPO", "Qwen/Qwen2.5-1.5B-Instruct-GGUF")
 GGUF_FILE      = os.environ.get("GGUF_FILE", "qwen2.5-1.5b-instruct-q4_k_m.gguf")
 MODEL_DIR      = Path(__file__).parent / "models"
 # ══════════════════════════════════════════════════════════════════
 # SYSTEM PROMPT
@@ -76,38 +76,70 @@ def clean_for_tts(text: str) -> str:
     return clean
 # ══════════════════════════════════════════════════════════════════
-# MODEL LOADING (llama.cpp CPU)
 # ══════════════════════════════════════════════════════════════════
 print("=" * 60)
-print("  Visual AI -- Booting Systems (llama.cpp Backend)")
 print("=" * 60)
-model = None
-try:
     MODEL_DIR.mkdir(parents=True, exist_ok=True)
-    print(f"[MODEL] Verifying/Downloading {GGUF_FILE} from {GGUF_REPO} ...")
     model_path = hf_hub_download(
         repo_id=GGUF_REPO,
         filename=GGUF_FILE,
         local_dir=str(MODEL_DIR),
         local_dir_use_symlinks=False
     )
-    print(f"[MODEL] Loading {GGUF_FILE} on CPU with llama.cpp ...")
-    # n_ctx determines context length.
-    # n_threads utilizes optimal CPU cores automatically if set to None.
-    model = Llama(
-        model_path=model_path,
-        n_ctx=4096,
-        n_threads=max(1, os.cpu_count() - 1),
-        verbose=False # Set to True to see llama.cpp debug logs
-    )
-    print("  OK  Model loaded successfully!")
-except Exception as exc:
-    print(f"  FAILED  Model load error: {exc}")
-    traceback.print_exc()
 # ══════════════════════════════════════════════════════════════════
 # CHAT MEMORY
@@ -127,7 +159,7 @@ def add_to_memory(sid: str, role: str, content: str):
             sessions[sid] = sessions[sid][-(MAX_MEMORY * 2):]
 # ══════════════════════════════════════════════════════════════════
-# RESPONSE GENERATION
 # ══════════════════════════════════════════════════════════════════
 STOP_TOKENS = [
     "<end_of_turn>", "<start_of_turn>",
@@ -135,38 +167,44 @@ STOP_TOKENS = [
 ]
 def generate_response(user_input: str, session_id: str) -> str:
-    if model is None:
-        return "[sad] My mind is offline right now. Please give me a moment."
     memory = get_memory(session_id)
     recent = memory[-(6 * 2):]
-    # Build prompt string
     prompt = f"System: {SYSTEM_PROMPT}\n\n"
     for msg in recent:
         label = "User" if msg["role"] == "user" else "Ana"
         prompt += f"{label}: {msg['content']}\n"
     prompt += f"User: {user_input}\nAna:"
     try:
-        # llama.cpp Generation
-        output = model.create_completion(
-            prompt=prompt,
-            max_tokens=MAX_NEW_TOKENS,
-            temperature=0.90,
-            top_k=50,
-            top_p=0.95,
-            repeat_penalty=1.1,
-            stop=STOP_TOKENS,
-            echo=False
-        )
-        response = output["choices"][0]["text"].strip()
     except Exception as exc:
-        print(f"[GENERATE] Error: {exc}")
         traceback.print_exc()
         return "[sad] Something went wrong in my mind. Could you say that again?"
     # Post-process cleanup
     if "\n\n" in response:
         response = response.split("\n\n")[0].strip()
@@ -244,10 +282,6 @@ body{
   justify-content:center;
 }
-/*
-  object-fit: contain prevents cuts/overflow and displays the full image intact.
-  No transitions = INSTANT image swapping.
-*/
 #bgImg{
   width:100%;
   height:100%;
@@ -607,8 +641,8 @@ def clear():
 @app.route("/health")
 def health():
     return jsonify({
-        "model_loaded": model is not None,
-        "backend":      "llama.cpp (CPU GGUF)",
     })
 if __name__ == "__main__":

 import threading
 import traceback
 import asyncio
+import urllib.request
+import zipfile
+import subprocess
+import time
+import requests
 from pathlib import Path
 from flask import Flask, request, jsonify, send_from_directory, Response
 from huggingface_hub import hf_hub_download
 import edge_tts
 # ══════════════════════════════════════════════════════════════════
 TTS_PITCH      = int(os.environ.get("TTS_PITCH", "7"))
 IMG_DIR        = Path(__file__).parent / "img"
+# You can swap this with ANY GGUF model
 GGUF_REPO      = os.environ.get("GGUF_REPO", "Qwen/Qwen2.5-1.5B-Instruct-GGUF")
 GGUF_FILE      = os.environ.get("GGUF_FILE", "qwen2.5-1.5b-instruct-q4_k_m.gguf")
 MODEL_DIR      = Path(__file__).parent / "models"
+LLAMA_BIN_DIR  = Path(__file__).parent / "llama_bin"
+LLAMA_EXE      = LLAMA_BIN_DIR / "llama-server"
 # ══════════════════════════════════════════════════════════════════
 # SYSTEM PROMPT
     return clean
 # ══════════════════════════════════════════════════════════════════
+# NATIVE LLAMA.CPP SERVER (BYPASS PIP COMPILATION ENTIRELY)
 # ══════════════════════════════════════════════════════════════════
 print("=" * 60)
+print("  Visual AI -- Booting Systems (Native llama.cpp Backend)")
 print("=" * 60)
+def setup_and_start_backend():
+    # 1. Download Model
     MODEL_DIR.mkdir(parents=True, exist_ok=True)
+    print(f"[SETUP] Verifying/Downloading Model: {GGUF_FILE} ...")
     model_path = hf_hub_download(
         repo_id=GGUF_REPO,
         filename=GGUF_FILE,
         local_dir=str(MODEL_DIR),
         local_dir_use_symlinks=False
     )
+    # 2. Download Pre-compiled Binary (Instant)
+    if not LLAMA_EXE.exists():
+        print("[SETUP] Bypassing python pip - Downloading pre-compiled C++ binary directly...")
+        LLAMA_BIN_DIR.mkdir(parents=True, exist_ok=True)
+        zip_path = LLAMA_BIN_DIR / "llama.zip"
+        url = "https://github.com/ggerganov/llama.cpp/releases/download/b3800/llama-b3800-bin-ubuntu-x64.zip"
+        urllib.request.urlretrieve(url, zip_path)
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_ref.extractall(LLAMA_BIN_DIR)
+        os.remove(zip_path)
+        # Locate the binary in the unzipped folder
+        for root, _, files in os.walk(LLAMA_BIN_DIR):
+            if "llama-server" in files:
+                found_exe = os.path.join(root, "llama-server")
+                os.chmod(found_exe, 0o755)
+                if found_exe != str(LLAMA_EXE):
+                    os.rename(found_exe, str(LLAMA_EXE))
+                break
+    # 3. Boot Server in Background
+    cpu_cores = os.cpu_count() or 2
+    threads = str(max(1, cpu_cores - 1))
+    print(f"[SETUP] Starting Native llama-server engine on {threads} CPU threads...")
+    subprocess.Popen([
+        str(LLAMA_EXE),
+        "-m", model_path,
+        "-c", "4096",
+        "--port", "8080",
+        "--host", "127.0.0.1",
+        "-t", threads
+    ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    # 4. Wait for Server to wake up
+    for _ in range(30):
+        try:
+            if requests.get("http://127.0.0.1:8080/").status_code == 200:
+                print("[SETUP] llama-server backend is ONLINE and ready!")
+                return True
+        except requests.exceptions.ConnectionError:
+            time.sleep(1)
+    print("[SETUP] FAILED to start llama-server backend.")
+    return False
+backend_ready = setup_and_start_backend()
 # ══════════════════════════════════════════════════════════════════
 # CHAT MEMORY
             sessions[sid] = sessions[sid][-(MAX_MEMORY * 2):]
 # ══════════════════════════════════════════════════════════════════
+# RESPONSE GENERATION (Proxied to local native binary)
 # ══════════════════════════════════════════════════════════════════
 STOP_TOKENS = [
     "<end_of_turn>", "<start_of_turn>",
 ]
 def generate_response(user_input: str, session_id: str) -> str:
+    if not backend_ready:
+        return "[sad] My core engine failed to start. Please check the logs."
     memory = get_memory(session_id)
     recent = memory[-(6 * 2):]
+    # Build prompt string explicitly
     prompt = f"System: {SYSTEM_PROMPT}\n\n"
     for msg in recent:
         label = "User" if msg["role"] == "user" else "Ana"
         prompt += f"{label}: {msg['content']}\n"
     prompt += f"User: {user_input}\nAna:"
+    payload = {
+        "prompt": prompt,
+        "n_predict": MAX_NEW_TOKENS,
+        "temperature": 0.90,
+        "top_k": 50,
+        "top_p": 0.95,
+        "repeat_penalty": 1.1,
+        "stop": STOP_TOKENS,
+        "stream": False
+    }
     try:
+        # Request completion natively from our C++ binary
+        res = requests.post("http://127.0.0.1:8080/completion", json=payload, timeout=60).json()
+        response = res.get("content", "").strip()
     except Exception as exc:
+        print(f"[GENERATE] Error communicating with llama-server: {exc}")
         traceback.print_exc()
         return "[sad] Something went wrong in my mind. Could you say that again?"
     # Post-process cleanup
+    for stop in STOP_TOKENS:
+        if stop in response:
+            response = response.split(stop)[0].strip()
     if "\n\n" in response:
         response = response.split("\n\n")[0].strip()
   justify-content:center;
 }
 #bgImg{
   width:100%;
   height:100%;
 @app.route("/health")
 def health():
     return jsonify({
+        "backend_ready": backend_ready,
+        "type": "native-llama-server"
     })
 if __name__ == "__main__":