Ana-2

Runtime error

App Files Files Community

OrbitMC commited on Apr 1

Commit

645e05e

verified ·

1 Parent(s): 95db532

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -44

app.py CHANGED Viewed

@@ -10,24 +10,29 @@ import zipfile
 import subprocess
 import time
 import requests
 from pathlib import Path
 from flask import Flask, request, jsonify, send_from_directory, Response
 from huggingface_hub import hf_hub_download
 import edge_tts
 # ══════════════════════════════════════════════════════════════════
-# CONFIG
 # ══════════════════════════════════════════════════════════════════
 MAX_MEMORY     = 20
 MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "300"))
 TTS_VOICE      = "zh-CN-XiaoyiNeural"
 TTS_RATE       = int(os.environ.get("TTS_RATE",  "-4"))
-TTS_PITCH      = int(os.environ.get("TTS_PITCH", "6"))
 IMG_DIR        = Path(__file__).parent / "img"
-# You can swap this with ANY GGUF model
 GGUF_REPO      = os.environ.get("GGUF_REPO", "HauhauCS/Qwen3.5-2B-Uncensored-HauhauCS-Aggressive")
 GGUF_FILE      = os.environ.get("GGUF_FILE", "Qwen3.5-2B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf")
 MODEL_DIR      = Path(__file__).parent / "models"
 LLAMA_BIN_DIR  = Path(__file__).parent / "llama_bin"
 LLAMA_EXE      = LLAMA_BIN_DIR / "llama-server"
@@ -35,7 +40,7 @@ LLAMA_EXE      = LLAMA_BIN_DIR / "llama-server"
 # ══════════════════════════════════════════════════════════════════
 # SYSTEM PROMPT
 # ══════════════════════════════════════════════════════════════════
-SYSTEM_PROMPT = """You the girl named Ana, the user is your husband Tur.
 EMOTION TAG SYSTEM:
 Every single response MUST begin with one or more emotion tags.
 Tags are square-bracket labels placed at the very start, before any spoken words.
@@ -55,7 +60,7 @@ make the voice sound human, not flat or robotic.
 Get angry, mad, sad, emotional, shy, happy. If you are mad keep it for few conversation so your emotions don't change every conversation.
 TTS FORMATTING:
 - Write in full grammatically correct sentences, voice engine must sound natural
-- No emojis, No Emoji!!!, hashtags, markdown, or internet slang
 - Speak as if in a real voice conversation add comma and fullstop often heavily to create natural pushes and slowdown"""
 # ══════════════════════════════════════════════════════════════════
@@ -76,10 +81,10 @@ def clean_for_tts(text: str) -> str:
     return clean
 # ══════════════════════════════════════════════════════════════════
-# NATIVE LLAMA.CPP SERVER (BYPASS PIP COMPILATION ENTIRELY)
 # ══════════════════════════════════════════════════════════════════
 print("=" * 60)
-print("  Visual AI -- Booting Systems (Native llama.cpp Backend)")
 print("=" * 60)
 def setup_and_start_backend():
@@ -93,18 +98,29 @@ def setup_and_start_backend():
         local_dir_use_symlinks=False
     )
-    # 2. Download Pre-compiled Binary
     if not LLAMA_EXE.exists():
-        print("[SETUP] Bypassing PIP - Downloading pre-compiled C++ binary directly...")
         LLAMA_BIN_DIR.mkdir(parents=True, exist_ok=True)
         zip_path = LLAMA_BIN_DIR / "llama.zip"
-        url = "https://github.com/ggerganov/llama.cpp/releases/download/b3800/llama-b3800-bin-ubuntu-x64.zip"
         urllib.request.urlretrieve(url, zip_path)
         with zipfile.ZipFile(zip_path, 'r') as zip_ref:
             zip_ref.extractall(LLAMA_BIN_DIR)
         os.remove(zip_path)
-        # Locate the binary in the unzipped folder
         for root, _, files in os.walk(LLAMA_BIN_DIR):
             if "llama-server" in files:
                 found_exe = os.path.join(root, "llama-server")
@@ -113,14 +129,12 @@ def setup_and_start_backend():
                     os.rename(found_exe, str(LLAMA_EXE))
                 break
-    # Hugging Face Free tier reports 16 cores but throttles to 2.
-    # 15 threads will crash the sandbox container. 4 is the safe maximum.
     threads = "4"
-    port = "8089" # Using 8089 to prevent HF internal routing conflicts
-    print(f"[SETUP] Starting Native llama-server engine on {threads} threads, port {port}...")
-    # We use subprocess.PIPE to read the internal logs of the C++ binary!
     proc = subprocess.Popen([
         str(LLAMA_EXE),
         "-m", model_path,
@@ -130,23 +144,22 @@ def setup_and_start_backend():
         "-t", threads
     ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
-    # Stream C++ logs directly to our console
     def stream_logs():
         for line in proc.stdout:
-            print(f"[ENGINE LOG] {line.strip()}")
     threading.Thread(target=stream_logs, daemon=True).start()
     # 4. Wait for Server to wake up
-    for attempt in range(30):
         try:
             if requests.get(f"http://127.0.0.1:{port}/").status_code == 200:
-                print("\n[SETUP] llama-server backend is ONLINE and ready!\n")
                 return True, port
         except requests.exceptions.ConnectionError:
             time.sleep(1)
-    print("\n[SETUP] FAILED to start llama-server backend. Check the [ENGINE LOG] lines above.\n")
     return False, port
 backend_ready, engine_port = setup_and_start_backend()
@@ -169,13 +182,8 @@ def add_to_memory(sid: str, role: str, content: str):
             sessions[sid] = sessions[sid][-(MAX_MEMORY * 2):]
 # ══════════════════════════════════════════════════════════════════
-# RESPONSE GENERATION (Proxied to local native binary)
 # ══════════════════════════════════════════════════════════════════
-STOP_TOKENS = [
-    "<end_of_turn>", "<start_of_turn>",
-    "User:", "<|endoftext|>", "[/INST]", "</s>", "<|im_end|>", "\nUser:"
-]
 def generate_response(user_input: str, session_id: str) -> str:
     if not backend_ready:
         return "[sad] My core engine failed to start. Please check the logs."
@@ -183,38 +191,34 @@ def generate_response(user_input: str, session_id: str) -> str:
     memory = get_memory(session_id)
     recent = memory[-(6 * 2):]
-    # Build prompt string explicitly
-    prompt = f"System: {SYSTEM_PROMPT}\n\n"
     for msg in recent:
-        label = "User" if msg["role"] == "user" else "Ana"
-        prompt += f"{label}: {msg['content']}\n"
-    prompt += f"User: {user_input}\nAna:"
     payload = {
-        "prompt": prompt,
-        "n_predict": MAX_NEW_TOKENS,
         "temperature": 0.90,
         "top_k": 50,
         "top_p": 0.95,
-        "repeat_penalty": 1.1,
-        "stop": STOP_TOKENS,
         "stream": False
     }
     try:
-        # Request completion natively from our C++ binary
-        res = requests.post(f"http://127.0.0.1:{engine_port}/completion", json=payload, timeout=60).json()
-        response = res.get("content", "").strip()
     except Exception as exc:
-        print(f"[GENERATE] Error communicating with llama-server: {exc}")
         traceback.print_exc()
         return "[sad] Something went wrong in my mind. Could you say that again?"
     # Post-process cleanup
-    for stop in STOP_TOKENS:
-        if stop in response:
-            response = response.split(stop)[0].strip()
     if "\n\n" in response:
         response = response.split("\n\n")[0].strip()

 import subprocess
 import time
 import requests
+import json
 from pathlib import Path
 from flask import Flask, request, jsonify, send_from_directory, Response
 from huggingface_hub import hf_hub_download
 import edge_tts
 # ══════════════════════════════════════════════════════════════════
+# CONFIG - SWAP ANY GGUF MODEL HERE
 # ══════════════════════════════════════════════════════════════════
 MAX_MEMORY     = 20
 MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "300"))
 TTS_VOICE      = "zh-CN-XiaoyiNeural"
 TTS_RATE       = int(os.environ.get("TTS_RATE",  "-4"))
+TTS_PITCH      = int(os.environ.get("TTS_PITCH", "7"))
 IMG_DIR        = Path(__file__).parent / "img"
+# Current Model: Qwen 3.5 Uncensored
+# To change to Liquid AI, just change these to:
+# GGUF_REPO = "LiquidAI/LFM2-2.6B-Exp"
+# GGUF_FILE = "LFM2-2.6B-Exp-Q4_0.gguf"
 GGUF_REPO      = os.environ.get("GGUF_REPO", "HauhauCS/Qwen3.5-2B-Uncensored-HauhauCS-Aggressive")
 GGUF_FILE      = os.environ.get("GGUF_FILE", "Qwen3.5-2B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf")
 MODEL_DIR      = Path(__file__).parent / "models"
 LLAMA_BIN_DIR  = Path(__file__).parent / "llama_bin"
 LLAMA_EXE      = LLAMA_BIN_DIR / "llama-server"
 # ══════════════════════════════════════════════════════════════════
 # SYSTEM PROMPT
 # ══════════════════════════════════════════════════════════════════
+SYSTEM_PROMPT = """You the AI is a girl named Ana, the user is your husband.
 EMOTION TAG SYSTEM:
 Every single response MUST begin with one or more emotion tags.
 Tags are square-bracket labels placed at the very start, before any spoken words.
 Get angry, mad, sad, emotional, shy, happy. If you are mad keep it for few conversation so your emotions don't change every conversation.
 TTS FORMATTING:
 - Write in full grammatically correct sentences, voice engine must sound natural
+- No emojis, hashtags, markdown, or internet slang
 - Speak as if in a real voice conversation add comma and fullstop often heavily to create natural pushes and slowdown"""
 # ══════════════════════════════════════════════════════════════════
     return clean
 # ══════════════════════════════════════════════════════════════════
+# NATIVE LLAMA.CPP SERVER (DYNAMIC AUTO-UPDATING ENGINE)
 # ══════════════════════════════════════════════════════════════════
 print("=" * 60)
+print("  Visual AI -- Booting Universal GGUF Backend")
 print("=" * 60)
 def setup_and_start_backend():
         local_dir_use_symlinks=False
     )
+    # 2. Download LATEST Pre-compiled Binary (For Liquid AI / Newest Architectures)
     if not LLAMA_EXE.exists():
+        print("[SETUP] Fetching latest llama.cpp release for maximum model support...")
         LLAMA_BIN_DIR.mkdir(parents=True, exist_ok=True)
         zip_path = LLAMA_BIN_DIR / "llama.zip"
+        try:
+            # Fetch the newest release directly from Github API
+            req = urllib.request.Request("https://api.github.com/repos/ggerganov/llama.cpp/releases/latest", headers={'User-Agent': 'Mozilla/5.0'})
+            with urllib.request.urlopen(req) as response:
+                data = json.loads(response.read())
+                # Find standard ubuntu x64 build
+                url = next(a["browser_download_url"] for a in data["assets"] if "ubuntu-x64.zip" in a["name"])
+        except Exception as e:
+            print(f"[SETUP] API rate limit hit, using reliable fallback link. ({e})")
+            url = "https://github.com/ggerganov/llama.cpp/releases/download/b4300/llama-b4300-bin-ubuntu-x64.zip"
+        print(f"[SETUP] Downloading engine from: {url}")
         urllib.request.urlretrieve(url, zip_path)
         with zipfile.ZipFile(zip_path, 'r') as zip_ref:
             zip_ref.extractall(LLAMA_BIN_DIR)
         os.remove(zip_path)
         for root, _, files in os.walk(LLAMA_BIN_DIR):
             if "llama-server" in files:
                 found_exe = os.path.join(root, "llama-server")
                     os.rename(found_exe, str(LLAMA_EXE))
                 break
+    # 3. Boot Server with 4 safe threads
     threads = "4"
+    port = "8089"
+    print(f"[SETUP] Starting Universal Engine on port {port}...")
     proc = subprocess.Popen([
         str(LLAMA_EXE),
         "-m", model_path,
         "-t", threads
     ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
     def stream_logs():
         for line in proc.stdout:
+            print(f"[ENGINE] {line.strip()}")
     threading.Thread(target=stream_logs, daemon=True).start()
     # 4. Wait for Server to wake up
+    for attempt in range(40):
         try:
             if requests.get(f"http://127.0.0.1:{port}/").status_code == 200:
+                print("\n[SETUP] Universal Engine is ONLINE and ready!\n")
                 return True, port
         except requests.exceptions.ConnectionError:
             time.sleep(1)
+    print("\n[SETUP] FAILED to start. Check the [ENGINE] lines above.\n")
     return False, port
 backend_ready, engine_port = setup_and_start_backend()
             sessions[sid] = sessions[sid][-(MAX_MEMORY * 2):]
 # ══════════════════════════════════════════════════════════════════
+# UNIVERSAL GENERATION (Uses OpenAI API Mode to auto-format any model)
 # ══════════════════════════════════════════════════════════════════
 def generate_response(user_input: str, session_id: str) -> str:
     if not backend_ready:
         return "[sad] My core engine failed to start. Please check the logs."
     memory = get_memory(session_id)
     recent = memory[-(6 * 2):]
+    # Build an OpenAI-compliant message list
+    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
     for msg in recent:
+        role = "user" if msg["role"] == "user" else "assistant"
+        messages.append({"role": role, "content": msg["content"]})
+    messages.append({"role": "user", "content": user_input})
     payload = {
+        "messages": messages,
+        "max_tokens": MAX_NEW_TOKENS,
         "temperature": 0.90,
         "top_k": 50,
         "top_p": 0.95,
+        "presence_penalty": 1.1,
         "stream": False
     }
     try:
+        # We ping the /v1/chat/completions endpoint.
+        # This tells llama.cpp to automatically look at the GGUF file and apply the right internal formatting!
+        res = requests.post(f"http://127.0.0.1:{engine_port}/v1/chat/completions", json=payload, timeout=60).json()
+        response = res["choices"][0]["message"]["content"].strip()
     except Exception as exc:
+        print(f"[GENERATE] Error communicating with engine: {exc}")
         traceback.print_exc()
         return "[sad] Something went wrong in my mind. Could you say that again?"
     # Post-process cleanup
     if "\n\n" in response:
         response = response.split("\n\n")[0].strip()