Ana-2

Runtime error

App Files Files Community

OrbitMC commited on Apr 1

Commit

6baa494

verified ·

1 Parent(s): 353bafa

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -23

app.py CHANGED Viewed

@@ -8,8 +8,11 @@ import asyncio
 from pathlib import Path
 from flask import Flask, request, jsonify, send_from_directory, Response
-# GGUF / CPU Backend Imports
-from gpt4all import GPT4All
 from huggingface_hub import hf_hub_download
 import edge_tts
@@ -24,9 +27,9 @@ TTS_RATE       = int(os.environ.get("TTS_RATE",  "-4"))
 TTS_PITCH      = int(os.environ.get("TTS_PITCH", "7"))
 IMG_DIR        = Path(__file__).parent / "img"
-# GGUF Model Config (Defaulting to the Mistral Instruct from your example)
-GGUF_REPO      = os.environ.get("GGUF_REPO", "HauhauCS/Qwen3.5-2B-Uncensored-HauhauCS-Aggressive")
-GGUF_FILE      = os.environ.get("GGUF_FILE", "Qwen3.5-2B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf")
 MODEL_DIR      = Path(__file__).parent / "models"
 # ══════════════════════════════════════════════════════════════════
@@ -73,10 +76,10 @@ def clean_for_tts(text: str) -> str:
     return clean
 # ══════════════════════════════════════════════════════════════════
-# MODEL LOADING (GPT4ALL CPU)
 # ══════════════════════════════════════════════════════════════════
 print("=" * 60)
-print("  Visual AI -- Booting Systems (GGUF CPU Backend)")
 print("=" * 60)
 model = None
@@ -85,15 +88,22 @@ try:
     MODEL_DIR.mkdir(parents=True, exist_ok=True)
     print(f"[MODEL] Verifying/Downloading {GGUF_FILE} from {GGUF_REPO} ...")
-    hf_hub_download(
         repo_id=GGUF_REPO,
         filename=GGUF_FILE,
         local_dir=str(MODEL_DIR),
         local_dir_use_symlinks=False
     )
-    print(f"[MODEL] Loading {GGUF_FILE} on CPU ...")
-    model = GPT4All(GGUF_FILE, model_path=str(MODEL_DIR), allow_download=False, device="cpu")
     print("  OK  Model loaded successfully!")
 except Exception as exc:
     print(f"  FAILED  Model load error: {exc}")
@@ -121,7 +131,7 @@ def add_to_memory(sid: str, role: str, content: str):
 # ══════════════════════════════════════════════════════════════════
 STOP_TOKENS = [
     "<end_of_turn>", "<start_of_turn>",
-    "User:", "<|endoftext|>", "[/INST]", "</s>", "<|im_end|>"
 ]
 def generate_response(user_input: str, session_id: str) -> str:
@@ -131,7 +141,7 @@ def generate_response(user_input: str, session_id: str) -> str:
     memory = get_memory(session_id)
     recent = memory[-(6 * 2):]
-    # Build prompt string explicitly for strict control
     prompt = f"System: {SYSTEM_PROMPT}\n\n"
     for msg in recent:
         label = "User" if msg["role"] == "user" else "Ana"
@@ -139,28 +149,24 @@ def generate_response(user_input: str, session_id: str) -> str:
     prompt += f"User: {user_input}\nAna:"
     try:
-        # GPT4All Generation
-        response = model.generate(
             prompt=prompt,
             max_tokens=MAX_NEW_TOKENS,
-            temp=0.90,
             top_k=50,
             top_p=0.95,
             repeat_penalty=1.1,
-            streaming=False
         )
     except Exception as exc:
         print(f"[GENERATE] Error: {exc}")
         traceback.print_exc()
         return "[sad] Something went wrong in my mind. Could you say that again?"
-    response = response.strip()
     # Post-process cleanup
-    for stop in STOP_TOKENS:
-        if stop in response:
-            response = response.split(stop)[0].strip()
     if "\n\n" in response:
         response = response.split("\n\n")[0].strip()
@@ -602,7 +608,7 @@ def clear():
 def health():
     return jsonify({
         "model_loaded": model is not None,
-        "backend":      "gpt4all (CPU GGUF)",
     })
 if __name__ == "__main__":

 from pathlib import Path
 from flask import Flask, request, jsonify, send_from_directory, Response
+# ══════════════════════════════════════════════════════════════════
+# LLAMA.CPP BACKEND IMPORTS (Universal GGUF Support)
+# ══════════════════════════════════════════════════════════════════
+# Install via: pip install llama-cpp-python huggingface-hub
+from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import edge_tts
 TTS_PITCH      = int(os.environ.get("TTS_PITCH", "7"))
 IMG_DIR        = Path(__file__).parent / "img"
+# You can swap this with ANY GGUF model supported by llama.cpp
+GGUF_REPO      = os.environ.get("GGUF_REPO", "Qwen/Qwen2.5-1.5B-Instruct-GGUF")
+GGUF_FILE      = os.environ.get("GGUF_FILE", "qwen2.5-1.5b-instruct-q4_k_m.gguf")
 MODEL_DIR      = Path(__file__).parent / "models"
 # ══════════════════════════════════════════════════════════════════
     return clean
 # ══════════════════════════════════════════════════════════════════
+# MODEL LOADING (llama.cpp CPU)
 # ══════════════════════════════════════════════════════════════════
 print("=" * 60)
+print("  Visual AI -- Booting Systems (llama.cpp Backend)")
 print("=" * 60)
 model = None
     MODEL_DIR.mkdir(parents=True, exist_ok=True)
     print(f"[MODEL] Verifying/Downloading {GGUF_FILE} from {GGUF_REPO} ...")
+    model_path = hf_hub_download(
         repo_id=GGUF_REPO,
         filename=GGUF_FILE,
         local_dir=str(MODEL_DIR),
         local_dir_use_symlinks=False
     )
+    print(f"[MODEL] Loading {GGUF_FILE} on CPU with llama.cpp ...")
+    # n_ctx determines context length.
+    # n_threads utilizes optimal CPU cores automatically if set to None.
+    model = Llama(
+        model_path=model_path,
+        n_ctx=4096,
+        n_threads=max(1, os.cpu_count() - 1),
+        verbose=False # Set to True to see llama.cpp debug logs
+    )
     print("  OK  Model loaded successfully!")
 except Exception as exc:
     print(f"  FAILED  Model load error: {exc}")
 # ══════════════════════════════════════════════════════════════════
 STOP_TOKENS = [
     "<end_of_turn>", "<start_of_turn>",
+    "User:", "<|endoftext|>", "[/INST]", "</s>", "<|im_end|>", "\nUser:"
 ]
 def generate_response(user_input: str, session_id: str) -> str:
     memory = get_memory(session_id)
     recent = memory[-(6 * 2):]
+    # Build prompt string
     prompt = f"System: {SYSTEM_PROMPT}\n\n"
     for msg in recent:
         label = "User" if msg["role"] == "user" else "Ana"
     prompt += f"User: {user_input}\nAna:"
     try:
+        # llama.cpp Generation
+        output = model.create_completion(
             prompt=prompt,
             max_tokens=MAX_NEW_TOKENS,
+            temperature=0.90,
             top_k=50,
             top_p=0.95,
             repeat_penalty=1.1,
+            stop=STOP_TOKENS,
+            echo=False
         )
+        response = output["choices"][0]["text"].strip()
     except Exception as exc:
         print(f"[GENERATE] Error: {exc}")
         traceback.print_exc()
         return "[sad] Something went wrong in my mind. Could you say that again?"
     # Post-process cleanup
     if "\n\n" in response:
         response = response.split("\n\n")[0].strip()
 def health():
     return jsonify({
         "model_loaded": model is not None,
+        "backend":      "llama.cpp (CPU GGUF)",
     })
 if __name__ == "__main__":