Spaces:
OrbitMC
/
Runtime error

OrbitMC commited on
Commit
6baa494
Β·
verified Β·
1 Parent(s): 353bafa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -23
app.py CHANGED
@@ -8,8 +8,11 @@ import asyncio
8
  from pathlib import Path
9
  from flask import Flask, request, jsonify, send_from_directory, Response
10
 
11
- # GGUF / CPU Backend Imports
12
- from gpt4all import GPT4All
 
 
 
13
  from huggingface_hub import hf_hub_download
14
 
15
  import edge_tts
@@ -24,9 +27,9 @@ TTS_RATE = int(os.environ.get("TTS_RATE", "-4"))
24
  TTS_PITCH = int(os.environ.get("TTS_PITCH", "7"))
25
  IMG_DIR = Path(__file__).parent / "img"
26
 
27
- # GGUF Model Config (Defaulting to the Mistral Instruct from your example)
28
- GGUF_REPO = os.environ.get("GGUF_REPO", "HauhauCS/Qwen3.5-2B-Uncensored-HauhauCS-Aggressive")
29
- GGUF_FILE = os.environ.get("GGUF_FILE", "Qwen3.5-2B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf")
30
  MODEL_DIR = Path(__file__).parent / "models"
31
 
32
  # ══════════════════════════════════════════════════════════════════
@@ -73,10 +76,10 @@ def clean_for_tts(text: str) -> str:
73
  return clean
74
 
75
  # ══════════════════════════════════════════════════════════════════
76
- # MODEL LOADING (GPT4ALL CPU)
77
  # ══════════════════════════════════════════════════════════════════
78
  print("=" * 60)
79
- print(" Visual AI -- Booting Systems (GGUF CPU Backend)")
80
  print("=" * 60)
81
 
82
  model = None
@@ -85,15 +88,22 @@ try:
85
  MODEL_DIR.mkdir(parents=True, exist_ok=True)
86
 
87
  print(f"[MODEL] Verifying/Downloading {GGUF_FILE} from {GGUF_REPO} ...")
88
- hf_hub_download(
89
  repo_id=GGUF_REPO,
90
  filename=GGUF_FILE,
91
  local_dir=str(MODEL_DIR),
92
  local_dir_use_symlinks=False
93
  )
94
 
95
- print(f"[MODEL] Loading {GGUF_FILE} on CPU ...")
96
- model = GPT4All(GGUF_FILE, model_path=str(MODEL_DIR), allow_download=False, device="cpu")
 
 
 
 
 
 
 
97
  print(" OK Model loaded successfully!")
98
  except Exception as exc:
99
  print(f" FAILED Model load error: {exc}")
@@ -121,7 +131,7 @@ def add_to_memory(sid: str, role: str, content: str):
121
  # ══════════════════════════════════════════════════════════════════
122
  STOP_TOKENS = [
123
  "<end_of_turn>", "<start_of_turn>",
124
- "User:", "<|endoftext|>", "[/INST]", "</s>", "<|im_end|>"
125
  ]
126
 
127
  def generate_response(user_input: str, session_id: str) -> str:
@@ -131,7 +141,7 @@ def generate_response(user_input: str, session_id: str) -> str:
131
  memory = get_memory(session_id)
132
  recent = memory[-(6 * 2):]
133
 
134
- # Build prompt string explicitly for strict control
135
  prompt = f"System: {SYSTEM_PROMPT}\n\n"
136
  for msg in recent:
137
  label = "User" if msg["role"] == "user" else "Ana"
@@ -139,28 +149,24 @@ def generate_response(user_input: str, session_id: str) -> str:
139
  prompt += f"User: {user_input}\nAna:"
140
 
141
  try:
142
- # GPT4All Generation
143
- response = model.generate(
144
  prompt=prompt,
145
  max_tokens=MAX_NEW_TOKENS,
146
- temp=0.90,
147
  top_k=50,
148
  top_p=0.95,
149
  repeat_penalty=1.1,
150
- streaming=False
 
151
  )
 
152
  except Exception as exc:
153
  print(f"[GENERATE] Error: {exc}")
154
  traceback.print_exc()
155
  return "[sad] Something went wrong in my mind. Could you say that again?"
156
 
157
- response = response.strip()
158
-
159
  # Post-process cleanup
160
- for stop in STOP_TOKENS:
161
- if stop in response:
162
- response = response.split(stop)[0].strip()
163
-
164
  if "\n\n" in response:
165
  response = response.split("\n\n")[0].strip()
166
 
@@ -602,7 +608,7 @@ def clear():
602
  def health():
603
  return jsonify({
604
  "model_loaded": model is not None,
605
- "backend": "gpt4all (CPU GGUF)",
606
  })
607
 
608
  if __name__ == "__main__":
 
8
  from pathlib import Path
9
  from flask import Flask, request, jsonify, send_from_directory, Response
10
 
11
+ # ══════════════════════════════════════════════════════════════════
12
+ # LLAMA.CPP BACKEND IMPORTS (Universal GGUF Support)
13
+ # ══════════════════════════════════════════════════════════════════
14
+ # Install via: pip install llama-cpp-python huggingface-hub
15
+ from llama_cpp import Llama
16
  from huggingface_hub import hf_hub_download
17
 
18
  import edge_tts
 
27
  TTS_PITCH = int(os.environ.get("TTS_PITCH", "7"))
28
  IMG_DIR = Path(__file__).parent / "img"
29
 
30
+ # You can swap this with ANY GGUF model supported by llama.cpp
31
+ GGUF_REPO = os.environ.get("GGUF_REPO", "Qwen/Qwen2.5-1.5B-Instruct-GGUF")
32
+ GGUF_FILE = os.environ.get("GGUF_FILE", "qwen2.5-1.5b-instruct-q4_k_m.gguf")
33
  MODEL_DIR = Path(__file__).parent / "models"
34
 
35
  # ══════════════════════════════════════════════════════════════════
 
76
  return clean
77
 
78
  # ══════════════════════════════════════════════════════════════════
79
+ # MODEL LOADING (llama.cpp CPU)
80
  # ══════════════════════════════════════════════════════════════════
81
  print("=" * 60)
82
+ print(" Visual AI -- Booting Systems (llama.cpp Backend)")
83
  print("=" * 60)
84
 
85
  model = None
 
88
  MODEL_DIR.mkdir(parents=True, exist_ok=True)
89
 
90
  print(f"[MODEL] Verifying/Downloading {GGUF_FILE} from {GGUF_REPO} ...")
91
+ model_path = hf_hub_download(
92
  repo_id=GGUF_REPO,
93
  filename=GGUF_FILE,
94
  local_dir=str(MODEL_DIR),
95
  local_dir_use_symlinks=False
96
  )
97
 
98
+ print(f"[MODEL] Loading {GGUF_FILE} on CPU with llama.cpp ...")
99
+ # n_ctx determines context length.
100
+ # n_threads utilizes optimal CPU cores automatically if set to None.
101
+ model = Llama(
102
+ model_path=model_path,
103
+ n_ctx=4096,
104
+ n_threads=max(1, os.cpu_count() - 1),
105
+ verbose=False # Set to True to see llama.cpp debug logs
106
+ )
107
  print(" OK Model loaded successfully!")
108
  except Exception as exc:
109
  print(f" FAILED Model load error: {exc}")
 
131
  # ══════════════════════════════════════════════════════════════════
132
  STOP_TOKENS = [
133
  "<end_of_turn>", "<start_of_turn>",
134
+ "User:", "<|endoftext|>", "[/INST]", "</s>", "<|im_end|>", "\nUser:"
135
  ]
136
 
137
  def generate_response(user_input: str, session_id: str) -> str:
 
141
  memory = get_memory(session_id)
142
  recent = memory[-(6 * 2):]
143
 
144
+ # Build prompt string
145
  prompt = f"System: {SYSTEM_PROMPT}\n\n"
146
  for msg in recent:
147
  label = "User" if msg["role"] == "user" else "Ana"
 
149
  prompt += f"User: {user_input}\nAna:"
150
 
151
  try:
152
+ # llama.cpp Generation
153
+ output = model.create_completion(
154
  prompt=prompt,
155
  max_tokens=MAX_NEW_TOKENS,
156
+ temperature=0.90,
157
  top_k=50,
158
  top_p=0.95,
159
  repeat_penalty=1.1,
160
+ stop=STOP_TOKENS,
161
+ echo=False
162
  )
163
+ response = output["choices"][0]["text"].strip()
164
  except Exception as exc:
165
  print(f"[GENERATE] Error: {exc}")
166
  traceback.print_exc()
167
  return "[sad] Something went wrong in my mind. Could you say that again?"
168
 
 
 
169
  # Post-process cleanup
 
 
 
 
170
  if "\n\n" in response:
171
  response = response.split("\n\n")[0].strip()
172
 
 
608
  def health():
609
  return jsonify({
610
  "model_loaded": model is not None,
611
+ "backend": "llama.cpp (CPU GGUF)",
612
  })
613
 
614
  if __name__ == "__main__":