CharlieBonito commited on
Commit
febdc07
verified
1 Parent(s): 2412ad0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -20
app.py CHANGED
@@ -17,8 +17,12 @@ LLAMA_SERVER = "/opt/llama-cpp/llama-server"
17
  MODEL_DIR = "/app/models"
18
  SERVER_URL = "http://127.0.0.1:8080"
19
  LOG_FILE = "/app/startup.log"
 
 
 
20
 
21
  server_ready = False
 
22
 
23
  def log(msg):
24
  stamp = datetime.now().strftime("%H:%M:%S")
@@ -29,7 +33,8 @@ def log(msg):
29
 
30
  def start_server():
31
  os.makedirs(MODEL_DIR, exist_ok=True)
32
- log("Descargando modelo (solo CPU)...")
 
33
  try:
34
  m_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=MODEL_DIR)
35
  log(f"Modelo descargado en: {m_path}")
@@ -42,61 +47,115 @@ def start_server():
42
 
43
  env = os.environ.copy()
44
  env["LD_LIBRARY_PATH"] = "/usr/local/lib:" + env.get("LD_LIBRARY_PATH", "")
 
 
45
 
46
- # Comando CPU con TODO: 8 hilos, contexto 4096
47
  cmd = [
48
  LLAMA_SERVER,
49
  "-m", m_path,
50
  "--host", "127.0.0.1",
51
  "--port", "8080",
52
- "-c", "4096", # 4k de contexto (sobra RAM con 32鈥疓B)
53
- "-ngl", "0", # Sin GPU
54
- "-t", "8", # Aprovechar los 8 n煤cleos
 
55
  "-np", "1",
 
 
 
 
 
 
56
  ]
57
- log(f"Lanzando (CPU, 8 hilos): {' '.join(cmd)}")
58
  return subprocess.Popen(
59
  cmd, env=env,
60
  stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
61
  text=True, bufsize=1
62
  )
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def monitor_engine():
65
- global server_ready
66
  try:
67
  log("Arrancando monitor...")
68
  proc = start_server()
69
  log(f"PID llama-server: {proc.pid}")
 
70
  for line in proc.stdout:
71
  line = line.strip()
72
  log(f"[llama] {line}")
73
- if "HTTP server listening" in line:
74
- server_ready = True
75
- log("馃敟 MOTOR EN L脥NEA (CPU 8C)")
76
  ret = proc.wait()
 
 
77
  log(f"llama-server termin贸 con c贸digo: {ret}")
78
  except Exception as e:
 
79
  log(f"EXCEPCI脫N MONITOR: {e}")
80
  log(traceback.format_exc())
81
 
82
  def respond(history):
83
  if not server_ready:
84
- yield "Motor cargando... a煤n no est谩 listo. Esto puede tardar un poco en CPU."
 
 
 
85
  return
86
 
87
- api_messages = [{"role": "system", "content": "Eres ClarityGuard, un asistente neuroinclusivo. Responde con claridad y empat铆a."}]
 
 
 
 
 
 
 
88
  for m in history:
89
- content = m["content"]
90
- if isinstance(content, list):
91
- content = " ".join(p.get("text", "") for p in content if isinstance(p, dict))
92
- api_messages.append({"role": m["role"], "content": str(content)})
 
 
 
 
 
 
 
 
 
93
 
94
  try:
95
  r = requests.post(
96
  f"{SERVER_URL}/v1/chat/completions",
97
- json={"messages": api_messages, "stream": True, "temperature": 0.2, "max_tokens": 512},
98
- stream=True, timeout=120
 
 
 
 
 
 
99
  )
 
100
  full_text = ""
101
  for line in r.iter_lines():
102
  if not line:
@@ -117,7 +176,7 @@ def respond(history):
117
  yield f"Error: {e}"
118
 
119
  with gr.Blocks() as demo:
120
- gr.Markdown("# 馃攳 ClarityGuard (CPU 8 hilos)")
121
  chatbot = gr.Chatbot(height=500)
122
  msg = gr.Textbox(placeholder="Escribe tu mensaje y presiona Enter...")
123
 
@@ -141,4 +200,4 @@ if __name__ == "__main__":
141
  with open(LOG_FILE, "w") as f:
142
  f.write("Iniciando...\n")
143
  threading.Thread(target=monitor_engine, daemon=True).start()
144
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
17
  MODEL_DIR = "/app/models"
18
  SERVER_URL = "http://127.0.0.1:8080"
19
  LOG_FILE = "/app/startup.log"
20
+ CPU_THREADS = int(os.getenv("CPU_THREADS", "8"))
21
+ LLAMA_CTX = int(os.getenv("LLAMA_CTX", "4096"))
22
+ LLAMA_MAX_TOKENS = int(os.getenv("LLAMA_MAX_TOKENS", "4096"))
23
 
24
  server_ready = False
25
+ server_error = None
26
 
27
  def log(msg):
28
  stamp = datetime.now().strftime("%H:%M:%S")
 
33
 
34
  def start_server():
35
  os.makedirs(MODEL_DIR, exist_ok=True)
36
+ log(f"Configuraci贸n: CPU_THREADS={CPU_THREADS}, LLAMA_CTX={LLAMA_CTX}, LLAMA_MAX_TOKENS={LLAMA_MAX_TOKENS}")
37
+ log("Descargando modelo para inferencia CPU-only...")
38
  try:
39
  m_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=MODEL_DIR)
40
  log(f"Modelo descargado en: {m_path}")
 
47
 
48
  env = os.environ.copy()
49
  env["LD_LIBRARY_PATH"] = "/usr/local/lib:" + env.get("LD_LIBRARY_PATH", "")
50
+ env["OMP_NUM_THREADS"] = str(CPU_THREADS)
51
+ env["OMP_PROC_BIND"] = "false"
52
 
53
+ # CPU-only: no GPU probing, no CUDA offload, tuned for 8 vCPU Spaces.
54
  cmd = [
55
  LLAMA_SERVER,
56
  "-m", m_path,
57
  "--host", "127.0.0.1",
58
  "--port", "8080",
59
+ "-c", str(LLAMA_CTX),
60
+ "-ngl", "0",
61
+ "-t", str(CPU_THREADS),
62
+ "-tb", str(CPU_THREADS),
63
  "-np", "1",
64
+ "-b", "512",
65
+ "-ub", "128",
66
+ "--threads-http", "2",
67
+ "--fit", "off",
68
+ "--no-mmap",
69
+ "--jinja",
70
  ]
71
+ log(f"Lanzando llama-server CPU-only: {' '.join(cmd)}")
72
  return subprocess.Popen(
73
  cmd, env=env,
74
  stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
75
  text=True, bufsize=1
76
  )
77
 
78
+ def wait_until_ready(proc, timeout=900):
79
+ global server_ready, server_error
80
+ start = time.time()
81
+ while proc.poll() is None and time.time() - start < timeout:
82
+ try:
83
+ r = requests.get(f"{SERVER_URL}/health", timeout=2)
84
+ if r.status_code == 200:
85
+ server_ready = True
86
+ log("MOTOR EN LINEA (CPU-only)")
87
+ return
88
+ except Exception:
89
+ pass
90
+ time.sleep(2)
91
+ if proc.poll() is None:
92
+ server_error = "El motor no respondi贸 al health-check dentro del tiempo esperado."
93
+ else:
94
+ server_error = f"El motor termin贸 antes de estar listo. C贸digo: {proc.returncode}"
95
+
96
  def monitor_engine():
97
+ global server_error
98
  try:
99
  log("Arrancando monitor...")
100
  proc = start_server()
101
  log(f"PID llama-server: {proc.pid}")
102
+ threading.Thread(target=wait_until_ready, args=(proc,), daemon=True).start()
103
  for line in proc.stdout:
104
  line = line.strip()
105
  log(f"[llama] {line}")
 
 
 
106
  ret = proc.wait()
107
+ if ret != 0 and not server_error:
108
+ server_error = f"llama-server termin贸 con c贸digo {ret}"
109
  log(f"llama-server termin贸 con c贸digo: {ret}")
110
  except Exception as e:
111
+ server_error = str(e)
112
  log(f"EXCEPCI脫N MONITOR: {e}")
113
  log(traceback.format_exc())
114
 
115
  def respond(history):
116
  if not server_ready:
117
+ if server_error:
118
+ yield f"Motor no disponible: {server_error}"
119
+ return
120
+ yield "Motor cargando en CPU... puede tardar varios minutos la primera vez."
121
  return
122
 
123
+ api_messages = [{
124
+ "role": "system",
125
+ "content": (
126
+ "Eres ClarityGuard, un asistente neuroinclusivo. Responde con claridad, "
127
+ "empat铆a y estructura. Prioriza respuestas completas, pero evita repetir ideas "
128
+ "y termina con una conclusi贸n o siguiente paso concreto."
129
+ ),
130
+ }]
131
  for m in history:
132
+ if isinstance(m, dict):
133
+ content = m.get("content", "")
134
+ if isinstance(content, list):
135
+ content = " ".join(p.get("text", "") for p in content if isinstance(p, dict))
136
+ api_messages.append({"role": m.get("role", "user"), "content": str(content)})
137
+ continue
138
+
139
+ if isinstance(m, (list, tuple)) and len(m) >= 2:
140
+ user_msg, assistant_msg = m[0], m[1]
141
+ if user_msg:
142
+ api_messages.append({"role": "user", "content": str(user_msg)})
143
+ if assistant_msg:
144
+ api_messages.append({"role": "assistant", "content": str(assistant_msg)})
145
 
146
  try:
147
  r = requests.post(
148
  f"{SERVER_URL}/v1/chat/completions",
149
+ json={
150
+ "model": MODEL_FILE,
151
+ "messages": api_messages,
152
+ "stream": True,
153
+ "temperature": 0.2,
154
+ "max_tokens": LLAMA_MAX_TOKENS,
155
+ },
156
+ stream=True, timeout=600
157
  )
158
+ r.raise_for_status()
159
  full_text = ""
160
  for line in r.iter_lines():
161
  if not line:
 
176
  yield f"Error: {e}"
177
 
178
  with gr.Blocks() as demo:
179
+ gr.Markdown("# ClarityGuard (CPU-only backup)")
180
  chatbot = gr.Chatbot(height=500)
181
  msg = gr.Textbox(placeholder="Escribe tu mensaje y presiona Enter...")
182
 
 
200
  with open(LOG_FILE, "w") as f:
201
  f.write("Iniciando...\n")
202
  threading.Thread(target=monitor_engine, daemon=True).start()
203
+ demo.launch(server_name="0.0.0.0", server_port=7860)