Spaces:

CharlieBonito
/

ClarityGuardAgent

Sleeping

App Files Files Community

CharlieBonito commited on Apr 24

Commit

4e37b96

verified ·

1 Parent(s): 66f6169

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -38

app.py CHANGED Viewed

@@ -30,12 +30,12 @@ def log(msg):
 def start_server():
     os.makedirs(MODEL_DIR, exist_ok=True)
-    log("Descargando modelo (puede tardar varios minutos)...")
     try:
         m_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=MODEL_DIR)
-        log(f"Modelo descargado en: {m_path}")
         mm_path = hf_hub_download(repo_id=MODEL_REPO, filename=MMPROJ_FILE, local_dir=MODEL_DIR)
-        log(f"mmproj descargado en: {mm_path}")
     except Exception as e:
         log(f"FALLO en descarga: {e}")
         raise
@@ -45,30 +45,22 @@ def start_server():
     log(f"Binario OK: {LLAMA_SERVER}")
-    # Chequeo rápido de dependencias (opcional)
-    try:
-        result = subprocess.run(["ldd", LLAMA_SERVER], capture_output=True, text=True)
-        log("ldd output:\n" + result.stdout)
-    except Exception:
-        log("No se pudo ejecutar ldd, continuando...")
     env = os.environ.copy()
-    # Dejar que el sistema maneje las librerías
-    ld_path = "/usr/local/lib:/usr/local/cuda/lib64:/usr/local/nvidia/lib64:" + env.get("LD_LIBRARY_PATH", "")
-    env["LD_LIBRARY_PATH"] = ld_path
     cmd = [
         LLAMA_SERVER,
         "-m", m_path,
-        "--mmproj", mm_path,        # Visión activada
         "--host", "127.0.0.1",
         "--port", "8080",
-        "-c", "4096",
-        "-ngl", "99",
-        "-fa", "on",                # Flash Attention
-        "-np", "1",
-        "-fit", "off",
-        # Sin --no-mmap, el binario precompilado maneja bien mmap
     ]
     log(f"Lanzando: {' '.join(cmd)}")
     return subprocess.Popen(
@@ -77,32 +69,30 @@ def start_server():
         text=True, bufsize=1
     )
 def monitor_engine():
     global server_ready
     try:
-        log("Arrancando monitor_engine...")
         proc = start_server()
-        log(f"PID llama-server: {proc.pid}")
         for line in proc.stdout:
             line = line.strip()
             log(f"[llama] {line}")
             if "HTTP server listening" in line:
                 server_ready = True
-                log("MOTOR EN LINEA")
         ret = proc.wait()
-        log(f"llama-server terminó con código: {ret}")
     except Exception as e:
-        log(f"EXCEPCION MONITOR: {e}")
         log(traceback.format_exc())
 def respond(history):
     if not server_ready:
-        yield "Motor cargando... aún no está listo. Consulta los logs para más información."
         return
-    api_messages = [{"role": "system", "content": "Eres ClarityGuard. Responde de forma breve y amigable."}]
     for m in history:
         content = m["content"]
         if isinstance(content, list):
@@ -112,8 +102,8 @@ def respond(history):
     try:
         r = requests.post(
             f"{SERVER_URL}/v1/chat/completions",
-            json={"messages": api_messages, "stream": True, "temperature": 0.2, "max_tokens": 512},
-            stream=True, timeout=90
         )
         full_text = ""
         for line in r.iter_lines():
@@ -126,18 +116,17 @@ def respond(history):
             if chunk == "[DONE]":
                 break
             try:
-                delta = json.loads(chunk)["choices"][0].get("delta", {}).get("content", "")
                 full_text += delta
                 yield full_text
-            except Exception:
                 continue
     except Exception as e:
-        yield f"Error: {e}"
 with gr.Blocks() as demo:
-    gr.Markdown("# 🔍 ClarityGuard — Neuro-inclusive Communication Assistant")
-    chatbot = gr.Chatbot(height=450)
     msg = gr.Textbox(placeholder="Escribe tu mensaje y presiona Enter...")
     def user_fn(message, history):
@@ -149,7 +138,7 @@ with gr.Blocks() as demo:
     def bot_fn(history):
         history.append({"role": "assistant", "content": ""})
         for chunk in respond(history[:-1]):
-            history[-1] = {"role": "assistant", "content": chunk}
             yield history
     msg.submit(user_fn, [msg, chatbot], [msg, chatbot]).then(

 def start_server():
     os.makedirs(MODEL_DIR, exist_ok=True)
+    log("Descargando modelo...")
     try:
         m_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=MODEL_DIR)
+        log(f"Modelo en: {m_path}")
         mm_path = hf_hub_download(repo_id=MODEL_REPO, filename=MMPROJ_FILE, local_dir=MODEL_DIR)
+        log(f"mmproj en: {mm_path}")
     except Exception as e:
         log(f"FALLO en descarga: {e}")
         raise
     log(f"Binario OK: {LLAMA_SERVER}")
     env = os.environ.copy()
+    env["LD_LIBRARY_PATH"] = "/usr/local/lib:/usr/local/cuda/lib64:" + env.get("LD_LIBRARY_PATH", "")
+    # A100 con 80 GB VRAM → contexto amplio y Flash Attention
     cmd = [
         LLAMA_SERVER,
         "-m", m_path,
+        "--mmproj", mm_path,
         "--host", "127.0.0.1",
         "--port", "8080",
+        "-c", "16384",       # 16k de contexto (sobra espacio)
+        "-ngl", "99",        # todas las capas a GPU
+        "-fa", "on",         # Flash Attention
+        "-np", "1",          # un solo slot de inferencia
+        "-fit", "off",       # desactivar ajuste automático
+        # mmap por defecto (rápido y fiable)
     ]
     log(f"Lanzando: {' '.join(cmd)}")
     return subprocess.Popen(
         text=True, bufsize=1
     )
 def monitor_engine():
     global server_ready
     try:
+        log("Arrancando monitor...")
         proc = start_server()
+        log(f"PID: {proc.pid}")
         for line in proc.stdout:
             line = line.strip()
             log(f"[llama] {line}")
             if "HTTP server listening" in line:
                 server_ready = True
+                log("🔥 MOTOR EN LÍNEA (A100)")
         ret = proc.wait()
+        log(f"Servidor terminó con código: {ret}")
     except Exception as e:
+        log(f"EXCEPCIÓN MONITOR: {e}")
         log(traceback.format_exc())
 def respond(history):
     if not server_ready:
+        yield "⚡ Cargando el motor en la A100... espera unos segundos."
         return
+    api_messages = [{"role": "system", "content": "Eres ClarityGuard, un asistente neuroinclusivo de análisis de comunicación. Responde con empatía y precisión."}]
     for m in history:
         content = m["content"]
         if isinstance(content, list):
     try:
         r = requests.post(
             f"{SERVER_URL}/v1/chat/completions",
+            json={"messages": api_messages, "stream": True, "temperature": 0.2, "max_tokens": 1024},
+            stream=True, timeout=120
         )
         full_text = ""
         for line in r.iter_lines():
             if chunk == "[DONE]":
                 break
             try:
+                delta = json.loads(chunk)["choices"][0]["delta"]["content"]
                 full_text += delta
                 yield full_text
+            except:
                 continue
     except Exception as e:
+        yield f"❌ Error: {e}"
 with gr.Blocks() as demo:
+    gr.Markdown("# 🔍 ClarityGuard — Asistente Neuroinclusivo (A100)")
+    chatbot = gr.Chatbot(height=500)
     msg = gr.Textbox(placeholder="Escribe tu mensaje y presiona Enter...")
     def user_fn(message, history):
     def bot_fn(history):
         history.append({"role": "assistant", "content": ""})
         for chunk in respond(history[:-1]):
+            history[-1]["content"] = chunk
             yield history
     msg.submit(user_fn, [msg, chatbot], [msg, chatbot]).then(